1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133
134 #include <libkern/section_keywords.h>
135
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 "error", /* 0 */
150 "life", /* 1 */
151 "load", /* 2 */
152 "fault", /* 3 */
153 "copy", /* 4 */
154 "share", /* 5 */
155 "adjust", /* 6 */
156 "pmap", /* 7 */
157 "mementry", /* 8 */
158 "iokit", /* 9 */
159 "upl", /* 10 */
160 "exc", /* 11 */
161 "vfs" /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165
166
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183 "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189
190 extern u_int32_t random(void); /* from <libkern/libkern.h> */
191 /* Internal prototypes
192 */
193
194 typedef struct vm_map_zap {
195 vm_map_entry_t vmz_head;
196 vm_map_entry_t *vmz_tail;
197 } *vm_map_zap_t;
198
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201
202 extern kern_return_t vm_map_wire_external(
203 vm_map_t map,
204 vm_map_offset_ut start_u,
205 vm_map_offset_ut end_u,
206 vm_prot_ut prot_u,
207 boolean_t user_wire) __exported;
208
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 vm_map_t src_map,
216 vm_map_address_ut src_addr,
217 vm_map_size_ut len,
218 boolean_t src_destroy,
219 boolean_t src_volatile,
220 vm_map_copy_t *copy_result, /* OUT */
221 boolean_t use_maxprot);
222
223 static vm_map_entry_t vm_map_entry_insert(
224 vm_map_t map,
225 vm_map_entry_t insp_entry,
226 vm_map_offset_t start,
227 vm_map_offset_t end,
228 vm_object_t object,
229 vm_object_offset_t offset,
230 vm_map_kernel_flags_t vmk_flags,
231 boolean_t needs_copy,
232 vm_prot_t cur_protection,
233 vm_prot_t max_protection,
234 vm_inherit_t inheritance,
235 boolean_t clear_map_aligned);
236
237 static void vm_map_simplify_range(
238 vm_map_t map,
239 vm_map_offset_t start,
240 vm_map_offset_t end); /* forward */
241
242 static boolean_t vm_map_range_check(
243 vm_map_t map,
244 vm_map_offset_t start,
245 vm_map_offset_t end,
246 vm_map_entry_t *entry);
247
248 static void vm_map_submap_pmap_clean(
249 vm_map_t map,
250 vm_map_offset_t start,
251 vm_map_offset_t end,
252 vm_map_t sub_map,
253 vm_map_offset_t offset);
254
255 static void vm_map_pmap_enter(
256 vm_map_t map,
257 vm_map_offset_t addr,
258 vm_map_offset_t end_addr,
259 vm_object_t object,
260 vm_object_offset_t offset,
261 vm_prot_t protection);
262
263 static void _vm_map_clip_end(
264 struct vm_map_header *map_header,
265 vm_map_entry_t entry,
266 vm_map_offset_t end);
267
268 static void _vm_map_clip_start(
269 struct vm_map_header *map_header,
270 vm_map_entry_t entry,
271 vm_map_offset_t start);
272
273 static kmem_return_t vm_map_delete(
274 vm_map_t map,
275 vm_map_offset_t start,
276 vm_map_offset_t end,
277 vmr_flags_t flags,
278 kmem_guard_t guard,
279 vm_map_zap_t zap);
280
281 static void vm_map_copy_insert(
282 vm_map_t map,
283 vm_map_entry_t after_where,
284 vm_map_copy_t copy);
285
286 static kern_return_t vm_map_copy_overwrite_unaligned(
287 vm_map_t dst_map,
288 vm_map_entry_t entry,
289 vm_map_copy_t copy,
290 vm_map_address_t start,
291 boolean_t discard_on_success);
292
293 static kern_return_t vm_map_copy_overwrite_aligned(
294 vm_map_t dst_map,
295 vm_map_entry_t tmp_entry,
296 vm_map_copy_t copy,
297 vm_map_offset_t start,
298 pmap_t pmap);
299
300 static kern_return_t vm_map_copyin_kernel_buffer(
301 vm_map_t src_map,
302 vm_map_address_t src_addr,
303 vm_map_size_t len,
304 boolean_t src_destroy,
305 vm_map_copy_t *copy_result); /* OUT */
306
307 static kern_return_t vm_map_copyout_kernel_buffer(
308 vm_map_t map,
309 vm_map_address_t *addr, /* IN/OUT */
310 vm_map_copy_t copy,
311 vm_map_size_t copy_size,
312 boolean_t overwrite,
313 boolean_t consume_on_success);
314
315 static void vm_map_fork_share(
316 vm_map_t old_map,
317 vm_map_entry_t old_entry,
318 vm_map_t new_map);
319
320 static boolean_t vm_map_fork_copy(
321 vm_map_t old_map,
322 vm_map_entry_t *old_entry_p,
323 vm_map_t new_map,
324 int vm_map_copyin_flags);
325
326 static kern_return_t vm_map_wire_nested(
327 vm_map_t map,
328 vm_map_offset_t start,
329 vm_map_offset_t end,
330 vm_prot_t caller_prot,
331 vm_tag_t tag,
332 boolean_t user_wire,
333 pmap_t map_pmap,
334 vm_map_offset_t pmap_addr,
335 ppnum_t *physpage_p);
336
337 static kern_return_t vm_map_unwire_nested(
338 vm_map_t map,
339 vm_map_offset_t start,
340 vm_map_offset_t end,
341 boolean_t user_wire,
342 pmap_t map_pmap,
343 vm_map_offset_t pmap_addr);
344
345 static kern_return_t vm_map_overwrite_submap_recurse(
346 vm_map_t dst_map,
347 vm_map_offset_t dst_addr,
348 vm_map_size_t dst_size);
349
350 static kern_return_t vm_map_copy_overwrite_nested(
351 vm_map_t dst_map,
352 vm_map_offset_t dst_addr,
353 vm_map_copy_t copy,
354 boolean_t interruptible,
355 pmap_t pmap,
356 boolean_t discard_on_success);
357
358 static kern_return_t vm_map_remap_extract(
359 vm_map_t map,
360 vm_map_offset_t addr,
361 vm_map_size_t size,
362 boolean_t copy,
363 vm_map_copy_t map_copy,
364 vm_prot_t *cur_protection,
365 vm_prot_t *max_protection,
366 vm_inherit_t inheritance,
367 vm_map_kernel_flags_t vmk_flags);
368
369 static void vm_map_region_look_for_page(
370 vm_map_t map,
371 vm_map_offset_t va,
372 vm_object_t object,
373 vm_object_offset_t offset,
374 int max_refcnt,
375 unsigned short depth,
376 vm_region_extended_info_t extended,
377 mach_msg_type_number_t count);
378
379 static boolean_t vm_map_region_has_obj_ref(
380 vm_map_entry_t entry,
381 vm_object_t object);
382
383
384 static kern_return_t vm_map_willneed(
385 vm_map_t map,
386 vm_map_offset_t start,
387 vm_map_offset_t end);
388
389 static kern_return_t vm_map_reuse_pages(
390 vm_map_t map,
391 vm_map_offset_t start,
392 vm_map_offset_t end);
393
394 static kern_return_t vm_map_reusable_pages(
395 vm_map_t map,
396 vm_map_offset_t start,
397 vm_map_offset_t end);
398
399 static kern_return_t vm_map_can_reuse(
400 vm_map_t map,
401 vm_map_offset_t start,
402 vm_map_offset_t end);
403
404 static kern_return_t vm_map_zero(
405 vm_map_t map,
406 vm_map_offset_t start,
407 vm_map_offset_t end);
408
409 static kern_return_t vm_map_random_address_for_size(
410 vm_map_t map,
411 vm_map_offset_t *address,
412 vm_map_size_t size,
413 vm_map_kernel_flags_t vmk_flags);
414
415
416 #if CONFIG_MAP_RANGES
417
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 vm_map_t map,
420 mach_vm_address_t addr,
421 mach_vm_address_t size,
422 mach_vm_range_t range);
423
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t vm_map_pageout(
427 vm_map_t map,
428 vm_map_offset_t start,
429 vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431
432 kern_return_t vm_map_corpse_footprint_collect(
433 vm_map_t old_map,
434 vm_map_entry_t old_entry,
435 vm_map_t new_map);
436 void vm_map_corpse_footprint_collect_done(
437 vm_map_t new_map);
438 void vm_map_corpse_footprint_destroy(
439 vm_map_t map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 vm_map_t map,
442 vm_map_offset_t va,
443 int *disposition_p);
444 void vm_map_footprint_query_page_info(
445 vm_map_t map,
446 vm_map_entry_t map_entry,
447 vm_map_offset_t curr_s_offset,
448 int *disposition_p);
449
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453
454 pid_t find_largest_process_vm_map_entries(void);
455
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461
462 /* in vmk flags the meaning of fixed/anywhere is inverted */
463 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 vm_map_kernel_flags_t *vmk_flags,
470 int vm_flags,
471 vm_tag_t vm_tag)
472 {
473 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 vmk_flags->vm_tag = vm_tag;
477 }
478
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 vm_map_kernel_flags_t *vmk_flags,
483 int vm_flags_and_tag)
484 {
485 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 vm_map_kernel_flags_t *vmk_flags,
495 int vm_flags_mask)
496 {
497 /* this function doesn't handle the inverted FIXED/ANYWHERE */
498 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 vm_map_kernel_flags_t vmk_flags,
506 int vm_flags_mask)
507 {
508 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 vm_map_kernel_flags_t vmk_flags,
514 int vm_flags_mask)
515 {
516 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517
518 /* Note: up to 16 still has good calling conventions */
519 static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520
521 #if DEBUG || DEVELOPMENT
522 /*
523 * All of this compiles to nothing if all checks pass.
524 */
525 #define check(field, value) ({ \
526 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 fl.__vm_flags = (value); \
528 fl.field = 0; \
529 assert(fl.__vm_flags == 0); \
530 })
531
532 /* bits 0-7 */
533 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 check(vmf_permanent, VM_FLAGS_PERMANENT);
541
542 /* bits 8-15 */
543 check(vmf_tpro, VM_FLAGS_TPRO);
544 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545
546 /* bits 16-23 */
547 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550
551 {
552 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553
554 /* check user tags will never clip */
555 fl.vm_tag = VM_MEMORY_COUNT - 1;
556 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557
558 /* check kernel tags will never clip */
559 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 }
562
563
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566
567 return (vmflags & ~vm_flags_mask) == 0;
568 }
569
570 /*
571 * Macros to copy a vm_map_entry. We must be careful to correctly
572 * manage the wired page count. vm_map_entry_copy() creates a new
573 * map entry to the same memory - the wired count in the new entry
574 * must be set to zero. vm_map_entry_copy_full() creates a new
575 * entry that is identical to the old entry. This preserves the
576 * wire count; it's used for map splitting and zone changing in
577 * vm_map_copyout.
578 */
579
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 vm_map_t map __unused,
583 vm_map_entry_t new __unused,
584 vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 /* when code signing monitor is enabled, we want to reset on copy */
588 new->csm_associated = FALSE;
589 #else
590 /* when code signing monitor is not enabled, assert as a sanity check */
591 assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
596 proc_selfpid(),
597 (get_bsdtask_info(current_task())
598 ? proc_name_address(get_bsdtask_info(current_task()))
599 : "?"),
600 __FUNCTION__, __LINE__,
601 map, new, new->vme_start, new->vme_end);
602 }
603 #endif /* DEVELOPMENT || DEBUG */
604 #if XNU_TARGET_OS_OSX
605 /*
606 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
607 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
608 * trigggering CSM assertions when the child accesses its mapping.
609 */
610 #else /* XNU_TARGET_OS_OSX */
611 new->vme_xnu_user_debug = FALSE;
612 #endif /* XNU_TARGET_OS_OSX */
613 }
614
615 /*
616 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
617 * But for security reasons on some platforms, we don't want the
618 * new mapping to be "used for jit", so we reset the flag here.
619 */
620 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)621 vm_map_entry_copy_code_signing(
622 vm_map_t map,
623 vm_map_entry_t new,
624 vm_map_entry_t old __unused)
625 {
626 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
627 assert(new->used_for_jit == old->used_for_jit);
628 } else {
629 if (old->used_for_jit) {
630 DTRACE_VM3(cs_wx,
631 uint64_t, new->vme_start,
632 uint64_t, new->vme_end,
633 vm_prot_t, new->protection);
634 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
635 proc_selfpid(),
636 (get_bsdtask_info(current_task())
637 ? proc_name_address(get_bsdtask_info(current_task()))
638 : "?"),
639 __FUNCTION__,
640 "removing execute access");
641 new->protection &= ~VM_PROT_EXECUTE;
642 new->max_protection &= ~VM_PROT_EXECUTE;
643 }
644 new->used_for_jit = FALSE;
645 }
646 }
647
648 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)649 vm_map_entry_copy_full(
650 vm_map_entry_t new,
651 vm_map_entry_t old)
652 {
653 #if MAP_ENTRY_CREATION_DEBUG
654 btref_put(new->vme_creation_bt);
655 btref_retain(old->vme_creation_bt);
656 #endif
657 #if MAP_ENTRY_INSERTION_DEBUG
658 btref_put(new->vme_insertion_bt);
659 btref_retain(old->vme_insertion_bt);
660 #endif
661 #if VM_BTLOG_TAGS
662 /* Discard the btref that might be in the new entry */
663 if (new->vme_kernel_object) {
664 btref_put(new->vme_tag_btref);
665 }
666 /* Retain the btref in the old entry to account for its copy */
667 if (old->vme_kernel_object) {
668 btref_retain(old->vme_tag_btref);
669 }
670 #endif /* VM_BTLOG_TAGS */
671 *new = *old;
672 }
673
674 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)675 vm_map_entry_copy(
676 vm_map_t map,
677 vm_map_entry_t new,
678 vm_map_entry_t old)
679 {
680 vm_map_entry_copy_full(new, old);
681
682 new->is_shared = FALSE;
683 new->needs_wakeup = FALSE;
684 new->in_transition = FALSE;
685 new->wired_count = 0;
686 new->user_wired_count = 0;
687 new->vme_permanent = FALSE;
688 vm_map_entry_copy_code_signing(map, new, old);
689 vm_map_entry_copy_csm_assoc(map, new, old);
690 if (new->iokit_acct) {
691 assertf(!new->use_pmap, "old %p new %p\n", old, new);
692 new->iokit_acct = FALSE;
693 new->use_pmap = TRUE;
694 }
695 new->vme_resilient_codesign = FALSE;
696 new->vme_resilient_media = FALSE;
697 new->vme_atomic = FALSE;
698 new->vme_no_copy_on_read = FALSE;
699 }
700
701 /*
702 * Normal lock_read_to_write() returns FALSE/0 on failure.
703 * These functions evaluate to zero on success and non-zero value on failure.
704 */
705 __attribute__((always_inline))
706 int
vm_map_lock_read_to_write(vm_map_t map)707 vm_map_lock_read_to_write(vm_map_t map)
708 {
709 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
710 DTRACE_VM(vm_map_lock_upgrade);
711 return 0;
712 }
713 return 1;
714 }
715
716 __attribute__((always_inline))
717 boolean_t
vm_map_try_lock(vm_map_t map)718 vm_map_try_lock(vm_map_t map)
719 {
720 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
721 DTRACE_VM(vm_map_lock_w);
722 return TRUE;
723 }
724 return FALSE;
725 }
726
727 __attribute__((always_inline))
728 boolean_t
vm_map_try_lock_read(vm_map_t map)729 vm_map_try_lock_read(vm_map_t map)
730 {
731 if (lck_rw_try_lock_shared(&(map)->lock)) {
732 DTRACE_VM(vm_map_lock_r);
733 return TRUE;
734 }
735 return FALSE;
736 }
737
738 /*!
739 * @function kdp_vm_map_is_acquired_exclusive
740 *
741 * @abstract
742 * Checks if vm map is acquired exclusive.
743 *
744 * @discussion
745 * NOT SAFE: To be used only by kernel debugger.
746 *
747 * @param map map to check
748 *
749 * @returns TRUE if the map is acquired exclusively.
750 */
751 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)752 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
753 {
754 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
755 }
756
757 /*
758 * Routines to get the page size the caller should
759 * use while inspecting the target address space.
760 * Use the "_safely" variant if the caller is dealing with a user-provided
761 * array whose size depends on the page size, to avoid any overflow or
762 * underflow of a user-allocated buffer.
763 */
764 int
vm_self_region_page_shift_safely(vm_map_t target_map)765 vm_self_region_page_shift_safely(
766 vm_map_t target_map)
767 {
768 int effective_page_shift = 0;
769
770 if (PAGE_SIZE == (4096)) {
771 /* x86_64 and 4k watches: always use 4k */
772 return PAGE_SHIFT;
773 }
774 /* did caller provide an explicit page size for this thread to use? */
775 effective_page_shift = thread_self_region_page_shift();
776 if (effective_page_shift) {
777 /* use the explicitly-provided page size */
778 return effective_page_shift;
779 }
780 /* no explicit page size: use the caller's page size... */
781 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
782 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
783 /* page size match: safe to use */
784 return effective_page_shift;
785 }
786 /* page size mismatch */
787 return -1;
788 }
789 int
vm_self_region_page_shift(vm_map_t target_map)790 vm_self_region_page_shift(
791 vm_map_t target_map)
792 {
793 int effective_page_shift;
794
795 effective_page_shift = vm_self_region_page_shift_safely(target_map);
796 if (effective_page_shift == -1) {
797 /* no safe value but OK to guess for caller */
798 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
799 VM_MAP_PAGE_SHIFT(target_map));
800 }
801 return effective_page_shift;
802 }
803
804
805 /*
806 * Decide if we want to allow processes to execute from their data or stack areas.
807 * override_nx() returns true if we do. Data/stack execution can be enabled independently
808 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
809 * or allow_stack_exec to enable data execution for that type of data area for that particular
810 * ABI (or both by or'ing the flags together). These are initialized in the architecture
811 * specific pmap files since the default behavior varies according to architecture. The
812 * main reason it varies is because of the need to provide binary compatibility with old
813 * applications that were written before these restrictions came into being. In the old
814 * days, an app could execute anything it could read, but this has slowly been tightened
815 * up over time. The default behavior is:
816 *
817 * 32-bit PPC apps may execute from both stack and data areas
818 * 32-bit Intel apps may exeucte from data areas but not stack
819 * 64-bit PPC/Intel apps may not execute from either data or stack
820 *
821 * An application on any architecture may override these defaults by explicitly
822 * adding PROT_EXEC permission to the page in question with the mprotect(2)
823 * system call. This code here just determines what happens when an app tries to
824 * execute from a page that lacks execute permission.
825 *
826 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
827 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
828 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
829 * execution from data areas for a particular binary even if the arch normally permits it. As
830 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
831 * to support some complicated use cases, notably browsers with out-of-process plugins that
832 * are not all NX-safe.
833 */
834
835 extern int allow_data_exec, allow_stack_exec;
836
837 int
override_nx(vm_map_t map,uint32_t user_tag)838 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
839 {
840 int current_abi;
841
842 if (map->pmap == kernel_pmap) {
843 return FALSE;
844 }
845
846 /*
847 * Determine if the app is running in 32 or 64 bit mode.
848 */
849
850 if (vm_map_is_64bit(map)) {
851 current_abi = VM_ABI_64;
852 } else {
853 current_abi = VM_ABI_32;
854 }
855
856 /*
857 * Determine if we should allow the execution based on whether it's a
858 * stack or data area and the current architecture.
859 */
860
861 if (user_tag == VM_MEMORY_STACK) {
862 return allow_stack_exec & current_abi;
863 }
864
865 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
866 }
867
868
869 /*
870 * Virtual memory maps provide for the mapping, protection,
871 * and sharing of virtual memory objects. In addition,
872 * this module provides for an efficient virtual copy of
873 * memory from one map to another.
874 *
875 * Synchronization is required prior to most operations.
876 *
877 * Maps consist of an ordered doubly-linked list of simple
878 * entries; a single hint is used to speed up lookups.
879 *
880 * Sharing maps have been deleted from this version of Mach.
881 * All shared objects are now mapped directly into the respective
882 * maps. This requires a change in the copy on write strategy;
883 * the asymmetric (delayed) strategy is used for shared temporary
884 * objects instead of the symmetric (shadow) strategy. All maps
885 * are now "top level" maps (either task map, kernel map or submap
886 * of the kernel map).
887 *
888 * Since portions of maps are specified by start/end addreses,
889 * which may not align with existing map entries, all
890 * routines merely "clip" entries to these start/end values.
891 * [That is, an entry is split into two, bordering at a
892 * start or end value.] Note that these clippings may not
893 * always be necessary (as the two resulting entries are then
894 * not changed); however, the clipping is done for convenience.
895 * No attempt is currently made to "glue back together" two
896 * abutting entries.
897 *
898 * The symmetric (shadow) copy strategy implements virtual copy
899 * by copying VM object references from one map to
900 * another, and then marking both regions as copy-on-write.
901 * It is important to note that only one writeable reference
902 * to a VM object region exists in any map when this strategy
903 * is used -- this means that shadow object creation can be
904 * delayed until a write operation occurs. The symmetric (delayed)
905 * strategy allows multiple maps to have writeable references to
906 * the same region of a vm object, and hence cannot delay creating
907 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
908 * Copying of permanent objects is completely different; see
909 * vm_object_copy_strategically() in vm_object.c.
910 */
911
912 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
913
914 #define VM_MAP_ZONE_NAME "maps"
915 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
916
917 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
918 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
919
920 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
921 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
922
923 /*
924 * Asserts that a vm_map_copy object is coming from the
925 * vm_map_copy_zone to ensure that it isn't a fake constructed
926 * anywhere else.
927 */
928 void
vm_map_copy_require(struct vm_map_copy * copy)929 vm_map_copy_require(struct vm_map_copy *copy)
930 {
931 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
932 }
933
934 /*
935 * vm_map_require:
936 *
937 * Ensures that the argument is memory allocated from the genuine
938 * vm map zone. (See zone_id_require_allow_foreign).
939 */
940 void
vm_map_require(vm_map_t map)941 vm_map_require(vm_map_t map)
942 {
943 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
944 }
945
946 #define VM_MAP_EARLY_COUNT_MAX 16
947 static __startup_data vm_offset_t map_data;
948 static __startup_data vm_size_t map_data_size;
949 static __startup_data vm_offset_t kentry_data;
950 static __startup_data vm_size_t kentry_data_size;
951 static __startup_data vm_offset_t map_holes_data;
952 static __startup_data vm_size_t map_holes_data_size;
953 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
954 static __startup_data uint32_t early_map_count;
955
956 #if XNU_TARGET_OS_OSX
957 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
958 #else /* XNU_TARGET_OS_OSX */
959 #define NO_COALESCE_LIMIT 0
960 #endif /* XNU_TARGET_OS_OSX */
961
962 /* Skip acquiring locks if we're in the midst of a kernel core dump */
963 unsigned int not_in_kdp = 1;
964
965 unsigned int vm_map_set_cache_attr_count = 0;
966
967 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)968 vm_map_set_cache_attr(
969 vm_map_t map,
970 vm_map_offset_t va)
971 {
972 vm_map_entry_t map_entry;
973 vm_object_t object;
974 kern_return_t kr = KERN_SUCCESS;
975
976 vm_map_lock_read(map);
977
978 if (!vm_map_lookup_entry(map, va, &map_entry) ||
979 map_entry->is_sub_map) {
980 /*
981 * that memory is not properly mapped
982 */
983 kr = KERN_INVALID_ARGUMENT;
984 goto done;
985 }
986 object = VME_OBJECT(map_entry);
987
988 if (object == VM_OBJECT_NULL) {
989 /*
990 * there should be a VM object here at this point
991 */
992 kr = KERN_INVALID_ARGUMENT;
993 goto done;
994 }
995 vm_object_lock(object);
996 object->set_cache_attr = TRUE;
997 vm_object_unlock(object);
998
999 vm_map_set_cache_attr_count++;
1000 done:
1001 vm_map_unlock_read(map);
1002
1003 return kr;
1004 }
1005
1006
1007 #if CONFIG_CODE_DECRYPTION
1008 /*
1009 * vm_map_apple_protected:
1010 * This remaps the requested part of the object with an object backed by
1011 * the decrypting pager.
1012 * crypt_info contains entry points and session data for the crypt module.
1013 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1014 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1015 */
1016 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1017 vm_map_apple_protected(
1018 vm_map_t map,
1019 vm_map_offset_t start,
1020 vm_map_offset_t end,
1021 vm_object_offset_t crypto_backing_offset,
1022 struct pager_crypt_info *crypt_info,
1023 uint32_t cryptid)
1024 {
1025 boolean_t map_locked;
1026 kern_return_t kr;
1027 vm_map_entry_t map_entry;
1028 struct vm_map_entry tmp_entry;
1029 memory_object_t unprotected_mem_obj;
1030 vm_object_t protected_object;
1031 vm_map_offset_t map_addr;
1032 vm_map_offset_t start_aligned, end_aligned;
1033 vm_object_offset_t crypto_start, crypto_end;
1034 boolean_t cache_pager;
1035
1036 map_locked = FALSE;
1037 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1038
1039 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1040 return KERN_INVALID_ADDRESS;
1041 }
1042 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1043 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1044 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1045 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1046
1047 #if __arm64__
1048 /*
1049 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1050 * so we might have to loop and establish up to 3 mappings:
1051 *
1052 * + the first 16K-page, which might overlap with the previous
1053 * 4K-aligned mapping,
1054 * + the center,
1055 * + the last 16K-page, which might overlap with the next
1056 * 4K-aligned mapping.
1057 * Each of these mapping might be backed by a vnode pager (if
1058 * properly page-aligned) or a "fourk_pager", itself backed by a
1059 * vnode pager (if 4K-aligned but not page-aligned).
1060 */
1061 #endif /* __arm64__ */
1062
1063 map_addr = start_aligned;
1064 for (map_addr = start_aligned;
1065 map_addr < end;
1066 map_addr = tmp_entry.vme_end) {
1067 vm_map_lock(map);
1068 map_locked = TRUE;
1069
1070 /* lookup the protected VM object */
1071 if (!vm_map_lookup_entry(map,
1072 map_addr,
1073 &map_entry) ||
1074 map_entry->is_sub_map ||
1075 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1076 /* that memory is not properly mapped */
1077 kr = KERN_INVALID_ARGUMENT;
1078 goto done;
1079 }
1080
1081 /* ensure mapped memory is mapped as executable except
1082 * except for model decryption flow */
1083 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1084 !(map_entry->protection & VM_PROT_EXECUTE)) {
1085 kr = KERN_INVALID_ARGUMENT;
1086 goto done;
1087 }
1088
1089 /* get the protected object to be decrypted */
1090 protected_object = VME_OBJECT(map_entry);
1091 if (protected_object == VM_OBJECT_NULL) {
1092 /* there should be a VM object here at this point */
1093 kr = KERN_INVALID_ARGUMENT;
1094 goto done;
1095 }
1096 /* ensure protected object stays alive while map is unlocked */
1097 vm_object_reference(protected_object);
1098
1099 /* limit the map entry to the area we want to cover */
1100 vm_map_clip_start(map, map_entry, start_aligned);
1101 vm_map_clip_end(map, map_entry, end_aligned);
1102
1103 tmp_entry = *map_entry;
1104 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1105 vm_map_unlock(map);
1106 map_locked = FALSE;
1107
1108 /*
1109 * This map entry might be only partially encrypted
1110 * (if not fully "page-aligned").
1111 */
1112 crypto_start = 0;
1113 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1114 if (tmp_entry.vme_start < start) {
1115 if (tmp_entry.vme_start != start_aligned) {
1116 kr = KERN_INVALID_ADDRESS;
1117 vm_object_deallocate(protected_object);
1118 goto done;
1119 }
1120 crypto_start += (start - tmp_entry.vme_start);
1121 }
1122 if (tmp_entry.vme_end > end) {
1123 if (tmp_entry.vme_end != end_aligned) {
1124 kr = KERN_INVALID_ADDRESS;
1125 vm_object_deallocate(protected_object);
1126 goto done;
1127 }
1128 crypto_end -= (tmp_entry.vme_end - end);
1129 }
1130
1131 /*
1132 * This "extra backing offset" is needed to get the decryption
1133 * routine to use the right key. It adjusts for the possibly
1134 * relative offset of an interposed "4K" pager...
1135 */
1136 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1137 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1138 }
1139
1140 cache_pager = TRUE;
1141 #if XNU_TARGET_OS_OSX
1142 if (vm_map_is_alien(map)) {
1143 cache_pager = FALSE;
1144 }
1145 #endif /* XNU_TARGET_OS_OSX */
1146
1147 /*
1148 * Lookup (and create if necessary) the protected memory object
1149 * matching that VM object.
1150 * If successful, this also grabs a reference on the memory object,
1151 * to guarantee that it doesn't go away before we get a chance to map
1152 * it.
1153 */
1154 unprotected_mem_obj = apple_protect_pager_setup(
1155 protected_object,
1156 VME_OFFSET(&tmp_entry),
1157 crypto_backing_offset,
1158 crypt_info,
1159 crypto_start,
1160 crypto_end,
1161 cache_pager);
1162
1163 /* release extra ref on protected object */
1164 vm_object_deallocate(protected_object);
1165
1166 if (unprotected_mem_obj == NULL) {
1167 kr = KERN_FAILURE;
1168 goto done;
1169 }
1170
1171 /* can overwrite an immutable mapping */
1172 vm_map_kernel_flags_t vmk_flags = {
1173 .vmf_fixed = true,
1174 .vmf_overwrite = true,
1175 .vmkf_overwrite_immutable = true,
1176 };
1177 /* make the new mapping as "permanent" as the one it replaces */
1178 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1179
1180 /* map this memory object in place of the current one */
1181 map_addr = tmp_entry.vme_start;
1182 kr = mach_vm_map_kernel(map,
1183 vm_sanitize_wrap_addr_ref(&map_addr),
1184 (tmp_entry.vme_end -
1185 tmp_entry.vme_start),
1186 (mach_vm_offset_t) 0,
1187 vmk_flags,
1188 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1189 0,
1190 TRUE,
1191 tmp_entry.protection,
1192 tmp_entry.max_protection,
1193 tmp_entry.inheritance);
1194 assertf(kr == KERN_SUCCESS,
1195 "kr = 0x%x\n", kr);
1196 assertf(map_addr == tmp_entry.vme_start,
1197 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1198 (uint64_t)map_addr,
1199 (uint64_t) tmp_entry.vme_start,
1200 &tmp_entry);
1201
1202 #if VM_MAP_DEBUG_APPLE_PROTECT
1203 if (vm_map_debug_apple_protect) {
1204 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1205 " backing:[object:%p,offset:0x%llx,"
1206 "crypto_backing_offset:0x%llx,"
1207 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1208 map,
1209 (uint64_t) map_addr,
1210 (uint64_t) (map_addr + (tmp_entry.vme_end -
1211 tmp_entry.vme_start)),
1212 unprotected_mem_obj,
1213 protected_object,
1214 VME_OFFSET(&tmp_entry),
1215 crypto_backing_offset,
1216 crypto_start,
1217 crypto_end);
1218 }
1219 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1220
1221 /*
1222 * Release the reference obtained by
1223 * apple_protect_pager_setup().
1224 * The mapping (if it succeeded) is now holding a reference on
1225 * the memory object.
1226 */
1227 memory_object_deallocate(unprotected_mem_obj);
1228 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1229
1230 /* continue with next map entry */
1231 crypto_backing_offset += (tmp_entry.vme_end -
1232 tmp_entry.vme_start);
1233 crypto_backing_offset -= crypto_start;
1234 }
1235 kr = KERN_SUCCESS;
1236
1237 done:
1238 if (map_locked) {
1239 vm_map_unlock(map);
1240 }
1241 return kr;
1242 }
1243 #endif /* CONFIG_CODE_DECRYPTION */
1244
1245
1246 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1247 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1248 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1249
1250 #if XNU_TARGET_OS_OSX
1251 #define MALLOC_NO_COW_DEFAULT 1
1252 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1253 #else /* XNU_TARGET_OS_OSX */
1254 #define MALLOC_NO_COW_DEFAULT 1
1255 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1256 #endif /* XNU_TARGET_OS_OSX */
1257 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1258 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1259 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1260 #if DEBUG
1261 int vm_check_map_sanity = 0;
1262 #endif
1263
1264 /*
1265 * vm_map_init:
1266 *
1267 * Initialize the vm_map module. Must be called before
1268 * any other vm_map routines.
1269 *
1270 * Map and entry structures are allocated from zones -- we must
1271 * initialize those zones.
1272 *
1273 * There are three zones of interest:
1274 *
1275 * vm_map_zone: used to allocate maps.
1276 * vm_map_entry_zone: used to allocate map entries.
1277 *
1278 * LP32:
1279 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1280 *
1281 * The kernel allocates map entries from a special zone that is initially
1282 * "crammed" with memory. It would be difficult (perhaps impossible) for
1283 * the kernel to allocate more memory to a entry zone when it became
1284 * empty since the very act of allocating memory implies the creation
1285 * of a new entry.
1286 */
1287 __startup_func
1288 void
vm_map_init(void)1289 vm_map_init(void)
1290 {
1291
1292 #if MACH_ASSERT
1293 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1294 sizeof(debug4k_filter));
1295 #endif /* MACH_ASSERT */
1296
1297 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1298 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1299
1300 /*
1301 * Don't quarantine because we always need elements available
1302 * Disallow GC on this zone... to aid the GC.
1303 */
1304 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1305 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1306 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1307 z->z_elems_rsv = (uint16_t)(32 *
1308 (ml_early_cpu_max_number() + 1));
1309 });
1310
1311 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1312 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1313 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1314 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1315 });
1316
1317 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1318 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1319
1320 /*
1321 * Add the stolen memory to zones, adjust zone size and stolen counts.
1322 */
1323 zone_cram_early(vm_map_zone, map_data, map_data_size);
1324 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1325 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1326 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1327 zone_count_free(vm_map_zone),
1328 zone_count_free(vm_map_entry_zone),
1329 zone_count_free(vm_map_holes_zone));
1330
1331 /*
1332 * Since these are covered by zones, remove them from stolen page accounting.
1333 */
1334 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1335
1336 #if VM_MAP_DEBUG_APPLE_PROTECT
1337 PE_parse_boot_argn("vm_map_debug_apple_protect",
1338 &vm_map_debug_apple_protect,
1339 sizeof(vm_map_debug_apple_protect));
1340 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1341 #if VM_MAP_DEBUG_APPLE_FOURK
1342 PE_parse_boot_argn("vm_map_debug_fourk",
1343 &vm_map_debug_fourk,
1344 sizeof(vm_map_debug_fourk));
1345 #endif /* VM_MAP_DEBUG_FOURK */
1346
1347 if (malloc_no_cow) {
1348 vm_memory_malloc_no_cow_mask = 0ULL;
1349 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1350 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1351 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1352 #if XNU_TARGET_OS_OSX
1353 /*
1354 * On macOS, keep copy-on-write for MALLOC_LARGE because
1355 * realloc() may use vm_copy() to transfer the old contents
1356 * to the new location.
1357 */
1358 #else /* XNU_TARGET_OS_OSX */
1359 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1360 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1361 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1362 #endif /* XNU_TARGET_OS_OSX */
1363 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1364 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1365 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1366 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1367 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1368 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1369 &vm_memory_malloc_no_cow_mask,
1370 sizeof(vm_memory_malloc_no_cow_mask));
1371 }
1372
1373 #if CONFIG_MAP_RANGES
1374 vm_map_range_map_init();
1375 #endif /* CONFIG_MAP_RANGES */
1376
1377 #if DEBUG
1378 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1379 if (vm_check_map_sanity) {
1380 kprintf("VM sanity checking enabled\n");
1381 } else {
1382 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1383 }
1384 #endif /* DEBUG */
1385
1386 #if DEVELOPMENT || DEBUG
1387 PE_parse_boot_argn("panic_on_unsigned_execute",
1388 &panic_on_unsigned_execute,
1389 sizeof(panic_on_unsigned_execute));
1390 PE_parse_boot_argn("panic_on_mlock_failure",
1391 &panic_on_mlock_failure,
1392 sizeof(panic_on_mlock_failure));
1393 #endif /* DEVELOPMENT || DEBUG */
1394 }
1395
1396 __startup_func
1397 static void
vm_map_steal_memory(void)1398 vm_map_steal_memory(void)
1399 {
1400 /*
1401 * We need to reserve enough memory to support boostraping VM maps
1402 * and the zone subsystem.
1403 *
1404 * The VM Maps that need to function before zones can support them
1405 * are the ones registered with vm_map_will_allocate_early_map(),
1406 * which are:
1407 * - the kernel map
1408 * - the various submaps used by zones (pgz, meta, ...)
1409 *
1410 * We also need enough entries and holes to support them
1411 * until zone_metadata_init() is called, which is when
1412 * the zone allocator becomes capable of expanding dynamically.
1413 *
1414 * We need:
1415 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1416 * - To allow for 3-4 entries per map, but the kernel map
1417 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1418 * to describe the submaps, so double it (and make it 8x too)
1419 * - To allow for holes between entries,
1420 * hence needs the same budget as entries
1421 */
1422 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1423 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1424 VM_MAP_EARLY_COUNT_MAX);
1425
1426 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1427 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1428 8 * VM_MAP_EARLY_COUNT_MAX);
1429
1430 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1431 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1432 8 * VM_MAP_EARLY_COUNT_MAX);
1433
1434 /*
1435 * Steal a contiguous range of memory so that a simple range check
1436 * can validate early addresses being freed/crammed to these
1437 * zones
1438 */
1439 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1440 map_holes_data_size);
1441 kentry_data = map_data + map_data_size;
1442 map_holes_data = kentry_data + kentry_data_size;
1443 }
1444 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1445
1446 __startup_func
1447 static void
vm_kernel_boostraped(void)1448 vm_kernel_boostraped(void)
1449 {
1450 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1451 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1452 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1453
1454 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1455 zone_count_free(vm_map_zone),
1456 zone_count_free(vm_map_entry_zone),
1457 zone_count_free(vm_map_holes_zone));
1458 }
1459 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1460
1461 void
vm_map_disable_hole_optimization(vm_map_t map)1462 vm_map_disable_hole_optimization(vm_map_t map)
1463 {
1464 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1465
1466 if (map->holelistenabled) {
1467 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1468
1469 while (hole_entry != NULL) {
1470 next_hole_entry = hole_entry->vme_next;
1471
1472 hole_entry->vme_next = NULL;
1473 hole_entry->vme_prev = NULL;
1474 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1475
1476 if (next_hole_entry == head_entry) {
1477 hole_entry = NULL;
1478 } else {
1479 hole_entry = next_hole_entry;
1480 }
1481 }
1482
1483 map->holes_list = NULL;
1484 map->holelistenabled = FALSE;
1485
1486 map->first_free = vm_map_first_entry(map);
1487 SAVE_HINT_HOLE_WRITE(map, NULL);
1488 }
1489 }
1490
1491 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1492 vm_kernel_map_is_kernel(vm_map_t map)
1493 {
1494 return map->pmap == kernel_pmap;
1495 }
1496
1497 /*
1498 * vm_map_create:
1499 *
1500 * Creates and returns a new empty VM map with
1501 * the given physical map structure, and having
1502 * the given lower and upper address bounds.
1503 */
1504
1505 extern vm_map_t vm_map_create_external(
1506 pmap_t pmap,
1507 vm_map_offset_t min_off,
1508 vm_map_offset_t max_off,
1509 boolean_t pageable);
1510
1511 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1512 vm_map_create_external(
1513 pmap_t pmap,
1514 vm_map_offset_t min,
1515 vm_map_offset_t max,
1516 boolean_t pageable)
1517 {
1518 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1519
1520 if (pageable) {
1521 options |= VM_MAP_CREATE_PAGEABLE;
1522 }
1523 return vm_map_create_options(pmap, min, max, options);
1524 }
1525
1526 __startup_func
1527 void
vm_map_will_allocate_early_map(vm_map_t * owner)1528 vm_map_will_allocate_early_map(vm_map_t *owner)
1529 {
1530 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1531 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1532 }
1533
1534 early_map_owners[early_map_count++] = owner;
1535 }
1536
1537 __startup_func
1538 void
vm_map_relocate_early_maps(vm_offset_t delta)1539 vm_map_relocate_early_maps(vm_offset_t delta)
1540 {
1541 for (uint32_t i = 0; i < early_map_count; i++) {
1542 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1543
1544 *early_map_owners[i] = (vm_map_t)(addr + delta);
1545 }
1546
1547 early_map_count = ~0u;
1548 }
1549
1550 /*
1551 * Routine: vm_map_relocate_early_elem
1552 *
1553 * Purpose:
1554 * Early zone elements are allocated in a temporary part
1555 * of the address space.
1556 *
1557 * Once the zones live in their final place, the early
1558 * VM maps, map entries and map holes need to be relocated.
1559 *
1560 * It involves rewriting any vm_map_t, vm_map_entry_t or
1561 * pointers to vm_map_links. Other pointers to other types
1562 * are fine.
1563 *
1564 * Fortunately, pointers to those types are self-contained
1565 * in those zones, _except_ for pointers to VM maps,
1566 * which are tracked during early boot and fixed with
1567 * vm_map_relocate_early_maps().
1568 */
1569 __startup_func
1570 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1571 vm_map_relocate_early_elem(
1572 uint32_t zone_id,
1573 vm_offset_t new_addr,
1574 vm_offset_t delta)
1575 {
1576 #define relocate(type_t, field) ({ \
1577 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1578 if (*__field) { \
1579 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1580 } \
1581 })
1582
1583 switch (zone_id) {
1584 case ZONE_ID_VM_MAP:
1585 case ZONE_ID_VM_MAP_ENTRY:
1586 case ZONE_ID_VM_MAP_HOLES:
1587 break;
1588
1589 default:
1590 panic("Unexpected zone ID %d", zone_id);
1591 }
1592
1593 if (zone_id == ZONE_ID_VM_MAP) {
1594 relocate(vm_map_t, hdr.links.prev);
1595 relocate(vm_map_t, hdr.links.next);
1596 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1597 #ifdef VM_MAP_STORE_USE_RB
1598 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1599 #endif /* VM_MAP_STORE_USE_RB */
1600 relocate(vm_map_t, hint);
1601 relocate(vm_map_t, hole_hint);
1602 relocate(vm_map_t, first_free);
1603 return;
1604 }
1605
1606 relocate(struct vm_map_links *, prev);
1607 relocate(struct vm_map_links *, next);
1608
1609 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1610 #ifdef VM_MAP_STORE_USE_RB
1611 relocate(vm_map_entry_t, store.entry.rbe_left);
1612 relocate(vm_map_entry_t, store.entry.rbe_right);
1613 relocate(vm_map_entry_t, store.entry.rbe_parent);
1614 #endif /* VM_MAP_STORE_USE_RB */
1615 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1616 /* no object to relocate because we haven't made any */
1617 ((vm_map_entry_t)new_addr)->vme_submap +=
1618 delta >> VME_SUBMAP_SHIFT;
1619 }
1620 #if MAP_ENTRY_CREATION_DEBUG
1621 relocate(vm_map_entry_t, vme_creation_maphdr);
1622 #endif /* MAP_ENTRY_CREATION_DEBUG */
1623 }
1624
1625 #undef relocate
1626 }
1627
1628 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1629 vm_map_create_options(
1630 pmap_t pmap,
1631 vm_map_offset_t min,
1632 vm_map_offset_t max,
1633 vm_map_create_options_t options)
1634 {
1635 vm_map_t result;
1636
1637 #if DEBUG || DEVELOPMENT
1638 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1639 if (early_map_count != ~0u && early_map_count !=
1640 zone_count_allocated(vm_map_zone) + 1) {
1641 panic("allocating %dth early map, owner not known",
1642 zone_count_allocated(vm_map_zone) + 1);
1643 }
1644 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1645 panic("allocating %dth early map for non kernel pmap",
1646 early_map_count);
1647 }
1648 }
1649 #endif /* DEBUG || DEVELOPMENT */
1650
1651 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1652
1653 vm_map_store_init(&result->hdr);
1654 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1655 vm_map_set_page_shift(result, PAGE_SHIFT);
1656
1657 result->size_limit = RLIM_INFINITY; /* default unlimited */
1658 result->data_limit = RLIM_INFINITY; /* default unlimited */
1659 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1660 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1661 result->pmap = pmap;
1662 result->min_offset = min;
1663 result->max_offset = max;
1664 result->first_free = vm_map_to_entry(result);
1665 result->hint = vm_map_to_entry(result);
1666
1667 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1668 assert(pmap == kernel_pmap);
1669 result->never_faults = true;
1670 }
1671
1672 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1673 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1674 result->has_corpse_footprint = true;
1675 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1676 struct vm_map_links *hole_entry;
1677
1678 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1679 hole_entry->start = min;
1680 /*
1681 * Holes can be used to track ranges all the way up to
1682 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1683 */
1684 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1685 result->holes_list = result->hole_hint = hole_entry;
1686 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1687 result->holelistenabled = true;
1688 }
1689
1690 vm_map_lock_init(result);
1691
1692 return result;
1693 }
1694
1695 /*
1696 * Adjusts a submap that was made by kmem_suballoc()
1697 * before it knew where it would be mapped,
1698 * so that it has the right min/max offsets.
1699 *
1700 * We do not need to hold any locks:
1701 * only the caller knows about this map,
1702 * and it is not published on any entry yet.
1703 */
1704 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1705 vm_map_adjust_offsets(
1706 vm_map_t map,
1707 vm_map_offset_t min_off,
1708 vm_map_offset_t max_off)
1709 {
1710 assert(map->min_offset == 0);
1711 assert(map->max_offset == max_off - min_off);
1712 assert(map->hdr.nentries == 0);
1713 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1714
1715 map->min_offset = min_off;
1716 map->max_offset = max_off;
1717
1718 if (map->holelistenabled) {
1719 struct vm_map_links *hole = map->holes_list;
1720
1721 hole->start = min_off;
1722 #if defined(__arm64__)
1723 hole->end = max_off;
1724 #else
1725 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1726 #endif
1727 }
1728 }
1729
1730
1731 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1732 vm_map_adjusted_size(vm_map_t map)
1733 {
1734 const struct vm_reserved_region *regions = NULL;
1735 size_t num_regions = 0;
1736 mach_vm_size_t reserved_size = 0, map_size = 0;
1737
1738 if (map == NULL || (map->size == 0)) {
1739 return 0;
1740 }
1741
1742 map_size = map->size;
1743
1744 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1745 /*
1746 * No special reserved regions or not an exotic map or the task
1747 * is terminating and these special regions might have already
1748 * been deallocated.
1749 */
1750 return map_size;
1751 }
1752
1753 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1754 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1755
1756 while (num_regions) {
1757 reserved_size += regions[--num_regions].vmrr_size;
1758 }
1759
1760 /*
1761 * There are a few places where the map is being switched out due to
1762 * 'termination' without that bit being set (e.g. exec and corpse purging).
1763 * In those cases, we could have the map's regions being deallocated on
1764 * a core while some accounting process is trying to get the map's size.
1765 * So this assert can't be enabled till all those places are uniform in
1766 * their use of the 'map->terminated' bit.
1767 *
1768 * assert(map_size >= reserved_size);
1769 */
1770
1771 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1772 }
1773
1774 /*
1775 * vm_map_entry_create: [ internal use only ]
1776 *
1777 * Allocates a VM map entry for insertion in the
1778 * given map (or map copy). No fields are filled.
1779 *
1780 * The VM entry will be zero initialized, except for:
1781 * - behavior set to VM_BEHAVIOR_DEFAULT
1782 * - inheritance set to VM_INHERIT_DEFAULT
1783 */
1784 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1785
1786 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1787
1788 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1789 _vm_map_entry_create(
1790 struct vm_map_header *map_header __unused)
1791 {
1792 vm_map_entry_t entry = NULL;
1793
1794 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1795
1796 /*
1797 * Help the compiler with what we know to be true,
1798 * so that the further bitfields inits have good codegen.
1799 *
1800 * See rdar://87041299
1801 */
1802 __builtin_assume(entry->vme_object_value == 0);
1803 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1804 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1805
1806 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1807 "VME_ALIAS_MASK covers tags");
1808
1809 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1810 "can skip zeroing of the behavior field");
1811 entry->inheritance = VM_INHERIT_DEFAULT;
1812
1813 #if MAP_ENTRY_CREATION_DEBUG
1814 entry->vme_creation_maphdr = map_header;
1815 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1816 BTREF_GET_NOWAIT);
1817 #endif
1818 return entry;
1819 }
1820
1821 /*
1822 * vm_map_entry_dispose: [ internal use only ]
1823 *
1824 * Inverse of vm_map_entry_create.
1825 *
1826 * write map lock held so no need to
1827 * do anything special to insure correctness
1828 * of the stores
1829 */
1830 static void
vm_map_entry_dispose(vm_map_entry_t entry)1831 vm_map_entry_dispose(
1832 vm_map_entry_t entry)
1833 {
1834 #if VM_BTLOG_TAGS
1835 if (entry->vme_kernel_object) {
1836 btref_put(entry->vme_tag_btref);
1837 }
1838 #endif /* VM_BTLOG_TAGS */
1839 #if MAP_ENTRY_CREATION_DEBUG
1840 btref_put(entry->vme_creation_bt);
1841 #endif
1842 #if MAP_ENTRY_INSERTION_DEBUG
1843 btref_put(entry->vme_insertion_bt);
1844 #endif
1845 zfree(vm_map_entry_zone, entry);
1846 }
1847
1848 #define vm_map_copy_entry_dispose(copy_entry) \
1849 vm_map_entry_dispose(copy_entry)
1850
1851 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1852 vm_map_zap_first_entry(
1853 vm_map_zap_t list)
1854 {
1855 return list->vmz_head;
1856 }
1857
1858 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1859 vm_map_zap_last_entry(
1860 vm_map_zap_t list)
1861 {
1862 assert(vm_map_zap_first_entry(list));
1863 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1864 }
1865
1866 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1867 vm_map_zap_append(
1868 vm_map_zap_t list,
1869 vm_map_entry_t entry)
1870 {
1871 entry->vme_next = VM_MAP_ENTRY_NULL;
1872 *list->vmz_tail = entry;
1873 list->vmz_tail = &entry->vme_next;
1874 }
1875
1876 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1877 vm_map_zap_pop(
1878 vm_map_zap_t list)
1879 {
1880 vm_map_entry_t head = list->vmz_head;
1881
1882 if (head != VM_MAP_ENTRY_NULL &&
1883 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1884 list->vmz_tail = &list->vmz_head;
1885 }
1886
1887 return head;
1888 }
1889
1890 static void
vm_map_zap_dispose(vm_map_zap_t list)1891 vm_map_zap_dispose(
1892 vm_map_zap_t list)
1893 {
1894 vm_map_entry_t entry;
1895
1896 while ((entry = vm_map_zap_pop(list))) {
1897 if (entry->is_sub_map) {
1898 vm_map_deallocate(VME_SUBMAP(entry));
1899 } else {
1900 vm_object_deallocate(VME_OBJECT(entry));
1901 }
1902
1903 vm_map_entry_dispose(entry);
1904 }
1905 }
1906
1907 #if MACH_ASSERT
1908 static boolean_t first_free_check = FALSE;
1909 boolean_t
first_free_is_valid(vm_map_t map)1910 first_free_is_valid(
1911 vm_map_t map)
1912 {
1913 if (!first_free_check) {
1914 return TRUE;
1915 }
1916
1917 return first_free_is_valid_store( map );
1918 }
1919 #endif /* MACH_ASSERT */
1920
1921
1922 #define vm_map_copy_entry_link(copy, after_where, entry) \
1923 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1924
1925 #define vm_map_copy_entry_unlink(copy, entry) \
1926 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1927
1928 /*
1929 * vm_map_destroy:
1930 *
1931 * Actually destroy a map.
1932 */
1933 void
vm_map_destroy(vm_map_t map)1934 vm_map_destroy(
1935 vm_map_t map)
1936 {
1937 /* final cleanup: this is not allowed to fail */
1938 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1939
1940 VM_MAP_ZAP_DECLARE(zap);
1941
1942 vm_map_lock(map);
1943
1944 map->terminated = true;
1945 /* clean up regular map entries */
1946 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1947 KMEM_GUARD_NONE, &zap);
1948 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1949 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1950 KMEM_GUARD_NONE, &zap);
1951
1952 vm_map_disable_hole_optimization(map);
1953 vm_map_corpse_footprint_destroy(map);
1954
1955 vm_map_unlock(map);
1956
1957 vm_map_zap_dispose(&zap);
1958
1959 assert(map->hdr.nentries == 0);
1960
1961 if (map->pmap) {
1962 pmap_destroy(map->pmap);
1963 }
1964
1965 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1966
1967 #if CONFIG_MAP_RANGES
1968 kfree_data(map->extra_ranges,
1969 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1970 #endif
1971
1972 zfree_id(ZONE_ID_VM_MAP, map);
1973 }
1974
1975 /*
1976 * Returns pid of the task with the largest number of VM map entries.
1977 * Used in the zone-map-exhaustion jetsam path.
1978 */
1979 pid_t
find_largest_process_vm_map_entries(void)1980 find_largest_process_vm_map_entries(void)
1981 {
1982 pid_t victim_pid = -1;
1983 int max_vm_map_entries = 0;
1984 task_t task = TASK_NULL;
1985 queue_head_t *task_list = &tasks;
1986
1987 lck_mtx_lock(&tasks_threads_lock);
1988 queue_iterate(task_list, task, task_t, tasks) {
1989 if (task == kernel_task || !task->active) {
1990 continue;
1991 }
1992
1993 vm_map_t task_map = task->map;
1994 if (task_map != VM_MAP_NULL) {
1995 int task_vm_map_entries = task_map->hdr.nentries;
1996 if (task_vm_map_entries > max_vm_map_entries) {
1997 max_vm_map_entries = task_vm_map_entries;
1998 victim_pid = pid_from_task(task);
1999 }
2000 }
2001 }
2002 lck_mtx_unlock(&tasks_threads_lock);
2003
2004 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2005 return victim_pid;
2006 }
2007
2008
2009 /*
2010 * vm_map_lookup_entry: [ internal use only ]
2011 *
2012 * Calls into the vm map store layer to find the map
2013 * entry containing (or immediately preceding) the
2014 * specified address in the given map; the entry is returned
2015 * in the "entry" parameter. The boolean
2016 * result indicates whether the address is
2017 * actually contained in the map.
2018 */
2019 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2020 vm_map_lookup_entry(
2021 vm_map_t map,
2022 vm_map_offset_t address,
2023 vm_map_entry_t *entry) /* OUT */
2024 {
2025 bool result = false;
2026 if (VM_KERNEL_ADDRESS(address)) {
2027 address = VM_KERNEL_STRIP_UPTR(address);
2028 }
2029
2030 #if CONFIG_PROB_GZALLOC
2031 if (map->pmap == kernel_pmap) {
2032 assertf(!pgz_owned(address),
2033 "it is the responsibility of callers to unguard PGZ addresses");
2034 }
2035 #endif /* CONFIG_PROB_GZALLOC */
2036 result = vm_map_store_lookup_entry( map, address, entry );
2037
2038 return result;
2039 }
2040
2041 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2042 vm_map_lookup_entry_or_next(
2043 vm_map_t map,
2044 vm_map_offset_t address,
2045 vm_map_entry_t *entry) /* OUT */
2046 {
2047 if (vm_map_lookup_entry(map, address, entry)) {
2048 return true;
2049 }
2050
2051 *entry = (*entry)->vme_next;
2052 return false;
2053 }
2054
2055 #if CONFIG_PROB_GZALLOC
2056 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2057 vm_map_lookup_entry_allow_pgz(
2058 vm_map_t map,
2059 vm_map_offset_t address,
2060 vm_map_entry_t *entry) /* OUT */
2061 {
2062 if (VM_KERNEL_ADDRESS(address)) {
2063 address = VM_KERNEL_STRIP_UPTR(address);
2064 }
2065 return vm_map_store_lookup_entry( map, address, entry );
2066 }
2067 #endif /* CONFIG_PROB_GZALLOC */
2068
2069 /*
2070 * Routine: vm_map_range_invalid_panic
2071 * Purpose:
2072 * Panic on detection of an invalid range id.
2073 */
2074 __abortlike
2075 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2076 vm_map_range_invalid_panic(
2077 vm_map_t map,
2078 vm_map_range_id_t range_id)
2079 {
2080 panic("invalid range ID (%u) for map %p", range_id, map);
2081 }
2082
2083 /*
2084 * Routine: vm_map_get_range
2085 * Purpose:
2086 * Adjust bounds based on security policy.
2087 */
2088 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2089 vm_map_get_range(
2090 vm_map_t map,
2091 vm_map_address_t *address,
2092 vm_map_kernel_flags_t *vmk_flags,
2093 vm_map_size_t size,
2094 bool *is_ptr)
2095 {
2096 struct mach_vm_range effective_range = {};
2097 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2098
2099 if (map == kernel_map) {
2100 effective_range = kmem_ranges[range_id];
2101
2102 if (startup_phase >= STARTUP_SUB_KMEM) {
2103 /*
2104 * Hint provided by caller is zeroed as the range is restricted to a
2105 * subset of the entire kernel_map VA, which could put the hint outside
2106 * the range, causing vm_map_store_find_space to fail.
2107 */
2108 *address = 0ull;
2109 /*
2110 * Ensure that range_id passed in by the caller is within meaningful
2111 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2112 * to fail as the corresponding range is invalid. Range id larger than
2113 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2114 */
2115 if ((range_id == KMEM_RANGE_ID_NONE) ||
2116 (range_id > KMEM_RANGE_ID_MAX)) {
2117 vm_map_range_invalid_panic(map, range_id);
2118 }
2119
2120 /*
2121 * Pointer ranges use kmem_locate_space to do allocations.
2122 *
2123 * Non pointer fronts look like [ Small | Large | Permanent ]
2124 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2125 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2126 * use the entire range.
2127 */
2128 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2129 *is_ptr = true;
2130 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2131 effective_range = kmem_large_ranges[range_id];
2132 }
2133 }
2134 #if CONFIG_MAP_RANGES
2135 } else if (map->uses_user_ranges) {
2136 switch (range_id) {
2137 case UMEM_RANGE_ID_DEFAULT:
2138 effective_range = map->default_range;
2139 break;
2140 case UMEM_RANGE_ID_HEAP:
2141 effective_range = map->data_range;
2142 break;
2143 case UMEM_RANGE_ID_LARGE_FILE:
2144 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2145 /* large file range is configured and should be used */
2146 effective_range = map->large_file_range;
2147 } else {
2148 /*
2149 * the user asking for this user range might not have the
2150 * permissions to use the large file range (i.e., it doesn't
2151 * hold the correct entitlement), so we give it the data range
2152 * instead
2153 */
2154 effective_range = map->data_range;
2155 }
2156 break;
2157 case UMEM_RANGE_ID_FIXED:
2158 /*
2159 * anywhere allocations with an address in "FIXED"
2160 * makes no sense, leave the range empty
2161 */
2162 break;
2163
2164 default:
2165 vm_map_range_invalid_panic(map, range_id);
2166 }
2167 #endif /* CONFIG_MAP_RANGES */
2168 } else {
2169 /*
2170 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2171 * allocations of PAGEZERO to explicit requests since its
2172 * normal use is to catch dereferences of NULL and many
2173 * applications also treat pointers with a value of 0 as
2174 * special and suddenly having address 0 contain useable
2175 * memory would tend to confuse those applications.
2176 */
2177 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2178 effective_range.max_address = map->max_offset;
2179 }
2180
2181 return effective_range;
2182 }
2183
2184 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2185 vm_map_locate_space_anywhere(
2186 vm_map_t map,
2187 vm_map_size_t size,
2188 vm_map_offset_t mask,
2189 vm_map_kernel_flags_t vmk_flags,
2190 vm_map_offset_t *start_inout,
2191 vm_map_entry_t *entry_out)
2192 {
2193 struct mach_vm_range effective_range = {};
2194 vm_map_size_t guard_offset;
2195 vm_map_offset_t hint, limit;
2196 vm_map_entry_t entry;
2197 bool is_kmem_ptr_range = false;
2198
2199 /*
2200 * Only supported by vm_map_enter() with a fixed address.
2201 */
2202 assert(!vmk_flags.vmf_fixed);
2203 assert(!vmk_flags.vmkf_beyond_max);
2204
2205 if (__improbable(map->wait_for_space)) {
2206 /*
2207 * support for "wait_for_space" is minimal,
2208 * its only consumer is the ipc_kernel_copy_map.
2209 */
2210 assert(!map->holelistenabled &&
2211 !vmk_flags.vmkf_last_free &&
2212 !vmk_flags.vmkf_keep_map_locked &&
2213 !vmk_flags.vmkf_map_jit &&
2214 !vmk_flags.vmf_random_addr &&
2215 *start_inout <= map->min_offset);
2216 } else if (vmk_flags.vmkf_last_free) {
2217 assert(!vmk_flags.vmkf_map_jit &&
2218 !vmk_flags.vmf_random_addr);
2219 }
2220
2221 if (vmk_flags.vmkf_guard_before) {
2222 guard_offset = VM_MAP_PAGE_SIZE(map);
2223 assert(size > guard_offset);
2224 size -= guard_offset;
2225 } else {
2226 assert(size != 0);
2227 guard_offset = 0;
2228 }
2229
2230 /*
2231 * Validate range_id from flags and get associated range
2232 */
2233 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2234 &is_kmem_ptr_range);
2235
2236 if (is_kmem_ptr_range) {
2237 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2238 vmk_flags.vmkf_last_free, start_inout, entry_out);
2239 }
2240
2241 #if XNU_TARGET_OS_OSX
2242 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2243 assert(map != kernel_map);
2244 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2245 }
2246 #endif /* XNU_TARGET_OS_OSX */
2247
2248 again:
2249 if (vmk_flags.vmkf_last_free) {
2250 hint = *start_inout;
2251
2252 if (hint == 0 || hint > effective_range.max_address) {
2253 hint = effective_range.max_address;
2254 }
2255 if (hint <= effective_range.min_address) {
2256 return KERN_NO_SPACE;
2257 }
2258 limit = effective_range.min_address;
2259 } else {
2260 hint = *start_inout;
2261
2262 if (vmk_flags.vmkf_map_jit) {
2263 if (map->jit_entry_exists &&
2264 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2265 return KERN_INVALID_ARGUMENT;
2266 }
2267 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2268 vmk_flags.vmf_random_addr = true;
2269 }
2270 }
2271
2272 if (vmk_flags.vmf_random_addr) {
2273 kern_return_t kr;
2274
2275 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2276 if (kr != KERN_SUCCESS) {
2277 return kr;
2278 }
2279 }
2280 #if __x86_64__
2281 else if ((hint == 0 || hint == vm_map_min(map)) &&
2282 !map->disable_vmentry_reuse &&
2283 map->vmmap_high_start != 0) {
2284 hint = map->vmmap_high_start;
2285 }
2286 #endif /* __x86_64__ */
2287
2288 if (hint < effective_range.min_address) {
2289 hint = effective_range.min_address;
2290 }
2291 if (effective_range.max_address <= hint) {
2292 return KERN_NO_SPACE;
2293 }
2294
2295 limit = effective_range.max_address;
2296 }
2297 entry = vm_map_store_find_space(map,
2298 hint, limit, vmk_flags.vmkf_last_free,
2299 guard_offset, size, mask,
2300 start_inout);
2301
2302 if (__improbable(entry == NULL)) {
2303 if (map->wait_for_space &&
2304 guard_offset + size <=
2305 effective_range.max_address - effective_range.min_address) {
2306 assert_wait((event_t)map, THREAD_ABORTSAFE);
2307 vm_map_unlock(map);
2308 thread_block(THREAD_CONTINUE_NULL);
2309 vm_map_lock(map);
2310 goto again;
2311 }
2312 return KERN_NO_SPACE;
2313 }
2314
2315 if (entry_out) {
2316 *entry_out = entry;
2317 }
2318 return KERN_SUCCESS;
2319 }
2320
2321 /*!
2322 * @function vm_map_locate_space_fixed()
2323 *
2324 * @brief
2325 * Locate (no reservation) a range in the specified VM map at a fixed address.
2326 *
2327 * @param map the map to scan for memory, must be locked.
2328 * @param start the fixed address trying to be reserved
2329 * @param size the size of the allocation to make.
2330 * @param mask an alignment mask the allocation must respect,
2331 * @param vmk_flags the vm map kernel flags to influence this call.
2332 * vmk_flags.vmf_anywhere must not be set.
2333 * @param entry_out the entry right before the hole.
2334 * @param zap_list a zap list of entries to clean up after the call.
2335 *
2336 * @returns
2337 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2338 * in which case entry_out is set to the entry before the hole.
2339 *
2340 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2341 * in which case entry_out is set the conflicting entry,
2342 * the callers MUST handle this error explicitly.
2343 *
2344 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2345 * would result in a mapping outside of the map.
2346 *
2347 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2348 */
2349 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2350 vm_map_locate_space_fixed(
2351 vm_map_t map,
2352 vm_map_offset_t start,
2353 vm_map_size_t size,
2354 vm_map_offset_t mask,
2355 vm_map_kernel_flags_t vmk_flags,
2356 vm_map_entry_t *entry_out,
2357 vm_map_zap_t zap_list)
2358 {
2359 vm_map_offset_t effective_min_offset, effective_max_offset;
2360 vm_map_entry_t entry;
2361 vm_map_offset_t end;
2362
2363 assert(vmk_flags.vmf_fixed);
2364
2365 effective_min_offset = map->min_offset;
2366 effective_max_offset = map->max_offset;
2367
2368 if (vmk_flags.vmkf_beyond_max) {
2369 /*
2370 * Allow an insertion beyond the map's max offset.
2371 */
2372 effective_max_offset = 0x00000000FFFFF000ULL;
2373 if (vm_map_is_64bit(map)) {
2374 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2375 }
2376 #if XNU_TARGET_OS_OSX
2377 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2378 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2379 #endif /* XNU_TARGET_OS_OSX */
2380 }
2381
2382 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2383 !vmk_flags.vmf_overwrite &&
2384 map->pmap == kernel_pmap &&
2385 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2386 /*
2387 * Force realloc() to switch to a new allocation,
2388 * to prevent 4k-fragmented virtual ranges.
2389 */
2390 // DEBUG4K_ERROR("no realloc in place");
2391 return KERN_NO_SPACE;
2392 }
2393
2394 /*
2395 * Verify that:
2396 * the address doesn't itself violate
2397 * the mask requirement.
2398 */
2399
2400 if ((start & mask) != 0) {
2401 return KERN_NO_SPACE;
2402 }
2403
2404 #if CONFIG_MAP_RANGES
2405 if (map->uses_user_ranges) {
2406 struct mach_vm_range r;
2407
2408 vm_map_user_range_resolve(map, start, 1, &r);
2409 if (r.max_address == 0) {
2410 return KERN_INVALID_ADDRESS;
2411 }
2412 effective_min_offset = r.min_address;
2413 effective_max_offset = r.max_address;
2414 }
2415 #endif /* CONFIG_MAP_RANGES */
2416
2417 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2418 (map == kernel_map)) {
2419 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2420 effective_min_offset = r->min_address;
2421 effective_max_offset = r->max_address;
2422 }
2423
2424 /*
2425 * ... the address is within bounds
2426 */
2427
2428 end = start + size;
2429
2430 if ((start < effective_min_offset) ||
2431 (end > effective_max_offset) ||
2432 (start >= end)) {
2433 return KERN_INVALID_ADDRESS;
2434 }
2435
2436 if (vmk_flags.vmf_overwrite) {
2437 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2438 kern_return_t remove_kr;
2439
2440 /*
2441 * Fixed mapping and "overwrite" flag: attempt to
2442 * remove all existing mappings in the specified
2443 * address range, saving them in our "zap_list".
2444 *
2445 * This avoids releasing the VM map lock in
2446 * vm_map_entry_delete() and allows atomicity
2447 * when we want to replace some mappings with a new one.
2448 * It also allows us to restore the old VM mappings if the
2449 * new mapping fails.
2450 */
2451 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2452
2453 if (vmk_flags.vmkf_overwrite_immutable) {
2454 /* we can overwrite immutable mappings */
2455 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2456 }
2457 if (vmk_flags.vmkf_remap_prot_copy) {
2458 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2459 }
2460 remove_kr = vm_map_delete(map, start, end, remove_flags,
2461 KMEM_GUARD_NONE, zap_list).kmr_return;
2462 if (remove_kr) {
2463 /* XXX FBDP restore zap_list? */
2464 return remove_kr;
2465 }
2466 }
2467
2468 /*
2469 * ... the starting address isn't allocated
2470 */
2471
2472 if (vm_map_lookup_entry(map, start, &entry)) {
2473 *entry_out = entry;
2474 return KERN_MEMORY_PRESENT;
2475 }
2476
2477 /*
2478 * ... the next region doesn't overlap the
2479 * end point.
2480 */
2481
2482 if ((entry->vme_next != vm_map_to_entry(map)) &&
2483 (entry->vme_next->vme_start < end)) {
2484 return KERN_NO_SPACE;
2485 }
2486
2487 *entry_out = entry;
2488 return KERN_SUCCESS;
2489 }
2490
2491 /*
2492 * Routine: vm_map_find_space
2493 * Purpose:
2494 * Allocate a range in the specified virtual address map,
2495 * returning the entry allocated for that range.
2496 * Used by kmem_alloc, etc.
2497 *
2498 * The map must be NOT be locked. It will be returned locked
2499 * on KERN_SUCCESS, unlocked on failure.
2500 *
2501 * If an entry is allocated, the object/offset fields
2502 * are initialized to zero.
2503 */
2504 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2505 vm_map_find_space(
2506 vm_map_t map,
2507 vm_map_offset_t hint_address,
2508 vm_map_size_t size,
2509 vm_map_offset_t mask,
2510 vm_map_kernel_flags_t vmk_flags,
2511 vm_map_entry_t *o_entry) /* OUT */
2512 {
2513 vm_map_entry_t new_entry, entry;
2514 kern_return_t kr;
2515
2516 if (size == 0) {
2517 return KERN_INVALID_ARGUMENT;
2518 }
2519
2520 new_entry = vm_map_entry_create(map);
2521 new_entry->use_pmap = true;
2522 new_entry->protection = VM_PROT_DEFAULT;
2523 new_entry->max_protection = VM_PROT_ALL;
2524
2525 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2526 new_entry->map_aligned = true;
2527 }
2528 if (vmk_flags.vmf_permanent) {
2529 new_entry->vme_permanent = true;
2530 }
2531
2532 vm_map_lock(map);
2533
2534 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2535 &hint_address, &entry);
2536 if (kr != KERN_SUCCESS) {
2537 vm_map_unlock(map);
2538 vm_map_entry_dispose(new_entry);
2539 return kr;
2540 }
2541 new_entry->vme_start = hint_address;
2542 new_entry->vme_end = hint_address + size;
2543
2544 /*
2545 * At this point,
2546 *
2547 * - new_entry's "vme_start" and "vme_end" should define
2548 * the endpoints of the available new range,
2549 *
2550 * - and "entry" should refer to the region before
2551 * the new range,
2552 *
2553 * - and the map should still be locked.
2554 */
2555
2556 assert(page_aligned(new_entry->vme_start));
2557 assert(page_aligned(new_entry->vme_end));
2558 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2559 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2560
2561 /*
2562 * Insert the new entry into the list
2563 */
2564
2565 vm_map_store_entry_link(map, entry, new_entry,
2566 VM_MAP_KERNEL_FLAGS_NONE);
2567 map->size += size;
2568
2569 /*
2570 * Update the lookup hint
2571 */
2572 SAVE_HINT_MAP_WRITE(map, new_entry);
2573
2574 *o_entry = new_entry;
2575 return KERN_SUCCESS;
2576 }
2577
2578 int vm_map_pmap_enter_print = FALSE;
2579 int vm_map_pmap_enter_enable = FALSE;
2580
2581 /*
2582 * Routine: vm_map_pmap_enter [internal only]
2583 *
2584 * Description:
2585 * Force pages from the specified object to be entered into
2586 * the pmap at the specified address if they are present.
2587 * As soon as a page not found in the object the scan ends.
2588 *
2589 * Returns:
2590 * Nothing.
2591 *
2592 * In/out conditions:
2593 * The source map should not be locked on entry.
2594 */
2595 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2596 vm_map_pmap_enter(
2597 vm_map_t map,
2598 vm_map_offset_t addr,
2599 vm_map_offset_t end_addr,
2600 vm_object_t object,
2601 vm_object_offset_t offset,
2602 vm_prot_t protection)
2603 {
2604 int type_of_fault;
2605 kern_return_t kr;
2606 uint8_t object_lock_type = 0;
2607 struct vm_object_fault_info fault_info = {};
2608
2609 if (map->pmap == 0) {
2610 return;
2611 }
2612
2613 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2614
2615 while (addr < end_addr) {
2616 vm_page_t m;
2617
2618
2619 /*
2620 * TODO:
2621 * From vm_map_enter(), we come into this function without the map
2622 * lock held or the object lock held.
2623 * We haven't taken a reference on the object either.
2624 * We should do a proper lookup on the map to make sure
2625 * that things are sane before we go locking objects that
2626 * could have been deallocated from under us.
2627 */
2628
2629 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2630 vm_object_lock(object);
2631
2632 m = vm_page_lookup(object, offset);
2633
2634 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2635 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2636 vm_object_unlock(object);
2637 return;
2638 }
2639
2640 if (vm_map_pmap_enter_print) {
2641 printf("vm_map_pmap_enter:");
2642 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2643 map, (unsigned long long)addr, object, (unsigned long long)offset);
2644 }
2645 type_of_fault = DBG_CACHE_HIT_FAULT;
2646 kr = vm_fault_enter(m, map->pmap,
2647 addr,
2648 PAGE_SIZE, 0,
2649 protection, protection,
2650 VM_PAGE_WIRED(m),
2651 FALSE, /* change_wiring */
2652 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2653 &fault_info,
2654 NULL, /* need_retry */
2655 &type_of_fault,
2656 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2657
2658 vm_object_unlock(object);
2659
2660 offset += PAGE_SIZE_64;
2661 addr += PAGE_SIZE;
2662 }
2663 }
2664
2665 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2666 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2667 vm_map_random_address_for_size(
2668 vm_map_t map,
2669 vm_map_offset_t *address,
2670 vm_map_size_t size,
2671 vm_map_kernel_flags_t vmk_flags)
2672 {
2673 kern_return_t kr = KERN_SUCCESS;
2674 int tries = 0;
2675 vm_map_offset_t random_addr = 0;
2676 vm_map_offset_t hole_end;
2677
2678 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2679 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2680 vm_map_size_t vm_hole_size = 0;
2681 vm_map_size_t addr_space_size;
2682 bool is_kmem_ptr;
2683 struct mach_vm_range effective_range;
2684
2685 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2686 &is_kmem_ptr);
2687
2688 addr_space_size = effective_range.max_address - effective_range.min_address;
2689 if (size >= addr_space_size) {
2690 return KERN_NO_SPACE;
2691 }
2692 addr_space_size -= size;
2693
2694 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2695
2696 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2697 if (startup_phase < STARTUP_SUB_ZALLOC) {
2698 random_addr = (vm_map_offset_t)early_random();
2699 } else {
2700 random_addr = (vm_map_offset_t)random();
2701 }
2702 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2703 random_addr = vm_map_trunc_page(
2704 effective_range.min_address + (random_addr % addr_space_size),
2705 VM_MAP_PAGE_MASK(map));
2706
2707 #if CONFIG_PROB_GZALLOC
2708 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2709 continue;
2710 }
2711 #endif /* CONFIG_PROB_GZALLOC */
2712
2713 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2714 if (prev_entry == vm_map_to_entry(map)) {
2715 next_entry = vm_map_first_entry(map);
2716 } else {
2717 next_entry = prev_entry->vme_next;
2718 }
2719 if (next_entry == vm_map_to_entry(map)) {
2720 hole_end = vm_map_max(map);
2721 } else {
2722 hole_end = next_entry->vme_start;
2723 }
2724 vm_hole_size = hole_end - random_addr;
2725 if (vm_hole_size >= size) {
2726 *address = random_addr;
2727 break;
2728 }
2729 }
2730 tries++;
2731 }
2732
2733 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2734 kr = KERN_NO_SPACE;
2735 }
2736 return kr;
2737 }
2738
2739 static boolean_t
vm_memory_malloc_no_cow(int alias)2740 vm_memory_malloc_no_cow(
2741 int alias)
2742 {
2743 uint64_t alias_mask;
2744
2745 if (!malloc_no_cow) {
2746 return FALSE;
2747 }
2748 if (alias > 63) {
2749 return FALSE;
2750 }
2751 alias_mask = 1ULL << alias;
2752 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2753 return TRUE;
2754 }
2755 return FALSE;
2756 }
2757
2758 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2759 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2760 /*
2761 * Routine: vm_map_enter
2762 *
2763 * Description:
2764 * Allocate a range in the specified virtual address map.
2765 * The resulting range will refer to memory defined by
2766 * the given memory object and offset into that object.
2767 *
2768 * Arguments are as defined in the vm_map call.
2769 */
2770 static unsigned int vm_map_enter_restore_successes = 0;
2771 static unsigned int vm_map_enter_restore_failures = 0;
2772 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2773 vm_map_enter(
2774 vm_map_t map,
2775 vm_map_offset_t *address, /* IN/OUT */
2776 vm_map_size_t size,
2777 vm_map_offset_t mask,
2778 vm_map_kernel_flags_t vmk_flags,
2779 vm_object_t object,
2780 vm_object_offset_t offset,
2781 boolean_t needs_copy,
2782 vm_prot_t cur_protection,
2783 vm_prot_t max_protection,
2784 vm_inherit_t inheritance)
2785 {
2786 vm_map_entry_t entry, new_entry;
2787 vm_map_offset_t start, tmp_start, tmp_offset;
2788 vm_map_offset_t end, tmp_end;
2789 vm_map_offset_t tmp2_start, tmp2_end;
2790 vm_map_offset_t step;
2791 kern_return_t result = KERN_SUCCESS;
2792 bool map_locked = FALSE;
2793 bool pmap_empty = TRUE;
2794 bool new_mapping_established = FALSE;
2795 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2796 const bool anywhere = !vmk_flags.vmf_fixed;
2797 const bool purgable = vmk_flags.vmf_purgeable;
2798 const bool no_cache = vmk_flags.vmf_no_cache;
2799 const bool is_submap = vmk_flags.vmkf_submap;
2800 const bool permanent = vmk_flags.vmf_permanent;
2801 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2802 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2803 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2804 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2805 const bool resilient_media = vmk_flags.vmf_resilient_media;
2806 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2807 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2808 const vm_tag_t alias = vmk_flags.vm_tag;
2809 vm_tag_t user_alias;
2810 kern_return_t kr;
2811 bool clear_map_aligned = FALSE;
2812 vm_map_size_t chunk_size = 0;
2813 vm_object_t caller_object;
2814 VM_MAP_ZAP_DECLARE(zap_old_list);
2815 VM_MAP_ZAP_DECLARE(zap_new_list);
2816
2817 caller_object = object;
2818
2819 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2820
2821 if (vmk_flags.vmf_4gb_chunk) {
2822 #if defined(__LP64__)
2823 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2824 #else /* __LP64__ */
2825 chunk_size = ANON_CHUNK_SIZE;
2826 #endif /* __LP64__ */
2827 } else {
2828 chunk_size = ANON_CHUNK_SIZE;
2829 }
2830
2831
2832
2833 if (superpage_size) {
2834 if (object != VM_OBJECT_NULL) {
2835 /* caller can't provide their own VM object */
2836 return KERN_INVALID_ARGUMENT;
2837 }
2838 switch (superpage_size) {
2839 /*
2840 * Note that the current implementation only supports
2841 * a single size for superpages, SUPERPAGE_SIZE, per
2842 * architecture. As soon as more sizes are supposed
2843 * to be supported, SUPERPAGE_SIZE has to be replaced
2844 * with a lookup of the size depending on superpage_size.
2845 */
2846 #ifdef __x86_64__
2847 case SUPERPAGE_SIZE_ANY:
2848 /* handle it like 2 MB and round up to page size */
2849 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2850 OS_FALLTHROUGH;
2851 case SUPERPAGE_SIZE_2MB:
2852 break;
2853 #endif
2854 default:
2855 return KERN_INVALID_ARGUMENT;
2856 }
2857 mask = SUPERPAGE_SIZE - 1;
2858 if (size & (SUPERPAGE_SIZE - 1)) {
2859 return KERN_INVALID_ARGUMENT;
2860 }
2861 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2862 }
2863
2864
2865 if ((cur_protection & VM_PROT_WRITE) &&
2866 (cur_protection & VM_PROT_EXECUTE) &&
2867 #if XNU_TARGET_OS_OSX
2868 map->pmap != kernel_pmap &&
2869 (cs_process_global_enforcement() ||
2870 (vmk_flags.vmkf_cs_enforcement_override
2871 ? vmk_flags.vmkf_cs_enforcement
2872 : (vm_map_cs_enforcement(map)
2873 #if __arm64__
2874 || !VM_MAP_IS_EXOTIC(map)
2875 #endif /* __arm64__ */
2876 ))) &&
2877 #endif /* XNU_TARGET_OS_OSX */
2878 #if CODE_SIGNING_MONITOR
2879 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2880 #endif
2881 (VM_MAP_POLICY_WX_FAIL(map) ||
2882 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2883 !entry_for_jit) {
2884 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2885
2886 DTRACE_VM3(cs_wx,
2887 uint64_t, 0,
2888 uint64_t, 0,
2889 vm_prot_t, cur_protection);
2890 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2891 proc_selfpid(),
2892 (get_bsdtask_info(current_task())
2893 ? proc_name_address(get_bsdtask_info(current_task()))
2894 : "?"),
2895 __FUNCTION__,
2896 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2897 cur_protection &= ~VM_PROT_EXECUTE;
2898 if (vm_protect_wx_fail) {
2899 return KERN_PROTECTION_FAILURE;
2900 }
2901 }
2902
2903 if (entry_for_jit
2904 && cur_protection != VM_PROT_ALL) {
2905 /*
2906 * Native macOS processes and all non-macOS processes are
2907 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2908 * the RWX requirement was not enforced, and thus, we must live
2909 * with our sins. We are now dealing with a JIT mapping without
2910 * RWX.
2911 *
2912 * We deal with these by letting the MAP_JIT stick in order
2913 * to avoid CS violations when these pages are mapped executable
2914 * down the line. In order to appease the page table monitor (you
2915 * know what I'm talking about), these pages will end up being
2916 * marked as XNU_USER_DEBUG, which will be allowed because we
2917 * don't enforce the code signing monitor on macOS systems. If
2918 * the user-space application ever changes permissions to RWX,
2919 * which they are allowed to since the mapping was originally
2920 * created with MAP_JIT, then they'll switch over to using the
2921 * XNU_USER_JIT type, and won't be allowed to downgrade any
2922 * more after that.
2923 *
2924 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2925 * strictly disallowed.
2926 */
2927
2928 #if XNU_TARGET_OS_OSX
2929 /*
2930 * Continue to allow non-RWX JIT
2931 */
2932 #else
2933 /* non-macOS: reject JIT regions without RWX */
2934 DTRACE_VM3(cs_wx,
2935 uint64_t, 0,
2936 uint64_t, 0,
2937 vm_prot_t, cur_protection);
2938 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2939 proc_selfpid(),
2940 (get_bsdtask_info(current_task())
2941 ? proc_name_address(get_bsdtask_info(current_task()))
2942 : "?"),
2943 __FUNCTION__,
2944 cur_protection);
2945 return KERN_PROTECTION_FAILURE;
2946 #endif
2947 }
2948
2949 /*
2950 * If the task has requested executable lockdown,
2951 * deny any new executable mapping.
2952 */
2953 if (map->map_disallow_new_exec == TRUE) {
2954 if (cur_protection & VM_PROT_EXECUTE) {
2955 return KERN_PROTECTION_FAILURE;
2956 }
2957 }
2958
2959 if (resilient_codesign) {
2960 assert(!is_submap);
2961 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2962 if ((cur_protection | max_protection) & reject_prot) {
2963 return KERN_PROTECTION_FAILURE;
2964 }
2965 }
2966
2967 if (resilient_media) {
2968 assert(!is_submap);
2969 // assert(!needs_copy);
2970 if (object != VM_OBJECT_NULL &&
2971 !object->internal) {
2972 /*
2973 * This mapping is directly backed by an external
2974 * memory manager (e.g. a vnode pager for a file):
2975 * we would not have any safe place to inject
2976 * a zero-filled page if an actual page is not
2977 * available, without possibly impacting the actual
2978 * contents of the mapped object (e.g. the file),
2979 * so we can't provide any media resiliency here.
2980 */
2981 return KERN_INVALID_ARGUMENT;
2982 }
2983 }
2984
2985 if (entry_for_tpro) {
2986 /*
2987 * TPRO overrides the effective permissions of the region
2988 * and explicitly maps as RW. Ensure we have been passed
2989 * the expected permissions. We accept `cur_protections`
2990 * RO as that will be handled on fault.
2991 */
2992 if (!(max_protection & VM_PROT_READ) ||
2993 !(max_protection & VM_PROT_WRITE) ||
2994 !(cur_protection & VM_PROT_READ)) {
2995 return KERN_PROTECTION_FAILURE;
2996 }
2997
2998 /*
2999 * We can now downgrade the cur_protection to RO. This is a mild lie
3000 * to the VM layer. But TPRO will be responsible for toggling the
3001 * protections between RO/RW
3002 */
3003 cur_protection = VM_PROT_READ;
3004 }
3005
3006 if (is_submap) {
3007 vm_map_t submap;
3008 if (purgable) {
3009 /* submaps can not be purgeable */
3010 return KERN_INVALID_ARGUMENT;
3011 }
3012 if (object == VM_OBJECT_NULL) {
3013 /* submaps can not be created lazily */
3014 return KERN_INVALID_ARGUMENT;
3015 }
3016 submap = (vm_map_t) object;
3017 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3018 /* page size mismatch */
3019 return KERN_INVALID_ARGUMENT;
3020 }
3021 }
3022 if (vmk_flags.vmkf_already) {
3023 /*
3024 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3025 * is already present. For it to be meaningul, the requested
3026 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3027 * we shouldn't try and remove what was mapped there first
3028 * (!VM_FLAGS_OVERWRITE).
3029 */
3030 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3031 return KERN_INVALID_ARGUMENT;
3032 }
3033 }
3034
3035 if (size == 0 ||
3036 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3037 *address = 0;
3038 return KERN_INVALID_ARGUMENT;
3039 }
3040
3041 if (map->pmap == kernel_pmap) {
3042 user_alias = VM_KERN_MEMORY_NONE;
3043 } else {
3044 user_alias = alias;
3045 }
3046
3047 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3048 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3049 }
3050
3051 #define RETURN(value) { result = value; goto BailOut; }
3052
3053 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3054 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3055 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3056 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3057 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3058 }
3059
3060 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3061 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3062 /*
3063 * In most cases, the caller rounds the size up to the
3064 * map's page size.
3065 * If we get a size that is explicitly not map-aligned here,
3066 * we'll have to respect the caller's wish and mark the
3067 * mapping as "not map-aligned" to avoid tripping the
3068 * map alignment checks later.
3069 */
3070 clear_map_aligned = TRUE;
3071 }
3072 if (!anywhere &&
3073 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3074 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3075 /*
3076 * We've been asked to map at a fixed address and that
3077 * address is not aligned to the map's specific alignment.
3078 * The caller should know what it's doing (i.e. most likely
3079 * mapping some fragmented copy map, transferring memory from
3080 * a VM map with a different alignment), so clear map_aligned
3081 * for this new VM map entry and proceed.
3082 */
3083 clear_map_aligned = TRUE;
3084 }
3085
3086 /*
3087 * Only zero-fill objects are allowed to be purgable.
3088 * LP64todo - limit purgable objects to 32-bits for now
3089 */
3090 if (purgable &&
3091 (offset != 0 ||
3092 (object != VM_OBJECT_NULL &&
3093 (object->vo_size != size ||
3094 object->purgable == VM_PURGABLE_DENY))
3095 #if __LP64__
3096 || size > ANON_MAX_SIZE
3097 #endif
3098 )) {
3099 return KERN_INVALID_ARGUMENT;
3100 }
3101
3102 vm_map_lock(map);
3103 map_locked = TRUE;
3104
3105 if (anywhere) {
3106 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3107 address, &entry);
3108 start = *address;
3109 } else {
3110 start = *address;
3111 result = vm_map_locate_space_fixed(map, start, size, mask,
3112 vmk_flags, &entry, &zap_old_list);
3113 }
3114
3115 end = start + size;
3116
3117 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3118
3119 /*
3120 * Check if what's already there is what we want.
3121 */
3122 if (result == KERN_MEMORY_PRESENT) {
3123 assert(!anywhere);
3124 if (!(vmk_flags.vmkf_already)) {
3125 RETURN(KERN_NO_SPACE);
3126 }
3127 tmp_start = start;
3128 tmp_offset = offset;
3129 if (entry->vme_start < start) {
3130 tmp_start -= start - entry->vme_start;
3131 tmp_offset -= start - entry->vme_start;
3132 }
3133 for (; entry->vme_start < end;
3134 entry = entry->vme_next) {
3135 /*
3136 * Check if the mapping's attributes
3137 * match the existing map entry.
3138 */
3139 if (entry == vm_map_to_entry(map) ||
3140 entry->vme_start != tmp_start ||
3141 entry->is_sub_map != is_submap ||
3142 VME_OFFSET(entry) != tmp_offset ||
3143 entry->needs_copy != needs_copy ||
3144 entry->protection != cur_protection ||
3145 entry->max_protection != max_protection ||
3146 entry->inheritance != inheritance ||
3147 entry->iokit_acct != iokit_acct ||
3148 VME_ALIAS(entry) != alias) {
3149 /* not the same mapping ! */
3150 RETURN(KERN_NO_SPACE);
3151 }
3152 /*
3153 * Check if the same object is being mapped.
3154 */
3155 if (is_submap) {
3156 if (VME_SUBMAP(entry) !=
3157 (vm_map_t) object) {
3158 /* not the same submap */
3159 RETURN(KERN_NO_SPACE);
3160 }
3161 } else {
3162 if (VME_OBJECT(entry) != object) {
3163 /* not the same VM object... */
3164 vm_object_t obj2;
3165
3166 obj2 = VME_OBJECT(entry);
3167 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3168 (object == VM_OBJECT_NULL || object->internal)) {
3169 /*
3170 * ... but both are
3171 * anonymous memory,
3172 * so equivalent.
3173 */
3174 } else {
3175 RETURN(KERN_NO_SPACE);
3176 }
3177 }
3178 }
3179
3180 tmp_offset += entry->vme_end - entry->vme_start;
3181 tmp_start += entry->vme_end - entry->vme_start;
3182 if (entry->vme_end >= end) {
3183 /* reached the end of our mapping */
3184 break;
3185 }
3186 }
3187 /* it all matches: let's use what's already there ! */
3188 RETURN(KERN_MEMORY_PRESENT);
3189 }
3190
3191 if (result != KERN_SUCCESS) {
3192 goto BailOut;
3193 }
3194
3195
3196 /*
3197 * At this point,
3198 * "start" and "end" should define the endpoints of the
3199 * available new range, and
3200 * "entry" should refer to the region before the new
3201 * range, and
3202 *
3203 * the map should be locked.
3204 */
3205
3206 /*
3207 * See whether we can avoid creating a new entry (and object) by
3208 * extending one of our neighbors. [So far, we only attempt to
3209 * extend from below.] Note that we can never extend/join
3210 * purgable objects because they need to remain distinct
3211 * entities in order to implement their "volatile object"
3212 * semantics.
3213 */
3214
3215 if (purgable ||
3216 entry_for_jit ||
3217 entry_for_tpro ||
3218 vm_memory_malloc_no_cow(user_alias)) {
3219 if (superpage_size) {
3220 /*
3221 * For "super page" allocations, we will allocate
3222 * special physically-contiguous VM objects later on,
3223 * so we should not have flags instructing us to create
3224 * a differently special VM object here.
3225 */
3226 RETURN(KERN_INVALID_ARGUMENT);
3227 }
3228
3229 if (object == VM_OBJECT_NULL) {
3230 assert(!superpage_size);
3231 object = vm_object_allocate(size);
3232 vm_object_lock(object);
3233 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3234 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3235 if (malloc_no_cow_except_fork &&
3236 !purgable &&
3237 !entry_for_jit &&
3238 !entry_for_tpro &&
3239 vm_memory_malloc_no_cow(user_alias)) {
3240 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3241 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3242 }
3243 if (entry_for_jit) {
3244 object->vo_inherit_copy_none = true;
3245 }
3246 if (purgable) {
3247 task_t owner;
3248 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3249 if (map->pmap == kernel_pmap) {
3250 /*
3251 * Purgeable mappings made in a kernel
3252 * map are "owned" by the kernel itself
3253 * rather than the current user task
3254 * because they're likely to be used by
3255 * more than this user task (see
3256 * execargs_purgeable_allocate(), for
3257 * example).
3258 */
3259 owner = kernel_task;
3260 } else {
3261 owner = current_task();
3262 }
3263 assert(object->vo_owner == NULL);
3264 assert(object->resident_page_count == 0);
3265 assert(object->wired_page_count == 0);
3266 vm_purgeable_nonvolatile_enqueue(object, owner);
3267 }
3268 vm_object_unlock(object);
3269 offset = (vm_object_offset_t)0;
3270 }
3271 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3272 /* no coalescing if address space uses sub-pages */
3273 } else if ((is_submap == FALSE) &&
3274 (object == VM_OBJECT_NULL) &&
3275 (entry != vm_map_to_entry(map)) &&
3276 (entry->vme_end == start) &&
3277 (!entry->is_shared) &&
3278 (!entry->is_sub_map) &&
3279 (!entry->in_transition) &&
3280 (!entry->needs_wakeup) &&
3281 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3282 (entry->protection == cur_protection) &&
3283 (entry->max_protection == max_protection) &&
3284 (entry->inheritance == inheritance) &&
3285 ((user_alias == VM_MEMORY_REALLOC) ||
3286 (VME_ALIAS(entry) == alias)) &&
3287 (entry->no_cache == no_cache) &&
3288 (entry->vme_permanent == permanent) &&
3289 /* no coalescing for immutable executable mappings */
3290 !((entry->protection & VM_PROT_EXECUTE) &&
3291 entry->vme_permanent) &&
3292 (!entry->superpage_size && !superpage_size) &&
3293 /*
3294 * No coalescing if not map-aligned, to avoid propagating
3295 * that condition any further than needed:
3296 */
3297 (!entry->map_aligned || !clear_map_aligned) &&
3298 (!entry->zero_wired_pages) &&
3299 (!entry->used_for_jit && !entry_for_jit) &&
3300 #if __arm64e__
3301 (!entry->used_for_tpro && !entry_for_tpro) &&
3302 #endif
3303 (!entry->csm_associated) &&
3304 (entry->iokit_acct == iokit_acct) &&
3305 (!entry->vme_resilient_codesign) &&
3306 (!entry->vme_resilient_media) &&
3307 (!entry->vme_atomic) &&
3308 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3309
3310 ((entry->vme_end - entry->vme_start) + size <=
3311 (user_alias == VM_MEMORY_REALLOC ?
3312 ANON_CHUNK_SIZE :
3313 NO_COALESCE_LIMIT)) &&
3314
3315 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3316 if (vm_object_coalesce(VME_OBJECT(entry),
3317 VM_OBJECT_NULL,
3318 VME_OFFSET(entry),
3319 (vm_object_offset_t) 0,
3320 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3321 (vm_map_size_t)(end - entry->vme_end))) {
3322 /*
3323 * Coalesced the two objects - can extend
3324 * the previous map entry to include the
3325 * new range.
3326 */
3327 map->size += (end - entry->vme_end);
3328 assert(entry->vme_start < end);
3329 assert(VM_MAP_PAGE_ALIGNED(end,
3330 VM_MAP_PAGE_MASK(map)));
3331 if (__improbable(vm_debug_events)) {
3332 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3333 }
3334 entry->vme_end = end;
3335 if (map->holelistenabled) {
3336 vm_map_store_update_first_free(map, entry, TRUE);
3337 } else {
3338 vm_map_store_update_first_free(map, map->first_free, TRUE);
3339 }
3340 new_mapping_established = TRUE;
3341 RETURN(KERN_SUCCESS);
3342 }
3343 }
3344
3345 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3346 new_entry = NULL;
3347
3348 if (vmk_flags.vmkf_submap_adjust) {
3349 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3350 offset = start;
3351 }
3352
3353 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3354 tmp2_end = tmp2_start + step;
3355 /*
3356 * Create a new entry
3357 *
3358 * XXX FBDP
3359 * The reserved "page zero" in each process's address space can
3360 * be arbitrarily large. Splitting it into separate objects and
3361 * therefore different VM map entries serves no purpose and just
3362 * slows down operations on the VM map, so let's not split the
3363 * allocation into chunks if the max protection is NONE. That
3364 * memory should never be accessible, so it will never get to the
3365 * default pager.
3366 */
3367 tmp_start = tmp2_start;
3368 if (!is_submap &&
3369 object == VM_OBJECT_NULL &&
3370 size > chunk_size &&
3371 max_protection != VM_PROT_NONE &&
3372 superpage_size == 0) {
3373 tmp_end = tmp_start + chunk_size;
3374 } else {
3375 tmp_end = tmp2_end;
3376 }
3377 do {
3378 if (!is_submap &&
3379 object != VM_OBJECT_NULL &&
3380 object->internal &&
3381 offset + (tmp_end - tmp_start) > object->vo_size) {
3382 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3383 DTRACE_VM5(vm_map_enter_overmap,
3384 vm_map_t, map,
3385 vm_map_address_t, tmp_start,
3386 vm_map_address_t, tmp_end,
3387 vm_object_offset_t, offset,
3388 vm_object_size_t, object->vo_size);
3389 }
3390 new_entry = vm_map_entry_insert(map,
3391 entry, tmp_start, tmp_end,
3392 object, offset, vmk_flags,
3393 needs_copy,
3394 cur_protection, max_protection,
3395 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3396 VM_INHERIT_NONE : inheritance),
3397 clear_map_aligned);
3398
3399 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3400
3401 if (resilient_codesign) {
3402 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3403 if (!((cur_protection | max_protection) & reject_prot)) {
3404 new_entry->vme_resilient_codesign = TRUE;
3405 }
3406 }
3407
3408 if (resilient_media &&
3409 (object == VM_OBJECT_NULL ||
3410 object->internal)) {
3411 new_entry->vme_resilient_media = TRUE;
3412 }
3413
3414 assert(!new_entry->iokit_acct);
3415 if (!is_submap &&
3416 object != VM_OBJECT_NULL &&
3417 object->internal &&
3418 (object->purgable != VM_PURGABLE_DENY ||
3419 object->vo_ledger_tag)) {
3420 assert(new_entry->use_pmap);
3421 assert(!new_entry->iokit_acct);
3422 /*
3423 * Turn off pmap accounting since
3424 * purgeable (or tagged) objects have their
3425 * own ledgers.
3426 */
3427 new_entry->use_pmap = FALSE;
3428 } else if (!is_submap &&
3429 iokit_acct &&
3430 object != VM_OBJECT_NULL &&
3431 object->internal) {
3432 /* alternate accounting */
3433 assert(!new_entry->iokit_acct);
3434 assert(new_entry->use_pmap);
3435 new_entry->iokit_acct = TRUE;
3436 new_entry->use_pmap = FALSE;
3437 DTRACE_VM4(
3438 vm_map_iokit_mapped_region,
3439 vm_map_t, map,
3440 vm_map_offset_t, new_entry->vme_start,
3441 vm_map_offset_t, new_entry->vme_end,
3442 int, VME_ALIAS(new_entry));
3443 vm_map_iokit_mapped_region(
3444 map,
3445 (new_entry->vme_end -
3446 new_entry->vme_start));
3447 } else if (!is_submap) {
3448 assert(!new_entry->iokit_acct);
3449 assert(new_entry->use_pmap);
3450 }
3451
3452 if (is_submap) {
3453 vm_map_t submap;
3454 boolean_t submap_is_64bit;
3455 boolean_t use_pmap;
3456
3457 assert(new_entry->is_sub_map);
3458 assert(!new_entry->use_pmap);
3459 assert(!new_entry->iokit_acct);
3460 submap = (vm_map_t) object;
3461 submap_is_64bit = vm_map_is_64bit(submap);
3462 use_pmap = vmk_flags.vmkf_nested_pmap;
3463 #ifndef NO_NESTED_PMAP
3464 if (use_pmap && submap->pmap == NULL) {
3465 ledger_t ledger = map->pmap->ledger;
3466 /* we need a sub pmap to nest... */
3467 submap->pmap = pmap_create_options(ledger, 0,
3468 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3469 if (submap->pmap == NULL) {
3470 /* let's proceed without nesting... */
3471 }
3472 #if defined(__arm64__)
3473 else {
3474 pmap_set_nested(submap->pmap);
3475 }
3476 #endif
3477 }
3478 if (use_pmap && submap->pmap != NULL) {
3479 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3480 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3481 kr = KERN_FAILURE;
3482 } else {
3483 kr = pmap_nest(map->pmap,
3484 submap->pmap,
3485 tmp_start,
3486 tmp_end - tmp_start);
3487 }
3488 if (kr != KERN_SUCCESS) {
3489 printf("vm_map_enter: "
3490 "pmap_nest(0x%llx,0x%llx) "
3491 "error 0x%x\n",
3492 (long long)tmp_start,
3493 (long long)tmp_end,
3494 kr);
3495 } else {
3496 /* we're now nested ! */
3497 new_entry->use_pmap = TRUE;
3498 pmap_empty = FALSE;
3499 }
3500 }
3501 #endif /* NO_NESTED_PMAP */
3502 }
3503 entry = new_entry;
3504
3505 if (superpage_size) {
3506 vm_page_t pages, m;
3507 vm_object_t sp_object;
3508 vm_object_offset_t sp_offset;
3509
3510 assert(object == VM_OBJECT_NULL);
3511 VME_OFFSET_SET(entry, 0);
3512
3513 /* allocate one superpage */
3514 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3515 if (kr != KERN_SUCCESS) {
3516 /* deallocate whole range... */
3517 new_mapping_established = TRUE;
3518 /* ... but only up to "tmp_end" */
3519 size -= end - tmp_end;
3520 RETURN(kr);
3521 }
3522
3523 /* create one vm_object per superpage */
3524 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3525 vm_object_lock(sp_object);
3526 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3527 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3528 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3529 VME_OBJECT_SET(entry, sp_object, false, 0);
3530 assert(entry->use_pmap);
3531
3532 /* enter the base pages into the object */
3533 for (sp_offset = 0;
3534 sp_offset < SUPERPAGE_SIZE;
3535 sp_offset += PAGE_SIZE) {
3536 m = pages;
3537 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3538 pages = NEXT_PAGE(m);
3539 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3540 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3541 }
3542 vm_object_unlock(sp_object);
3543 }
3544 } while (tmp_end != tmp2_end &&
3545 (tmp_start = tmp_end) &&
3546 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3547 tmp_end + chunk_size : tmp2_end));
3548 }
3549
3550 new_mapping_established = TRUE;
3551
3552 BailOut:
3553 assert(map_locked == TRUE);
3554
3555 /*
3556 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3557 * If we have identified and possibly established the new mapping(s),
3558 * make sure we did not go beyond the address space limit.
3559 */
3560 if (result == KERN_SUCCESS) {
3561 if (map->size_limit != RLIM_INFINITY &&
3562 map->size > map->size_limit) {
3563 /*
3564 * Establishing the requested mappings would exceed
3565 * the process's RLIMIT_AS limit: fail with
3566 * KERN_NO_SPACE.
3567 */
3568 result = KERN_NO_SPACE;
3569 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3570 proc_selfpid(),
3571 (get_bsdtask_info(current_task())
3572 ? proc_name_address(get_bsdtask_info(current_task()))
3573 : "?"),
3574 __FUNCTION__,
3575 (uint64_t) map->size,
3576 (uint64_t) map->size_limit);
3577 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3578 vm_map_size_t, map->size,
3579 uint64_t, map->size_limit);
3580 vm_map_enter_RLIMIT_AS_count++;
3581 } else if (map->data_limit != RLIM_INFINITY &&
3582 map->size > map->data_limit) {
3583 /*
3584 * Establishing the requested mappings would exceed
3585 * the process's RLIMIT_DATA limit: fail with
3586 * KERN_NO_SPACE.
3587 */
3588 result = KERN_NO_SPACE;
3589 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3590 proc_selfpid(),
3591 (get_bsdtask_info(current_task())
3592 ? proc_name_address(get_bsdtask_info(current_task()))
3593 : "?"),
3594 __FUNCTION__,
3595 (uint64_t) map->size,
3596 (uint64_t) map->data_limit);
3597 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3598 vm_map_size_t, map->size,
3599 uint64_t, map->data_limit);
3600 vm_map_enter_RLIMIT_DATA_count++;
3601 }
3602 }
3603
3604 if (result == KERN_SUCCESS) {
3605 vm_prot_t pager_prot;
3606 memory_object_t pager;
3607
3608 #if DEBUG
3609 if (pmap_empty &&
3610 !(vmk_flags.vmkf_no_pmap_check)) {
3611 assert(pmap_is_empty(map->pmap,
3612 *address,
3613 *address + size));
3614 }
3615 #endif /* DEBUG */
3616
3617 /*
3618 * For "named" VM objects, let the pager know that the
3619 * memory object is being mapped. Some pagers need to keep
3620 * track of this, to know when they can reclaim the memory
3621 * object, for example.
3622 * VM calls memory_object_map() for each mapping (specifying
3623 * the protection of each mapping) and calls
3624 * memory_object_last_unmap() when all the mappings are gone.
3625 */
3626 pager_prot = max_protection;
3627 if (needs_copy) {
3628 /*
3629 * Copy-On-Write mapping: won't modify
3630 * the memory object.
3631 */
3632 pager_prot &= ~VM_PROT_WRITE;
3633 }
3634 if (!is_submap &&
3635 object != VM_OBJECT_NULL &&
3636 object->named &&
3637 object->pager != MEMORY_OBJECT_NULL) {
3638 vm_object_lock(object);
3639 pager = object->pager;
3640 if (object->named &&
3641 pager != MEMORY_OBJECT_NULL) {
3642 assert(object->pager_ready);
3643 vm_object_mapping_wait(object, THREAD_UNINT);
3644 /* object might have lost its pager while waiting */
3645 pager = object->pager;
3646 if (object->named && pager != MEMORY_OBJECT_NULL) {
3647 vm_object_mapping_begin(object);
3648 vm_object_unlock(object);
3649
3650 kr = memory_object_map(pager, pager_prot);
3651 assert(kr == KERN_SUCCESS);
3652
3653 vm_object_lock(object);
3654 vm_object_mapping_end(object);
3655 }
3656 }
3657 vm_object_unlock(object);
3658 }
3659 }
3660
3661 assert(map_locked == TRUE);
3662
3663 if (new_mapping_established) {
3664 /*
3665 * If we release the map lock for any reason below,
3666 * another thread could deallocate our new mapping,
3667 * releasing the caller's reference on "caller_object",
3668 * which was transferred to the mapping.
3669 * If this was the only reference, the object could be
3670 * destroyed.
3671 *
3672 * We need to take an extra reference on "caller_object"
3673 * to keep it alive if we need to return the caller's
3674 * reference to the caller in case of failure.
3675 */
3676 if (is_submap) {
3677 vm_map_reference((vm_map_t)caller_object);
3678 } else {
3679 vm_object_reference(caller_object);
3680 }
3681 }
3682
3683 if (!keep_map_locked) {
3684 vm_map_unlock(map);
3685 map_locked = FALSE;
3686 entry = VM_MAP_ENTRY_NULL;
3687 new_entry = VM_MAP_ENTRY_NULL;
3688 }
3689
3690 /*
3691 * We can't hold the map lock if we enter this block.
3692 */
3693
3694 if (result == KERN_SUCCESS) {
3695 /* Wire down the new entry if the user
3696 * requested all new map entries be wired.
3697 */
3698 if ((map->wiring_required) || (superpage_size)) {
3699 assert(!keep_map_locked);
3700 pmap_empty = FALSE; /* pmap won't be empty */
3701 kr = vm_map_wire_nested(map, start, end,
3702 cur_protection, VM_KERN_MEMORY_MLOCK,
3703 TRUE, PMAP_NULL, 0, NULL);
3704 result = kr;
3705 }
3706
3707 }
3708
3709 if (result != KERN_SUCCESS) {
3710 if (new_mapping_established) {
3711 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3712
3713 /*
3714 * We have to get rid of the new mappings since we
3715 * won't make them available to the user.
3716 * Try and do that atomically, to minimize the risk
3717 * that someone else create new mappings that range.
3718 */
3719 if (!map_locked) {
3720 vm_map_lock(map);
3721 map_locked = TRUE;
3722 }
3723 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3724 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3725 if (permanent) {
3726 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3727 }
3728 (void) vm_map_delete(map,
3729 *address, *address + size,
3730 remove_flags,
3731 KMEM_GUARD_NONE, &zap_new_list);
3732 }
3733
3734 if (vm_map_zap_first_entry(&zap_old_list)) {
3735 vm_map_entry_t entry1, entry2;
3736
3737 /*
3738 * The new mapping failed. Attempt to restore
3739 * the old mappings, saved in the "zap_old_map".
3740 */
3741 if (!map_locked) {
3742 vm_map_lock(map);
3743 map_locked = TRUE;
3744 }
3745
3746 /* first check if the coast is still clear */
3747 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3748 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3749
3750 if (vm_map_lookup_entry(map, start, &entry1) ||
3751 vm_map_lookup_entry(map, end, &entry2) ||
3752 entry1 != entry2) {
3753 /*
3754 * Part of that range has already been
3755 * re-mapped: we can't restore the old
3756 * mappings...
3757 */
3758 vm_map_enter_restore_failures++;
3759 } else {
3760 /*
3761 * Transfer the saved map entries from
3762 * "zap_old_map" to the original "map",
3763 * inserting them all after "entry1".
3764 */
3765 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3766 vm_map_size_t entry_size;
3767
3768 entry_size = (entry2->vme_end -
3769 entry2->vme_start);
3770 vm_map_store_entry_link(map, entry1, entry2,
3771 VM_MAP_KERNEL_FLAGS_NONE);
3772 map->size += entry_size;
3773 entry1 = entry2;
3774 }
3775 if (map->wiring_required) {
3776 /*
3777 * XXX TODO: we should rewire the
3778 * old pages here...
3779 */
3780 }
3781 vm_map_enter_restore_successes++;
3782 }
3783 }
3784 }
3785
3786 /*
3787 * The caller is responsible for releasing the lock if it requested to
3788 * keep the map locked.
3789 */
3790 if (map_locked && !keep_map_locked) {
3791 vm_map_unlock(map);
3792 }
3793
3794 vm_map_zap_dispose(&zap_old_list);
3795 vm_map_zap_dispose(&zap_new_list);
3796
3797 if (new_mapping_established) {
3798 /*
3799 * The caller had a reference on "caller_object" and we
3800 * transferred that reference to the mapping.
3801 * We also took an extra reference on "caller_object" to keep
3802 * it alive while the map was unlocked.
3803 */
3804 if (result == KERN_SUCCESS) {
3805 /*
3806 * On success, the caller's reference on the object gets
3807 * tranferred to the mapping.
3808 * Release our extra reference.
3809 */
3810 if (is_submap) {
3811 vm_map_deallocate((vm_map_t)caller_object);
3812 } else {
3813 vm_object_deallocate(caller_object);
3814 }
3815 } else {
3816 /*
3817 * On error, the caller expects to still have a
3818 * reference on the object it gave us.
3819 * Let's use our extra reference for that.
3820 */
3821 }
3822 }
3823
3824 return result;
3825
3826 #undef RETURN
3827 }
3828
3829 /*
3830 * Counters for the prefault optimization.
3831 */
3832 int64_t vm_prefault_nb_pages = 0;
3833 int64_t vm_prefault_nb_bailout = 0;
3834
3835 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3836 vm_map_enter_adjust_offset(
3837 vm_object_offset_t *obj_offs,
3838 vm_object_offset_t *obj_end,
3839 vm_object_offset_t quantity)
3840 {
3841 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3842 os_add_overflow(*obj_end, quantity, obj_end) ||
3843 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3844 return KERN_INVALID_ARGUMENT;
3845 }
3846
3847 return KERN_SUCCESS;
3848 }
3849
3850 static __attribute__((always_inline, warn_unused_result))
3851 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3852 vm_map_enter_mem_object_sanitize(
3853 vm_map_t target_map,
3854 vm_map_offset_ut address_u,
3855 vm_map_size_ut initial_size_u,
3856 vm_map_offset_ut mask_u,
3857 vm_object_offset_ut offset_u,
3858 vm_prot_ut cur_protection_u,
3859 vm_prot_ut max_protection_u,
3860 vm_inherit_ut inheritance_u,
3861 vm_map_kernel_flags_t vmk_flags,
3862 ipc_port_t port,
3863 vm_map_address_t *map_addr,
3864 vm_map_size_t *map_size,
3865 vm_map_offset_t *mask,
3866 vm_object_offset_t *obj_offs,
3867 vm_object_offset_t *obj_end,
3868 vm_object_size_t *obj_size,
3869 vm_prot_t *cur_protection,
3870 vm_prot_t *max_protection,
3871 vm_inherit_t *inheritance)
3872 {
3873 kern_return_t result;
3874
3875 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3876 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3877 VM_PROT_IS_MASK, cur_protection,
3878 max_protection);
3879 if (__improbable(result != KERN_SUCCESS)) {
3880 return result;
3881 }
3882
3883 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3884 inheritance);
3885 if (__improbable(result != KERN_SUCCESS)) {
3886 return result;
3887 }
3888
3889 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3890 if (__improbable(result != KERN_SUCCESS)) {
3891 return result;
3892 }
3893
3894 if (vmk_flags.vmf_fixed) {
3895 vm_map_address_t map_end;
3896
3897 result = vm_sanitize_addr_size(address_u, initial_size_u,
3898 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3899 target_map,
3900 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3901 map_addr, &map_end, map_size);
3902 if (__improbable(result != KERN_SUCCESS)) {
3903 return result;
3904 }
3905 } else {
3906 *map_addr = vm_sanitize_addr(target_map, address_u);
3907 result = vm_sanitize_size(0, initial_size_u,
3908 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3909 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3910 if (__improbable(result != KERN_SUCCESS)) {
3911 return result;
3912 }
3913 }
3914
3915 *obj_size = vm_object_round_page(*map_size);
3916 if (__improbable(*obj_size == 0)) {
3917 return KERN_INVALID_ARGUMENT;
3918 }
3919
3920 if (IP_VALID(port)) {
3921 result = vm_sanitize_addr_size(offset_u, *obj_size,
3922 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3923 PAGE_MASK,
3924 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3925 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3926 obj_offs, obj_end, obj_size);
3927 if (__improbable(result != KERN_SUCCESS)) {
3928 return result;
3929 }
3930 } else {
3931 *obj_offs = 0;
3932 *obj_end = *obj_size;
3933 }
3934
3935 return KERN_SUCCESS;
3936 }
3937
3938 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3939 vm_map_enter_mem_object(
3940 vm_map_t target_map,
3941 vm_map_offset_ut *address_u,
3942 vm_map_size_ut initial_size_u,
3943 vm_map_offset_ut mask_u,
3944 vm_map_kernel_flags_t vmk_flags,
3945 ipc_port_t port,
3946 vm_object_offset_ut offset_u,
3947 boolean_t copy,
3948 vm_prot_ut cur_protection_u,
3949 vm_prot_ut max_protection_u,
3950 vm_inherit_ut inheritance_u,
3951 upl_page_list_ptr_t page_list,
3952 unsigned int page_list_count)
3953 {
3954 vm_map_offset_t mask;
3955 vm_prot_t cur_protection;
3956 vm_prot_t max_protection;
3957 vm_inherit_t inheritance;
3958 vm_map_address_t map_addr, map_mask;
3959 vm_map_size_t map_size;
3960 vm_object_t object = VM_OBJECT_NULL;
3961 vm_object_offset_t obj_offs, obj_end;
3962 vm_object_size_t obj_size;
3963 kern_return_t result;
3964 boolean_t mask_cur_protection, mask_max_protection;
3965 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
3966 vm_map_offset_t offset_in_mapping = 0;
3967
3968 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3969 /* XXX TODO4K prefaulting depends on page size... */
3970 try_prefault = FALSE;
3971 }
3972
3973 /*
3974 * Check arguments for validity
3975 */
3976 if ((target_map == VM_MAP_NULL) ||
3977 (try_prefault && (copy || !page_list))) {
3978 return KERN_INVALID_ARGUMENT;
3979 }
3980
3981 map_mask = vm_map_page_mask(target_map);
3982
3983 /*
3984 * Sanitize any input parameters that are addr/size/prot/inherit
3985 */
3986 result = vm_map_enter_mem_object_sanitize(
3987 target_map,
3988 *address_u,
3989 initial_size_u,
3990 mask_u,
3991 offset_u,
3992 cur_protection_u,
3993 max_protection_u,
3994 inheritance_u,
3995 vmk_flags,
3996 port,
3997 &map_addr,
3998 &map_size,
3999 &mask,
4000 &obj_offs,
4001 &obj_end,
4002 &obj_size,
4003 &cur_protection,
4004 &max_protection,
4005 &inheritance);
4006 if (__improbable(result != KERN_SUCCESS)) {
4007 return vm_sanitize_get_kr(result);
4008 }
4009
4010 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4011 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4012
4013 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4014 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4015 cur_protection &= ~VM_PROT_IS_MASK;
4016 max_protection &= ~VM_PROT_IS_MASK;
4017
4018 #if __arm64__
4019 if (cur_protection & VM_PROT_EXECUTE) {
4020 cur_protection |= VM_PROT_READ;
4021 }
4022 #endif /* __arm64__ */
4023
4024 /*
4025 * Find the vm object (if any) corresponding to this port.
4026 */
4027 if (!IP_VALID(port)) {
4028 object = VM_OBJECT_NULL;
4029 copy = FALSE;
4030 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4031 vm_named_entry_t named_entry;
4032 vm_object_size_t initial_size;
4033
4034 named_entry = mach_memory_entry_from_port(port);
4035
4036 if (vmk_flags.vmf_return_data_addr ||
4037 vmk_flags.vmf_return_4k_data_addr) {
4038 result = vm_map_enter_adjust_offset(&obj_offs,
4039 &obj_end, named_entry->data_offset);
4040 if (__improbable(result)) {
4041 return result;
4042 }
4043 }
4044
4045 /* a few checks to make sure user is obeying rules */
4046 if (mask_max_protection) {
4047 max_protection &= named_entry->protection;
4048 }
4049 if (mask_cur_protection) {
4050 cur_protection &= named_entry->protection;
4051 }
4052 if ((named_entry->protection & max_protection) !=
4053 max_protection) {
4054 return KERN_INVALID_RIGHT;
4055 }
4056 if ((named_entry->protection & cur_protection) !=
4057 cur_protection) {
4058 return KERN_INVALID_RIGHT;
4059 }
4060
4061 /*
4062 * unwrap is safe because we know obj_size is larger and doesn't
4063 * overflow
4064 */
4065 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4066 if (named_entry->size < obj_offs + initial_size) {
4067 return KERN_INVALID_ARGUMENT;
4068 }
4069
4070 /* for a vm_map_copy, we can only map it whole */
4071 if (named_entry->is_copy &&
4072 (obj_size != named_entry->size) &&
4073 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4074 /* XXX FBDP use the rounded size... */
4075 obj_end += named_entry->size - obj_size;
4076 obj_size = named_entry->size;
4077 }
4078
4079 if (named_entry->offset) {
4080 /*
4081 * the callers parameter offset is defined to be the
4082 * offset from beginning of named entry offset in object
4083 *
4084 * Because we checked above that
4085 * obj_offs + obj_size < named_entry_size
4086 * these overflow checks should be redundant...
4087 */
4088 result = vm_map_enter_adjust_offset(&obj_offs,
4089 &obj_end, named_entry->offset);
4090 if (__improbable(result)) {
4091 return result;
4092 }
4093 }
4094
4095 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4096 /*
4097 * Let's not map more than requested;
4098 * vm_map_enter() will handle this "not map-aligned"
4099 * case.
4100 */
4101 map_size = obj_size;
4102 }
4103
4104 named_entry_lock(named_entry);
4105
4106 // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4107 assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4108
4109 if (named_entry->is_sub_map) {
4110 vm_map_t submap;
4111
4112 assert(!named_entry->is_copy);
4113 assert(!named_entry->is_object);
4114
4115 if (vmk_flags.vmf_return_data_addr ||
4116 vmk_flags.vmf_return_4k_data_addr) {
4117 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4118 }
4119
4120 submap = named_entry->backing.map;
4121 vm_map_reference(submap);
4122 named_entry_unlock(named_entry);
4123
4124 vmk_flags.vmkf_submap = TRUE;
4125 result = vm_map_enter(target_map,
4126 &map_addr,
4127 map_size,
4128 mask,
4129 vmk_flags,
4130 (vm_object_t)(uintptr_t) submap,
4131 obj_offs,
4132 copy,
4133 cur_protection,
4134 max_protection,
4135 inheritance);
4136 if (result != KERN_SUCCESS) {
4137 vm_map_deallocate(submap);
4138 return result;
4139 }
4140 /*
4141 * No need to lock "submap" just to check its
4142 * "mapped" flag: that flag is never reset
4143 * once it's been set and if we race, we'll
4144 * just end up setting it twice, which is OK.
4145 */
4146 if (submap->mapped_in_other_pmaps == FALSE &&
4147 vm_map_pmap(submap) != PMAP_NULL &&
4148 vm_map_pmap(submap) !=
4149 vm_map_pmap(target_map)) {
4150 /*
4151 * This submap is being mapped in a map
4152 * that uses a different pmap.
4153 * Set its "mapped_in_other_pmaps" flag
4154 * to indicate that we now need to
4155 * remove mappings from all pmaps rather
4156 * than just the submap's pmap.
4157 */
4158 vm_map_lock(submap);
4159 submap->mapped_in_other_pmaps = TRUE;
4160 vm_map_unlock(submap);
4161 }
4162 goto out;
4163 }
4164
4165 if (named_entry->is_copy) {
4166 kern_return_t kr;
4167 vm_map_copy_t copy_map;
4168 vm_map_entry_t copy_entry;
4169 vm_map_offset_t copy_addr;
4170 vm_map_copy_t target_copy_map;
4171 vm_map_offset_t overmap_start, overmap_end;
4172 vm_map_offset_t trimmed_start;
4173 vm_map_size_t target_size;
4174
4175 assert(!named_entry->is_object);
4176 assert(!named_entry->is_sub_map);
4177
4178 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4179 (VM_FLAGS_FIXED |
4180 VM_FLAGS_ANYWHERE |
4181 VM_FLAGS_OVERWRITE |
4182 VM_FLAGS_RETURN_4K_DATA_ADDR |
4183 VM_FLAGS_RETURN_DATA_ADDR))) {
4184 named_entry_unlock(named_entry);
4185 return KERN_INVALID_ARGUMENT;
4186 }
4187
4188 copy_map = named_entry->backing.copy;
4189 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4190 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4191 /* unsupported type; should not happen */
4192 printf("vm_map_enter_mem_object: "
4193 "memory_entry->backing.copy "
4194 "unsupported type 0x%x\n",
4195 copy_map->type);
4196 named_entry_unlock(named_entry);
4197 return KERN_INVALID_ARGUMENT;
4198 }
4199
4200 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4201 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4202 }
4203
4204 if (vmk_flags.vmf_return_data_addr ||
4205 vmk_flags.vmf_return_4k_data_addr) {
4206 offset_in_mapping = obj_offs & map_mask;
4207 if (vmk_flags.vmf_return_4k_data_addr) {
4208 offset_in_mapping &= ~((signed)(0xFFF));
4209 }
4210 }
4211
4212 target_copy_map = VM_MAP_COPY_NULL;
4213 target_size = copy_map->size;
4214 overmap_start = 0;
4215 overmap_end = 0;
4216 trimmed_start = 0;
4217 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4218 DEBUG4K_ADJUST("adjusting...\n");
4219 kr = vm_map_copy_adjust_to_target(
4220 copy_map,
4221 obj_offs,
4222 initial_size,
4223 target_map,
4224 copy,
4225 &target_copy_map,
4226 &overmap_start,
4227 &overmap_end,
4228 &trimmed_start);
4229 if (kr != KERN_SUCCESS) {
4230 named_entry_unlock(named_entry);
4231 return kr;
4232 }
4233 target_size = target_copy_map->size;
4234 } else {
4235 /*
4236 * Assert that the vm_map_copy is coming from the right
4237 * zone and hasn't been forged
4238 */
4239 vm_map_copy_require(copy_map);
4240 target_copy_map = copy_map;
4241 }
4242
4243 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4244
4245 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4246 (VM_FLAGS_FIXED |
4247 VM_FLAGS_ANYWHERE |
4248 VM_FLAGS_OVERWRITE |
4249 VM_FLAGS_RETURN_4K_DATA_ADDR |
4250 VM_FLAGS_RETURN_DATA_ADDR));
4251
4252 /* reserve a contiguous range */
4253 kr = vm_map_enter(target_map,
4254 &map_addr,
4255 vm_map_round_page(target_size, map_mask),
4256 mask,
4257 rsv_flags,
4258 VM_OBJECT_NULL,
4259 0,
4260 FALSE, /* copy */
4261 cur_protection,
4262 max_protection,
4263 inheritance);
4264 if (kr != KERN_SUCCESS) {
4265 DEBUG4K_ERROR("kr 0x%x\n", kr);
4266 if (target_copy_map != copy_map) {
4267 vm_map_copy_discard(target_copy_map);
4268 target_copy_map = VM_MAP_COPY_NULL;
4269 }
4270 named_entry_unlock(named_entry);
4271 return kr;
4272 }
4273
4274 copy_addr = map_addr;
4275
4276 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4277 copy_entry != vm_map_copy_to_entry(target_copy_map);
4278 copy_entry = copy_entry->vme_next) {
4279 vm_map_t copy_submap = VM_MAP_NULL;
4280 vm_object_t copy_object = VM_OBJECT_NULL;
4281 vm_map_size_t copy_size;
4282 vm_object_offset_t copy_offset;
4283 boolean_t do_copy = false;
4284
4285 if (copy_entry->is_sub_map) {
4286 copy_submap = VME_SUBMAP(copy_entry);
4287 copy_object = (vm_object_t)copy_submap;
4288 } else {
4289 copy_object = VME_OBJECT(copy_entry);
4290 }
4291 copy_offset = VME_OFFSET(copy_entry);
4292 copy_size = (copy_entry->vme_end -
4293 copy_entry->vme_start);
4294
4295 /* sanity check */
4296 if ((copy_addr + copy_size) >
4297 (map_addr +
4298 overmap_start + overmap_end +
4299 named_entry->size /* XXX full size */)) {
4300 /* over-mapping too much !? */
4301 kr = KERN_INVALID_ARGUMENT;
4302 DEBUG4K_ERROR("kr 0x%x\n", kr);
4303 /* abort */
4304 break;
4305 }
4306
4307 /* take a reference on the object */
4308 if (copy_entry->is_sub_map) {
4309 vm_map_reference(copy_submap);
4310 } else {
4311 if (!copy &&
4312 copy_object != VM_OBJECT_NULL &&
4313 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4314 bool is_writable;
4315
4316 /*
4317 * We need to resolve our side of this
4318 * "symmetric" copy-on-write now; we
4319 * need a new object to map and share,
4320 * instead of the current one which
4321 * might still be shared with the
4322 * original mapping.
4323 *
4324 * Note: A "vm_map_copy_t" does not
4325 * have a lock but we're protected by
4326 * the named entry's lock here.
4327 */
4328 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4329 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4330 assert(copy_object != VME_OBJECT(copy_entry));
4331 is_writable = false;
4332 if (copy_entry->protection & VM_PROT_WRITE) {
4333 is_writable = true;
4334 #if __arm64e__
4335 } else if (copy_entry->used_for_tpro) {
4336 is_writable = true;
4337 #endif /* __arm64e__ */
4338 }
4339 if (!copy_entry->needs_copy && is_writable) {
4340 vm_prot_t prot;
4341
4342 prot = copy_entry->protection & ~VM_PROT_WRITE;
4343 vm_object_pmap_protect(copy_object,
4344 copy_offset,
4345 copy_size,
4346 PMAP_NULL,
4347 PAGE_SIZE,
4348 0,
4349 prot);
4350 }
4351 copy_entry->needs_copy = FALSE;
4352 copy_entry->is_shared = TRUE;
4353 copy_object = VME_OBJECT(copy_entry);
4354 copy_offset = VME_OFFSET(copy_entry);
4355 vm_object_lock(copy_object);
4356 /* we're about to make a shared mapping of this object */
4357 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4358 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4359 vm_object_unlock(copy_object);
4360 }
4361
4362 if (copy_object != VM_OBJECT_NULL &&
4363 copy_object->named &&
4364 copy_object->pager != MEMORY_OBJECT_NULL &&
4365 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4366 memory_object_t pager;
4367 vm_prot_t pager_prot;
4368
4369 /*
4370 * For "named" VM objects, let the pager know that the
4371 * memory object is being mapped. Some pagers need to keep
4372 * track of this, to know when they can reclaim the memory
4373 * object, for example.
4374 * VM calls memory_object_map() for each mapping (specifying
4375 * the protection of each mapping) and calls
4376 * memory_object_last_unmap() when all the mappings are gone.
4377 */
4378 pager_prot = max_protection;
4379 if (copy) {
4380 /*
4381 * Copy-On-Write mapping: won't modify the
4382 * memory object.
4383 */
4384 pager_prot &= ~VM_PROT_WRITE;
4385 }
4386 vm_object_lock(copy_object);
4387 pager = copy_object->pager;
4388 if (copy_object->named &&
4389 pager != MEMORY_OBJECT_NULL &&
4390 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4391 assert(copy_object->pager_ready);
4392 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4393 /*
4394 * Object might have lost its pager
4395 * while waiting.
4396 */
4397 pager = copy_object->pager;
4398 if (copy_object->named &&
4399 pager != MEMORY_OBJECT_NULL) {
4400 vm_object_mapping_begin(copy_object);
4401 vm_object_unlock(copy_object);
4402
4403 kr = memory_object_map(pager, pager_prot);
4404 assert(kr == KERN_SUCCESS);
4405
4406 vm_object_lock(copy_object);
4407 vm_object_mapping_end(copy_object);
4408 }
4409 }
4410 vm_object_unlock(copy_object);
4411 }
4412
4413 /*
4414 * Perform the copy if requested
4415 */
4416
4417 if (copy && copy_object != VM_OBJECT_NULL) {
4418 vm_object_t new_object;
4419 vm_object_offset_t new_offset;
4420
4421 result = vm_object_copy_strategically(copy_object, copy_offset,
4422 copy_size,
4423 false, /* forking */
4424 &new_object, &new_offset,
4425 &do_copy);
4426
4427
4428 if (result == KERN_MEMORY_RESTART_COPY) {
4429 boolean_t success;
4430 boolean_t src_needs_copy;
4431
4432 /*
4433 * XXX
4434 * We currently ignore src_needs_copy.
4435 * This really is the issue of how to make
4436 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4437 * non-kernel users to use. Solution forthcoming.
4438 * In the meantime, since we don't allow non-kernel
4439 * memory managers to specify symmetric copy,
4440 * we won't run into problems here.
4441 */
4442 new_object = copy_object;
4443 new_offset = copy_offset;
4444 success = vm_object_copy_quickly(new_object,
4445 new_offset,
4446 copy_size,
4447 &src_needs_copy,
4448 &do_copy);
4449 assert(success);
4450 result = KERN_SUCCESS;
4451 }
4452 if (result != KERN_SUCCESS) {
4453 kr = result;
4454 break;
4455 }
4456
4457 copy_object = new_object;
4458 copy_offset = new_offset;
4459 /*
4460 * No extra object reference for the mapping:
4461 * the mapping should be the only thing keeping
4462 * this new object alive.
4463 */
4464 } else {
4465 /*
4466 * We already have the right object
4467 * to map.
4468 */
4469 copy_object = VME_OBJECT(copy_entry);
4470 /* take an extra ref for the mapping below */
4471 vm_object_reference(copy_object);
4472 }
4473 }
4474
4475 /*
4476 * If the caller does not want a specific
4477 * tag for this new mapping: use
4478 * the tag of the original mapping.
4479 */
4480 vm_map_kernel_flags_t vmk_remap_flags = {
4481 .vmkf_submap = copy_entry->is_sub_map,
4482 };
4483
4484 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4485 vm_map_kernel_flags_vmflags(vmk_flags),
4486 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4487
4488 /* over-map the object into destination */
4489 vmk_remap_flags.vmf_fixed = true;
4490 vmk_remap_flags.vmf_overwrite = true;
4491
4492 if (!copy && !copy_entry->is_sub_map) {
4493 /*
4494 * copy-on-write should have been
4495 * resolved at this point, or we would
4496 * end up sharing instead of copying.
4497 */
4498 assert(!copy_entry->needs_copy);
4499 }
4500 #if XNU_TARGET_OS_OSX
4501 if (copy_entry->used_for_jit) {
4502 vmk_remap_flags.vmkf_map_jit = TRUE;
4503 }
4504 #endif /* XNU_TARGET_OS_OSX */
4505
4506 kr = vm_map_enter(target_map,
4507 ©_addr,
4508 copy_size,
4509 (vm_map_offset_t) 0,
4510 vmk_remap_flags,
4511 copy_object,
4512 copy_offset,
4513 ((copy_object == NULL)
4514 ? FALSE
4515 : (copy || copy_entry->needs_copy)),
4516 cur_protection,
4517 max_protection,
4518 inheritance);
4519 if (kr != KERN_SUCCESS) {
4520 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4521 if (copy_entry->is_sub_map) {
4522 vm_map_deallocate(copy_submap);
4523 } else {
4524 vm_object_deallocate(copy_object);
4525 }
4526 /* abort */
4527 break;
4528 }
4529
4530 /* next mapping */
4531 copy_addr += copy_size;
4532 }
4533
4534 named_entry_unlock(named_entry);
4535 if (target_copy_map != copy_map) {
4536 vm_map_copy_discard(target_copy_map);
4537 target_copy_map = VM_MAP_COPY_NULL;
4538 }
4539
4540 if (kr == KERN_SUCCESS) {
4541 if (overmap_start) {
4542 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4543 }
4544 offset_in_mapping += overmap_start;
4545 } else if (!vmk_flags.vmf_overwrite) {
4546 /* deallocate the contiguous range */
4547 vm_map_remove(target_map, map_addr,
4548 map_addr + map_size);
4549 }
4550 result = kr;
4551 goto out;
4552 }
4553
4554 if (named_entry->is_object) {
4555 unsigned int access;
4556 unsigned int wimg_mode;
4557
4558 assert(!named_entry->is_copy);
4559 assert(!named_entry->is_sub_map);
4560
4561 /* we are mapping a VM object */
4562
4563 access = named_entry->access;
4564
4565 if (vmk_flags.vmf_return_data_addr ||
4566 vmk_flags.vmf_return_4k_data_addr) {
4567 offset_in_mapping = obj_offs & map_mask;
4568 if (vmk_flags.vmf_return_4k_data_addr) {
4569 offset_in_mapping &= ~((signed)(0xFFF));
4570 }
4571 obj_offs -= offset_in_mapping;
4572 map_size = vm_map_round_page(initial_size +
4573 offset_in_mapping, map_mask);
4574 }
4575
4576 object = vm_named_entry_to_vm_object(named_entry);
4577 assert(object != VM_OBJECT_NULL);
4578 vm_object_lock(object);
4579 named_entry_unlock(named_entry);
4580
4581 vm_object_reference_locked(object);
4582
4583 wimg_mode = object->wimg_bits;
4584 vm_prot_to_wimg(access, &wimg_mode);
4585 if (object->wimg_bits != wimg_mode) {
4586 vm_object_change_wimg_mode(object, wimg_mode);
4587 }
4588
4589 vm_object_unlock(object);
4590 } else {
4591 panic("invalid VM named entry %p", named_entry);
4592 }
4593 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4594 /*
4595 * JMM - This is temporary until we unify named entries
4596 * and raw memory objects.
4597 *
4598 * Detected fake ip_kotype for a memory object. In
4599 * this case, the port isn't really a port at all, but
4600 * instead is just a raw memory object.
4601 */
4602 if (vmk_flags.vmf_return_data_addr ||
4603 vmk_flags.vmf_return_4k_data_addr) {
4604 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4605 }
4606
4607 object = memory_object_to_vm_object((memory_object_t)port);
4608 if (object == VM_OBJECT_NULL) {
4609 return KERN_INVALID_OBJECT;
4610 }
4611 vm_object_reference(object);
4612
4613 /* wait for object (if any) to be ready */
4614 if (object != VM_OBJECT_NULL) {
4615 if (is_kernel_object(object)) {
4616 printf("Warning: Attempt to map kernel object"
4617 " by a non-private kernel entity\n");
4618 return KERN_INVALID_OBJECT;
4619 }
4620 if (!object->pager_ready) {
4621 vm_object_lock(object);
4622
4623 while (!object->pager_ready) {
4624 vm_object_sleep(object,
4625 VM_OBJECT_EVENT_PAGER_READY,
4626 THREAD_UNINT,
4627 LCK_SLEEP_EXCLUSIVE);
4628 }
4629 vm_object_unlock(object);
4630 }
4631 }
4632 } else {
4633 return KERN_INVALID_OBJECT;
4634 }
4635
4636 if (object != VM_OBJECT_NULL &&
4637 object->named &&
4638 object->pager != MEMORY_OBJECT_NULL &&
4639 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4640 memory_object_t pager;
4641 vm_prot_t pager_prot;
4642 kern_return_t kr;
4643
4644 /*
4645 * For "named" VM objects, let the pager know that the
4646 * memory object is being mapped. Some pagers need to keep
4647 * track of this, to know when they can reclaim the memory
4648 * object, for example.
4649 * VM calls memory_object_map() for each mapping (specifying
4650 * the protection of each mapping) and calls
4651 * memory_object_last_unmap() when all the mappings are gone.
4652 */
4653 pager_prot = max_protection;
4654 if (copy) {
4655 /*
4656 * Copy-On-Write mapping: won't modify the
4657 * memory object.
4658 */
4659 pager_prot &= ~VM_PROT_WRITE;
4660 }
4661 vm_object_lock(object);
4662 pager = object->pager;
4663 if (object->named &&
4664 pager != MEMORY_OBJECT_NULL &&
4665 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4666 assert(object->pager_ready);
4667 vm_object_mapping_wait(object, THREAD_UNINT);
4668 /* object might have lost its pager while waiting */
4669 pager = object->pager;
4670 if (object->named && pager != MEMORY_OBJECT_NULL) {
4671 vm_object_mapping_begin(object);
4672 vm_object_unlock(object);
4673
4674 kr = memory_object_map(pager, pager_prot);
4675 assert(kr == KERN_SUCCESS);
4676
4677 vm_object_lock(object);
4678 vm_object_mapping_end(object);
4679 }
4680 }
4681 vm_object_unlock(object);
4682 }
4683
4684 /*
4685 * Perform the copy if requested
4686 */
4687
4688 if (copy) {
4689 vm_object_t new_object;
4690 vm_object_offset_t new_offset;
4691
4692 result = vm_object_copy_strategically(object,
4693 obj_offs,
4694 map_size,
4695 false, /* forking */
4696 &new_object, &new_offset,
4697 ©);
4698
4699
4700 if (result == KERN_MEMORY_RESTART_COPY) {
4701 boolean_t success;
4702 boolean_t src_needs_copy;
4703
4704 /*
4705 * XXX
4706 * We currently ignore src_needs_copy.
4707 * This really is the issue of how to make
4708 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4709 * non-kernel users to use. Solution forthcoming.
4710 * In the meantime, since we don't allow non-kernel
4711 * memory managers to specify symmetric copy,
4712 * we won't run into problems here.
4713 */
4714 new_object = object;
4715 new_offset = obj_offs;
4716 success = vm_object_copy_quickly(new_object,
4717 new_offset,
4718 map_size,
4719 &src_needs_copy,
4720 ©);
4721 assert(success);
4722 result = KERN_SUCCESS;
4723 }
4724 /*
4725 * Throw away the reference to the
4726 * original object, as it won't be mapped.
4727 */
4728
4729 vm_object_deallocate(object);
4730
4731 if (result != KERN_SUCCESS) {
4732 return result;
4733 }
4734
4735 object = new_object;
4736 obj_offs = new_offset;
4737 }
4738
4739 /*
4740 * If non-kernel users want to try to prefault pages, the mapping and prefault
4741 * needs to be atomic.
4742 */
4743 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4744 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4745
4746 result = vm_map_enter(target_map,
4747 &map_addr, map_size,
4748 (vm_map_offset_t)mask,
4749 vmk_flags,
4750 object, obj_offs,
4751 copy,
4752 cur_protection, max_protection,
4753 inheritance);
4754 if (result != KERN_SUCCESS) {
4755 vm_object_deallocate(object);
4756 }
4757
4758 /*
4759 * Try to prefault, and do not forget to release the vm map lock.
4760 */
4761 if (result == KERN_SUCCESS && try_prefault) {
4762 mach_vm_address_t va = map_addr;
4763 kern_return_t kr = KERN_SUCCESS;
4764 unsigned int i = 0;
4765 int pmap_options;
4766
4767 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4768 if (object->internal) {
4769 pmap_options |= PMAP_OPTIONS_INTERNAL;
4770 }
4771
4772 for (i = 0; i < page_list_count; ++i) {
4773 if (!UPL_VALID_PAGE(page_list, i)) {
4774 if (kernel_prefault) {
4775 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4776 result = KERN_MEMORY_ERROR;
4777 break;
4778 }
4779 } else {
4780 /*
4781 * If this function call failed, we should stop
4782 * trying to optimize, other calls are likely
4783 * going to fail too.
4784 *
4785 * We are not gonna report an error for such
4786 * failure though. That's an optimization, not
4787 * something critical.
4788 */
4789 kr = pmap_enter_options(target_map->pmap,
4790 va, UPL_PHYS_PAGE(page_list, i),
4791 cur_protection, VM_PROT_NONE,
4792 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4793 if (kr != KERN_SUCCESS) {
4794 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4795 if (kernel_prefault) {
4796 result = kr;
4797 }
4798 break;
4799 }
4800 OSIncrementAtomic64(&vm_prefault_nb_pages);
4801 }
4802
4803 /* Next virtual address */
4804 va += PAGE_SIZE;
4805 }
4806 if (vmk_flags.vmkf_keep_map_locked) {
4807 vm_map_unlock(target_map);
4808 }
4809 }
4810
4811 out:
4812 if (result == KERN_SUCCESS) {
4813 #if KASAN
4814 if (target_map->pmap == kernel_pmap) {
4815 kasan_notify_address(map_addr, map_size);
4816 }
4817 #endif
4818 *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4819 }
4820 return result;
4821 }
4822
4823 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4824 vm_map_enter_mem_object_prefault(
4825 vm_map_t target_map,
4826 vm_map_offset_ut *address,
4827 vm_map_size_ut initial_size,
4828 vm_map_offset_ut mask,
4829 vm_map_kernel_flags_t vmk_flags,
4830 ipc_port_t port,
4831 vm_object_offset_ut offset,
4832 vm_prot_ut cur_protection,
4833 vm_prot_ut max_protection,
4834 upl_page_list_ptr_t page_list,
4835 unsigned int page_list_count)
4836 {
4837 /* range_id is set by vm_map_enter_mem_object */
4838 return vm_map_enter_mem_object(target_map,
4839 address,
4840 initial_size,
4841 mask,
4842 vmk_flags,
4843 port,
4844 offset,
4845 FALSE,
4846 cur_protection,
4847 max_protection,
4848 VM_INHERIT_DEFAULT,
4849 page_list,
4850 page_list_count);
4851 }
4852
4853 static __attribute__((always_inline, warn_unused_result))
4854 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4855 vm_map_enter_mem_object_control_sanitize(
4856 vm_map_t target_map,
4857 vm_map_offset_ut address_u,
4858 vm_map_size_ut initial_size_u,
4859 vm_map_offset_ut mask_u,
4860 vm_object_offset_ut offset_u,
4861 vm_prot_ut cur_protection_u,
4862 vm_prot_ut max_protection_u,
4863 vm_inherit_ut inheritance_u,
4864 vm_map_kernel_flags_t vmk_flags,
4865 vm_map_address_t *map_addr,
4866 vm_map_size_t *map_size,
4867 vm_map_offset_t *mask,
4868 vm_object_offset_t *obj_offs,
4869 vm_object_offset_t *obj_end,
4870 vm_object_size_t *obj_size,
4871 vm_prot_t *cur_protection,
4872 vm_prot_t *max_protection,
4873 vm_inherit_t *inheritance)
4874 {
4875 kern_return_t kr;
4876
4877 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4878 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4879 cur_protection, max_protection);
4880 if (__improbable(kr != KERN_SUCCESS)) {
4881 return kr;
4882 }
4883
4884 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4885 inheritance);
4886 if (__improbable(kr != KERN_SUCCESS)) {
4887 return kr;
4888 }
4889
4890 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4891 if (__improbable(kr != KERN_SUCCESS)) {
4892 return kr;
4893 }
4894 /*
4895 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4896 * pages).
4897 * We keep unaligned values for now. The call we eventually make to
4898 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4899 * target_map pages or kernel pages. But this isn't enough to guarantee
4900 * kernel space alignment.
4901 */
4902 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4903 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4904 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4905 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4906 obj_offs, obj_end, obj_size);
4907 if (__improbable(kr != KERN_SUCCESS)) {
4908 return kr;
4909 }
4910
4911 /*
4912 * There is no vm_sanitize_addr_size variant that also adjusts for
4913 * a separate offset. Rather than create one for this one-off issue,
4914 * we sanitize map_addr and map_size individually, relying on
4915 * vm_sanitize_size to incorporate the offset. Then, we perform the
4916 * overflow check manually below.
4917 */
4918 *map_addr = vm_sanitize_addr(target_map, address_u);
4919 kr = vm_sanitize_size(offset_u, initial_size_u,
4920 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4921 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4922 if (__improbable(kr != KERN_SUCCESS)) {
4923 return kr;
4924 }
4925
4926 /*
4927 * Ensure arithmetic doesn't overflow in target_map space.
4928 * The computation of map_size above accounts for the possibility that
4929 * offset_u might be unaligned in target_map space.
4930 */
4931 if (vmk_flags.vmf_fixed) {
4932 vm_map_address_t map_end;
4933
4934 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4935 return KERN_INVALID_ARGUMENT;
4936 }
4937 }
4938
4939 return KERN_SUCCESS;
4940 }
4941
4942 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4943 vm_map_enter_mem_object_control(
4944 vm_map_t target_map,
4945 vm_map_offset_ut *address_u,
4946 vm_map_size_ut initial_size_u,
4947 vm_map_offset_ut mask_u,
4948 vm_map_kernel_flags_t vmk_flags,
4949 memory_object_control_t control,
4950 vm_object_offset_ut offset_u,
4951 boolean_t needs_copy,
4952 vm_prot_ut cur_protection_u,
4953 vm_prot_ut max_protection_u,
4954 vm_inherit_ut inheritance_u)
4955 {
4956 vm_map_offset_t mask;
4957 vm_prot_t cur_protection;
4958 vm_prot_t max_protection;
4959 vm_inherit_t inheritance;
4960 vm_map_address_t map_addr;
4961 vm_map_size_t map_size;
4962 vm_object_t object;
4963 vm_object_offset_t obj_offs, obj_end;
4964 vm_object_size_t obj_size;
4965 kern_return_t result;
4966 memory_object_t pager;
4967 vm_prot_t pager_prot;
4968 kern_return_t kr;
4969
4970 /*
4971 * Check arguments for validity
4972 */
4973 if (target_map == VM_MAP_NULL) {
4974 return KERN_INVALID_ARGUMENT;
4975 }
4976
4977 /*
4978 * We only support vmf_return_data_addr-like behavior.
4979 */
4980 vmk_flags.vmf_return_data_addr = true;
4981
4982 /*
4983 * Sanitize any input parameters that are addr/size/prot/inherit
4984 */
4985 kr = vm_map_enter_mem_object_control_sanitize(target_map,
4986 *address_u,
4987 initial_size_u,
4988 mask_u,
4989 offset_u,
4990 cur_protection_u,
4991 max_protection_u,
4992 inheritance_u,
4993 vmk_flags,
4994 &map_addr,
4995 &map_size,
4996 &mask,
4997 &obj_offs,
4998 &obj_end,
4999 &obj_size,
5000 &cur_protection,
5001 &max_protection,
5002 &inheritance);
5003 if (__improbable(kr != KERN_SUCCESS)) {
5004 return vm_sanitize_get_kr(kr);
5005 }
5006
5007 object = memory_object_control_to_vm_object(control);
5008
5009 if (object == VM_OBJECT_NULL) {
5010 return KERN_INVALID_OBJECT;
5011 }
5012
5013 if (is_kernel_object(object)) {
5014 printf("Warning: Attempt to map kernel object"
5015 " by a non-private kernel entity\n");
5016 return KERN_INVALID_OBJECT;
5017 }
5018
5019 vm_object_lock(object);
5020 os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5021
5022
5023 /*
5024 * For "named" VM objects, let the pager know that the
5025 * memory object is being mapped. Some pagers need to keep
5026 * track of this, to know when they can reclaim the memory
5027 * object, for example.
5028 * VM calls memory_object_map() for each mapping (specifying
5029 * the protection of each mapping) and calls
5030 * memory_object_last_unmap() when all the mappings are gone.
5031 */
5032 pager_prot = max_protection;
5033 if (needs_copy) {
5034 pager_prot &= ~VM_PROT_WRITE;
5035 }
5036 pager = object->pager;
5037 if (object->named &&
5038 pager != MEMORY_OBJECT_NULL &&
5039 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5040 assert(object->pager_ready);
5041 vm_object_mapping_wait(object, THREAD_UNINT);
5042 /* object might have lost its pager while waiting */
5043 pager = object->pager;
5044 if (object->named && pager != MEMORY_OBJECT_NULL) {
5045 vm_object_mapping_begin(object);
5046 vm_object_unlock(object);
5047
5048 kr = memory_object_map(pager, pager_prot);
5049 assert(kr == KERN_SUCCESS);
5050
5051 vm_object_lock(object);
5052 vm_object_mapping_end(object);
5053 }
5054 }
5055 vm_object_unlock(object);
5056
5057 /*
5058 * Perform the copy if requested
5059 */
5060
5061 if (needs_copy) {
5062 vm_object_t new_object;
5063 vm_object_offset_t new_offset;
5064
5065 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5066 false, /* forking */
5067 &new_object, &new_offset,
5068 &needs_copy);
5069
5070
5071 if (result == KERN_MEMORY_RESTART_COPY) {
5072 boolean_t success;
5073 boolean_t src_needs_copy;
5074
5075 /*
5076 * XXX
5077 * We currently ignore src_needs_copy.
5078 * This really is the issue of how to make
5079 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5080 * non-kernel users to use. Solution forthcoming.
5081 * In the meantime, since we don't allow non-kernel
5082 * memory managers to specify symmetric copy,
5083 * we won't run into problems here.
5084 */
5085 new_object = object;
5086 new_offset = obj_offs;
5087 success = vm_object_copy_quickly(new_object,
5088 new_offset, obj_size,
5089 &src_needs_copy,
5090 &needs_copy);
5091 assert(success);
5092 result = KERN_SUCCESS;
5093 }
5094 /*
5095 * Throw away the reference to the
5096 * original object, as it won't be mapped.
5097 */
5098
5099 vm_object_deallocate(object);
5100
5101 if (result != KERN_SUCCESS) {
5102 return result;
5103 }
5104
5105 object = new_object;
5106 obj_offs = new_offset;
5107 }
5108
5109 result = vm_map_enter(target_map,
5110 &map_addr, map_size,
5111 (vm_map_offset_t)mask,
5112 vmk_flags,
5113 object,
5114 obj_offs,
5115 needs_copy,
5116 cur_protection, max_protection,
5117 inheritance);
5118
5119 if (result == KERN_SUCCESS) {
5120 *address_u = vm_sanitize_wrap_addr(
5121 map_addr + (obj_offs & vm_map_page_mask(target_map)));
5122 } else {
5123 vm_object_deallocate(object);
5124 }
5125
5126 return result;
5127 }
5128
5129
5130 /* Not used without nested pmaps */
5131 #ifndef NO_NESTED_PMAP
5132 /*
5133 * Clip and unnest a portion of a nested submap mapping.
5134 */
5135
5136
5137 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5138 vm_map_clip_unnest(
5139 vm_map_t map,
5140 vm_map_entry_t entry,
5141 vm_map_offset_t start_unnest,
5142 vm_map_offset_t end_unnest)
5143 {
5144 vm_map_offset_t old_start_unnest = start_unnest;
5145 vm_map_offset_t old_end_unnest = end_unnest;
5146
5147 assert(entry->is_sub_map);
5148 assert(VME_SUBMAP(entry) != NULL);
5149 assert(entry->use_pmap);
5150
5151 /*
5152 * Query the platform for the optimal unnest range.
5153 * DRK: There's some duplication of effort here, since
5154 * callers may have adjusted the range to some extent. This
5155 * routine was introduced to support 1GiB subtree nesting
5156 * for x86 platforms, which can also nest on 2MiB boundaries
5157 * depending on size/alignment.
5158 */
5159 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5160 assert(VME_SUBMAP(entry)->is_nested_map);
5161 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5162 log_unnest_badness(map,
5163 old_start_unnest,
5164 old_end_unnest,
5165 VME_SUBMAP(entry)->is_nested_map,
5166 (entry->vme_start +
5167 VME_SUBMAP(entry)->lowest_unnestable_start -
5168 VME_OFFSET(entry)));
5169 }
5170
5171 if (entry->vme_start > start_unnest ||
5172 entry->vme_end < end_unnest) {
5173 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5174 "bad nested entry: start=0x%llx end=0x%llx\n",
5175 (long long)start_unnest, (long long)end_unnest,
5176 (long long)entry->vme_start, (long long)entry->vme_end);
5177 }
5178
5179 if (start_unnest > entry->vme_start) {
5180 _vm_map_clip_start(&map->hdr,
5181 entry,
5182 start_unnest);
5183 if (map->holelistenabled) {
5184 vm_map_store_update_first_free(map, NULL, FALSE);
5185 } else {
5186 vm_map_store_update_first_free(map, map->first_free, FALSE);
5187 }
5188 }
5189 if (entry->vme_end > end_unnest) {
5190 _vm_map_clip_end(&map->hdr,
5191 entry,
5192 end_unnest);
5193 if (map->holelistenabled) {
5194 vm_map_store_update_first_free(map, NULL, FALSE);
5195 } else {
5196 vm_map_store_update_first_free(map, map->first_free, FALSE);
5197 }
5198 }
5199
5200 pmap_unnest(map->pmap,
5201 entry->vme_start,
5202 entry->vme_end - entry->vme_start);
5203 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5204 /* clean up parent map/maps */
5205 vm_map_submap_pmap_clean(
5206 map, entry->vme_start,
5207 entry->vme_end,
5208 VME_SUBMAP(entry),
5209 VME_OFFSET(entry));
5210 }
5211 entry->use_pmap = FALSE;
5212 if ((map->pmap != kernel_pmap) &&
5213 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5214 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5215 }
5216 }
5217 #endif /* NO_NESTED_PMAP */
5218
5219 __abortlike
5220 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5221 __vm_map_clip_atomic_entry_panic(
5222 vm_map_t map,
5223 vm_map_entry_t entry,
5224 vm_map_offset_t where)
5225 {
5226 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5227 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5228 (uint64_t)entry->vme_start,
5229 (uint64_t)entry->vme_end,
5230 (uint64_t)where);
5231 }
5232
5233 /*
5234 * vm_map_clip_start: [ internal use only ]
5235 *
5236 * Asserts that the given entry begins at or after
5237 * the specified address; if necessary,
5238 * it splits the entry into two.
5239 */
5240 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5241 vm_map_clip_start(
5242 vm_map_t map,
5243 vm_map_entry_t entry,
5244 vm_map_offset_t startaddr)
5245 {
5246 #ifndef NO_NESTED_PMAP
5247 if (entry->is_sub_map &&
5248 entry->use_pmap &&
5249 startaddr >= entry->vme_start) {
5250 vm_map_offset_t start_unnest, end_unnest;
5251
5252 /*
5253 * Make sure "startaddr" is no longer in a nested range
5254 * before we clip. Unnest only the minimum range the platform
5255 * can handle.
5256 * vm_map_clip_unnest may perform additional adjustments to
5257 * the unnest range.
5258 */
5259 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5260 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5261 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5262 }
5263 #endif /* NO_NESTED_PMAP */
5264 if (startaddr > entry->vme_start) {
5265 if (!entry->is_sub_map &&
5266 VME_OBJECT(entry) &&
5267 VME_OBJECT(entry)->phys_contiguous) {
5268 pmap_remove(map->pmap,
5269 (addr64_t)(entry->vme_start),
5270 (addr64_t)(entry->vme_end));
5271 }
5272 if (entry->vme_atomic) {
5273 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5274 }
5275
5276 DTRACE_VM5(
5277 vm_map_clip_start,
5278 vm_map_t, map,
5279 vm_map_offset_t, entry->vme_start,
5280 vm_map_offset_t, entry->vme_end,
5281 vm_map_offset_t, startaddr,
5282 int, VME_ALIAS(entry));
5283
5284 _vm_map_clip_start(&map->hdr, entry, startaddr);
5285 if (map->holelistenabled) {
5286 vm_map_store_update_first_free(map, NULL, FALSE);
5287 } else {
5288 vm_map_store_update_first_free(map, map->first_free, FALSE);
5289 }
5290 }
5291 }
5292
5293
5294 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5295 MACRO_BEGIN \
5296 if ((startaddr) > (entry)->vme_start) \
5297 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5298 MACRO_END
5299
5300 /*
5301 * This routine is called only when it is known that
5302 * the entry must be split.
5303 */
5304 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5305 _vm_map_clip_start(
5306 struct vm_map_header *map_header,
5307 vm_map_entry_t entry,
5308 vm_map_offset_t start)
5309 {
5310 vm_map_entry_t new_entry;
5311
5312 /*
5313 * Split off the front portion --
5314 * note that we must insert the new
5315 * entry BEFORE this one, so that
5316 * this entry has the specified starting
5317 * address.
5318 */
5319
5320 if (entry->map_aligned) {
5321 assert(VM_MAP_PAGE_ALIGNED(start,
5322 VM_MAP_HDR_PAGE_MASK(map_header)));
5323 }
5324
5325 new_entry = _vm_map_entry_create(map_header);
5326 vm_map_entry_copy_full(new_entry, entry);
5327
5328 new_entry->vme_end = start;
5329 assert(new_entry->vme_start < new_entry->vme_end);
5330 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5331 if (__improbable(start >= entry->vme_end)) {
5332 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5333 }
5334 assert(start < entry->vme_end);
5335 entry->vme_start = start;
5336
5337 #if VM_BTLOG_TAGS
5338 if (new_entry->vme_kernel_object) {
5339 btref_retain(new_entry->vme_tag_btref);
5340 }
5341 #endif /* VM_BTLOG_TAGS */
5342
5343 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5344
5345 if (entry->is_sub_map) {
5346 vm_map_reference(VME_SUBMAP(new_entry));
5347 } else {
5348 vm_object_reference(VME_OBJECT(new_entry));
5349 }
5350 }
5351
5352
5353 /*
5354 * vm_map_clip_end: [ internal use only ]
5355 *
5356 * Asserts that the given entry ends at or before
5357 * the specified address; if necessary,
5358 * it splits the entry into two.
5359 */
5360 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5361 vm_map_clip_end(
5362 vm_map_t map,
5363 vm_map_entry_t entry,
5364 vm_map_offset_t endaddr)
5365 {
5366 if (endaddr > entry->vme_end) {
5367 /*
5368 * Within the scope of this clipping, limit "endaddr" to
5369 * the end of this map entry...
5370 */
5371 endaddr = entry->vme_end;
5372 }
5373 #ifndef NO_NESTED_PMAP
5374 if (entry->is_sub_map && entry->use_pmap) {
5375 vm_map_offset_t start_unnest, end_unnest;
5376
5377 /*
5378 * Make sure the range between the start of this entry and
5379 * the new "endaddr" is no longer nested before we clip.
5380 * Unnest only the minimum range the platform can handle.
5381 * vm_map_clip_unnest may perform additional adjustments to
5382 * the unnest range.
5383 */
5384 start_unnest = entry->vme_start;
5385 end_unnest =
5386 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5387 ~(pmap_shared_region_size_min(map->pmap) - 1);
5388 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5389 }
5390 #endif /* NO_NESTED_PMAP */
5391 if (endaddr < entry->vme_end) {
5392 if (!entry->is_sub_map &&
5393 VME_OBJECT(entry) &&
5394 VME_OBJECT(entry)->phys_contiguous) {
5395 pmap_remove(map->pmap,
5396 (addr64_t)(entry->vme_start),
5397 (addr64_t)(entry->vme_end));
5398 }
5399 if (entry->vme_atomic) {
5400 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5401 }
5402 DTRACE_VM5(
5403 vm_map_clip_end,
5404 vm_map_t, map,
5405 vm_map_offset_t, entry->vme_start,
5406 vm_map_offset_t, entry->vme_end,
5407 vm_map_offset_t, endaddr,
5408 int, VME_ALIAS(entry));
5409
5410 _vm_map_clip_end(&map->hdr, entry, endaddr);
5411 if (map->holelistenabled) {
5412 vm_map_store_update_first_free(map, NULL, FALSE);
5413 } else {
5414 vm_map_store_update_first_free(map, map->first_free, FALSE);
5415 }
5416 }
5417 }
5418
5419
5420 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5421 MACRO_BEGIN \
5422 if ((endaddr) < (entry)->vme_end) \
5423 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5424 MACRO_END
5425
5426 /*
5427 * This routine is called only when it is known that
5428 * the entry must be split.
5429 */
5430 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5431 _vm_map_clip_end(
5432 struct vm_map_header *map_header,
5433 vm_map_entry_t entry,
5434 vm_map_offset_t end)
5435 {
5436 vm_map_entry_t new_entry;
5437
5438 /*
5439 * Create a new entry and insert it
5440 * AFTER the specified entry
5441 */
5442
5443 if (entry->map_aligned) {
5444 assert(VM_MAP_PAGE_ALIGNED(end,
5445 VM_MAP_HDR_PAGE_MASK(map_header)));
5446 }
5447
5448 new_entry = _vm_map_entry_create(map_header);
5449 vm_map_entry_copy_full(new_entry, entry);
5450
5451 if (__improbable(end <= entry->vme_start)) {
5452 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5453 }
5454 assert(entry->vme_start < end);
5455 new_entry->vme_start = entry->vme_end = end;
5456 VME_OFFSET_SET(new_entry,
5457 VME_OFFSET(new_entry) + (end - entry->vme_start));
5458 assert(new_entry->vme_start < new_entry->vme_end);
5459
5460 #if VM_BTLOG_TAGS
5461 if (new_entry->vme_kernel_object) {
5462 btref_retain(new_entry->vme_tag_btref);
5463 }
5464 #endif /* VM_BTLOG_TAGS */
5465
5466 _vm_map_store_entry_link(map_header, entry, new_entry);
5467
5468 if (entry->is_sub_map) {
5469 vm_map_reference(VME_SUBMAP(new_entry));
5470 } else {
5471 vm_object_reference(VME_OBJECT(new_entry));
5472 }
5473 }
5474
5475
5476 /*
5477 * VM_MAP_RANGE_CHECK: [ internal use only ]
5478 *
5479 * Asserts that the starting and ending region
5480 * addresses fall within the valid range of the map.
5481 */
5482 #define VM_MAP_RANGE_CHECK(map, start, end) \
5483 MACRO_BEGIN \
5484 if (start < vm_map_min(map)) \
5485 start = vm_map_min(map); \
5486 if (end > vm_map_max(map)) \
5487 end = vm_map_max(map); \
5488 if (start > end) \
5489 start = end; \
5490 MACRO_END
5491
5492 /*
5493 * vm_map_range_check: [ internal use only ]
5494 *
5495 * Check that the region defined by the specified start and
5496 * end addresses are wholly contained within a single map
5497 * entry or set of adjacent map entries of the spacified map,
5498 * i.e. the specified region contains no unmapped space.
5499 * If any or all of the region is unmapped, FALSE is returned.
5500 * Otherwise, TRUE is returned and if the output argument 'entry'
5501 * is not NULL it points to the map entry containing the start
5502 * of the region.
5503 *
5504 * The map is locked for reading on entry and is left locked.
5505 */
5506 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5507 vm_map_range_check(
5508 vm_map_t map,
5509 vm_map_offset_t start,
5510 vm_map_offset_t end,
5511 vm_map_entry_t *entry)
5512 {
5513 vm_map_entry_t cur;
5514 vm_map_offset_t prev;
5515
5516 /*
5517 * Basic sanity checks first
5518 */
5519 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5520 return FALSE;
5521 }
5522
5523 /*
5524 * Check first if the region starts within a valid
5525 * mapping for the map.
5526 */
5527 if (!vm_map_lookup_entry(map, start, &cur)) {
5528 return FALSE;
5529 }
5530
5531 /*
5532 * Optimize for the case that the region is contained
5533 * in a single map entry.
5534 */
5535 if (entry != (vm_map_entry_t *) NULL) {
5536 *entry = cur;
5537 }
5538 if (end <= cur->vme_end) {
5539 return TRUE;
5540 }
5541
5542 /*
5543 * If the region is not wholly contained within a
5544 * single entry, walk the entries looking for holes.
5545 */
5546 prev = cur->vme_end;
5547 cur = cur->vme_next;
5548 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5549 if (end <= cur->vme_end) {
5550 return TRUE;
5551 }
5552 prev = cur->vme_end;
5553 cur = cur->vme_next;
5554 }
5555 return FALSE;
5556 }
5557
5558 static __attribute__((always_inline, warn_unused_result))
5559 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5560 vm_map_protect_sanitize(
5561 vm_map_t map,
5562 vm_map_offset_ut start_u,
5563 vm_map_offset_ut end_u,
5564 vm_prot_ut new_prot_u,
5565 vm_map_offset_t *start,
5566 vm_map_offset_t *end,
5567 vm_prot_t *new_prot)
5568 {
5569 kern_return_t kr;
5570 vm_map_size_t size;
5571
5572 kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5573 map, VM_PROT_COPY, new_prot);
5574 if (__improbable(kr != KERN_SUCCESS)) {
5575 return kr;
5576 }
5577
5578 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5579 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5580 if (__improbable(kr != KERN_SUCCESS)) {
5581 return kr;
5582 }
5583
5584 return KERN_SUCCESS;
5585 }
5586
5587 /*
5588 * vm_map_protect:
5589 *
5590 * Sets the protection of the specified address
5591 * region in the target map. If "set_max" is
5592 * specified, the maximum protection is to be set;
5593 * otherwise, only the current protection is affected.
5594 */
5595 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5596 vm_map_protect(
5597 vm_map_t map,
5598 vm_map_offset_ut start_u,
5599 vm_map_offset_ut end_u,
5600 boolean_t set_max,
5601 vm_prot_ut new_prot_u)
5602 {
5603 vm_map_entry_t current;
5604 vm_map_offset_t prev;
5605 vm_map_entry_t entry;
5606 vm_prot_t new_prot;
5607 vm_prot_t new_max;
5608 int pmap_options = 0;
5609 kern_return_t kr;
5610 vm_map_offset_t start, original_start;
5611 vm_map_offset_t end;
5612
5613 kr = vm_map_protect_sanitize(map,
5614 start_u,
5615 end_u,
5616 new_prot_u,
5617 &start,
5618 &end,
5619 &new_prot);
5620 if (__improbable(kr != KERN_SUCCESS)) {
5621 return vm_sanitize_get_kr(kr);
5622 }
5623 original_start = start;
5624
5625 if (new_prot & VM_PROT_COPY) {
5626 vm_map_offset_t new_start;
5627 vm_prot_t cur_prot, max_prot;
5628 vm_map_kernel_flags_t kflags;
5629
5630 /* LP64todo - see below */
5631 if (start >= map->max_offset) {
5632 return KERN_INVALID_ADDRESS;
5633 }
5634
5635 if ((new_prot & VM_PROT_ALLEXEC) &&
5636 map->pmap != kernel_pmap &&
5637 (vm_map_cs_enforcement(map)
5638 #if XNU_TARGET_OS_OSX && __arm64__
5639 || !VM_MAP_IS_EXOTIC(map)
5640 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5641 ) &&
5642 VM_MAP_POLICY_WX_FAIL(map)) {
5643 DTRACE_VM3(cs_wx,
5644 uint64_t, (uint64_t) start,
5645 uint64_t, (uint64_t) end,
5646 vm_prot_t, new_prot);
5647 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5648 proc_selfpid(),
5649 (get_bsdtask_info(current_task())
5650 ? proc_name_address(get_bsdtask_info(current_task()))
5651 : "?"),
5652 __FUNCTION__, __LINE__,
5653 #if DEVELOPMENT || DEBUG
5654 (uint64_t)start,
5655 (uint64_t)end,
5656 #else /* DEVELOPMENT || DEBUG */
5657 (uint64_t)0,
5658 (uint64_t)0,
5659 #endif /* DEVELOPMENT || DEBUG */
5660 new_prot);
5661 return KERN_PROTECTION_FAILURE;
5662 }
5663
5664 /*
5665 * Let vm_map_remap_extract() know that it will need to:
5666 * + make a copy of the mapping
5667 * + add VM_PROT_WRITE to the max protections
5668 * + remove any protections that are no longer allowed from the
5669 * max protections (to avoid any WRITE/EXECUTE conflict, for
5670 * example).
5671 * Note that "max_prot" is an IN/OUT parameter only for this
5672 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5673 * only.
5674 */
5675 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5676 cur_prot = VM_PROT_NONE;
5677 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5678 kflags.vmkf_remap_prot_copy = true;
5679 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5680 new_start = start;
5681 kr = vm_map_remap(map,
5682 vm_sanitize_wrap_addr_ref(&new_start),
5683 end - start,
5684 0, /* mask */
5685 kflags,
5686 map,
5687 start,
5688 TRUE, /* copy-on-write remapping! */
5689 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5690 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5691 VM_INHERIT_DEFAULT);
5692 if (kr != KERN_SUCCESS) {
5693 return kr;
5694 }
5695 new_prot &= ~VM_PROT_COPY;
5696 }
5697
5698 vm_map_lock(map);
5699 restart_after_unlock:
5700
5701 /* LP64todo - remove this check when vm_map_commpage64()
5702 * no longer has to stuff in a map_entry for the commpage
5703 * above the map's max_offset.
5704 */
5705 if (start >= map->max_offset) {
5706 vm_map_unlock(map);
5707 return KERN_INVALID_ADDRESS;
5708 }
5709
5710 while (1) {
5711 /*
5712 * Lookup the entry. If it doesn't start in a valid
5713 * entry, return an error.
5714 */
5715 if (!vm_map_lookup_entry(map, start, &entry)) {
5716 vm_map_unlock(map);
5717 return KERN_INVALID_ADDRESS;
5718 }
5719
5720 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5721 start = SUPERPAGE_ROUND_DOWN(start);
5722 continue;
5723 }
5724 break;
5725 }
5726 if (entry->superpage_size) {
5727 end = SUPERPAGE_ROUND_UP(end);
5728 }
5729
5730 /*
5731 * Make a first pass to check for protection and address
5732 * violations.
5733 */
5734
5735 current = entry;
5736 prev = current->vme_start;
5737 while ((current != vm_map_to_entry(map)) &&
5738 (current->vme_start < end)) {
5739 /*
5740 * If there is a hole, return an error.
5741 */
5742 if (current->vme_start != prev) {
5743 vm_map_unlock(map);
5744 return KERN_INVALID_ADDRESS;
5745 }
5746
5747 new_max = current->max_protection;
5748
5749 #if defined(__x86_64__)
5750 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5751 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5752 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5753 }
5754 #elif CODE_SIGNING_MONITOR
5755 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5756 new_max |= VM_PROT_EXECUTE;
5757 }
5758 #endif
5759 if ((new_prot & new_max) != new_prot) {
5760 vm_map_unlock(map);
5761 return KERN_PROTECTION_FAILURE;
5762 }
5763
5764 if (current->used_for_jit &&
5765 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5766 vm_map_unlock(map);
5767 return KERN_PROTECTION_FAILURE;
5768 }
5769
5770 #if __arm64e__
5771 /* Disallow protecting hw assisted TPRO mappings */
5772 if (current->used_for_tpro) {
5773 vm_map_unlock(map);
5774 return KERN_PROTECTION_FAILURE;
5775 }
5776 #endif /* __arm64e__ */
5777
5778
5779 if ((new_prot & VM_PROT_WRITE) &&
5780 (new_prot & VM_PROT_ALLEXEC) &&
5781 #if XNU_TARGET_OS_OSX
5782 map->pmap != kernel_pmap &&
5783 (vm_map_cs_enforcement(map)
5784 #if __arm64__
5785 || !VM_MAP_IS_EXOTIC(map)
5786 #endif /* __arm64__ */
5787 ) &&
5788 #endif /* XNU_TARGET_OS_OSX */
5789 #if CODE_SIGNING_MONITOR
5790 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5791 #endif
5792 !(current->used_for_jit)) {
5793 DTRACE_VM3(cs_wx,
5794 uint64_t, (uint64_t) current->vme_start,
5795 uint64_t, (uint64_t) current->vme_end,
5796 vm_prot_t, new_prot);
5797 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5798 proc_selfpid(),
5799 (get_bsdtask_info(current_task())
5800 ? proc_name_address(get_bsdtask_info(current_task()))
5801 : "?"),
5802 __FUNCTION__, __LINE__,
5803 #if DEVELOPMENT || DEBUG
5804 (uint64_t)current->vme_start,
5805 (uint64_t)current->vme_end,
5806 #else /* DEVELOPMENT || DEBUG */
5807 (uint64_t)0,
5808 (uint64_t)0,
5809 #endif /* DEVELOPMENT || DEBUG */
5810 new_prot);
5811 new_prot &= ~VM_PROT_ALLEXEC;
5812 if (VM_MAP_POLICY_WX_FAIL(map)) {
5813 vm_map_unlock(map);
5814 return KERN_PROTECTION_FAILURE;
5815 }
5816 }
5817
5818 /*
5819 * If the task has requested executable lockdown,
5820 * deny both:
5821 * - adding executable protections OR
5822 * - adding write protections to an existing executable mapping.
5823 */
5824 if (map->map_disallow_new_exec == TRUE) {
5825 if ((new_prot & VM_PROT_ALLEXEC) ||
5826 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5827 vm_map_unlock(map);
5828 return KERN_PROTECTION_FAILURE;
5829 }
5830 }
5831
5832 prev = current->vme_end;
5833 current = current->vme_next;
5834 }
5835
5836 #if __arm64__
5837 if (end > prev &&
5838 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5839 vm_map_entry_t prev_entry;
5840
5841 prev_entry = current->vme_prev;
5842 if (prev_entry != vm_map_to_entry(map) &&
5843 !prev_entry->map_aligned &&
5844 (vm_map_round_page(prev_entry->vme_end,
5845 VM_MAP_PAGE_MASK(map))
5846 == end)) {
5847 /*
5848 * The last entry in our range is not "map-aligned"
5849 * but it would have reached all the way to "end"
5850 * if it had been map-aligned, so this is not really
5851 * a hole in the range and we can proceed.
5852 */
5853 prev = end;
5854 }
5855 }
5856 #endif /* __arm64__ */
5857
5858 if (end > prev) {
5859 vm_map_unlock(map);
5860 return KERN_INVALID_ADDRESS;
5861 }
5862
5863 /*
5864 * Go back and fix up protections.
5865 * Clip to start here if the range starts within
5866 * the entry.
5867 */
5868
5869 current = entry;
5870 if (current != vm_map_to_entry(map)) {
5871 /* clip and unnest if necessary */
5872 vm_map_clip_start(map, current, start);
5873 }
5874
5875 while ((current != vm_map_to_entry(map)) &&
5876 (current->vme_start < end)) {
5877 vm_prot_t old_prot;
5878
5879 if (current->in_transition) {
5880 wait_result_t wait_result;
5881 vm_map_offset_t current_start;
5882
5883 /*
5884 * Another thread is wiring/unwiring this entry.
5885 * Let the other thread know we are waiting.
5886 */
5887 current_start = current->vme_start;
5888 current->needs_wakeup = true;
5889 /* wait for the other thread to be done */
5890 wait_result = vm_map_entry_wait(map, TH_UNINT);
5891 /*
5892 * We unlocked the map, so anything could have changed in the
5893 * range and we need to re-check from "current_start" to "end".
5894 * Our entries might no longer be valid.
5895 */
5896 current = NULL;
5897 entry = NULL;
5898 /*
5899 * Re-lookup and re-clip "current_start".
5900 * If it's no longer mapped,
5901 */
5902 vm_map_lookup_entry_or_next(map, current_start, ¤t);
5903 if (current != vm_map_to_entry(map)) {
5904 vm_map_clip_start(map, current, current_start);
5905 }
5906 /* restart from this point */
5907 start = current_start;
5908 goto restart_after_unlock;
5909 }
5910
5911 vm_map_clip_end(map, current, end);
5912
5913 #if DEVELOPMENT || DEBUG
5914 if (current->csm_associated && vm_log_xnu_user_debug) {
5915 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5916 proc_selfpid(),
5917 (get_bsdtask_info(current_task())
5918 ? proc_name_address(get_bsdtask_info(current_task()))
5919 : "?"),
5920 __FUNCTION__,
5921 (uint64_t)start,
5922 (uint64_t)end,
5923 new_prot,
5924 map, current,
5925 current->vme_start,
5926 current->vme_end,
5927 current->protection,
5928 current->max_protection);
5929 }
5930 #endif /* DEVELOPMENT || DEBUG */
5931
5932 if (current->is_sub_map) {
5933 /* clipping did unnest if needed */
5934 assert(!current->use_pmap);
5935 }
5936
5937 old_prot = current->protection;
5938
5939 if (set_max) {
5940 current->max_protection = new_prot;
5941 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5942 current->protection = (new_prot & old_prot);
5943 } else {
5944 current->protection = new_prot;
5945 }
5946
5947 #if CODE_SIGNING_MONITOR
5948 if (!current->vme_xnu_user_debug &&
5949 /* a !csm_associated mapping becoming executable */
5950 ((!current->csm_associated &&
5951 !(old_prot & VM_PROT_EXECUTE) &&
5952 (current->protection & VM_PROT_EXECUTE))
5953 ||
5954 /* a csm_associated mapping becoming writable */
5955 (current->csm_associated &&
5956 !(old_prot & VM_PROT_WRITE) &&
5957 (current->protection & VM_PROT_WRITE)))) {
5958 /*
5959 * This mapping has not already been marked as
5960 * "user_debug" and it is either:
5961 * 1. not code-signing-monitored and becoming executable
5962 * 2. code-signing-monitored and becoming writable,
5963 * so inform the CodeSigningMonitor and mark the
5964 * mapping as "user_debug" if appropriate.
5965 */
5966 vm_map_kernel_flags_t vmk_flags;
5967 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5968 /* pretend it's a vm_protect(VM_PROT_COPY)... */
5969 vmk_flags.vmkf_remap_prot_copy = true;
5970 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5971 #if DEVELOPMENT || DEBUG
5972 if (vm_log_xnu_user_debug) {
5973 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5974 proc_selfpid(),
5975 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5976 __FUNCTION__, __LINE__,
5977 map, current,
5978 current->vme_start, current->vme_end,
5979 old_prot, current->protection,
5980 kr, current->vme_xnu_user_debug);
5981 }
5982 #endif /* DEVELOPMENT || DEBUG */
5983 }
5984 #endif /* CODE_SIGNING_MONITOR */
5985
5986 /*
5987 * Update physical map if necessary.
5988 * If the request is to turn off write protection,
5989 * we won't do it for real (in pmap). This is because
5990 * it would cause copy-on-write to fail. We've already
5991 * set, the new protection in the map, so if a
5992 * write-protect fault occurred, it will be fixed up
5993 * properly, COW or not.
5994 */
5995 if (current->protection != old_prot) {
5996 /* Look one level in we support nested pmaps */
5997 /* from mapped submaps which are direct entries */
5998 /* in our map */
5999
6000 vm_prot_t prot;
6001
6002 prot = current->protection;
6003 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6004 prot &= ~VM_PROT_WRITE;
6005 } else {
6006 assert(!VME_OBJECT(current)->code_signed);
6007 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6008 if (prot & VM_PROT_WRITE) {
6009 /*
6010 * For write requests on the
6011 * compressor, we wil ask the
6012 * pmap layer to prevent us from
6013 * taking a write fault when we
6014 * attempt to access the mapping
6015 * next.
6016 */
6017 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6018 }
6019 }
6020
6021 if (override_nx(map, VME_ALIAS(current)) && prot) {
6022 prot |= VM_PROT_EXECUTE;
6023 }
6024
6025 #if DEVELOPMENT || DEBUG
6026 if (!(old_prot & VM_PROT_EXECUTE) &&
6027 (prot & VM_PROT_EXECUTE) &&
6028 panic_on_unsigned_execute &&
6029 (proc_selfcsflags() & CS_KILL)) {
6030 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6031 }
6032 #endif /* DEVELOPMENT || DEBUG */
6033
6034 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6035 if (current->wired_count) {
6036 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6037 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6038 }
6039
6040 /* If the pmap layer cares about this
6041 * protection type, force a fault for
6042 * each page so that vm_fault will
6043 * repopulate the page with the full
6044 * set of protections.
6045 */
6046 /*
6047 * TODO: We don't seem to need this,
6048 * but this is due to an internal
6049 * implementation detail of
6050 * pmap_protect. Do we want to rely
6051 * on this?
6052 */
6053 prot = VM_PROT_NONE;
6054 }
6055
6056 if (current->is_sub_map && current->use_pmap) {
6057 pmap_protect(VME_SUBMAP(current)->pmap,
6058 current->vme_start,
6059 current->vme_end,
6060 prot);
6061 } else {
6062 pmap_protect_options(map->pmap,
6063 current->vme_start,
6064 current->vme_end,
6065 prot,
6066 pmap_options,
6067 NULL);
6068 }
6069 }
6070 current = current->vme_next;
6071 }
6072
6073 if (entry == VM_MAP_ENTRY_NULL) {
6074 /*
6075 * Re-lookup the original start of our range.
6076 * If it's no longer mapped, start with the next mapping.
6077 */
6078 vm_map_lookup_entry_or_next(map, original_start, &entry);
6079 }
6080 current = entry;
6081 while ((current != vm_map_to_entry(map)) &&
6082 (current->vme_start <= end)) {
6083 vm_map_simplify_entry(map, current);
6084 current = current->vme_next;
6085 }
6086
6087 vm_map_unlock(map);
6088 return KERN_SUCCESS;
6089 }
6090
6091 static __attribute__((always_inline, warn_unused_result))
6092 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6093 vm_map_inherit_sanitize(
6094 vm_map_t map,
6095 vm_map_offset_ut start_u,
6096 vm_map_offset_ut end_u,
6097 vm_inherit_ut new_inheritance_u,
6098 vm_map_offset_t *start,
6099 vm_map_offset_t *end,
6100 vm_inherit_t *new_inheritance)
6101 {
6102 kern_return_t kr;
6103 vm_map_size_t size;
6104
6105 kr = vm_sanitize_inherit(new_inheritance_u,
6106 VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6107 if (__improbable(kr != KERN_SUCCESS)) {
6108 return kr;
6109 }
6110
6111 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6112 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6113 if (__improbable(kr != KERN_SUCCESS)) {
6114 return kr;
6115 }
6116
6117 return KERN_SUCCESS;
6118 }
6119
6120 /*
6121 * vm_map_inherit:
6122 *
6123 * Sets the inheritance of the specified address
6124 * range in the target map. Inheritance
6125 * affects how the map will be shared with
6126 * child maps at the time of vm_map_fork.
6127 */
6128 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6129 vm_map_inherit(
6130 vm_map_t map,
6131 vm_map_offset_ut start_u,
6132 vm_map_offset_ut end_u,
6133 vm_inherit_ut new_inheritance_u)
6134 {
6135 vm_map_entry_t entry;
6136 vm_map_entry_t temp_entry;
6137 kern_return_t kr;
6138 vm_map_offset_t start;
6139 vm_map_offset_t end;
6140 vm_inherit_t new_inheritance;
6141
6142 kr = vm_map_inherit_sanitize(map,
6143 start_u,
6144 end_u,
6145 new_inheritance_u,
6146 &start,
6147 &end,
6148 &new_inheritance);
6149 if (__improbable(kr != KERN_SUCCESS)) {
6150 return vm_sanitize_get_kr(kr);
6151 }
6152
6153 vm_map_lock(map);
6154
6155 VM_MAP_RANGE_CHECK(map, start, end);
6156
6157 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6158 entry = temp_entry;
6159 } else {
6160 temp_entry = temp_entry->vme_next;
6161 entry = temp_entry;
6162 }
6163
6164 /* first check entire range for submaps which can't support the */
6165 /* given inheritance. */
6166 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6167 if (entry->is_sub_map) {
6168 if (new_inheritance == VM_INHERIT_COPY) {
6169 vm_map_unlock(map);
6170 return KERN_INVALID_ARGUMENT;
6171 }
6172 }
6173
6174 entry = entry->vme_next;
6175 }
6176
6177 entry = temp_entry;
6178 if (entry != vm_map_to_entry(map)) {
6179 /* clip and unnest if necessary */
6180 vm_map_clip_start(map, entry, start);
6181 }
6182
6183 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6184 vm_map_clip_end(map, entry, end);
6185 if (entry->is_sub_map) {
6186 /* clip did unnest if needed */
6187 assert(!entry->use_pmap);
6188 }
6189
6190 entry->inheritance = new_inheritance;
6191
6192 entry = entry->vme_next;
6193 }
6194
6195 vm_map_unlock(map);
6196 return KERN_SUCCESS;
6197 }
6198
6199 /*
6200 * Update the accounting for the amount of wired memory in this map. If the user has
6201 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6202 */
6203
6204 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6205 add_wire_counts(
6206 vm_map_t map,
6207 vm_map_entry_t entry,
6208 boolean_t user_wire)
6209 {
6210 vm_map_size_t size;
6211
6212 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6213
6214 if (user_wire) {
6215 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6216
6217 /*
6218 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6219 * this map entry.
6220 */
6221
6222 if (entry->user_wired_count == 0) {
6223 size = entry->vme_end - entry->vme_start;
6224
6225 /*
6226 * Since this is the first time the user is wiring this map entry, check to see if we're
6227 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6228 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6229 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6230 * limit, then we fail.
6231 */
6232
6233 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6234 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6235 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6236 #if DEVELOPMENT || DEBUG
6237 if (panic_on_mlock_failure) {
6238 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6239 }
6240 #endif /* DEVELOPMENT || DEBUG */
6241 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6242 } else {
6243 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6244 #if DEVELOPMENT || DEBUG
6245 if (panic_on_mlock_failure) {
6246 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6247 }
6248 #endif /* DEVELOPMENT || DEBUG */
6249 }
6250 return KERN_RESOURCE_SHORTAGE;
6251 }
6252
6253 /*
6254 * The first time the user wires an entry, we also increment the wired_count and add this to
6255 * the total that has been wired in the map.
6256 */
6257
6258 if (entry->wired_count >= MAX_WIRE_COUNT) {
6259 return KERN_FAILURE;
6260 }
6261
6262 entry->wired_count++;
6263 map->user_wire_size += size;
6264 }
6265
6266 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6267 return KERN_FAILURE;
6268 }
6269
6270 entry->user_wired_count++;
6271 } else {
6272 /*
6273 * The kernel's wiring the memory. Just bump the count and continue.
6274 */
6275
6276 if (entry->wired_count >= MAX_WIRE_COUNT) {
6277 panic("vm_map_wire: too many wirings");
6278 }
6279
6280 entry->wired_count++;
6281 }
6282
6283 if (first_wire) {
6284 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6285 }
6286
6287 return KERN_SUCCESS;
6288 }
6289
6290 /*
6291 * Update the memory wiring accounting now that the given map entry is being unwired.
6292 */
6293
6294 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6295 subtract_wire_counts(
6296 vm_map_t map,
6297 vm_map_entry_t entry,
6298 boolean_t user_wire)
6299 {
6300 if (user_wire) {
6301 /*
6302 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6303 */
6304
6305 if (entry->user_wired_count == 1) {
6306 /*
6307 * We're removing the last user wire reference. Decrement the wired_count and the total
6308 * user wired memory for this map.
6309 */
6310
6311 assert(entry->wired_count >= 1);
6312 entry->wired_count--;
6313 map->user_wire_size -= entry->vme_end - entry->vme_start;
6314 }
6315
6316 assert(entry->user_wired_count >= 1);
6317 entry->user_wired_count--;
6318 } else {
6319 /*
6320 * The kernel is unwiring the memory. Just update the count.
6321 */
6322
6323 assert(entry->wired_count >= 1);
6324 entry->wired_count--;
6325 }
6326
6327 vme_btref_consider_and_put(entry);
6328 }
6329
6330 int cs_executable_wire = 0;
6331
6332 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6333 vm_map_wire_nested(
6334 vm_map_t map,
6335 vm_map_offset_t start,
6336 vm_map_offset_t end,
6337 vm_prot_t caller_prot,
6338 vm_tag_t tag,
6339 boolean_t user_wire,
6340 pmap_t map_pmap,
6341 vm_map_offset_t pmap_addr,
6342 ppnum_t *physpage_p)
6343 {
6344 vm_map_entry_t entry;
6345 vm_prot_t access_type;
6346 struct vm_map_entry *first_entry, tmp_entry;
6347 vm_map_t real_map;
6348 vm_map_offset_t s, e;
6349 kern_return_t rc;
6350 boolean_t need_wakeup;
6351 boolean_t main_map = FALSE;
6352 wait_interrupt_t interruptible_state;
6353 thread_t cur_thread;
6354 unsigned int last_timestamp;
6355 vm_map_size_t size;
6356 boolean_t wire_and_extract;
6357 vm_prot_t extra_prots;
6358
6359 extra_prots = VM_PROT_COPY;
6360 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6361 #if XNU_TARGET_OS_OSX
6362 if (map->pmap == kernel_pmap ||
6363 !vm_map_cs_enforcement(map)) {
6364 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6365 }
6366 #endif /* XNU_TARGET_OS_OSX */
6367 #if CODE_SIGNING_MONITOR
6368 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6369 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6370 }
6371 #endif /* CODE_SIGNING_MONITOR */
6372
6373 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6374
6375 wire_and_extract = FALSE;
6376 if (physpage_p != NULL) {
6377 /*
6378 * The caller wants the physical page number of the
6379 * wired page. We return only one physical page number
6380 * so this works for only one page at a time.
6381 *
6382 * The only caller (vm_map_wire_and_extract)
6383 * guarantees it.
6384 */
6385 assert(end - start == VM_MAP_PAGE_SIZE(map));
6386 wire_and_extract = TRUE;
6387 *physpage_p = 0;
6388 }
6389
6390 VM_MAP_RANGE_CHECK(map, start, end);
6391 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6392 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6393 if (start == end) {
6394 /* We wired what the caller asked for, zero pages */
6395 return KERN_SUCCESS;
6396 }
6397
6398 vm_map_lock(map);
6399 if (map_pmap == NULL) {
6400 main_map = TRUE;
6401 }
6402 last_timestamp = map->timestamp;
6403
6404 need_wakeup = FALSE;
6405 cur_thread = current_thread();
6406
6407 s = start;
6408 rc = KERN_SUCCESS;
6409
6410 if (vm_map_lookup_entry(map, s, &first_entry)) {
6411 entry = first_entry;
6412 /*
6413 * vm_map_clip_start will be done later.
6414 * We don't want to unnest any nested submaps here !
6415 */
6416 } else {
6417 /* Start address is not in map */
6418 rc = KERN_INVALID_ADDRESS;
6419 goto done;
6420 }
6421
6422 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6423 /*
6424 * At this point, we have wired from "start" to "s".
6425 * We still need to wire from "s" to "end".
6426 *
6427 * "entry" hasn't been clipped, so it could start before "s"
6428 * and/or end after "end".
6429 */
6430
6431 /* "e" is how far we want to wire in this entry */
6432 e = entry->vme_end;
6433 if (e > end) {
6434 e = end;
6435 }
6436
6437 /*
6438 * If another thread is wiring/unwiring this entry then
6439 * block after informing other thread to wake us up.
6440 */
6441 if (entry->in_transition) {
6442 wait_result_t wait_result;
6443
6444 /*
6445 * We have not clipped the entry. Make sure that
6446 * the start address is in range so that the lookup
6447 * below will succeed.
6448 * "s" is the current starting point: we've already
6449 * wired from "start" to "s" and we still have
6450 * to wire from "s" to "end".
6451 */
6452
6453 entry->needs_wakeup = TRUE;
6454
6455 /*
6456 * wake up anybody waiting on entries that we have
6457 * already wired.
6458 */
6459 if (need_wakeup) {
6460 vm_map_entry_wakeup(map);
6461 need_wakeup = FALSE;
6462 }
6463 /*
6464 * User wiring is interruptible
6465 */
6466 wait_result = vm_map_entry_wait(map,
6467 (user_wire) ? THREAD_ABORTSAFE :
6468 THREAD_UNINT);
6469 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6470 /*
6471 * undo the wirings we have done so far
6472 * We do not clear the needs_wakeup flag,
6473 * because we cannot tell if we were the
6474 * only one waiting.
6475 */
6476 rc = KERN_FAILURE;
6477 goto done;
6478 }
6479
6480 /*
6481 * Cannot avoid a lookup here. reset timestamp.
6482 */
6483 last_timestamp = map->timestamp;
6484
6485 /*
6486 * The entry could have been clipped, look it up again.
6487 * Worse that can happen is, it may not exist anymore.
6488 */
6489 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6490 /*
6491 * User: undo everything upto the previous
6492 * entry. let vm_map_unwire worry about
6493 * checking the validity of the range.
6494 */
6495 rc = KERN_FAILURE;
6496 goto done;
6497 }
6498 entry = first_entry;
6499 continue;
6500 }
6501
6502 if (entry->is_sub_map) {
6503 vm_map_offset_t sub_start;
6504 vm_map_offset_t sub_end;
6505 vm_map_offset_t local_start;
6506 vm_map_offset_t local_end;
6507 pmap_t pmap;
6508
6509 if (wire_and_extract) {
6510 /*
6511 * Wiring would result in copy-on-write
6512 * which would not be compatible with
6513 * the sharing we have with the original
6514 * provider of this memory.
6515 */
6516 rc = KERN_INVALID_ARGUMENT;
6517 goto done;
6518 }
6519
6520 vm_map_clip_start(map, entry, s);
6521 vm_map_clip_end(map, entry, end);
6522
6523 sub_start = VME_OFFSET(entry);
6524 sub_end = entry->vme_end;
6525 sub_end += VME_OFFSET(entry) - entry->vme_start;
6526
6527 local_end = entry->vme_end;
6528 if (map_pmap == NULL) {
6529 vm_object_t object;
6530 vm_object_offset_t offset;
6531 vm_prot_t prot;
6532 boolean_t wired;
6533 vm_map_entry_t local_entry;
6534 vm_map_version_t version;
6535 vm_map_t lookup_map;
6536
6537 if (entry->use_pmap) {
6538 pmap = VME_SUBMAP(entry)->pmap;
6539 /* ppc implementation requires that */
6540 /* submaps pmap address ranges line */
6541 /* up with parent map */
6542 #ifdef notdef
6543 pmap_addr = sub_start;
6544 #endif
6545 pmap_addr = s;
6546 } else {
6547 pmap = map->pmap;
6548 pmap_addr = s;
6549 }
6550
6551 if (entry->wired_count) {
6552 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6553 goto done;
6554 }
6555
6556 /*
6557 * The map was not unlocked:
6558 * no need to goto re-lookup.
6559 * Just go directly to next entry.
6560 */
6561 entry = entry->vme_next;
6562 s = entry->vme_start;
6563 continue;
6564 }
6565
6566 /* call vm_map_lookup_and_lock_object to */
6567 /* cause any needs copy to be */
6568 /* evaluated */
6569 local_start = entry->vme_start;
6570 lookup_map = map;
6571 vm_map_lock_write_to_read(map);
6572 rc = vm_map_lookup_and_lock_object(
6573 &lookup_map, local_start,
6574 (access_type | extra_prots),
6575 OBJECT_LOCK_EXCLUSIVE,
6576 &version, &object,
6577 &offset, &prot, &wired,
6578 NULL,
6579 &real_map, NULL);
6580 if (rc != KERN_SUCCESS) {
6581 vm_map_unlock_read(lookup_map);
6582 assert(map_pmap == NULL);
6583 vm_map_unwire_nested(map, start,
6584 s, user_wire, PMAP_NULL, 0);
6585 return rc;
6586 }
6587 vm_object_unlock(object);
6588 if (real_map != lookup_map) {
6589 vm_map_unlock(real_map);
6590 }
6591 vm_map_unlock_read(lookup_map);
6592 vm_map_lock(map);
6593
6594 /* we unlocked, so must re-lookup */
6595 if (!vm_map_lookup_entry(map,
6596 local_start,
6597 &local_entry)) {
6598 rc = KERN_FAILURE;
6599 goto done;
6600 }
6601
6602 /*
6603 * entry could have been "simplified",
6604 * so re-clip
6605 */
6606 entry = local_entry;
6607 assert(s == local_start);
6608 vm_map_clip_start(map, entry, s);
6609 vm_map_clip_end(map, entry, end);
6610 /* re-compute "e" */
6611 e = entry->vme_end;
6612 if (e > end) {
6613 e = end;
6614 }
6615
6616 /* did we have a change of type? */
6617 if (!entry->is_sub_map) {
6618 last_timestamp = map->timestamp;
6619 continue;
6620 }
6621 } else {
6622 local_start = entry->vme_start;
6623 pmap = map_pmap;
6624 }
6625
6626 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6627 goto done;
6628 }
6629
6630 entry->in_transition = TRUE;
6631
6632 vm_map_unlock(map);
6633 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6634 sub_start, sub_end,
6635 caller_prot, tag,
6636 user_wire, pmap, pmap_addr,
6637 NULL);
6638 vm_map_lock(map);
6639
6640 /*
6641 * Find the entry again. It could have been clipped
6642 * after we unlocked the map.
6643 */
6644 if (!vm_map_lookup_entry(map, local_start,
6645 &first_entry)) {
6646 panic("vm_map_wire: re-lookup failed");
6647 }
6648 entry = first_entry;
6649
6650 assert(local_start == s);
6651 /* re-compute "e" */
6652 e = entry->vme_end;
6653 if (e > end) {
6654 e = end;
6655 }
6656
6657 last_timestamp = map->timestamp;
6658 while ((entry != vm_map_to_entry(map)) &&
6659 (entry->vme_start < e)) {
6660 assert(entry->in_transition);
6661 entry->in_transition = FALSE;
6662 if (entry->needs_wakeup) {
6663 entry->needs_wakeup = FALSE;
6664 need_wakeup = TRUE;
6665 }
6666 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6667 subtract_wire_counts(map, entry, user_wire);
6668 }
6669 entry = entry->vme_next;
6670 }
6671 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6672 goto done;
6673 }
6674
6675 /* no need to relookup again */
6676 s = entry->vme_start;
6677 continue;
6678 }
6679
6680 /*
6681 * If this entry is already wired then increment
6682 * the appropriate wire reference count.
6683 */
6684 if (entry->wired_count) {
6685 if ((entry->protection & access_type) != access_type) {
6686 /* found a protection problem */
6687
6688 /*
6689 * XXX FBDP
6690 * We should always return an error
6691 * in this case but since we didn't
6692 * enforce it before, let's do
6693 * it only for the new "wire_and_extract"
6694 * code path for now...
6695 */
6696 if (wire_and_extract) {
6697 rc = KERN_PROTECTION_FAILURE;
6698 goto done;
6699 }
6700 }
6701
6702 /*
6703 * entry is already wired down, get our reference
6704 * after clipping to our range.
6705 */
6706 vm_map_clip_start(map, entry, s);
6707 vm_map_clip_end(map, entry, end);
6708
6709 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6710 goto done;
6711 }
6712
6713 if (wire_and_extract) {
6714 vm_object_t object;
6715 vm_object_offset_t offset;
6716 vm_page_t m;
6717
6718 /*
6719 * We don't have to "wire" the page again
6720 * bit we still have to "extract" its
6721 * physical page number, after some sanity
6722 * checks.
6723 */
6724 assert((entry->vme_end - entry->vme_start)
6725 == PAGE_SIZE);
6726 assert(!entry->needs_copy);
6727 assert(!entry->is_sub_map);
6728 assert(VME_OBJECT(entry));
6729 if (((entry->vme_end - entry->vme_start)
6730 != PAGE_SIZE) ||
6731 entry->needs_copy ||
6732 entry->is_sub_map ||
6733 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6734 rc = KERN_INVALID_ARGUMENT;
6735 goto done;
6736 }
6737
6738 object = VME_OBJECT(entry);
6739 offset = VME_OFFSET(entry);
6740 /* need exclusive lock to update m->dirty */
6741 if (entry->protection & VM_PROT_WRITE) {
6742 vm_object_lock(object);
6743 } else {
6744 vm_object_lock_shared(object);
6745 }
6746 m = vm_page_lookup(object, offset);
6747 assert(m != VM_PAGE_NULL);
6748 assert(VM_PAGE_WIRED(m));
6749 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6750 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6751 if (entry->protection & VM_PROT_WRITE) {
6752 vm_object_lock_assert_exclusive(
6753 object);
6754 m->vmp_dirty = TRUE;
6755 }
6756 } else {
6757 /* not already wired !? */
6758 *physpage_p = 0;
6759 }
6760 vm_object_unlock(object);
6761 }
6762
6763 /* map was not unlocked: no need to relookup */
6764 entry = entry->vme_next;
6765 s = entry->vme_start;
6766 continue;
6767 }
6768
6769 /*
6770 * Unwired entry or wire request transmitted via submap
6771 */
6772
6773 /*
6774 * Wiring would copy the pages to the shadow object.
6775 * The shadow object would not be code-signed so
6776 * attempting to execute code from these copied pages
6777 * would trigger a code-signing violation.
6778 */
6779
6780 if ((entry->protection & VM_PROT_EXECUTE)
6781 #if XNU_TARGET_OS_OSX
6782 &&
6783 map->pmap != kernel_pmap &&
6784 (vm_map_cs_enforcement(map)
6785 #if __arm64__
6786 || !VM_MAP_IS_EXOTIC(map)
6787 #endif /* __arm64__ */
6788 )
6789 #endif /* XNU_TARGET_OS_OSX */
6790 #if CODE_SIGNING_MONITOR
6791 &&
6792 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6793 #endif
6794 ) {
6795 #if MACH_ASSERT
6796 printf("pid %d[%s] wiring executable range from "
6797 "0x%llx to 0x%llx: rejected to preserve "
6798 "code-signing\n",
6799 proc_selfpid(),
6800 (get_bsdtask_info(current_task())
6801 ? proc_name_address(get_bsdtask_info(current_task()))
6802 : "?"),
6803 (uint64_t) entry->vme_start,
6804 (uint64_t) entry->vme_end);
6805 #endif /* MACH_ASSERT */
6806 DTRACE_VM2(cs_executable_wire,
6807 uint64_t, (uint64_t)entry->vme_start,
6808 uint64_t, (uint64_t)entry->vme_end);
6809 cs_executable_wire++;
6810 rc = KERN_PROTECTION_FAILURE;
6811 goto done;
6812 }
6813
6814 /*
6815 * Perform actions of vm_map_lookup that need the write
6816 * lock on the map: create a shadow object for a
6817 * copy-on-write region, or an object for a zero-fill
6818 * region.
6819 */
6820 size = entry->vme_end - entry->vme_start;
6821 /*
6822 * If wiring a copy-on-write page, we need to copy it now
6823 * even if we're only (currently) requesting read access.
6824 * This is aggressive, but once it's wired we can't move it.
6825 */
6826 if (entry->needs_copy) {
6827 if (wire_and_extract) {
6828 /*
6829 * We're supposed to share with the original
6830 * provider so should not be "needs_copy"
6831 */
6832 rc = KERN_INVALID_ARGUMENT;
6833 goto done;
6834 }
6835
6836 VME_OBJECT_SHADOW(entry, size,
6837 vm_map_always_shadow(map));
6838 entry->needs_copy = FALSE;
6839 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6840 if (wire_and_extract) {
6841 /*
6842 * We're supposed to share with the original
6843 * provider so should already have an object.
6844 */
6845 rc = KERN_INVALID_ARGUMENT;
6846 goto done;
6847 }
6848 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6849 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6850 assert(entry->use_pmap);
6851 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6852 if (wire_and_extract) {
6853 /*
6854 * We're supposed to share with the original
6855 * provider so should not be COPY_SYMMETRIC.
6856 */
6857 rc = KERN_INVALID_ARGUMENT;
6858 goto done;
6859 }
6860 /*
6861 * Force an unrequested "copy-on-write" but only for
6862 * the range we're wiring.
6863 */
6864 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6865 vm_map_clip_start(map, entry, s);
6866 vm_map_clip_end(map, entry, end);
6867 /* recompute "size" */
6868 size = entry->vme_end - entry->vme_start;
6869 /* make a shadow object */
6870 vm_object_t orig_object;
6871 vm_object_offset_t orig_offset;
6872 orig_object = VME_OBJECT(entry);
6873 orig_offset = VME_OFFSET(entry);
6874 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6875 if (VME_OBJECT(entry) != orig_object) {
6876 /*
6877 * This mapping has not been shared (or it would be
6878 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6879 * not been copied-on-write (or it would be marked
6880 * as "needs_copy" and would have been handled above
6881 * and also already write-protected).
6882 * We still need to write-protect here to prevent
6883 * other threads from modifying these pages while
6884 * we're in the process of copying and wiring
6885 * the copied pages.
6886 * Since the mapping is neither shared nor COWed,
6887 * we only need to write-protect the PTEs for this
6888 * mapping.
6889 */
6890 vm_object_pmap_protect(orig_object,
6891 orig_offset,
6892 size,
6893 map->pmap,
6894 VM_MAP_PAGE_SIZE(map),
6895 entry->vme_start,
6896 entry->protection & ~VM_PROT_WRITE);
6897 }
6898 }
6899 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6900 /*
6901 * Make the object COPY_DELAY to get a stable object
6902 * to wire.
6903 * That should avoid creating long shadow chains while
6904 * wiring/unwiring the same range repeatedly.
6905 * That also prevents part of the object from being
6906 * wired while another part is "needs_copy", which
6907 * could result in conflicting rules wrt copy-on-write.
6908 */
6909 vm_object_t object;
6910
6911 object = VME_OBJECT(entry);
6912 vm_object_lock(object);
6913 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6914 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6915 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6916 object, (uint64_t)object->vo_size,
6917 entry,
6918 (uint64_t)entry->vme_start,
6919 (uint64_t)entry->vme_end,
6920 (uint64_t)VME_OFFSET(entry),
6921 (uint64_t)size);
6922 assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6923 "object %p ref_count %d\n",
6924 object, os_ref_get_count_raw(&object->ref_count));
6925 assertf(!entry->needs_copy,
6926 "entry %p\n", entry);
6927 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6928 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6929 }
6930 vm_object_unlock(object);
6931 }
6932
6933 vm_map_clip_start(map, entry, s);
6934 vm_map_clip_end(map, entry, end);
6935
6936 /* re-compute "e" */
6937 e = entry->vme_end;
6938 if (e > end) {
6939 e = end;
6940 }
6941
6942 /*
6943 * Check for holes and protection mismatch.
6944 * Holes: Next entry should be contiguous unless this
6945 * is the end of the region.
6946 * Protection: Access requested must be allowed, unless
6947 * wiring is by protection class
6948 */
6949 if ((entry->vme_end < end) &&
6950 ((entry->vme_next == vm_map_to_entry(map)) ||
6951 (entry->vme_next->vme_start > entry->vme_end))) {
6952 /* found a hole */
6953 rc = KERN_INVALID_ADDRESS;
6954 goto done;
6955 }
6956 if ((entry->protection & access_type) != access_type) {
6957 /* found a protection problem */
6958 rc = KERN_PROTECTION_FAILURE;
6959 goto done;
6960 }
6961
6962 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6963
6964 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6965 goto done;
6966 }
6967
6968 entry->in_transition = TRUE;
6969
6970 /*
6971 * This entry might get split once we unlock the map.
6972 * In vm_fault_wire(), we need the current range as
6973 * defined by this entry. In order for this to work
6974 * along with a simultaneous clip operation, we make a
6975 * temporary copy of this entry and use that for the
6976 * wiring. Note that the underlying objects do not
6977 * change during a clip.
6978 */
6979 tmp_entry = *entry;
6980
6981 /*
6982 * The in_transition state guarentees that the entry
6983 * (or entries for this range, if split occured) will be
6984 * there when the map lock is acquired for the second time.
6985 */
6986 vm_map_unlock(map);
6987
6988 if (!user_wire && cur_thread != THREAD_NULL) {
6989 interruptible_state = thread_interrupt_level(THREAD_UNINT);
6990 } else {
6991 interruptible_state = THREAD_UNINT;
6992 }
6993
6994 if (map_pmap) {
6995 rc = vm_fault_wire(map,
6996 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6997 physpage_p);
6998 } else {
6999 rc = vm_fault_wire(map,
7000 &tmp_entry, caller_prot, tag, map->pmap,
7001 tmp_entry.vme_start,
7002 physpage_p);
7003 }
7004
7005 if (!user_wire && cur_thread != THREAD_NULL) {
7006 thread_interrupt_level(interruptible_state);
7007 }
7008
7009 vm_map_lock(map);
7010
7011 if (last_timestamp + 1 != map->timestamp) {
7012 /*
7013 * Find the entry again. It could have been clipped
7014 * after we unlocked the map.
7015 */
7016 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7017 &first_entry)) {
7018 panic("vm_map_wire: re-lookup failed");
7019 }
7020
7021 entry = first_entry;
7022 }
7023
7024 last_timestamp = map->timestamp;
7025
7026 while ((entry != vm_map_to_entry(map)) &&
7027 (entry->vme_start < tmp_entry.vme_end)) {
7028 assert(entry->in_transition);
7029 entry->in_transition = FALSE;
7030 if (entry->needs_wakeup) {
7031 entry->needs_wakeup = FALSE;
7032 need_wakeup = TRUE;
7033 }
7034 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7035 subtract_wire_counts(map, entry, user_wire);
7036 }
7037 entry = entry->vme_next;
7038 }
7039
7040 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7041 goto done;
7042 }
7043
7044 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7045 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7046 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7047 /* found a "new" hole */
7048 s = tmp_entry.vme_end;
7049 rc = KERN_INVALID_ADDRESS;
7050 goto done;
7051 }
7052
7053 s = entry->vme_start;
7054 } /* end while loop through map entries */
7055
7056 done:
7057 if (rc == KERN_SUCCESS) {
7058 /* repair any damage we may have made to the VM map */
7059 vm_map_simplify_range(map, start, end);
7060 }
7061
7062 vm_map_unlock(map);
7063
7064 /*
7065 * wake up anybody waiting on entries we wired.
7066 */
7067 if (need_wakeup) {
7068 vm_map_entry_wakeup(map);
7069 }
7070
7071 if (rc != KERN_SUCCESS) {
7072 /* undo what has been wired so far */
7073 vm_map_unwire_nested(map, start, s, user_wire,
7074 map_pmap, pmap_addr);
7075 if (physpage_p) {
7076 *physpage_p = 0;
7077 }
7078 }
7079
7080 return rc;
7081 }
7082
7083 static __attribute__((always_inline, warn_unused_result))
7084 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7085 vm_map_wire_sanitize(
7086 vm_map_t map,
7087 vm_map_offset_ut start_u,
7088 vm_map_offset_ut end_u,
7089 vm_prot_ut prot_u,
7090 vm_sanitize_caller_t vm_sanitize_caller,
7091 vm_map_offset_t *start,
7092 vm_map_offset_t *end,
7093 vm_map_size_t *size,
7094 vm_prot_t *prot)
7095 {
7096 kern_return_t kr;
7097
7098 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7099 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7100 size);
7101 if (__improbable(kr != KERN_SUCCESS)) {
7102 return kr;
7103 }
7104
7105 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7106 if (__improbable(kr != KERN_SUCCESS)) {
7107 return kr;
7108 }
7109
7110 return KERN_SUCCESS;
7111 }
7112
7113 /*
7114 * Validation function for vm_map_wire_nested().
7115 */
7116 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7117 vm_map_wire_impl(
7118 vm_map_t map,
7119 vm_map_offset_ut start_u,
7120 vm_map_offset_ut end_u,
7121 vm_prot_ut prot_u,
7122 vm_tag_t tag,
7123 boolean_t user_wire,
7124 ppnum_t *physpage_p,
7125 vm_sanitize_caller_t vm_sanitize_caller)
7126 {
7127 vm_map_offset_t start, end;
7128 vm_map_size_t size;
7129 vm_prot_t prot;
7130 kern_return_t kr;
7131
7132 /*
7133 * Sanitize any input parameters that are addr/size/prot/inherit
7134 */
7135 kr = vm_map_wire_sanitize(map,
7136 start_u,
7137 end_u,
7138 prot_u,
7139 vm_sanitize_caller,
7140 &start,
7141 &end,
7142 &size,
7143 &prot);
7144 if (__improbable(kr != KERN_SUCCESS)) {
7145 if (physpage_p) {
7146 *physpage_p = 0;
7147 }
7148 return vm_sanitize_get_kr(kr);
7149 }
7150
7151 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7152 PMAP_NULL, 0, physpage_p);
7153 }
7154
7155 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7156 vm_map_wire_external(
7157 vm_map_t map,
7158 vm_map_offset_ut start_u,
7159 vm_map_offset_ut end_u,
7160 vm_prot_ut prot_u,
7161 boolean_t user_wire)
7162 {
7163 vm_tag_t tag = vm_tag_bt();
7164
7165 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7166 }
7167
7168 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7169 vm_map_wire_kernel(
7170 vm_map_t map,
7171 vm_map_offset_ut start_u,
7172 vm_map_offset_ut end_u,
7173 vm_prot_ut prot_u,
7174 vm_tag_t tag,
7175 boolean_t user_wire)
7176 {
7177 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7178 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7179 }
7180
7181 #if XNU_PLATFORM_MacOSX
7182
7183 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7184 vm_map_wire_and_extract(
7185 vm_map_t map,
7186 vm_map_offset_ut start_u,
7187 vm_prot_ut prot_u,
7188 boolean_t user_wire,
7189 ppnum_t *physpage_p)
7190 {
7191 vm_tag_t tag = vm_tag_bt();
7192 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7193 vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u);
7194
7195 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7196 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7197 }
7198
7199 #endif /* XNU_PLATFORM_MacOSX */
7200
7201 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7202 vm_map_unwire_nested(
7203 vm_map_t map,
7204 vm_map_offset_t start,
7205 vm_map_offset_t end,
7206 boolean_t user_wire,
7207 pmap_t map_pmap,
7208 vm_map_offset_t pmap_addr)
7209 {
7210 vm_map_entry_t entry;
7211 struct vm_map_entry *first_entry, tmp_entry;
7212 boolean_t need_wakeup;
7213 boolean_t main_map = FALSE;
7214 unsigned int last_timestamp;
7215
7216 VM_MAP_RANGE_CHECK(map, start, end);
7217 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7218 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7219
7220 if (start == end) {
7221 /* We unwired what the caller asked for: zero pages */
7222 return KERN_SUCCESS;
7223 }
7224
7225 vm_map_lock(map);
7226 if (map_pmap == NULL) {
7227 main_map = TRUE;
7228 }
7229 last_timestamp = map->timestamp;
7230
7231 if (vm_map_lookup_entry(map, start, &first_entry)) {
7232 entry = first_entry;
7233 /*
7234 * vm_map_clip_start will be done later.
7235 * We don't want to unnest any nested sub maps here !
7236 */
7237 } else {
7238 if (!user_wire) {
7239 panic("vm_map_unwire: start not found");
7240 }
7241 /* Start address is not in map. */
7242 vm_map_unlock(map);
7243 return KERN_INVALID_ADDRESS;
7244 }
7245
7246 if (entry->superpage_size) {
7247 /* superpages are always wired */
7248 vm_map_unlock(map);
7249 return KERN_INVALID_ADDRESS;
7250 }
7251
7252 need_wakeup = FALSE;
7253 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7254 if (entry->in_transition) {
7255 /*
7256 * 1)
7257 * Another thread is wiring down this entry. Note
7258 * that if it is not for the other thread we would
7259 * be unwiring an unwired entry. This is not
7260 * permitted. If we wait, we will be unwiring memory
7261 * we did not wire.
7262 *
7263 * 2)
7264 * Another thread is unwiring this entry. We did not
7265 * have a reference to it, because if we did, this
7266 * entry will not be getting unwired now.
7267 */
7268 if (!user_wire) {
7269 /*
7270 * XXX FBDP
7271 * This could happen: there could be some
7272 * overlapping vslock/vsunlock operations
7273 * going on.
7274 * We should probably just wait and retry,
7275 * but then we have to be careful that this
7276 * entry could get "simplified" after
7277 * "in_transition" gets unset and before
7278 * we re-lookup the entry, so we would
7279 * have to re-clip the entry to avoid
7280 * re-unwiring what we have already unwired...
7281 * See vm_map_wire_nested().
7282 *
7283 * Or we could just ignore "in_transition"
7284 * here and proceed to decement the wired
7285 * count(s) on this entry. That should be fine
7286 * as long as "wired_count" doesn't drop all
7287 * the way to 0 (and we should panic if THAT
7288 * happens).
7289 */
7290 panic("vm_map_unwire: in_transition entry");
7291 }
7292
7293 entry = entry->vme_next;
7294 continue;
7295 }
7296
7297 if (entry->is_sub_map) {
7298 vm_map_offset_t sub_start;
7299 vm_map_offset_t sub_end;
7300 vm_map_offset_t local_end;
7301 pmap_t pmap;
7302
7303 vm_map_clip_start(map, entry, start);
7304 vm_map_clip_end(map, entry, end);
7305
7306 sub_start = VME_OFFSET(entry);
7307 sub_end = entry->vme_end - entry->vme_start;
7308 sub_end += VME_OFFSET(entry);
7309 local_end = entry->vme_end;
7310 if (map_pmap == NULL) {
7311 if (entry->use_pmap) {
7312 pmap = VME_SUBMAP(entry)->pmap;
7313 pmap_addr = sub_start;
7314 } else {
7315 pmap = map->pmap;
7316 pmap_addr = start;
7317 }
7318 if (entry->wired_count == 0 ||
7319 (user_wire && entry->user_wired_count == 0)) {
7320 if (!user_wire) {
7321 panic("vm_map_unwire: entry is unwired");
7322 }
7323 entry = entry->vme_next;
7324 continue;
7325 }
7326
7327 /*
7328 * Check for holes
7329 * Holes: Next entry should be contiguous unless
7330 * this is the end of the region.
7331 */
7332 if (((entry->vme_end < end) &&
7333 ((entry->vme_next == vm_map_to_entry(map)) ||
7334 (entry->vme_next->vme_start
7335 > entry->vme_end)))) {
7336 if (!user_wire) {
7337 panic("vm_map_unwire: non-contiguous region");
7338 }
7339 /*
7340 * entry = entry->vme_next;
7341 * continue;
7342 */
7343 }
7344
7345 subtract_wire_counts(map, entry, user_wire);
7346
7347 if (entry->wired_count != 0) {
7348 entry = entry->vme_next;
7349 continue;
7350 }
7351
7352 entry->in_transition = TRUE;
7353 tmp_entry = *entry;/* see comment in vm_map_wire() */
7354
7355 /*
7356 * We can unlock the map now. The in_transition state
7357 * guarantees existance of the entry.
7358 */
7359 vm_map_unlock(map);
7360 vm_map_unwire_nested(VME_SUBMAP(entry),
7361 sub_start, sub_end, user_wire, pmap, pmap_addr);
7362 vm_map_lock(map);
7363
7364 if (last_timestamp + 1 != map->timestamp) {
7365 /*
7366 * Find the entry again. It could have been
7367 * clipped or deleted after we unlocked the map.
7368 */
7369 if (!vm_map_lookup_entry(map,
7370 tmp_entry.vme_start,
7371 &first_entry)) {
7372 if (!user_wire) {
7373 panic("vm_map_unwire: re-lookup failed");
7374 }
7375 entry = first_entry->vme_next;
7376 } else {
7377 entry = first_entry;
7378 }
7379 }
7380 last_timestamp = map->timestamp;
7381
7382 /*
7383 * clear transition bit for all constituent entries
7384 * that were in the original entry (saved in
7385 * tmp_entry). Also check for waiters.
7386 */
7387 while ((entry != vm_map_to_entry(map)) &&
7388 (entry->vme_start < tmp_entry.vme_end)) {
7389 assert(entry->in_transition);
7390 entry->in_transition = FALSE;
7391 if (entry->needs_wakeup) {
7392 entry->needs_wakeup = FALSE;
7393 need_wakeup = TRUE;
7394 }
7395 entry = entry->vme_next;
7396 }
7397 continue;
7398 } else {
7399 tmp_entry = *entry;
7400 vm_map_unlock(map);
7401 vm_map_unwire_nested(VME_SUBMAP(entry),
7402 sub_start, sub_end, user_wire, map_pmap,
7403 pmap_addr);
7404 vm_map_lock(map);
7405
7406 if (last_timestamp + 1 != map->timestamp) {
7407 /*
7408 * Find the entry again. It could have been
7409 * clipped or deleted after we unlocked the map.
7410 */
7411 if (!vm_map_lookup_entry(map,
7412 tmp_entry.vme_start,
7413 &first_entry)) {
7414 if (!user_wire) {
7415 panic("vm_map_unwire: re-lookup failed");
7416 }
7417 entry = first_entry->vme_next;
7418 } else {
7419 entry = first_entry;
7420 }
7421 }
7422 last_timestamp = map->timestamp;
7423 }
7424 }
7425
7426
7427 if ((entry->wired_count == 0) ||
7428 (user_wire && entry->user_wired_count == 0)) {
7429 if (!user_wire) {
7430 panic("vm_map_unwire: entry is unwired");
7431 }
7432
7433 entry = entry->vme_next;
7434 continue;
7435 }
7436
7437 assert(entry->wired_count > 0 &&
7438 (!user_wire || entry->user_wired_count > 0));
7439
7440 vm_map_clip_start(map, entry, start);
7441 vm_map_clip_end(map, entry, end);
7442
7443 /*
7444 * Check for holes
7445 * Holes: Next entry should be contiguous unless
7446 * this is the end of the region.
7447 */
7448 if (((entry->vme_end < end) &&
7449 ((entry->vme_next == vm_map_to_entry(map)) ||
7450 (entry->vme_next->vme_start > entry->vme_end)))) {
7451 if (!user_wire) {
7452 panic("vm_map_unwire: non-contiguous region");
7453 }
7454 entry = entry->vme_next;
7455 continue;
7456 }
7457
7458 subtract_wire_counts(map, entry, user_wire);
7459
7460 if (entry->wired_count != 0) {
7461 entry = entry->vme_next;
7462 continue;
7463 }
7464
7465 if (entry->zero_wired_pages) {
7466 entry->zero_wired_pages = FALSE;
7467 }
7468
7469 entry->in_transition = TRUE;
7470 tmp_entry = *entry; /* see comment in vm_map_wire() */
7471
7472 /*
7473 * We can unlock the map now. The in_transition state
7474 * guarantees existance of the entry.
7475 */
7476 vm_map_unlock(map);
7477 if (map_pmap) {
7478 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7479 pmap_addr, tmp_entry.vme_end);
7480 } else {
7481 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7482 tmp_entry.vme_start, tmp_entry.vme_end);
7483 }
7484 vm_map_lock(map);
7485
7486 if (last_timestamp + 1 != map->timestamp) {
7487 /*
7488 * Find the entry again. It could have been clipped
7489 * or deleted after we unlocked the map.
7490 */
7491 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7492 &first_entry)) {
7493 if (!user_wire) {
7494 panic("vm_map_unwire: re-lookup failed");
7495 }
7496 entry = first_entry->vme_next;
7497 } else {
7498 entry = first_entry;
7499 }
7500 }
7501 last_timestamp = map->timestamp;
7502
7503 /*
7504 * clear transition bit for all constituent entries that
7505 * were in the original entry (saved in tmp_entry). Also
7506 * check for waiters.
7507 */
7508 while ((entry != vm_map_to_entry(map)) &&
7509 (entry->vme_start < tmp_entry.vme_end)) {
7510 assert(entry->in_transition);
7511 entry->in_transition = FALSE;
7512 if (entry->needs_wakeup) {
7513 entry->needs_wakeup = FALSE;
7514 need_wakeup = TRUE;
7515 }
7516 entry = entry->vme_next;
7517 }
7518 }
7519
7520 /*
7521 * We might have fragmented the address space when we wired this
7522 * range of addresses. Attempt to re-coalesce these VM map entries
7523 * with their neighbors now that they're no longer wired.
7524 * Under some circumstances, address space fragmentation can
7525 * prevent VM object shadow chain collapsing, which can cause
7526 * swap space leaks.
7527 */
7528 vm_map_simplify_range(map, start, end);
7529
7530 vm_map_unlock(map);
7531 /*
7532 * wake up anybody waiting on entries that we have unwired.
7533 */
7534 if (need_wakeup) {
7535 vm_map_entry_wakeup(map);
7536 }
7537 return KERN_SUCCESS;
7538 }
7539
7540 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7541 vm_map_unwire(
7542 vm_map_t map,
7543 vm_map_offset_ut start_u,
7544 vm_map_offset_ut end_u,
7545 boolean_t user_wire)
7546 {
7547 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7548 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7549 }
7550
7551 static __attribute__((always_inline, warn_unused_result))
7552 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7553 vm_map_unwire_sanitize(
7554 vm_map_t map,
7555 vm_map_offset_ut start_u,
7556 vm_map_offset_ut end_u,
7557 vm_sanitize_caller_t vm_sanitize_caller,
7558 vm_map_offset_t *start,
7559 vm_map_offset_t *end,
7560 vm_map_size_t *size)
7561 {
7562 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7563 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7564 size);
7565 }
7566
7567 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7568 vm_map_unwire_impl(
7569 vm_map_t map,
7570 vm_map_offset_ut start_u,
7571 vm_map_offset_ut end_u,
7572 boolean_t user_wire,
7573 vm_sanitize_caller_t vm_sanitize_caller)
7574 {
7575 vm_map_offset_t start, end;
7576 vm_map_size_t size;
7577 kern_return_t kr;
7578
7579 /*
7580 * Sanitize any input parameters that are addr/size/prot/inherit
7581 */
7582 kr = vm_map_unwire_sanitize(
7583 map,
7584 start_u,
7585 end_u,
7586 vm_sanitize_caller,
7587 &start,
7588 &end,
7589 &size);
7590 if (__improbable(kr != KERN_SUCCESS)) {
7591 return vm_sanitize_get_kr(kr);
7592 }
7593
7594 return vm_map_unwire_nested(map, start, end,
7595 user_wire, (pmap_t)NULL, 0);
7596 }
7597
7598
7599 /*
7600 * vm_map_entry_zap: [ internal use only ]
7601 *
7602 * Remove the entry from the target map
7603 * and put it on a zap list.
7604 */
7605 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7606 vm_map_entry_zap(
7607 vm_map_t map,
7608 vm_map_entry_t entry,
7609 vm_map_zap_t zap)
7610 {
7611 vm_map_offset_t s, e;
7612
7613 s = entry->vme_start;
7614 e = entry->vme_end;
7615 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7616 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7617 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7618 assert(page_aligned(s));
7619 assert(page_aligned(e));
7620 }
7621 if (entry->map_aligned == TRUE) {
7622 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7623 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7624 }
7625 assert(entry->wired_count == 0);
7626 assert(entry->user_wired_count == 0);
7627 assert(!entry->vme_permanent);
7628
7629 vm_map_store_entry_unlink(map, entry, false);
7630 map->size -= e - s;
7631
7632 vm_map_zap_append(zap, entry);
7633 }
7634
7635 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7636 vm_map_submap_pmap_clean(
7637 vm_map_t map,
7638 vm_map_offset_t start,
7639 vm_map_offset_t end,
7640 vm_map_t sub_map,
7641 vm_map_offset_t offset)
7642 {
7643 vm_map_offset_t submap_start;
7644 vm_map_offset_t submap_end;
7645 vm_map_size_t remove_size;
7646 vm_map_entry_t entry;
7647
7648 submap_end = offset + (end - start);
7649 submap_start = offset;
7650
7651 vm_map_lock_read(sub_map);
7652 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7653 remove_size = (entry->vme_end - entry->vme_start);
7654 if (offset > entry->vme_start) {
7655 remove_size -= offset - entry->vme_start;
7656 }
7657
7658
7659 if (submap_end < entry->vme_end) {
7660 remove_size -=
7661 entry->vme_end - submap_end;
7662 }
7663 if (entry->is_sub_map) {
7664 vm_map_submap_pmap_clean(
7665 sub_map,
7666 start,
7667 start + remove_size,
7668 VME_SUBMAP(entry),
7669 VME_OFFSET(entry));
7670 } else {
7671 if (map->mapped_in_other_pmaps &&
7672 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7673 VME_OBJECT(entry) != NULL) {
7674 vm_object_pmap_protect_options(
7675 VME_OBJECT(entry),
7676 (VME_OFFSET(entry) +
7677 offset -
7678 entry->vme_start),
7679 remove_size,
7680 PMAP_NULL,
7681 PAGE_SIZE,
7682 entry->vme_start,
7683 VM_PROT_NONE,
7684 PMAP_OPTIONS_REMOVE);
7685 } else {
7686 pmap_remove(map->pmap,
7687 (addr64_t)start,
7688 (addr64_t)(start + remove_size));
7689 }
7690 }
7691 }
7692
7693 entry = entry->vme_next;
7694
7695 while ((entry != vm_map_to_entry(sub_map))
7696 && (entry->vme_start < submap_end)) {
7697 remove_size = (entry->vme_end - entry->vme_start);
7698 if (submap_end < entry->vme_end) {
7699 remove_size -= entry->vme_end - submap_end;
7700 }
7701 if (entry->is_sub_map) {
7702 vm_map_submap_pmap_clean(
7703 sub_map,
7704 (start + entry->vme_start) - offset,
7705 ((start + entry->vme_start) - offset) + remove_size,
7706 VME_SUBMAP(entry),
7707 VME_OFFSET(entry));
7708 } else {
7709 if (map->mapped_in_other_pmaps &&
7710 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7711 VME_OBJECT(entry) != NULL) {
7712 vm_object_pmap_protect_options(
7713 VME_OBJECT(entry),
7714 VME_OFFSET(entry),
7715 remove_size,
7716 PMAP_NULL,
7717 PAGE_SIZE,
7718 entry->vme_start,
7719 VM_PROT_NONE,
7720 PMAP_OPTIONS_REMOVE);
7721 } else {
7722 pmap_remove(map->pmap,
7723 (addr64_t)((start + entry->vme_start)
7724 - offset),
7725 (addr64_t)(((start + entry->vme_start)
7726 - offset) + remove_size));
7727 }
7728 }
7729 entry = entry->vme_next;
7730 }
7731 vm_map_unlock_read(sub_map);
7732 return;
7733 }
7734
7735 /*
7736 * virt_memory_guard_ast:
7737 *
7738 * Handle the AST callout for a virtual memory guard.
7739 * raise an EXC_GUARD exception and terminate the task
7740 * if configured to do so.
7741 */
7742 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7743 virt_memory_guard_ast(
7744 thread_t thread,
7745 mach_exception_data_type_t code,
7746 mach_exception_data_type_t subcode)
7747 {
7748 task_t task = get_threadtask(thread);
7749 assert(task != kernel_task);
7750 assert(task == current_task());
7751 kern_return_t sync_exception_result;
7752 uint32_t behavior;
7753
7754 behavior = task->task_exc_guard;
7755
7756 /* Is delivery enabled */
7757 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7758 return;
7759 }
7760
7761 /* If only once, make sure we're that once */
7762 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7763 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7764
7765 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7766 break;
7767 }
7768 behavior = task->task_exc_guard;
7769 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7770 return;
7771 }
7772 }
7773
7774 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7775 /* Raise exception synchronously and see if handler claimed it */
7776 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7777
7778 if (fatal) {
7779 /*
7780 * If Synchronous EXC_GUARD delivery was successful then
7781 * kill the process and return, else kill the process
7782 * and deliver the exception via EXC_CORPSE_NOTIFY.
7783 */
7784
7785
7786 int flags = PX_DEBUG_NO_HONOR;
7787 exception_info_t info = {
7788 .os_reason = OS_REASON_GUARD,
7789 .exception_type = EXC_GUARD,
7790 .mx_code = code,
7791 .mx_subcode = subcode
7792 };
7793
7794 if (sync_exception_result == KERN_SUCCESS) {
7795 flags |= PX_PSIGNAL;
7796 }
7797 exit_with_mach_exception(current_proc(), info, flags);
7798 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7799 /*
7800 * If the synchronous EXC_GUARD delivery was not successful,
7801 * raise a simulated crash.
7802 */
7803 if (sync_exception_result != KERN_SUCCESS) {
7804 task_violated_guard(code, subcode, NULL, FALSE);
7805 }
7806 }
7807 }
7808
7809 /*
7810 * vm_map_guard_exception:
7811 *
7812 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7813 *
7814 * Right now, we do this when we find nothing mapped, or a
7815 * gap in the mapping when a user address space deallocate
7816 * was requested. We report the address of the first gap found.
7817 */
7818 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7819 vm_map_guard_exception(
7820 vm_map_offset_t gap_start,
7821 unsigned reason)
7822 {
7823 mach_exception_code_t code = 0;
7824 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7825 unsigned int target = 0; /* should we pass in pid associated with map? */
7826 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7827 boolean_t fatal = FALSE;
7828
7829 task_t task = current_task_early();
7830
7831 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7832 if (task == NULL || task == kernel_task) {
7833 return;
7834 }
7835
7836 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7837 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7838 EXC_GUARD_ENCODE_TARGET(code, target);
7839
7840 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7841 fatal = TRUE;
7842 }
7843 thread_guard_violation(current_thread(), code, subcode, fatal);
7844 }
7845
7846 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7847 vm_map_delete_submap_recurse(
7848 vm_map_t submap,
7849 vm_map_offset_t submap_start,
7850 vm_map_offset_t submap_end)
7851 {
7852 vm_map_entry_t submap_entry;
7853
7854 /*
7855 * Verify that the submap does not contain any "permanent" entries
7856 * within the specified range. We permit TPRO ranges to be overwritten
7857 * as we only reach this path if TPRO const protection is disabled for a
7858 * given map.
7859 *
7860 * We do not care about gaps.
7861 */
7862
7863 vm_map_lock(submap);
7864
7865 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7866 submap_entry = submap_entry->vme_next;
7867 }
7868
7869 for (;
7870 submap_entry != vm_map_to_entry(submap) &&
7871 submap_entry->vme_start < submap_end;
7872 submap_entry = submap_entry->vme_next) {
7873 if (submap_entry->vme_permanent
7874 #ifdef __arm64e__
7875 /* allow TPRO submap entries to be overwritten */
7876 && !submap_entry->used_for_tpro
7877 #endif
7878 ) {
7879 /* "permanent" entry -> fail */
7880 vm_map_unlock(submap);
7881 return KERN_PROTECTION_FAILURE;
7882 }
7883 }
7884 /* no "permanent" entries in the range -> success */
7885 vm_map_unlock(submap);
7886 return KERN_SUCCESS;
7887 }
7888
7889 __abortlike
7890 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7891 __vm_map_delete_misaligned_panic(
7892 vm_map_t map,
7893 vm_map_offset_t start,
7894 vm_map_offset_t end)
7895 {
7896 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7897 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7898 }
7899
7900 __abortlike
7901 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7902 __vm_map_delete_failed_panic(
7903 vm_map_t map,
7904 vm_map_offset_t start,
7905 vm_map_offset_t end,
7906 kern_return_t kr)
7907 {
7908 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7909 map, (uint64_t)start, (uint64_t)end, kr);
7910 }
7911
7912 __abortlike
7913 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7914 __vm_map_delete_gap_panic(
7915 vm_map_t map,
7916 vm_map_offset_t where,
7917 vm_map_offset_t start,
7918 vm_map_offset_t end)
7919 {
7920 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7921 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7922 }
7923
7924 __abortlike
7925 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7926 __vm_map_delete_permanent_panic(
7927 vm_map_t map,
7928 vm_map_offset_t start,
7929 vm_map_offset_t end,
7930 vm_map_entry_t entry)
7931 {
7932 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7933 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7934 map, (uint64_t)start, (uint64_t)end, entry,
7935 (uint64_t)entry->vme_start,
7936 (uint64_t)entry->vme_end);
7937 }
7938
7939 __options_decl(vm_map_delete_state_t, uint32_t, {
7940 VMDS_NONE = 0x0000,
7941
7942 VMDS_FOUND_GAP = 0x0001,
7943 VMDS_GAPS_OK = 0x0002,
7944
7945 VMDS_KERNEL_PMAP = 0x0004,
7946 VMDS_NEEDS_LOOKUP = 0x0008,
7947 VMDS_NEEDS_WAKEUP = 0x0010,
7948 VMDS_KERNEL_KMEMPTR = 0x0020
7949 });
7950
7951 /*
7952 * vm_map_clamp_to_pmap(map, start, end)
7953 *
7954 * Modify *start and *end so they fall within the bounds of map->pmap.
7955 */
7956 #if MACH_ASSERT
7957 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)7958 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
7959 {
7960 vm_map_address_t min;
7961 vm_map_address_t max;
7962
7963 #if __x86_64__
7964 /* x86_64 struct pmap does not have min and max fields */
7965 if (map->pmap == kernel_pmap) {
7966 min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
7967 max = VM_MAX_KERNEL_ADDRESS;
7968 } else {
7969 min = VM_MAP_MIN_ADDRESS;
7970 max = VM_MAP_MAX_ADDRESS;
7971 }
7972 #else
7973 min = map->pmap->min;
7974 max = map->pmap->max;
7975 #endif
7976
7977 if (*start < min) {
7978 *start = min;
7979 } else if (*start > max) {
7980 *start = max;
7981 }
7982 if (*end < min) {
7983 *end = min;
7984 } else if (*end > max) {
7985 *end = max;
7986 }
7987 }
7988 #endif
7989
7990 int vm_log_map_delete_permanent_prot_none = 0;
7991 /*
7992 * vm_map_delete: [ internal use only ]
7993 *
7994 * Deallocates the given address range from the target map.
7995 * Removes all user wirings. Unwires one kernel wiring if
7996 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
7997 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
7998 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7999 *
8000 *
8001 * When the map is a kernel map, then any error in removing mappings
8002 * will lead to a panic so that clients do not have to repeat the panic
8003 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8004 * is also passed, then KERN_ABORTED will not lead to a panic.
8005 *
8006 * This routine is called with map locked and leaves map locked.
8007 */
8008 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8009 vm_map_delete(
8010 vm_map_t map,
8011 vm_map_offset_t start,
8012 vm_map_offset_t end,
8013 vmr_flags_t flags,
8014 kmem_guard_t guard,
8015 vm_map_zap_t zap_list)
8016 {
8017 vm_map_entry_t entry, next;
8018 int interruptible;
8019 vm_map_offset_t gap_start = 0;
8020 vm_map_offset_t clear_in_transition_end = 0;
8021 __unused vm_map_offset_t save_start = start;
8022 __unused vm_map_offset_t save_end = end;
8023 vm_map_delete_state_t state = VMDS_NONE;
8024 kmem_return_t ret = { };
8025 vm_map_range_id_t range_id = 0;
8026 struct kmem_page_meta *meta = NULL;
8027 uint32_t size_idx, slot_idx;
8028 struct mach_vm_range slot;
8029
8030 if (vm_map_pmap(map) == kernel_pmap) {
8031 state |= VMDS_KERNEL_PMAP;
8032 range_id = kmem_addr_get_range(start, end - start);
8033 if (kmem_is_ptr_range(range_id)) {
8034 state |= VMDS_KERNEL_KMEMPTR;
8035 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8036 &size_idx, &slot);
8037 }
8038 }
8039
8040 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8041 state |= VMDS_GAPS_OK;
8042 }
8043
8044 if (map->corpse_source &&
8045 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8046 !map->terminated) {
8047 /*
8048 * The map is being used for corpses related diagnostics.
8049 * So skip any entry removal to avoid perturbing the map state.
8050 * The cleanup will happen in task_terminate_internal after the
8051 * call to task_port_no_senders.
8052 */
8053 goto out;
8054 }
8055
8056 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8057 THREAD_ABORTSAFE : THREAD_UNINT;
8058
8059 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8060 (start & VM_MAP_PAGE_MASK(map))) {
8061 __vm_map_delete_misaligned_panic(map, start, end);
8062 }
8063
8064 if ((state & VMDS_GAPS_OK) == 0) {
8065 /*
8066 * If the map isn't terminated then all deletions must have
8067 * no gaps, and be within the [min, max) of the map.
8068 *
8069 * We got here without VM_MAP_RANGE_CHECK() being called,
8070 * and hence must validate bounds manually.
8071 *
8072 * It is worth noting that because vm_deallocate() will
8073 * round_page() the deallocation size, it's possible for "end"
8074 * to be 0 here due to overflow. We hence must treat it as being
8075 * beyond vm_map_max(map).
8076 *
8077 * Similarly, end < start means some wrap around happend,
8078 * which should cause an error or panic.
8079 */
8080 if (end == 0 || end > vm_map_max(map)) {
8081 state |= VMDS_FOUND_GAP;
8082 gap_start = vm_map_max(map);
8083 if (state & VMDS_KERNEL_PMAP) {
8084 __vm_map_delete_gap_panic(map,
8085 gap_start, start, end);
8086 }
8087 goto out;
8088 }
8089
8090 if (end < start) {
8091 if (state & VMDS_KERNEL_PMAP) {
8092 __vm_map_delete_gap_panic(map,
8093 vm_map_max(map), start, end);
8094 }
8095 ret.kmr_return = KERN_INVALID_ARGUMENT;
8096 goto out;
8097 }
8098
8099 if (start < vm_map_min(map)) {
8100 state |= VMDS_FOUND_GAP;
8101 gap_start = start;
8102 if (state & VMDS_KERNEL_PMAP) {
8103 __vm_map_delete_gap_panic(map,
8104 gap_start, start, end);
8105 }
8106 goto out;
8107 }
8108 } else {
8109 /*
8110 * If the map is terminated, we must accept start/end
8111 * being beyond the boundaries of the map as this is
8112 * how some of the mappings like commpage mappings
8113 * can be destroyed (they're outside of those bounds).
8114 *
8115 * end < start is still something we can't cope with,
8116 * so just bail.
8117 */
8118 if (end < start) {
8119 goto out;
8120 }
8121 }
8122
8123
8124 /*
8125 * Find the start of the region.
8126 *
8127 * If in a superpage, extend the range
8128 * to include the start of the mapping.
8129 */
8130 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8131 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8132 start = SUPERPAGE_ROUND_DOWN(start);
8133 } else {
8134 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8135 break;
8136 }
8137 }
8138
8139 if (entry->superpage_size) {
8140 end = SUPERPAGE_ROUND_UP(end);
8141 }
8142
8143 /*
8144 * Step through all entries in this region
8145 */
8146 for (vm_map_offset_t s = start; s < end;) {
8147 /*
8148 * At this point, we have deleted all the memory entries
8149 * in [start, s) and are proceeding with the [s, end) range.
8150 *
8151 * This loop might drop the map lock, and it is possible that
8152 * some memory was already reallocated within [start, s)
8153 * and we don't want to mess with those entries.
8154 *
8155 * Some of those entries could even have been re-assembled
8156 * with an entry after "s" (in vm_map_simplify_entry()), so
8157 * we may have to vm_map_clip_start() again.
8158 *
8159 * When clear_in_transition_end is set, the we had marked
8160 * [start, clear_in_transition_end) as "in_transition"
8161 * during a previous iteration and we need to clear it.
8162 */
8163
8164 /*
8165 * Step 1: If needed (because we dropped locks),
8166 * lookup the entry again.
8167 *
8168 * If we're coming back from unwiring (Step 5),
8169 * we also need to mark the entries as no longer
8170 * in transition after that.
8171 */
8172
8173 if (state & VMDS_NEEDS_LOOKUP) {
8174 state &= ~VMDS_NEEDS_LOOKUP;
8175
8176 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8177 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8178 }
8179
8180 if (state & VMDS_KERNEL_KMEMPTR) {
8181 kmem_validate_slot(s, meta, size_idx, slot_idx);
8182 }
8183 }
8184
8185 if (clear_in_transition_end) {
8186 for (vm_map_entry_t it = entry;
8187 it != vm_map_to_entry(map) &&
8188 it->vme_start < clear_in_transition_end;
8189 it = it->vme_next) {
8190 assert(it->in_transition);
8191 it->in_transition = FALSE;
8192 if (it->needs_wakeup) {
8193 it->needs_wakeup = FALSE;
8194 state |= VMDS_NEEDS_WAKEUP;
8195 }
8196 }
8197
8198 clear_in_transition_end = 0;
8199 }
8200
8201
8202 /*
8203 * Step 2: Perform various policy checks
8204 * before we do _anything_ to this entry.
8205 */
8206
8207 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8208 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8209 /*
8210 * Either we found a gap already,
8211 * or we are tearing down a map,
8212 * keep going.
8213 */
8214 } else if (state & VMDS_KERNEL_PMAP) {
8215 __vm_map_delete_gap_panic(map, s, start, end);
8216 } else if (s < end) {
8217 state |= VMDS_FOUND_GAP;
8218 gap_start = s;
8219 }
8220
8221 if (entry == vm_map_to_entry(map) ||
8222 end <= entry->vme_start) {
8223 break;
8224 }
8225
8226 s = entry->vme_start;
8227 }
8228
8229 if (state & VMDS_KERNEL_PMAP) {
8230 /*
8231 * In the kernel map and its submaps,
8232 * permanent entries never die, even
8233 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8234 */
8235 if (entry->vme_permanent) {
8236 __vm_map_delete_permanent_panic(map, start, end, entry);
8237 }
8238
8239 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8240 end = entry->vme_end;
8241 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8242 }
8243
8244 /*
8245 * In the kernel map and its submaps,
8246 * the removal of an atomic/guarded entry is strict.
8247 *
8248 * An atomic entry is processed only if it was
8249 * specifically targeted.
8250 *
8251 * We might have deleted non-atomic entries before
8252 * we reach this this point however...
8253 */
8254 kmem_entry_validate_guard(map, entry,
8255 start, end - start, guard);
8256 }
8257
8258 /*
8259 * Step 2.1: handle "permanent" and "submap" entries
8260 * *before* clipping to avoid triggering some unnecessary
8261 * un-nesting of the shared region.
8262 */
8263 if (entry->vme_permanent && entry->is_sub_map) {
8264 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8265 /*
8266 * Un-mapping a "permanent" mapping of a user-space
8267 * submap is not allowed unless...
8268 */
8269 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8270 /*
8271 * a. explicitly requested by the kernel caller.
8272 */
8273 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8274 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8275 developer_mode_state()) {
8276 /*
8277 * b. we're in "developer" mode (for
8278 * breakpoints, dtrace probes, ...).
8279 */
8280 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8281 } else if (map->terminated) {
8282 /*
8283 * c. this is the final address space cleanup.
8284 */
8285 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8286 } else {
8287 vm_map_offset_t submap_start, submap_end;
8288 kern_return_t submap_kr;
8289
8290 /*
8291 * Check if there are any "permanent" mappings
8292 * in this range in the submap.
8293 */
8294 if (entry->in_transition) {
8295 /* can that even happen ? */
8296 goto in_transition;
8297 }
8298 /* compute the clipped range in the submap */
8299 submap_start = s - entry->vme_start;
8300 submap_start += VME_OFFSET(entry);
8301 submap_end = end - entry->vme_start;
8302 submap_end += VME_OFFSET(entry);
8303 submap_kr = vm_map_delete_submap_recurse(
8304 VME_SUBMAP(entry),
8305 submap_start,
8306 submap_end);
8307 if (submap_kr != KERN_SUCCESS) {
8308 /*
8309 * There are some "permanent" mappings
8310 * in the submap: we are not allowed
8311 * to remove this range.
8312 */
8313 printf("%d[%s] removing permanent submap entry "
8314 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8315 proc_selfpid(),
8316 (get_bsdtask_info(current_task())
8317 ? proc_name_address(get_bsdtask_info(current_task()))
8318 : "?"), entry,
8319 (uint64_t)entry->vme_start,
8320 (uint64_t)entry->vme_end,
8321 entry->protection,
8322 entry->max_protection);
8323 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8324 vm_map_entry_t, entry,
8325 vm_map_offset_t, entry->vme_start,
8326 vm_map_offset_t, entry->vme_end,
8327 vm_prot_t, entry->protection,
8328 vm_prot_t, entry->max_protection,
8329 int, VME_ALIAS(entry));
8330 ret.kmr_return = KERN_PROTECTION_FAILURE;
8331 goto out;
8332 }
8333 /* no permanent mappings: proceed */
8334 }
8335 }
8336
8337 /*
8338 * Step 3: Perform any clipping needed.
8339 *
8340 * After this, "entry" starts at "s", ends before "end"
8341 */
8342
8343 if (entry->vme_start < s) {
8344 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8345 entry->map_aligned &&
8346 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8347 /*
8348 * The entry will no longer be map-aligned
8349 * after clipping and the caller said it's OK.
8350 */
8351 entry->map_aligned = FALSE;
8352 }
8353 vm_map_clip_start(map, entry, s);
8354 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8355 }
8356
8357 if (end < entry->vme_end) {
8358 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8359 entry->map_aligned &&
8360 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8361 /*
8362 * The entry will no longer be map-aligned
8363 * after clipping and the caller said it's OK.
8364 */
8365 entry->map_aligned = FALSE;
8366 }
8367 vm_map_clip_end(map, entry, end);
8368 }
8369
8370 if (entry->vme_permanent && entry->is_sub_map) {
8371 /*
8372 * We already went through step 2.1 which did not deny
8373 * the removal of this "permanent" and "is_sub_map"
8374 * entry.
8375 * Now that we've clipped what we actually want to
8376 * delete, undo the "permanent" part to allow the
8377 * removal to proceed.
8378 */
8379 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8380 vm_map_entry_t, entry,
8381 vm_map_offset_t, entry->vme_start,
8382 vm_map_offset_t, entry->vme_end,
8383 vm_prot_t, entry->protection,
8384 vm_prot_t, entry->max_protection,
8385 int, VME_ALIAS(entry));
8386 entry->vme_permanent = false;
8387 }
8388
8389 assert(s == entry->vme_start);
8390 assert(entry->vme_end <= end);
8391
8392
8393 /*
8394 * Step 4: If the entry is in flux, wait for this to resolve.
8395 */
8396
8397 if (entry->in_transition) {
8398 wait_result_t wait_result;
8399
8400 in_transition:
8401 /*
8402 * Another thread is wiring/unwiring this entry.
8403 * Let the other thread know we are waiting.
8404 */
8405
8406 entry->needs_wakeup = TRUE;
8407
8408 /*
8409 * wake up anybody waiting on entries that we have
8410 * already unwired/deleted.
8411 */
8412 if (state & VMDS_NEEDS_WAKEUP) {
8413 vm_map_entry_wakeup(map);
8414 state &= ~VMDS_NEEDS_WAKEUP;
8415 }
8416
8417 wait_result = vm_map_entry_wait(map, interruptible);
8418
8419 if (interruptible &&
8420 wait_result == THREAD_INTERRUPTED) {
8421 /*
8422 * We do not clear the needs_wakeup flag,
8423 * since we cannot tell if we were the only one.
8424 */
8425 ret.kmr_return = KERN_ABORTED;
8426 return ret;
8427 }
8428
8429 /*
8430 * The entry could have been clipped or it
8431 * may not exist anymore. Look it up again.
8432 */
8433 state |= VMDS_NEEDS_LOOKUP;
8434 continue;
8435 }
8436
8437
8438 /*
8439 * Step 5: Handle wiring
8440 */
8441
8442 if (entry->wired_count) {
8443 struct vm_map_entry tmp_entry;
8444 boolean_t user_wire;
8445 unsigned int last_timestamp;
8446
8447 user_wire = entry->user_wired_count > 0;
8448
8449 /*
8450 * Remove a kernel wiring if requested
8451 */
8452 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8453 entry->wired_count--;
8454 vme_btref_consider_and_put(entry);
8455 }
8456
8457 /*
8458 * Remove all user wirings for proper accounting
8459 */
8460 while (entry->user_wired_count) {
8461 subtract_wire_counts(map, entry, user_wire);
8462 }
8463
8464 /*
8465 * All our DMA I/O operations in IOKit are currently
8466 * done by wiring through the map entries of the task
8467 * requesting the I/O.
8468 *
8469 * Because of this, we must always wait for kernel wirings
8470 * to go away on the entries before deleting them.
8471 *
8472 * Any caller who wants to actually remove a kernel wiring
8473 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8474 * properly remove one wiring instead of blasting through
8475 * them all.
8476 */
8477 if (entry->wired_count != 0) {
8478 assert(map != kernel_map);
8479 /*
8480 * Cannot continue. Typical case is when
8481 * a user thread has physical io pending on
8482 * on this page. Either wait for the
8483 * kernel wiring to go away or return an
8484 * error.
8485 */
8486 wait_result_t wait_result;
8487
8488 entry->needs_wakeup = TRUE;
8489 wait_result = vm_map_entry_wait(map,
8490 interruptible);
8491
8492 if (interruptible &&
8493 wait_result == THREAD_INTERRUPTED) {
8494 /*
8495 * We do not clear the
8496 * needs_wakeup flag, since we
8497 * cannot tell if we were the
8498 * only one.
8499 */
8500 ret.kmr_return = KERN_ABORTED;
8501 return ret;
8502 }
8503
8504
8505 /*
8506 * The entry could have been clipped or
8507 * it may not exist anymore. Look it
8508 * up again.
8509 */
8510 state |= VMDS_NEEDS_LOOKUP;
8511 continue;
8512 }
8513
8514 /*
8515 * We can unlock the map now.
8516 *
8517 * The entry might be split once we unlock the map,
8518 * but we need the range as defined by this entry
8519 * to be stable. So we must make a local copy.
8520 *
8521 * The underlying objects do not change during clips,
8522 * and the in_transition state guarentees existence
8523 * of the entry.
8524 */
8525 last_timestamp = map->timestamp;
8526 entry->in_transition = TRUE;
8527 tmp_entry = *entry;
8528 vm_map_unlock(map);
8529
8530 if (tmp_entry.is_sub_map) {
8531 vm_map_t sub_map;
8532 vm_map_offset_t sub_start, sub_end;
8533 pmap_t pmap;
8534 vm_map_offset_t pmap_addr;
8535
8536
8537 sub_map = VME_SUBMAP(&tmp_entry);
8538 sub_start = VME_OFFSET(&tmp_entry);
8539 sub_end = sub_start + (tmp_entry.vme_end -
8540 tmp_entry.vme_start);
8541 if (tmp_entry.use_pmap) {
8542 pmap = sub_map->pmap;
8543 pmap_addr = tmp_entry.vme_start;
8544 } else {
8545 pmap = map->pmap;
8546 pmap_addr = tmp_entry.vme_start;
8547 }
8548 (void) vm_map_unwire_nested(sub_map,
8549 sub_start, sub_end,
8550 user_wire,
8551 pmap, pmap_addr);
8552 } else {
8553 vm_map_offset_t entry_end = tmp_entry.vme_end;
8554 vm_map_offset_t max_end;
8555
8556 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8557 max_end = end - VM_MAP_PAGE_SIZE(map);
8558 if (entry_end > max_end) {
8559 entry_end = max_end;
8560 }
8561 }
8562
8563 if (tmp_entry.vme_kernel_object) {
8564 pmap_protect_options(
8565 map->pmap,
8566 tmp_entry.vme_start,
8567 entry_end,
8568 VM_PROT_NONE,
8569 PMAP_OPTIONS_REMOVE,
8570 NULL);
8571 }
8572 vm_fault_unwire(map, &tmp_entry,
8573 tmp_entry.vme_kernel_object, map->pmap,
8574 tmp_entry.vme_start, entry_end);
8575 }
8576
8577 vm_map_lock(map);
8578
8579 /*
8580 * Unwiring happened, we can now go back to deleting
8581 * them (after we clear the in_transition bit for the range).
8582 */
8583 if (last_timestamp + 1 != map->timestamp) {
8584 state |= VMDS_NEEDS_LOOKUP;
8585 }
8586 clear_in_transition_end = tmp_entry.vme_end;
8587 continue;
8588 }
8589
8590 assert(entry->wired_count == 0);
8591 assert(entry->user_wired_count == 0);
8592
8593
8594 /*
8595 * Step 6: Entry is unwired and ready for us to delete !
8596 */
8597
8598 if (!entry->vme_permanent) {
8599 /*
8600 * Typical case: the entry really shouldn't be permanent
8601 */
8602 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8603 (entry->protection & VM_PROT_EXECUTE) &&
8604 developer_mode_state()) {
8605 /*
8606 * Allow debuggers to undo executable mappings
8607 * when developer mode is on.
8608 */
8609 #if 0
8610 printf("FBDP %d[%s] removing permanent executable entry "
8611 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8612 proc_selfpid(),
8613 (current_task()->bsd_info
8614 ? proc_name_address(current_task()->bsd_info)
8615 : "?"), entry,
8616 (uint64_t)entry->vme_start,
8617 (uint64_t)entry->vme_end,
8618 entry->protection,
8619 entry->max_protection);
8620 #endif
8621 entry->vme_permanent = FALSE;
8622 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8623 #if 0
8624 printf("FBDP %d[%s] removing permanent entry "
8625 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8626 proc_selfpid(),
8627 (current_task()->bsd_info
8628 ? proc_name_address(current_task()->bsd_info)
8629 : "?"), entry,
8630 (uint64_t)entry->vme_start,
8631 (uint64_t)entry->vme_end,
8632 entry->protection,
8633 entry->max_protection);
8634 #endif
8635 entry->vme_permanent = FALSE;
8636 #if CODE_SIGNING_MONITOR
8637 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8638 entry->vme_permanent = FALSE;
8639
8640 printf("%d[%s] %s(0x%llx,0x%llx): "
8641 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8642 "prot 0x%x/0x%x\n",
8643 proc_selfpid(),
8644 (get_bsdtask_info(current_task())
8645 ? proc_name_address(get_bsdtask_info(current_task()))
8646 : "?"),
8647 __FUNCTION__,
8648 (uint64_t)start,
8649 (uint64_t)end,
8650 (uint64_t)entry->vme_start,
8651 (uint64_t)entry->vme_end,
8652 entry->protection,
8653 entry->max_protection);
8654 #endif
8655 } else {
8656 DTRACE_VM6(vm_map_delete_permanent,
8657 vm_map_entry_t, entry,
8658 vm_map_offset_t, entry->vme_start,
8659 vm_map_offset_t, entry->vme_end,
8660 vm_prot_t, entry->protection,
8661 vm_prot_t, entry->max_protection,
8662 int, VME_ALIAS(entry));
8663 }
8664
8665 if (entry->is_sub_map) {
8666 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8667 "map %p (%d) entry %p submap %p (%d)\n",
8668 map, VM_MAP_PAGE_SHIFT(map), entry,
8669 VME_SUBMAP(entry),
8670 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8671 if (entry->use_pmap) {
8672 #ifndef NO_NESTED_PMAP
8673 int pmap_flags;
8674
8675 if (map->terminated) {
8676 /*
8677 * This is the final cleanup of the
8678 * address space being terminated.
8679 * No new mappings are expected and
8680 * we don't really need to unnest the
8681 * shared region (and lose the "global"
8682 * pmap mappings, if applicable).
8683 *
8684 * Tell the pmap layer that we're
8685 * "clean" wrt nesting.
8686 */
8687 pmap_flags = PMAP_UNNEST_CLEAN;
8688 } else {
8689 /*
8690 * We're unmapping part of the nested
8691 * shared region, so we can't keep the
8692 * nested pmap.
8693 */
8694 pmap_flags = 0;
8695 }
8696 pmap_unnest_options(
8697 map->pmap,
8698 (addr64_t)entry->vme_start,
8699 entry->vme_end - entry->vme_start,
8700 pmap_flags);
8701 #endif /* NO_NESTED_PMAP */
8702 if (map->mapped_in_other_pmaps &&
8703 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8704 /* clean up parent map/maps */
8705 vm_map_submap_pmap_clean(
8706 map, entry->vme_start,
8707 entry->vme_end,
8708 VME_SUBMAP(entry),
8709 VME_OFFSET(entry));
8710 }
8711 } else {
8712 vm_map_submap_pmap_clean(
8713 map, entry->vme_start, entry->vme_end,
8714 VME_SUBMAP(entry),
8715 VME_OFFSET(entry));
8716 }
8717 } else if (entry->vme_kernel_object ||
8718 VME_OBJECT(entry) == compressor_object) {
8719 /*
8720 * nothing to do
8721 */
8722 } else if (map->mapped_in_other_pmaps &&
8723 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8724 vm_object_pmap_protect_options(
8725 VME_OBJECT(entry), VME_OFFSET(entry),
8726 entry->vme_end - entry->vme_start,
8727 PMAP_NULL,
8728 PAGE_SIZE,
8729 entry->vme_start,
8730 VM_PROT_NONE,
8731 PMAP_OPTIONS_REMOVE);
8732 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8733 (state & VMDS_KERNEL_PMAP)) {
8734 /* Remove translations associated
8735 * with this range unless the entry
8736 * does not have an object, or
8737 * it's the kernel map or a descendant
8738 * since the platform could potentially
8739 * create "backdoor" mappings invisible
8740 * to the VM. It is expected that
8741 * objectless, non-kernel ranges
8742 * do not have such VM invisible
8743 * translations.
8744 */
8745 vm_map_address_t remove_start = entry->vme_start;
8746 vm_map_address_t remove_end = entry->vme_end;
8747 #if MACH_ASSERT
8748 /*
8749 * Prevent panics in pmap_remove() from some vm test code
8750 * which uses virtual address ranges that pmap disallows.
8751 */
8752 if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8753 vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8754 }
8755 #endif /* MACH_ASSERT */
8756 pmap_remove(map->pmap, remove_start, remove_end);
8757 }
8758
8759 #if DEBUG
8760 /*
8761 * All pmap mappings for this map entry must have been
8762 * cleared by now.
8763 */
8764 assert(pmap_is_empty(map->pmap,
8765 entry->vme_start,
8766 entry->vme_end));
8767 #endif /* DEBUG */
8768
8769 if (entry->iokit_acct) {
8770 /* alternate accounting */
8771 DTRACE_VM4(vm_map_iokit_unmapped_region,
8772 vm_map_t, map,
8773 vm_map_offset_t, entry->vme_start,
8774 vm_map_offset_t, entry->vme_end,
8775 int, VME_ALIAS(entry));
8776 vm_map_iokit_unmapped_region(map,
8777 (entry->vme_end -
8778 entry->vme_start));
8779 entry->iokit_acct = FALSE;
8780 entry->use_pmap = FALSE;
8781 }
8782
8783 /* move "s" forward */
8784 s = entry->vme_end;
8785 next = entry->vme_next;
8786 if (!entry->map_aligned) {
8787 vm_map_offset_t rounded_s;
8788
8789 /*
8790 * Skip artificial gap due to mis-aligned entry
8791 * on devices with a page size smaller than the
8792 * map's page size (i.e. 16k task on a 4k device).
8793 */
8794 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8795 if (next == vm_map_to_entry(map)) {
8796 s = rounded_s;
8797 } else if (s < rounded_s) {
8798 s = MIN(rounded_s, next->vme_start);
8799 }
8800 }
8801 ret.kmr_size += s - entry->vme_start;
8802
8803 if (entry->vme_permanent) {
8804 /*
8805 * A permanent entry can not be removed, so leave it
8806 * in place but remove all access permissions.
8807 */
8808 if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8809 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8810 __FUNCTION__, __LINE__,
8811 proc_selfpid(),
8812 (get_bsdtask_info(current_task())
8813 ? proc_name_address(get_bsdtask_info(current_task()))
8814 : "?"),
8815 map,
8816 entry,
8817 (uint64_t)entry->vme_start,
8818 (uint64_t)entry->vme_end,
8819 entry->is_sub_map,
8820 entry->protection,
8821 entry->max_protection);
8822 }
8823 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8824 vm_map_entry_t, entry,
8825 vm_map_offset_t, entry->vme_start,
8826 vm_map_offset_t, entry->vme_end,
8827 vm_prot_t, entry->protection,
8828 vm_prot_t, entry->max_protection,
8829 int, VME_ALIAS(entry));
8830 entry->protection = VM_PROT_NONE;
8831 entry->max_protection = VM_PROT_NONE;
8832 #ifdef __arm64e__
8833 entry->used_for_tpro = FALSE;
8834 #endif
8835 } else {
8836 vm_map_entry_zap(map, entry, zap_list);
8837 }
8838
8839 entry = next;
8840 next = VM_MAP_ENTRY_NULL;
8841
8842 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8843 unsigned int last_timestamp = map->timestamp++;
8844
8845 if (lck_rw_lock_yield_exclusive(&map->lock,
8846 LCK_RW_YIELD_ANY_WAITER)) {
8847 if (last_timestamp != map->timestamp + 1) {
8848 state |= VMDS_NEEDS_LOOKUP;
8849 }
8850 } else {
8851 /* we didn't yield, undo our change */
8852 map->timestamp--;
8853 }
8854 }
8855 }
8856
8857 if (map->wait_for_space) {
8858 thread_wakeup((event_t) map);
8859 }
8860
8861 if (state & VMDS_NEEDS_WAKEUP) {
8862 vm_map_entry_wakeup(map);
8863 }
8864
8865 out:
8866 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8867 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8868 }
8869
8870 if (state & VMDS_KERNEL_KMEMPTR) {
8871 kmem_free_space(start, end, range_id, &slot);
8872 }
8873
8874 if (state & VMDS_FOUND_GAP) {
8875 DTRACE_VM3(kern_vm_deallocate_gap,
8876 vm_map_offset_t, gap_start,
8877 vm_map_offset_t, save_start,
8878 vm_map_offset_t, save_end);
8879 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8880 ret.kmr_return = KERN_INVALID_VALUE;
8881 } else {
8882 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8883 }
8884 }
8885
8886 return ret;
8887 }
8888
8889 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8890 vm_map_remove_and_unlock(
8891 vm_map_t map,
8892 vm_map_offset_t start,
8893 vm_map_offset_t end,
8894 vmr_flags_t flags,
8895 kmem_guard_t guard)
8896 {
8897 kmem_return_t ret;
8898 VM_MAP_ZAP_DECLARE(zap);
8899
8900 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8901 vm_map_unlock(map);
8902
8903 vm_map_zap_dispose(&zap);
8904
8905 return ret;
8906 }
8907
8908 /*
8909 * vm_map_remove_guard:
8910 *
8911 * Remove the given address range from the target map.
8912 * This is the exported form of vm_map_delete.
8913 */
8914 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8915 vm_map_remove_guard(
8916 vm_map_t map,
8917 vm_map_offset_t start,
8918 vm_map_offset_t end,
8919 vmr_flags_t flags,
8920 kmem_guard_t guard)
8921 {
8922 vm_map_lock(map);
8923 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8924 }
8925
8926 /*
8927 * vm_map_terminate:
8928 *
8929 * Clean out a task's map.
8930 */
8931 kern_return_t
vm_map_terminate(vm_map_t map)8932 vm_map_terminate(
8933 vm_map_t map)
8934 {
8935 vm_map_lock(map);
8936 map->terminated = TRUE;
8937 vm_map_disable_hole_optimization(map);
8938 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8939 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8940 return KERN_SUCCESS;
8941 }
8942
8943 /*
8944 * Routine: vm_map_copy_allocate
8945 *
8946 * Description:
8947 * Allocates and initializes a map copy object.
8948 */
8949 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8950 vm_map_copy_allocate(uint16_t type)
8951 {
8952 vm_map_copy_t new_copy;
8953
8954 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8955 new_copy->type = type;
8956 if (type == VM_MAP_COPY_ENTRY_LIST) {
8957 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8958 vm_map_store_init(&new_copy->cpy_hdr);
8959 }
8960 return new_copy;
8961 }
8962
8963 /*
8964 * Routine: vm_map_copy_discard
8965 *
8966 * Description:
8967 * Dispose of a map copy object (returned by
8968 * vm_map_copyin).
8969 */
8970 void
vm_map_copy_discard(vm_map_copy_t copy)8971 vm_map_copy_discard(
8972 vm_map_copy_t copy)
8973 {
8974 if (copy == VM_MAP_COPY_NULL) {
8975 return;
8976 }
8977
8978 /*
8979 * Assert that the vm_map_copy is coming from the right
8980 * zone and hasn't been forged
8981 */
8982 vm_map_copy_require(copy);
8983
8984 switch (copy->type) {
8985 case VM_MAP_COPY_ENTRY_LIST:
8986 while (vm_map_copy_first_entry(copy) !=
8987 vm_map_copy_to_entry(copy)) {
8988 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8989
8990 vm_map_copy_entry_unlink(copy, entry);
8991 if (entry->is_sub_map) {
8992 vm_map_deallocate(VME_SUBMAP(entry));
8993 } else {
8994 vm_object_deallocate(VME_OBJECT(entry));
8995 }
8996 vm_map_copy_entry_dispose(entry);
8997 }
8998 break;
8999 case VM_MAP_COPY_KERNEL_BUFFER:
9000
9001 /*
9002 * The vm_map_copy_t and possibly the data buffer were
9003 * allocated by a single call to kalloc_data(), i.e. the
9004 * vm_map_copy_t was not allocated out of the zone.
9005 */
9006 if (copy->size > msg_ool_size_small || copy->offset) {
9007 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9008 (long long)copy->size, (long long)copy->offset);
9009 }
9010 kfree_data(copy->cpy_kdata, copy->size);
9011 }
9012 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9013 }
9014
9015 #if XNU_PLATFORM_MacOSX
9016
9017 __exported
9018 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9019
9020 /*
9021 * Routine: vm_map_copy_copy
9022 *
9023 * Description:
9024 * Move the information in a map copy object to
9025 * a new map copy object, leaving the old one
9026 * empty.
9027 *
9028 * This is used by kernel routines that need
9029 * to look at out-of-line data (in copyin form)
9030 * before deciding whether to return SUCCESS.
9031 * If the routine returns FAILURE, the original
9032 * copy object will be deallocated; therefore,
9033 * these routines must make a copy of the copy
9034 * object and leave the original empty so that
9035 * deallocation will not fail.
9036 */
9037 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9038 vm_map_copy_copy(
9039 vm_map_copy_t copy)
9040 {
9041 vm_map_copy_t new_copy;
9042
9043 if (copy == VM_MAP_COPY_NULL) {
9044 return VM_MAP_COPY_NULL;
9045 }
9046
9047 /*
9048 * Assert that the vm_map_copy is coming from the right
9049 * zone and hasn't been forged
9050 */
9051 vm_map_copy_require(copy);
9052
9053 /*
9054 * Allocate a new copy object, and copy the information
9055 * from the old one into it.
9056 */
9057
9058 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9059 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9060 #if __has_feature(ptrauth_calls)
9061 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9062 new_copy->cpy_kdata = copy->cpy_kdata;
9063 }
9064 #endif
9065
9066 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9067 /*
9068 * The links in the entry chain must be
9069 * changed to point to the new copy object.
9070 */
9071 vm_map_copy_first_entry(copy)->vme_prev
9072 = vm_map_copy_to_entry(new_copy);
9073 vm_map_copy_last_entry(copy)->vme_next
9074 = vm_map_copy_to_entry(new_copy);
9075 }
9076
9077 /*
9078 * Change the old copy object into one that contains
9079 * nothing to be deallocated.
9080 */
9081 bzero(copy, sizeof(struct vm_map_copy));
9082 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9083
9084 /*
9085 * Return the new object.
9086 */
9087 return new_copy;
9088 }
9089
9090 #endif /* XNU_PLATFORM_MacOSX */
9091
9092 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9093 vm_map_entry_is_overwritable(
9094 vm_map_t dst_map __unused,
9095 vm_map_entry_t entry)
9096 {
9097 if (!(entry->protection & VM_PROT_WRITE)) {
9098 /* can't overwrite if not writable */
9099 return FALSE;
9100 }
9101 #if !__x86_64__
9102 if (entry->used_for_jit &&
9103 vm_map_cs_enforcement(dst_map) &&
9104 !dst_map->cs_debugged) {
9105 /*
9106 * Can't overwrite a JIT region while cs_enforced
9107 * and not cs_debugged.
9108 */
9109 return FALSE;
9110 }
9111
9112 #if __arm64e__
9113 /* Do not allow overwrite HW assisted TPRO entries */
9114 if (entry->used_for_tpro) {
9115 return FALSE;
9116 }
9117 #endif /* __arm64e__ */
9118
9119 if (entry->vme_permanent) {
9120 if (entry->is_sub_map) {
9121 /*
9122 * We can't tell if the submap contains "permanent"
9123 * entries within the range targeted by the caller.
9124 * The caller will have to check for that with
9125 * vm_map_overwrite_submap_recurse() for example.
9126 */
9127 } else {
9128 /*
9129 * Do not allow overwriting of a "permanent"
9130 * entry.
9131 */
9132 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9133 vm_map_entry_t, entry,
9134 vm_map_offset_t, entry->vme_start,
9135 vm_map_offset_t, entry->vme_end,
9136 vm_prot_t, entry->protection,
9137 vm_prot_t, entry->max_protection,
9138 int, VME_ALIAS(entry));
9139 return FALSE;
9140 }
9141 }
9142 #endif /* !__x86_64__ */
9143
9144 if (entry->is_sub_map) {
9145 /* remember not to assume every entry has a VM object... */
9146 }
9147
9148 return TRUE;
9149 }
9150
9151 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9152 vm_map_overwrite_submap_recurse(
9153 vm_map_t dst_map,
9154 vm_map_offset_t dst_addr,
9155 vm_map_size_t dst_size)
9156 {
9157 vm_map_offset_t dst_end;
9158 vm_map_entry_t tmp_entry;
9159 vm_map_entry_t entry;
9160 kern_return_t result;
9161 boolean_t encountered_sub_map = FALSE;
9162
9163
9164
9165 /*
9166 * Verify that the destination is all writeable
9167 * initially. We have to trunc the destination
9168 * address and round the copy size or we'll end up
9169 * splitting entries in strange ways.
9170 */
9171
9172 dst_end = vm_map_round_page(dst_addr + dst_size,
9173 VM_MAP_PAGE_MASK(dst_map));
9174 vm_map_lock(dst_map);
9175
9176 start_pass_1:
9177 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9178 vm_map_unlock(dst_map);
9179 return KERN_INVALID_ADDRESS;
9180 }
9181
9182 vm_map_clip_start(dst_map,
9183 tmp_entry,
9184 vm_map_trunc_page(dst_addr,
9185 VM_MAP_PAGE_MASK(dst_map)));
9186 if (tmp_entry->is_sub_map) {
9187 /* clipping did unnest if needed */
9188 assert(!tmp_entry->use_pmap);
9189 }
9190
9191 for (entry = tmp_entry;;) {
9192 vm_map_entry_t next;
9193
9194 next = entry->vme_next;
9195 while (entry->is_sub_map) {
9196 vm_map_offset_t sub_start;
9197 vm_map_offset_t sub_end;
9198 vm_map_offset_t local_end;
9199
9200 if (entry->in_transition) {
9201 /*
9202 * Say that we are waiting, and wait for entry.
9203 */
9204 entry->needs_wakeup = TRUE;
9205 vm_map_entry_wait(dst_map, THREAD_UNINT);
9206
9207 goto start_pass_1;
9208 }
9209
9210 encountered_sub_map = TRUE;
9211 sub_start = VME_OFFSET(entry);
9212
9213 if (entry->vme_end < dst_end) {
9214 sub_end = entry->vme_end;
9215 } else {
9216 sub_end = dst_end;
9217 }
9218 sub_end -= entry->vme_start;
9219 sub_end += VME_OFFSET(entry);
9220 local_end = entry->vme_end;
9221 vm_map_unlock(dst_map);
9222
9223 result = vm_map_overwrite_submap_recurse(
9224 VME_SUBMAP(entry),
9225 sub_start,
9226 sub_end - sub_start);
9227
9228 if (result != KERN_SUCCESS) {
9229 return result;
9230 }
9231 if (dst_end <= entry->vme_end) {
9232 return KERN_SUCCESS;
9233 }
9234 vm_map_lock(dst_map);
9235 if (!vm_map_lookup_entry(dst_map, local_end,
9236 &tmp_entry)) {
9237 vm_map_unlock(dst_map);
9238 return KERN_INVALID_ADDRESS;
9239 }
9240 entry = tmp_entry;
9241 next = entry->vme_next;
9242 }
9243 assert(!entry->is_sub_map);
9244
9245 if (!(entry->protection & VM_PROT_WRITE)) {
9246 vm_map_unlock(dst_map);
9247 return KERN_PROTECTION_FAILURE;
9248 }
9249
9250 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9251 vm_map_unlock(dst_map);
9252 return KERN_PROTECTION_FAILURE;
9253 }
9254
9255 /*
9256 * If the entry is in transition, we must wait
9257 * for it to exit that state. Anything could happen
9258 * when we unlock the map, so start over.
9259 */
9260 if (entry->in_transition) {
9261 /*
9262 * Say that we are waiting, and wait for entry.
9263 */
9264 entry->needs_wakeup = TRUE;
9265 vm_map_entry_wait(dst_map, THREAD_UNINT);
9266
9267 goto start_pass_1;
9268 }
9269
9270 /*
9271 * our range is contained completely within this map entry
9272 */
9273 if (dst_end <= entry->vme_end) {
9274 vm_map_unlock(dst_map);
9275 return KERN_SUCCESS;
9276 }
9277 /*
9278 * check that range specified is contiguous region
9279 */
9280 if ((next == vm_map_to_entry(dst_map)) ||
9281 (next->vme_start != entry->vme_end)) {
9282 vm_map_unlock(dst_map);
9283 return KERN_INVALID_ADDRESS;
9284 }
9285
9286 /*
9287 * Check for permanent objects in the destination.
9288 */
9289 assert(!entry->is_sub_map);
9290 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9291 ((!VME_OBJECT(entry)->internal) ||
9292 (VME_OBJECT(entry)->true_share))) {
9293 if (encountered_sub_map) {
9294 vm_map_unlock(dst_map);
9295 return KERN_FAILURE;
9296 }
9297 }
9298
9299
9300 entry = next;
9301 }/* for */
9302 vm_map_unlock(dst_map);
9303 return KERN_SUCCESS;
9304 }
9305
9306 /*
9307 * Routine: vm_map_copy_overwrite
9308 *
9309 * Description:
9310 * Copy the memory described by the map copy
9311 * object (copy; returned by vm_map_copyin) onto
9312 * the specified destination region (dst_map, dst_addr).
9313 * The destination must be writeable.
9314 *
9315 * Unlike vm_map_copyout, this routine actually
9316 * writes over previously-mapped memory. If the
9317 * previous mapping was to a permanent (user-supplied)
9318 * memory object, it is preserved.
9319 *
9320 * The attributes (protection and inheritance) of the
9321 * destination region are preserved.
9322 *
9323 * If successful, consumes the copy object.
9324 * Otherwise, the caller is responsible for it.
9325 *
9326 * Implementation notes:
9327 * To overwrite aligned temporary virtual memory, it is
9328 * sufficient to remove the previous mapping and insert
9329 * the new copy. This replacement is done either on
9330 * the whole region (if no permanent virtual memory
9331 * objects are embedded in the destination region) or
9332 * in individual map entries.
9333 *
9334 * To overwrite permanent virtual memory , it is necessary
9335 * to copy each page, as the external memory management
9336 * interface currently does not provide any optimizations.
9337 *
9338 * Unaligned memory also has to be copied. It is possible
9339 * to use 'vm_trickery' to copy the aligned data. This is
9340 * not done but not hard to implement.
9341 *
9342 * Once a page of permanent memory has been overwritten,
9343 * it is impossible to interrupt this function; otherwise,
9344 * the call would be neither atomic nor location-independent.
9345 * The kernel-state portion of a user thread must be
9346 * interruptible.
9347 *
9348 * It may be expensive to forward all requests that might
9349 * overwrite permanent memory (vm_write, vm_copy) to
9350 * uninterruptible kernel threads. This routine may be
9351 * called by interruptible threads; however, success is
9352 * not guaranteed -- if the request cannot be performed
9353 * atomically and interruptibly, an error indication is
9354 * returned.
9355 *
9356 * Callers of this function must call vm_map_copy_require on
9357 * previously created vm_map_copy_t or pass a newly created
9358 * one to ensure that it hasn't been forged.
9359 */
9360 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9361 vm_map_copy_overwrite_nested(
9362 vm_map_t dst_map,
9363 vm_map_address_t dst_addr,
9364 vm_map_copy_t copy,
9365 boolean_t interruptible,
9366 pmap_t pmap,
9367 boolean_t discard_on_success)
9368 {
9369 vm_map_offset_t dst_end;
9370 vm_map_entry_t tmp_entry;
9371 vm_map_entry_t entry;
9372 kern_return_t kr;
9373 boolean_t aligned = TRUE;
9374 boolean_t contains_permanent_objects = FALSE;
9375 boolean_t encountered_sub_map = FALSE;
9376 vm_map_offset_t base_addr;
9377 vm_map_size_t copy_size;
9378 vm_map_size_t total_size;
9379 uint16_t copy_page_shift;
9380
9381 /*
9382 * Check for special kernel buffer allocated
9383 * by new_ipc_kmsg_copyin.
9384 */
9385
9386 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9387 kr = vm_map_copyout_kernel_buffer(
9388 dst_map, &dst_addr,
9389 copy, copy->size, TRUE, discard_on_success);
9390 return kr;
9391 }
9392
9393 /*
9394 * Only works for entry lists at the moment. Will
9395 * support page lists later.
9396 */
9397
9398 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9399
9400 if (copy->size == 0) {
9401 if (discard_on_success) {
9402 vm_map_copy_discard(copy);
9403 }
9404 return KERN_SUCCESS;
9405 }
9406
9407 copy_page_shift = copy->cpy_hdr.page_shift;
9408
9409 /*
9410 * Verify that the destination is all writeable
9411 * initially. We have to trunc the destination
9412 * address and round the copy size or we'll end up
9413 * splitting entries in strange ways.
9414 */
9415
9416 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9417 VM_MAP_PAGE_MASK(dst_map)) ||
9418 !VM_MAP_PAGE_ALIGNED(copy->offset,
9419 VM_MAP_PAGE_MASK(dst_map)) ||
9420 !VM_MAP_PAGE_ALIGNED(dst_addr,
9421 VM_MAP_PAGE_MASK(dst_map)) ||
9422 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9423 aligned = FALSE;
9424 dst_end = vm_map_round_page(dst_addr + copy->size,
9425 VM_MAP_PAGE_MASK(dst_map));
9426 } else {
9427 dst_end = dst_addr + copy->size;
9428 }
9429
9430 vm_map_lock(dst_map);
9431
9432 /* LP64todo - remove this check when vm_map_commpage64()
9433 * no longer has to stuff in a map_entry for the commpage
9434 * above the map's max_offset.
9435 */
9436 if (dst_addr >= dst_map->max_offset) {
9437 vm_map_unlock(dst_map);
9438 return KERN_INVALID_ADDRESS;
9439 }
9440
9441 start_pass_1:
9442 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9443 vm_map_unlock(dst_map);
9444 return KERN_INVALID_ADDRESS;
9445 }
9446 vm_map_clip_start(dst_map,
9447 tmp_entry,
9448 vm_map_trunc_page(dst_addr,
9449 VM_MAP_PAGE_MASK(dst_map)));
9450 for (entry = tmp_entry;;) {
9451 vm_map_entry_t next = entry->vme_next;
9452
9453 while (entry->is_sub_map) {
9454 vm_map_offset_t sub_start;
9455 vm_map_offset_t sub_end;
9456 vm_map_offset_t local_end;
9457
9458 if (entry->in_transition) {
9459 /*
9460 * Say that we are waiting, and wait for entry.
9461 */
9462 entry->needs_wakeup = TRUE;
9463 vm_map_entry_wait(dst_map, THREAD_UNINT);
9464
9465 goto start_pass_1;
9466 }
9467
9468 local_end = entry->vme_end;
9469 if (!(entry->needs_copy)) {
9470 /* if needs_copy we are a COW submap */
9471 /* in such a case we just replace so */
9472 /* there is no need for the follow- */
9473 /* ing check. */
9474 encountered_sub_map = TRUE;
9475 sub_start = VME_OFFSET(entry);
9476
9477 if (entry->vme_end < dst_end) {
9478 sub_end = entry->vme_end;
9479 } else {
9480 sub_end = dst_end;
9481 }
9482 sub_end -= entry->vme_start;
9483 sub_end += VME_OFFSET(entry);
9484 vm_map_unlock(dst_map);
9485
9486 kr = vm_map_overwrite_submap_recurse(
9487 VME_SUBMAP(entry),
9488 sub_start,
9489 sub_end - sub_start);
9490 if (kr != KERN_SUCCESS) {
9491 return kr;
9492 }
9493 vm_map_lock(dst_map);
9494 }
9495
9496 if (dst_end <= entry->vme_end) {
9497 goto start_overwrite;
9498 }
9499 if (!vm_map_lookup_entry(dst_map, local_end,
9500 &entry)) {
9501 vm_map_unlock(dst_map);
9502 return KERN_INVALID_ADDRESS;
9503 }
9504 next = entry->vme_next;
9505 }
9506 assert(!entry->is_sub_map);
9507
9508 if (!(entry->protection & VM_PROT_WRITE)) {
9509 vm_map_unlock(dst_map);
9510 return KERN_PROTECTION_FAILURE;
9511 }
9512
9513 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9514 vm_map_unlock(dst_map);
9515 return KERN_PROTECTION_FAILURE;
9516 }
9517
9518 /*
9519 * If the entry is in transition, we must wait
9520 * for it to exit that state. Anything could happen
9521 * when we unlock the map, so start over.
9522 */
9523 if (entry->in_transition) {
9524 /*
9525 * Say that we are waiting, and wait for entry.
9526 */
9527 entry->needs_wakeup = TRUE;
9528 vm_map_entry_wait(dst_map, THREAD_UNINT);
9529
9530 goto start_pass_1;
9531 }
9532
9533 /*
9534 * our range is contained completely within this map entry
9535 */
9536 if (dst_end <= entry->vme_end) {
9537 break;
9538 }
9539 /*
9540 * check that range specified is contiguous region
9541 */
9542 if ((next == vm_map_to_entry(dst_map)) ||
9543 (next->vme_start != entry->vme_end)) {
9544 vm_map_unlock(dst_map);
9545 return KERN_INVALID_ADDRESS;
9546 }
9547
9548
9549 /*
9550 * Check for permanent objects in the destination.
9551 */
9552 assert(!entry->is_sub_map);
9553 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9554 ((!VME_OBJECT(entry)->internal) ||
9555 (VME_OBJECT(entry)->true_share))) {
9556 contains_permanent_objects = TRUE;
9557 }
9558
9559 entry = next;
9560 }/* for */
9561
9562 start_overwrite:
9563 /*
9564 * If there are permanent objects in the destination, then
9565 * the copy cannot be interrupted.
9566 */
9567
9568 if (interruptible && contains_permanent_objects) {
9569 vm_map_unlock(dst_map);
9570 return KERN_FAILURE; /* XXX */
9571 }
9572
9573 /*
9574 *
9575 * Make a second pass, overwriting the data
9576 * At the beginning of each loop iteration,
9577 * the next entry to be overwritten is "tmp_entry"
9578 * (initially, the value returned from the lookup above),
9579 * and the starting address expected in that entry
9580 * is "start".
9581 */
9582
9583 total_size = copy->size;
9584 if (encountered_sub_map) {
9585 copy_size = 0;
9586 /* re-calculate tmp_entry since we've had the map */
9587 /* unlocked */
9588 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9589 vm_map_unlock(dst_map);
9590 return KERN_INVALID_ADDRESS;
9591 }
9592 } else {
9593 copy_size = copy->size;
9594 }
9595
9596 base_addr = dst_addr;
9597 while (TRUE) {
9598 /* deconstruct the copy object and do in parts */
9599 /* only in sub_map, interruptable case */
9600 vm_map_entry_t copy_entry;
9601 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9602 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9603 int nentries;
9604 int remaining_entries = 0;
9605 vm_map_offset_t new_offset = 0;
9606
9607 for (entry = tmp_entry; copy_size == 0;) {
9608 vm_map_entry_t next;
9609
9610 next = entry->vme_next;
9611
9612 /* tmp_entry and base address are moved along */
9613 /* each time we encounter a sub-map. Otherwise */
9614 /* entry can outpase tmp_entry, and the copy_size */
9615 /* may reflect the distance between them */
9616 /* if the current entry is found to be in transition */
9617 /* we will start over at the beginning or the last */
9618 /* encounter of a submap as dictated by base_addr */
9619 /* we will zero copy_size accordingly. */
9620 if (entry->in_transition) {
9621 /*
9622 * Say that we are waiting, and wait for entry.
9623 */
9624 entry->needs_wakeup = TRUE;
9625 vm_map_entry_wait(dst_map, THREAD_UNINT);
9626
9627 if (!vm_map_lookup_entry(dst_map, base_addr,
9628 &tmp_entry)) {
9629 vm_map_unlock(dst_map);
9630 return KERN_INVALID_ADDRESS;
9631 }
9632 copy_size = 0;
9633 entry = tmp_entry;
9634 continue;
9635 }
9636 if (entry->is_sub_map) {
9637 vm_map_offset_t sub_start;
9638 vm_map_offset_t sub_end;
9639 vm_map_offset_t local_end;
9640
9641 if (entry->needs_copy) {
9642 /* if this is a COW submap */
9643 /* just back the range with a */
9644 /* anonymous entry */
9645 assert(!entry->vme_permanent);
9646 if (entry->vme_end < dst_end) {
9647 sub_end = entry->vme_end;
9648 } else {
9649 sub_end = dst_end;
9650 }
9651 if (entry->vme_start < base_addr) {
9652 sub_start = base_addr;
9653 } else {
9654 sub_start = entry->vme_start;
9655 }
9656 vm_map_clip_end(
9657 dst_map, entry, sub_end);
9658 vm_map_clip_start(
9659 dst_map, entry, sub_start);
9660 assert(!entry->use_pmap);
9661 assert(!entry->iokit_acct);
9662 entry->use_pmap = TRUE;
9663 vm_map_deallocate(VME_SUBMAP(entry));
9664 assert(!entry->vme_permanent);
9665 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9666 VME_OFFSET_SET(entry, 0);
9667 entry->is_shared = FALSE;
9668 entry->needs_copy = FALSE;
9669 entry->protection = VM_PROT_DEFAULT;
9670 entry->max_protection = VM_PROT_ALL;
9671 entry->wired_count = 0;
9672 entry->user_wired_count = 0;
9673 if (entry->inheritance
9674 == VM_INHERIT_SHARE) {
9675 entry->inheritance = VM_INHERIT_COPY;
9676 }
9677 continue;
9678 }
9679 /* first take care of any non-sub_map */
9680 /* entries to send */
9681 if (base_addr < entry->vme_start) {
9682 /* stuff to send */
9683 copy_size =
9684 entry->vme_start - base_addr;
9685 break;
9686 }
9687 sub_start = VME_OFFSET(entry);
9688
9689 if (entry->vme_end < dst_end) {
9690 sub_end = entry->vme_end;
9691 } else {
9692 sub_end = dst_end;
9693 }
9694 sub_end -= entry->vme_start;
9695 sub_end += VME_OFFSET(entry);
9696 local_end = entry->vme_end;
9697 vm_map_unlock(dst_map);
9698 copy_size = sub_end - sub_start;
9699
9700 /* adjust the copy object */
9701 if (total_size > copy_size) {
9702 vm_map_size_t local_size = 0;
9703 vm_map_size_t entry_size;
9704
9705 nentries = 1;
9706 new_offset = copy->offset;
9707 copy_entry = vm_map_copy_first_entry(copy);
9708 while (copy_entry !=
9709 vm_map_copy_to_entry(copy)) {
9710 entry_size = copy_entry->vme_end -
9711 copy_entry->vme_start;
9712 if ((local_size < copy_size) &&
9713 ((local_size + entry_size)
9714 >= copy_size)) {
9715 vm_map_copy_clip_end(copy,
9716 copy_entry,
9717 copy_entry->vme_start +
9718 (copy_size - local_size));
9719 entry_size = copy_entry->vme_end -
9720 copy_entry->vme_start;
9721 local_size += entry_size;
9722 new_offset += entry_size;
9723 }
9724 if (local_size >= copy_size) {
9725 next_copy = copy_entry->vme_next;
9726 copy_entry->vme_next =
9727 vm_map_copy_to_entry(copy);
9728 previous_prev =
9729 copy->cpy_hdr.links.prev;
9730 copy->cpy_hdr.links.prev = copy_entry;
9731 copy->size = copy_size;
9732 remaining_entries =
9733 copy->cpy_hdr.nentries;
9734 remaining_entries -= nentries;
9735 copy->cpy_hdr.nentries = nentries;
9736 break;
9737 } else {
9738 local_size += entry_size;
9739 new_offset += entry_size;
9740 nentries++;
9741 }
9742 copy_entry = copy_entry->vme_next;
9743 }
9744 }
9745
9746 if ((entry->use_pmap) && (pmap == NULL)) {
9747 kr = vm_map_copy_overwrite_nested(
9748 VME_SUBMAP(entry),
9749 sub_start,
9750 copy,
9751 interruptible,
9752 VME_SUBMAP(entry)->pmap,
9753 TRUE);
9754 } else if (pmap != NULL) {
9755 kr = vm_map_copy_overwrite_nested(
9756 VME_SUBMAP(entry),
9757 sub_start,
9758 copy,
9759 interruptible, pmap,
9760 TRUE);
9761 } else {
9762 kr = vm_map_copy_overwrite_nested(
9763 VME_SUBMAP(entry),
9764 sub_start,
9765 copy,
9766 interruptible,
9767 dst_map->pmap,
9768 TRUE);
9769 }
9770 if (kr != KERN_SUCCESS) {
9771 if (next_copy != NULL) {
9772 copy->cpy_hdr.nentries +=
9773 remaining_entries;
9774 copy->cpy_hdr.links.prev->vme_next =
9775 next_copy;
9776 copy->cpy_hdr.links.prev
9777 = previous_prev;
9778 copy->size = total_size;
9779 }
9780 return kr;
9781 }
9782 if (dst_end <= local_end) {
9783 return KERN_SUCCESS;
9784 }
9785 /* otherwise copy no longer exists, it was */
9786 /* destroyed after successful copy_overwrite */
9787 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9788 copy->offset = new_offset;
9789 copy->cpy_hdr.page_shift = copy_page_shift;
9790
9791 total_size -= copy_size;
9792 copy_size = 0;
9793 /* put back remainder of copy in container */
9794 if (next_copy != NULL) {
9795 copy->cpy_hdr.nentries = remaining_entries;
9796 copy->cpy_hdr.links.next = next_copy;
9797 copy->cpy_hdr.links.prev = previous_prev;
9798 copy->size = total_size;
9799 next_copy->vme_prev =
9800 vm_map_copy_to_entry(copy);
9801 next_copy = NULL;
9802 }
9803 base_addr = local_end;
9804 vm_map_lock(dst_map);
9805 if (!vm_map_lookup_entry(dst_map,
9806 local_end, &tmp_entry)) {
9807 vm_map_unlock(dst_map);
9808 return KERN_INVALID_ADDRESS;
9809 }
9810 entry = tmp_entry;
9811 continue;
9812 }
9813 assert(!entry->is_sub_map);
9814
9815 if (dst_end <= entry->vme_end) {
9816 copy_size = dst_end - base_addr;
9817 break;
9818 }
9819
9820 if ((next == vm_map_to_entry(dst_map)) ||
9821 (next->vme_start != entry->vme_end)) {
9822 vm_map_unlock(dst_map);
9823 return KERN_INVALID_ADDRESS;
9824 }
9825
9826 entry = next;
9827 }/* for */
9828
9829 next_copy = NULL;
9830 nentries = 1;
9831
9832 /* adjust the copy object */
9833 if (total_size > copy_size) {
9834 vm_map_size_t local_size = 0;
9835 vm_map_size_t entry_size;
9836
9837 new_offset = copy->offset;
9838 copy_entry = vm_map_copy_first_entry(copy);
9839 while (copy_entry != vm_map_copy_to_entry(copy)) {
9840 entry_size = copy_entry->vme_end -
9841 copy_entry->vme_start;
9842 if ((local_size < copy_size) &&
9843 ((local_size + entry_size)
9844 >= copy_size)) {
9845 vm_map_copy_clip_end(copy, copy_entry,
9846 copy_entry->vme_start +
9847 (copy_size - local_size));
9848 entry_size = copy_entry->vme_end -
9849 copy_entry->vme_start;
9850 local_size += entry_size;
9851 new_offset += entry_size;
9852 }
9853 if (local_size >= copy_size) {
9854 next_copy = copy_entry->vme_next;
9855 copy_entry->vme_next =
9856 vm_map_copy_to_entry(copy);
9857 previous_prev =
9858 copy->cpy_hdr.links.prev;
9859 copy->cpy_hdr.links.prev = copy_entry;
9860 copy->size = copy_size;
9861 remaining_entries =
9862 copy->cpy_hdr.nentries;
9863 remaining_entries -= nentries;
9864 copy->cpy_hdr.nentries = nentries;
9865 break;
9866 } else {
9867 local_size += entry_size;
9868 new_offset += entry_size;
9869 nentries++;
9870 }
9871 copy_entry = copy_entry->vme_next;
9872 }
9873 }
9874
9875 if (aligned) {
9876 pmap_t local_pmap;
9877
9878 if (pmap) {
9879 local_pmap = pmap;
9880 } else {
9881 local_pmap = dst_map->pmap;
9882 }
9883
9884 if ((kr = vm_map_copy_overwrite_aligned(
9885 dst_map, tmp_entry, copy,
9886 base_addr, local_pmap)) != KERN_SUCCESS) {
9887 if (next_copy != NULL) {
9888 copy->cpy_hdr.nentries +=
9889 remaining_entries;
9890 copy->cpy_hdr.links.prev->vme_next =
9891 next_copy;
9892 copy->cpy_hdr.links.prev =
9893 previous_prev;
9894 copy->size += copy_size;
9895 }
9896 return kr;
9897 }
9898 vm_map_unlock(dst_map);
9899 } else {
9900 /*
9901 * Performance gain:
9902 *
9903 * if the copy and dst address are misaligned but the same
9904 * offset within the page we can copy_not_aligned the
9905 * misaligned parts and copy aligned the rest. If they are
9906 * aligned but len is unaligned we simply need to copy
9907 * the end bit unaligned. We'll need to split the misaligned
9908 * bits of the region in this case !
9909 */
9910 /* ALWAYS UNLOCKS THE dst_map MAP */
9911 kr = vm_map_copy_overwrite_unaligned(
9912 dst_map,
9913 tmp_entry,
9914 copy,
9915 base_addr,
9916 discard_on_success);
9917 if (kr != KERN_SUCCESS) {
9918 if (next_copy != NULL) {
9919 copy->cpy_hdr.nentries +=
9920 remaining_entries;
9921 copy->cpy_hdr.links.prev->vme_next =
9922 next_copy;
9923 copy->cpy_hdr.links.prev =
9924 previous_prev;
9925 copy->size += copy_size;
9926 }
9927 return kr;
9928 }
9929 }
9930 total_size -= copy_size;
9931 if (total_size == 0) {
9932 break;
9933 }
9934 base_addr += copy_size;
9935 copy_size = 0;
9936 copy->offset = new_offset;
9937 if (next_copy != NULL) {
9938 copy->cpy_hdr.nentries = remaining_entries;
9939 copy->cpy_hdr.links.next = next_copy;
9940 copy->cpy_hdr.links.prev = previous_prev;
9941 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9942 copy->size = total_size;
9943 }
9944 vm_map_lock(dst_map);
9945 while (TRUE) {
9946 if (!vm_map_lookup_entry(dst_map,
9947 base_addr, &tmp_entry)) {
9948 vm_map_unlock(dst_map);
9949 return KERN_INVALID_ADDRESS;
9950 }
9951 if (tmp_entry->in_transition) {
9952 entry->needs_wakeup = TRUE;
9953 vm_map_entry_wait(dst_map, THREAD_UNINT);
9954 } else {
9955 break;
9956 }
9957 }
9958 vm_map_clip_start(dst_map,
9959 tmp_entry,
9960 vm_map_trunc_page(base_addr,
9961 VM_MAP_PAGE_MASK(dst_map)));
9962
9963 entry = tmp_entry;
9964 } /* while */
9965
9966 /*
9967 * Throw away the vm_map_copy object
9968 */
9969 if (discard_on_success) {
9970 vm_map_copy_discard(copy);
9971 }
9972
9973 return KERN_SUCCESS;
9974 }/* vm_map_copy_overwrite */
9975
9976 static __attribute__((always_inline, warn_unused_result))
9977 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9978 vm_map_copy_addr_size_sanitize(
9979 vm_map_t map,
9980 vm_map_offset_ut addr_u,
9981 vm_map_size_ut size_u,
9982 vm_sanitize_caller_t vm_sanitize_caller,
9983 vm_map_offset_t *addr,
9984 vm_map_offset_t *end,
9985 vm_map_size_t *size)
9986 {
9987 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
9988 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
9989
9990
9991 return vm_sanitize_addr_size(addr_u, size_u,
9992 vm_sanitize_caller, map,
9993 flags,
9994 addr, end, size);
9995 }
9996
9997 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)9998 vm_map_copy_overwrite(
9999 vm_map_t dst_map,
10000 vm_map_offset_ut dst_addr_u,
10001 vm_map_copy_t copy,
10002 vm_map_size_ut copy_size_u,
10003 boolean_t interruptible)
10004 {
10005 vm_map_offset_t dst_addr, dst_end;
10006 vm_map_size_t copy_size;
10007 vm_map_size_t head_size, tail_size;
10008 vm_map_copy_t head_copy, tail_copy;
10009 vm_map_offset_t head_addr, tail_addr;
10010 vm_map_entry_t entry;
10011 kern_return_t kr;
10012 vm_map_offset_t effective_page_mask, effective_page_size;
10013 uint16_t copy_page_shift;
10014
10015 head_size = 0;
10016 tail_size = 0;
10017 head_copy = NULL;
10018 tail_copy = NULL;
10019 head_addr = 0;
10020 tail_addr = 0;
10021
10022 /*
10023 * Check for null copy object.
10024 */
10025 if (copy == VM_MAP_COPY_NULL) {
10026 return KERN_SUCCESS;
10027 }
10028
10029 /*
10030 * Sanitize any input parameters that are addr/size/prot/inherit
10031 */
10032 kr = vm_map_copy_addr_size_sanitize(
10033 dst_map,
10034 dst_addr_u,
10035 copy_size_u,
10036 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10037 &dst_addr,
10038 &dst_end,
10039 ©_size);
10040 if (__improbable(kr != KERN_SUCCESS)) {
10041 return vm_sanitize_get_kr(kr);
10042 }
10043
10044 /*
10045 * Assert that the vm_map_copy is coming from the right
10046 * zone and hasn't been forged
10047 */
10048 vm_map_copy_require(copy);
10049
10050 if (interruptible ||
10051 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10052 /*
10053 * We can't split the "copy" map if we're interruptible
10054 * or if we don't have a "copy" map...
10055 */
10056 blunt_copy:
10057 kr = vm_map_copy_overwrite_nested(dst_map,
10058 dst_addr,
10059 copy,
10060 interruptible,
10061 (pmap_t) NULL,
10062 TRUE);
10063 if (kr) {
10064 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10065 }
10066 return kr;
10067 }
10068
10069 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10070 if (copy_page_shift < PAGE_SHIFT ||
10071 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10072 goto blunt_copy;
10073 }
10074
10075 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10076 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10077 } else {
10078 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10079 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10080 effective_page_mask);
10081 }
10082 effective_page_size = effective_page_mask + 1;
10083
10084 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10085 /*
10086 * Too small to bother with optimizing...
10087 */
10088 goto blunt_copy;
10089 }
10090
10091 if ((dst_addr & effective_page_mask) !=
10092 (copy->offset & effective_page_mask)) {
10093 /*
10094 * Incompatible mis-alignment of source and destination...
10095 */
10096 goto blunt_copy;
10097 }
10098
10099 /*
10100 * Proper alignment or identical mis-alignment at the beginning.
10101 * Let's try and do a small unaligned copy first (if needed)
10102 * and then an aligned copy for the rest.
10103 */
10104 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10105 head_addr = dst_addr;
10106 head_size = (effective_page_size -
10107 (copy->offset & effective_page_mask));
10108 head_size = MIN(head_size, copy_size);
10109 }
10110 if (!vm_map_page_aligned(copy->offset + copy_size,
10111 effective_page_mask)) {
10112 /*
10113 * Mis-alignment at the end.
10114 * Do an aligned copy up to the last page and
10115 * then an unaligned copy for the remaining bytes.
10116 */
10117 tail_size = ((copy->offset + copy_size) &
10118 effective_page_mask);
10119 tail_size = MIN(tail_size, copy_size);
10120 tail_addr = dst_addr + copy_size - tail_size;
10121 assert(tail_addr >= head_addr + head_size);
10122 }
10123 assert(head_size + tail_size <= copy_size);
10124
10125 if (head_size + tail_size == copy_size) {
10126 /*
10127 * It's all unaligned, no optimization possible...
10128 */
10129 goto blunt_copy;
10130 }
10131
10132 /*
10133 * Can't optimize if there are any submaps in the
10134 * destination due to the way we free the "copy" map
10135 * progressively in vm_map_copy_overwrite_nested()
10136 * in that case.
10137 */
10138 vm_map_lock_read(dst_map);
10139 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10140 vm_map_unlock_read(dst_map);
10141 goto blunt_copy;
10142 }
10143 for (;
10144 (entry != vm_map_to_entry(dst_map) &&
10145 entry->vme_start < dst_addr + copy_size);
10146 entry = entry->vme_next) {
10147 if (entry->is_sub_map) {
10148 vm_map_unlock_read(dst_map);
10149 goto blunt_copy;
10150 }
10151 }
10152 vm_map_unlock_read(dst_map);
10153
10154 if (head_size) {
10155 /*
10156 * Unaligned copy of the first "head_size" bytes, to reach
10157 * a page boundary.
10158 */
10159
10160 /*
10161 * Extract "head_copy" out of "copy".
10162 */
10163 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10164 head_copy->cpy_hdr.entries_pageable =
10165 copy->cpy_hdr.entries_pageable;
10166 head_copy->cpy_hdr.page_shift = copy_page_shift;
10167
10168 entry = vm_map_copy_first_entry(copy);
10169 if (entry->vme_end < copy->offset + head_size) {
10170 head_size = entry->vme_end - copy->offset;
10171 }
10172
10173 head_copy->offset = copy->offset;
10174 head_copy->size = head_size;
10175 copy->offset += head_size;
10176 copy->size -= head_size;
10177 copy_size -= head_size;
10178 assert(copy_size > 0);
10179
10180 vm_map_copy_clip_end(copy, entry, copy->offset);
10181 vm_map_copy_entry_unlink(copy, entry);
10182 vm_map_copy_entry_link(head_copy,
10183 vm_map_copy_to_entry(head_copy),
10184 entry);
10185
10186 /*
10187 * Do the unaligned copy.
10188 */
10189 kr = vm_map_copy_overwrite_nested(dst_map,
10190 head_addr,
10191 head_copy,
10192 interruptible,
10193 (pmap_t) NULL,
10194 FALSE);
10195 if (kr != KERN_SUCCESS) {
10196 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10197 goto done;
10198 }
10199 }
10200
10201 if (tail_size) {
10202 /*
10203 * Extract "tail_copy" out of "copy".
10204 */
10205 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10206 tail_copy->cpy_hdr.entries_pageable =
10207 copy->cpy_hdr.entries_pageable;
10208 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10209
10210 tail_copy->offset = copy->offset + copy_size - tail_size;
10211 tail_copy->size = tail_size;
10212
10213 copy->size -= tail_size;
10214 copy_size -= tail_size;
10215 assert(copy_size > 0);
10216
10217 entry = vm_map_copy_last_entry(copy);
10218 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10219 entry = vm_map_copy_last_entry(copy);
10220 vm_map_copy_entry_unlink(copy, entry);
10221 vm_map_copy_entry_link(tail_copy,
10222 vm_map_copy_last_entry(tail_copy),
10223 entry);
10224 }
10225
10226 /*
10227 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10228 * we want to avoid TOCTOU issues w.r.t copy->size but
10229 * we don't need to change vm_map_copy_overwrite_nested()
10230 * and all other vm_map_copy_overwrite variants.
10231 *
10232 * So we assign the original copy_size that was passed into
10233 * this routine back to copy.
10234 *
10235 * This use of local 'copy_size' passed into this routine is
10236 * to try and protect against TOCTOU attacks where the kernel
10237 * has been exploited. We don't expect this to be an issue
10238 * during normal system operation.
10239 */
10240 assertf(copy->size == copy_size,
10241 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10242 copy->size = copy_size;
10243
10244 /*
10245 * Copy most (or possibly all) of the data.
10246 */
10247 kr = vm_map_copy_overwrite_nested(dst_map,
10248 dst_addr + head_size,
10249 copy,
10250 interruptible,
10251 (pmap_t) NULL,
10252 FALSE);
10253 if (kr != KERN_SUCCESS) {
10254 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10255 goto done;
10256 }
10257
10258 if (tail_size) {
10259 kr = vm_map_copy_overwrite_nested(dst_map,
10260 tail_addr,
10261 tail_copy,
10262 interruptible,
10263 (pmap_t) NULL,
10264 FALSE);
10265 if (kr) {
10266 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10267 }
10268 }
10269
10270 done:
10271 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10272 if (kr == KERN_SUCCESS) {
10273 /*
10274 * Discard all the copy maps.
10275 */
10276 if (head_copy) {
10277 vm_map_copy_discard(head_copy);
10278 head_copy = NULL;
10279 }
10280 vm_map_copy_discard(copy);
10281 if (tail_copy) {
10282 vm_map_copy_discard(tail_copy);
10283 tail_copy = NULL;
10284 }
10285 } else {
10286 /*
10287 * Re-assemble the original copy map.
10288 */
10289 if (head_copy) {
10290 entry = vm_map_copy_first_entry(head_copy);
10291 vm_map_copy_entry_unlink(head_copy, entry);
10292 vm_map_copy_entry_link(copy,
10293 vm_map_copy_to_entry(copy),
10294 entry);
10295 copy->offset -= head_size;
10296 copy->size += head_size;
10297 vm_map_copy_discard(head_copy);
10298 head_copy = NULL;
10299 }
10300 if (tail_copy) {
10301 entry = vm_map_copy_last_entry(tail_copy);
10302 vm_map_copy_entry_unlink(tail_copy, entry);
10303 vm_map_copy_entry_link(copy,
10304 vm_map_copy_last_entry(copy),
10305 entry);
10306 copy->size += tail_size;
10307 vm_map_copy_discard(tail_copy);
10308 tail_copy = NULL;
10309 }
10310 }
10311 return kr;
10312 }
10313
10314
10315 /*
10316 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10317 *
10318 * Decription:
10319 * Physically copy unaligned data
10320 *
10321 * Implementation:
10322 * Unaligned parts of pages have to be physically copied. We use
10323 * a modified form of vm_fault_copy (which understands none-aligned
10324 * page offsets and sizes) to do the copy. We attempt to copy as
10325 * much memory in one go as possibly, however vm_fault_copy copies
10326 * within 1 memory object so we have to find the smaller of "amount left"
10327 * "source object data size" and "target object data size". With
10328 * unaligned data we don't need to split regions, therefore the source
10329 * (copy) object should be one map entry, the target range may be split
10330 * over multiple map entries however. In any event we are pessimistic
10331 * about these assumptions.
10332 *
10333 * Callers of this function must call vm_map_copy_require on
10334 * previously created vm_map_copy_t or pass a newly created
10335 * one to ensure that it hasn't been forged.
10336 *
10337 * Assumptions:
10338 * dst_map is locked on entry and is return locked on success,
10339 * unlocked on error.
10340 */
10341
10342 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10343 vm_map_copy_overwrite_unaligned(
10344 vm_map_t dst_map,
10345 vm_map_entry_t entry,
10346 vm_map_copy_t copy,
10347 vm_map_offset_t start,
10348 boolean_t discard_on_success)
10349 {
10350 vm_map_entry_t copy_entry;
10351 vm_map_entry_t copy_entry_next;
10352 vm_map_version_t version;
10353 vm_object_t dst_object;
10354 vm_object_offset_t dst_offset;
10355 vm_object_offset_t src_offset;
10356 vm_object_offset_t entry_offset;
10357 vm_map_offset_t entry_end;
10358 vm_map_size_t src_size,
10359 dst_size,
10360 copy_size,
10361 amount_left;
10362 kern_return_t kr = KERN_SUCCESS;
10363
10364
10365 copy_entry = vm_map_copy_first_entry(copy);
10366
10367 vm_map_lock_write_to_read(dst_map);
10368
10369 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10370 amount_left = copy->size;
10371 /*
10372 * unaligned so we never clipped this entry, we need the offset into
10373 * the vm_object not just the data.
10374 */
10375 while (amount_left > 0) {
10376 if (entry == vm_map_to_entry(dst_map)) {
10377 vm_map_unlock_read(dst_map);
10378 return KERN_INVALID_ADDRESS;
10379 }
10380
10381 /* "start" must be within the current map entry */
10382 assert((start >= entry->vme_start) && (start < entry->vme_end));
10383
10384 /*
10385 * Check protection again
10386 */
10387 if (!(entry->protection & VM_PROT_WRITE)) {
10388 vm_map_unlock_read(dst_map);
10389 return KERN_PROTECTION_FAILURE;
10390 }
10391 if (entry->is_sub_map) {
10392 /* not implemented... */
10393 vm_map_unlock_read(dst_map);
10394 return KERN_INVALID_ARGUMENT;
10395 }
10396 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10397 vm_map_unlock_read(dst_map);
10398 return KERN_PROTECTION_FAILURE;
10399 }
10400 /*
10401 * If the entry is in transition, we must wait
10402 * for it to exit that state. Anything could happen
10403 * when we unlock the map, so start over.
10404 */
10405 if (entry->in_transition) {
10406 /*
10407 * Say that we are waiting, and wait for entry.
10408 */
10409 entry->needs_wakeup = TRUE;
10410 vm_map_entry_wait(dst_map, THREAD_UNINT);
10411
10412 goto RetryLookup;
10413 }
10414
10415 dst_offset = start - entry->vme_start;
10416
10417 dst_size = entry->vme_end - start;
10418
10419 src_size = copy_entry->vme_end -
10420 (copy_entry->vme_start + src_offset);
10421
10422 if (dst_size < src_size) {
10423 /*
10424 * we can only copy dst_size bytes before
10425 * we have to get the next destination entry
10426 */
10427 copy_size = dst_size;
10428 } else {
10429 /*
10430 * we can only copy src_size bytes before
10431 * we have to get the next source copy entry
10432 */
10433 copy_size = src_size;
10434 }
10435
10436 if (copy_size > amount_left) {
10437 copy_size = amount_left;
10438 }
10439 /*
10440 * Entry needs copy, create a shadow shadow object for
10441 * Copy on write region.
10442 */
10443 assert(!entry->is_sub_map);
10444 if (entry->needs_copy) {
10445 if (vm_map_lock_read_to_write(dst_map)) {
10446 vm_map_lock_read(dst_map);
10447 goto RetryLookup;
10448 }
10449 VME_OBJECT_SHADOW(entry,
10450 (vm_map_size_t)(entry->vme_end
10451 - entry->vme_start),
10452 vm_map_always_shadow(dst_map));
10453 entry->needs_copy = FALSE;
10454 vm_map_lock_write_to_read(dst_map);
10455 }
10456 dst_object = VME_OBJECT(entry);
10457 /*
10458 * unlike with the virtual (aligned) copy we're going
10459 * to fault on it therefore we need a target object.
10460 */
10461 if (dst_object == VM_OBJECT_NULL) {
10462 if (vm_map_lock_read_to_write(dst_map)) {
10463 vm_map_lock_read(dst_map);
10464 goto RetryLookup;
10465 }
10466 dst_object = vm_object_allocate((vm_map_size_t)
10467 entry->vme_end - entry->vme_start);
10468 VME_OBJECT_SET(entry, dst_object, false, 0);
10469 VME_OFFSET_SET(entry, 0);
10470 assert(entry->use_pmap);
10471 vm_map_lock_write_to_read(dst_map);
10472 }
10473 /*
10474 * Take an object reference and unlock map. The "entry" may
10475 * disappear or change when the map is unlocked.
10476 */
10477 vm_object_reference(dst_object);
10478 version.main_timestamp = dst_map->timestamp;
10479 entry_offset = VME_OFFSET(entry);
10480 entry_end = entry->vme_end;
10481 vm_map_unlock_read(dst_map);
10482 /*
10483 * Copy as much as possible in one pass
10484 */
10485 kr = vm_fault_copy(
10486 VME_OBJECT(copy_entry),
10487 VME_OFFSET(copy_entry) + src_offset,
10488 ©_size,
10489 dst_object,
10490 entry_offset + dst_offset,
10491 dst_map,
10492 &version,
10493 THREAD_UNINT );
10494
10495 start += copy_size;
10496 src_offset += copy_size;
10497 amount_left -= copy_size;
10498 /*
10499 * Release the object reference
10500 */
10501 vm_object_deallocate(dst_object);
10502 /*
10503 * If a hard error occurred, return it now
10504 */
10505 if (kr != KERN_SUCCESS) {
10506 return kr;
10507 }
10508
10509 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10510 || amount_left == 0) {
10511 /*
10512 * all done with this copy entry, dispose.
10513 */
10514 copy_entry_next = copy_entry->vme_next;
10515
10516 if (discard_on_success) {
10517 vm_map_copy_entry_unlink(copy, copy_entry);
10518 assert(!copy_entry->is_sub_map);
10519 vm_object_deallocate(VME_OBJECT(copy_entry));
10520 vm_map_copy_entry_dispose(copy_entry);
10521 }
10522
10523 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10524 amount_left) {
10525 /*
10526 * not finished copying but run out of source
10527 */
10528 return KERN_INVALID_ADDRESS;
10529 }
10530
10531 copy_entry = copy_entry_next;
10532
10533 src_offset = 0;
10534 }
10535
10536 if (amount_left == 0) {
10537 return KERN_SUCCESS;
10538 }
10539
10540 vm_map_lock_read(dst_map);
10541 if (version.main_timestamp == dst_map->timestamp) {
10542 if (start == entry_end) {
10543 /*
10544 * destination region is split. Use the version
10545 * information to avoid a lookup in the normal
10546 * case.
10547 */
10548 entry = entry->vme_next;
10549 /*
10550 * should be contiguous. Fail if we encounter
10551 * a hole in the destination.
10552 */
10553 if (start != entry->vme_start) {
10554 vm_map_unlock_read(dst_map);
10555 return KERN_INVALID_ADDRESS;
10556 }
10557 }
10558 } else {
10559 /*
10560 * Map version check failed.
10561 * we must lookup the entry because somebody
10562 * might have changed the map behind our backs.
10563 */
10564 RetryLookup:
10565 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10566 vm_map_unlock_read(dst_map);
10567 return KERN_INVALID_ADDRESS;
10568 }
10569 }
10570 }/* while */
10571
10572 return KERN_SUCCESS;
10573 }/* vm_map_copy_overwrite_unaligned */
10574
10575 /*
10576 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10577 *
10578 * Description:
10579 * Does all the vm_trickery possible for whole pages.
10580 *
10581 * Implementation:
10582 *
10583 * If there are no permanent objects in the destination,
10584 * and the source and destination map entry zones match,
10585 * and the destination map entry is not shared,
10586 * then the map entries can be deleted and replaced
10587 * with those from the copy. The following code is the
10588 * basic idea of what to do, but there are lots of annoying
10589 * little details about getting protection and inheritance
10590 * right. Should add protection, inheritance, and sharing checks
10591 * to the above pass and make sure that no wiring is involved.
10592 *
10593 * Callers of this function must call vm_map_copy_require on
10594 * previously created vm_map_copy_t or pass a newly created
10595 * one to ensure that it hasn't been forged.
10596 */
10597
10598 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10599 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10600 int vm_map_copy_overwrite_aligned_src_large = 0;
10601
10602 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10603 vm_map_copy_overwrite_aligned(
10604 vm_map_t dst_map,
10605 vm_map_entry_t tmp_entry,
10606 vm_map_copy_t copy,
10607 vm_map_offset_t start,
10608 __unused pmap_t pmap)
10609 {
10610 vm_object_t object;
10611 vm_map_entry_t copy_entry;
10612 vm_map_size_t copy_size;
10613 vm_map_size_t size;
10614 vm_map_entry_t entry;
10615
10616 while ((copy_entry = vm_map_copy_first_entry(copy))
10617 != vm_map_copy_to_entry(copy)) {
10618 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10619
10620 entry = tmp_entry;
10621
10622 if (entry->is_sub_map) {
10623 /* unnested when clipped earlier */
10624 assert(!entry->use_pmap);
10625 }
10626 if (entry == vm_map_to_entry(dst_map)) {
10627 vm_map_unlock(dst_map);
10628 return KERN_INVALID_ADDRESS;
10629 }
10630 size = (entry->vme_end - entry->vme_start);
10631 /*
10632 * Make sure that no holes popped up in the
10633 * address map, and that the protection is
10634 * still valid, in case the map was unlocked
10635 * earlier.
10636 */
10637
10638 if ((entry->vme_start != start) || ((entry->is_sub_map)
10639 && !entry->needs_copy)) {
10640 vm_map_unlock(dst_map);
10641 return KERN_INVALID_ADDRESS;
10642 }
10643 assert(entry != vm_map_to_entry(dst_map));
10644
10645 /*
10646 * Check protection again
10647 */
10648
10649 if (!(entry->protection & VM_PROT_WRITE)) {
10650 vm_map_unlock(dst_map);
10651 return KERN_PROTECTION_FAILURE;
10652 }
10653
10654 if (entry->is_sub_map) {
10655 /* not properly implemented */
10656 vm_map_unlock(dst_map);
10657 return KERN_PROTECTION_FAILURE;
10658 }
10659
10660 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10661 vm_map_unlock(dst_map);
10662 return KERN_PROTECTION_FAILURE;
10663 }
10664
10665 /*
10666 * If the entry is in transition, we must wait
10667 * for it to exit that state. Anything could happen
10668 * when we unlock the map, so start over.
10669 */
10670 if (entry->in_transition) {
10671 /*
10672 * Say that we are waiting, and wait for entry.
10673 */
10674 entry->needs_wakeup = TRUE;
10675 vm_map_entry_wait(dst_map, THREAD_UNINT);
10676
10677 goto RetryLookup;
10678 }
10679
10680 /*
10681 * Adjust to source size first
10682 */
10683
10684 if (copy_size < size) {
10685 if (entry->map_aligned &&
10686 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10687 VM_MAP_PAGE_MASK(dst_map))) {
10688 /* no longer map-aligned */
10689 entry->map_aligned = FALSE;
10690 }
10691 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10692 size = copy_size;
10693 }
10694
10695 /*
10696 * Adjust to destination size
10697 */
10698
10699 if (size < copy_size) {
10700 vm_map_copy_clip_end(copy, copy_entry,
10701 copy_entry->vme_start + size);
10702 copy_size = size;
10703 }
10704
10705 assert((entry->vme_end - entry->vme_start) == size);
10706 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10707 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10708
10709 /*
10710 * If the destination contains temporary unshared memory,
10711 * we can perform the copy by throwing it away and
10712 * installing the source data.
10713 *
10714 * Exceptions for mappings with special semantics:
10715 * + "permanent" entries,
10716 * + JIT regions,
10717 * + TPRO regions,
10718 * + pmap-specific protection policies,
10719 * + VM objects with COPY_NONE copy strategy.
10720 */
10721
10722 object = VME_OBJECT(entry);
10723 if ((!entry->is_shared &&
10724 !entry->vme_permanent &&
10725 !entry->used_for_jit &&
10726 #if __arm64e__
10727 !entry->used_for_tpro &&
10728 #endif /* __arm64e__ */
10729 !(entry->protection & VM_PROT_EXECUTE) &&
10730 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10731 ((object == VM_OBJECT_NULL) ||
10732 (object->internal &&
10733 !object->true_share &&
10734 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10735 entry->needs_copy) {
10736 vm_object_t old_object = VME_OBJECT(entry);
10737 vm_object_offset_t old_offset = VME_OFFSET(entry);
10738 vm_object_offset_t offset;
10739
10740 assert(!entry->is_sub_map);
10741 /*
10742 * Ensure that the source and destination aren't
10743 * identical
10744 */
10745 if (old_object == VME_OBJECT(copy_entry) &&
10746 old_offset == VME_OFFSET(copy_entry)) {
10747 vm_map_copy_entry_unlink(copy, copy_entry);
10748 vm_map_copy_entry_dispose(copy_entry);
10749
10750 if (old_object != VM_OBJECT_NULL) {
10751 vm_object_deallocate(old_object);
10752 }
10753
10754 start = tmp_entry->vme_end;
10755 tmp_entry = tmp_entry->vme_next;
10756 continue;
10757 }
10758
10759 #if XNU_TARGET_OS_OSX
10760 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10761 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10762 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10763 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10764 copy_size <= __TRADEOFF1_COPY_SIZE) {
10765 /*
10766 * Virtual vs. Physical copy tradeoff #1.
10767 *
10768 * Copying only a few pages out of a large
10769 * object: do a physical copy instead of
10770 * a virtual copy, to avoid possibly keeping
10771 * the entire large object alive because of
10772 * those few copy-on-write pages.
10773 */
10774 vm_map_copy_overwrite_aligned_src_large++;
10775 goto slow_copy;
10776 }
10777 #endif /* XNU_TARGET_OS_OSX */
10778
10779 if ((dst_map->pmap != kernel_pmap) &&
10780 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10781 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10782 vm_object_t new_object, new_shadow;
10783
10784 /*
10785 * We're about to map something over a mapping
10786 * established by malloc()...
10787 */
10788 new_object = VME_OBJECT(copy_entry);
10789 if (new_object != VM_OBJECT_NULL) {
10790 vm_object_lock_shared(new_object);
10791 }
10792 while (new_object != VM_OBJECT_NULL &&
10793 #if XNU_TARGET_OS_OSX
10794 !new_object->true_share &&
10795 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10796 #endif /* XNU_TARGET_OS_OSX */
10797 new_object->internal) {
10798 new_shadow = new_object->shadow;
10799 if (new_shadow == VM_OBJECT_NULL) {
10800 break;
10801 }
10802 vm_object_lock_shared(new_shadow);
10803 vm_object_unlock(new_object);
10804 new_object = new_shadow;
10805 }
10806 if (new_object != VM_OBJECT_NULL) {
10807 if (!new_object->internal) {
10808 /*
10809 * The new mapping is backed
10810 * by an external object. We
10811 * don't want malloc'ed memory
10812 * to be replaced with such a
10813 * non-anonymous mapping, so
10814 * let's go off the optimized
10815 * path...
10816 */
10817 vm_map_copy_overwrite_aligned_src_not_internal++;
10818 vm_object_unlock(new_object);
10819 goto slow_copy;
10820 }
10821 #if XNU_TARGET_OS_OSX
10822 if (new_object->true_share ||
10823 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10824 /*
10825 * Same if there's a "true_share"
10826 * object in the shadow chain, or
10827 * an object with a non-default
10828 * (SYMMETRIC) copy strategy.
10829 */
10830 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10831 vm_object_unlock(new_object);
10832 goto slow_copy;
10833 }
10834 #endif /* XNU_TARGET_OS_OSX */
10835 vm_object_unlock(new_object);
10836 }
10837 /*
10838 * The new mapping is still backed by
10839 * anonymous (internal) memory, so it's
10840 * OK to substitute it for the original
10841 * malloc() mapping.
10842 */
10843 }
10844
10845 if (old_object != VM_OBJECT_NULL) {
10846 assert(!entry->vme_permanent);
10847 if (entry->is_sub_map) {
10848 if (entry->use_pmap) {
10849 #ifndef NO_NESTED_PMAP
10850 pmap_unnest(dst_map->pmap,
10851 (addr64_t)entry->vme_start,
10852 entry->vme_end - entry->vme_start);
10853 #endif /* NO_NESTED_PMAP */
10854 if (dst_map->mapped_in_other_pmaps) {
10855 /* clean up parent */
10856 /* map/maps */
10857 vm_map_submap_pmap_clean(
10858 dst_map, entry->vme_start,
10859 entry->vme_end,
10860 VME_SUBMAP(entry),
10861 VME_OFFSET(entry));
10862 }
10863 } else {
10864 vm_map_submap_pmap_clean(
10865 dst_map, entry->vme_start,
10866 entry->vme_end,
10867 VME_SUBMAP(entry),
10868 VME_OFFSET(entry));
10869 }
10870 vm_map_deallocate(VME_SUBMAP(entry));
10871 } else {
10872 if (dst_map->mapped_in_other_pmaps) {
10873 vm_object_pmap_protect_options(
10874 VME_OBJECT(entry),
10875 VME_OFFSET(entry),
10876 entry->vme_end
10877 - entry->vme_start,
10878 PMAP_NULL,
10879 PAGE_SIZE,
10880 entry->vme_start,
10881 VM_PROT_NONE,
10882 PMAP_OPTIONS_REMOVE);
10883 } else {
10884 pmap_remove_options(
10885 dst_map->pmap,
10886 (addr64_t)(entry->vme_start),
10887 (addr64_t)(entry->vme_end),
10888 PMAP_OPTIONS_REMOVE);
10889 }
10890 vm_object_deallocate(old_object);
10891 }
10892 }
10893
10894 if (entry->iokit_acct) {
10895 /* keep using iokit accounting */
10896 entry->use_pmap = FALSE;
10897 } else {
10898 /* use pmap accounting */
10899 entry->use_pmap = TRUE;
10900 }
10901 assert(!entry->vme_permanent);
10902 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10903 object = VME_OBJECT(entry);
10904 entry->needs_copy = copy_entry->needs_copy;
10905 entry->wired_count = 0;
10906 entry->user_wired_count = 0;
10907 offset = VME_OFFSET(copy_entry);
10908 VME_OFFSET_SET(entry, offset);
10909
10910 vm_map_copy_entry_unlink(copy, copy_entry);
10911 vm_map_copy_entry_dispose(copy_entry);
10912
10913 /*
10914 * we could try to push pages into the pmap at this point, BUT
10915 * this optimization only saved on average 2 us per page if ALL
10916 * the pages in the source were currently mapped
10917 * and ALL the pages in the dest were touched, if there were fewer
10918 * than 2/3 of the pages touched, this optimization actually cost more cycles
10919 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10920 */
10921
10922 /*
10923 * Set up for the next iteration. The map
10924 * has not been unlocked, so the next
10925 * address should be at the end of this
10926 * entry, and the next map entry should be
10927 * the one following it.
10928 */
10929
10930 start = tmp_entry->vme_end;
10931 tmp_entry = tmp_entry->vme_next;
10932 } else {
10933 vm_map_version_t version;
10934 vm_object_t dst_object;
10935 vm_object_offset_t dst_offset;
10936 kern_return_t r;
10937
10938 slow_copy:
10939 if (entry->needs_copy) {
10940 VME_OBJECT_SHADOW(entry,
10941 (entry->vme_end -
10942 entry->vme_start),
10943 vm_map_always_shadow(dst_map));
10944 entry->needs_copy = FALSE;
10945 }
10946
10947 dst_object = VME_OBJECT(entry);
10948 dst_offset = VME_OFFSET(entry);
10949
10950 /*
10951 * Take an object reference, and record
10952 * the map version information so that the
10953 * map can be safely unlocked.
10954 */
10955
10956 if (dst_object == VM_OBJECT_NULL) {
10957 /*
10958 * We would usually have just taken the
10959 * optimized path above if the destination
10960 * object has not been allocated yet. But we
10961 * now disable that optimization if the copy
10962 * entry's object is not backed by anonymous
10963 * memory to avoid replacing malloc'ed
10964 * (i.e. re-usable) anonymous memory with a
10965 * not-so-anonymous mapping.
10966 * So we have to handle this case here and
10967 * allocate a new VM object for this map entry.
10968 */
10969 dst_object = vm_object_allocate(
10970 entry->vme_end - entry->vme_start);
10971 dst_offset = 0;
10972 VME_OBJECT_SET(entry, dst_object, false, 0);
10973 VME_OFFSET_SET(entry, dst_offset);
10974 assert(entry->use_pmap);
10975 }
10976
10977 vm_object_reference(dst_object);
10978
10979 /* account for unlock bumping up timestamp */
10980 version.main_timestamp = dst_map->timestamp + 1;
10981
10982 vm_map_unlock(dst_map);
10983
10984 /*
10985 * Copy as much as possible in one pass
10986 */
10987
10988 copy_size = size;
10989 r = vm_fault_copy(
10990 VME_OBJECT(copy_entry),
10991 VME_OFFSET(copy_entry),
10992 ©_size,
10993 dst_object,
10994 dst_offset,
10995 dst_map,
10996 &version,
10997 THREAD_UNINT );
10998
10999 /*
11000 * Release the object reference
11001 */
11002
11003 vm_object_deallocate(dst_object);
11004
11005 /*
11006 * If a hard error occurred, return it now
11007 */
11008
11009 if (r != KERN_SUCCESS) {
11010 return r;
11011 }
11012
11013 if (copy_size != 0) {
11014 /*
11015 * Dispose of the copied region
11016 */
11017
11018 vm_map_copy_clip_end(copy, copy_entry,
11019 copy_entry->vme_start + copy_size);
11020 vm_map_copy_entry_unlink(copy, copy_entry);
11021 vm_object_deallocate(VME_OBJECT(copy_entry));
11022 vm_map_copy_entry_dispose(copy_entry);
11023 }
11024
11025 /*
11026 * Pick up in the destination map where we left off.
11027 *
11028 * Use the version information to avoid a lookup
11029 * in the normal case.
11030 */
11031
11032 start += copy_size;
11033 vm_map_lock(dst_map);
11034 if (version.main_timestamp == dst_map->timestamp &&
11035 copy_size != 0) {
11036 /* We can safely use saved tmp_entry value */
11037
11038 if (tmp_entry->map_aligned &&
11039 !VM_MAP_PAGE_ALIGNED(
11040 start,
11041 VM_MAP_PAGE_MASK(dst_map))) {
11042 /* no longer map-aligned */
11043 tmp_entry->map_aligned = FALSE;
11044 }
11045 vm_map_clip_end(dst_map, tmp_entry, start);
11046 tmp_entry = tmp_entry->vme_next;
11047 } else {
11048 /* Must do lookup of tmp_entry */
11049
11050 RetryLookup:
11051 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11052 vm_map_unlock(dst_map);
11053 return KERN_INVALID_ADDRESS;
11054 }
11055 if (tmp_entry->map_aligned &&
11056 !VM_MAP_PAGE_ALIGNED(
11057 start,
11058 VM_MAP_PAGE_MASK(dst_map))) {
11059 /* no longer map-aligned */
11060 tmp_entry->map_aligned = FALSE;
11061 }
11062 vm_map_clip_start(dst_map, tmp_entry, start);
11063 }
11064 }
11065 }/* while */
11066
11067 return KERN_SUCCESS;
11068 }/* vm_map_copy_overwrite_aligned */
11069
11070 /*
11071 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11072 *
11073 * Description:
11074 * Copy in data to a kernel buffer from space in the
11075 * source map. The original space may be optionally
11076 * deallocated.
11077 *
11078 * If successful, returns a new copy object.
11079 */
11080 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11081 vm_map_copyin_kernel_buffer(
11082 vm_map_t src_map,
11083 vm_map_offset_t src_addr,
11084 vm_map_size_t len,
11085 boolean_t src_destroy,
11086 vm_map_copy_t *copy_result)
11087 {
11088 kern_return_t kr;
11089 vm_map_copy_t copy;
11090 void *kdata;
11091
11092 if (len > msg_ool_size_small) {
11093 return KERN_INVALID_ARGUMENT;
11094 }
11095
11096 kdata = kalloc_data(len, Z_WAITOK);
11097 if (kdata == NULL) {
11098 return KERN_RESOURCE_SHORTAGE;
11099 }
11100 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11101 if (kr != KERN_SUCCESS) {
11102 kfree_data(kdata, len);
11103 return kr;
11104 }
11105
11106 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11107 copy->cpy_kdata = kdata;
11108 copy->size = len;
11109 copy->offset = 0;
11110
11111 if (src_destroy) {
11112 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11113
11114 if (src_map == kernel_map) {
11115 flags |= VM_MAP_REMOVE_KUNWIRE;
11116 }
11117
11118 (void)vm_map_remove_guard(src_map,
11119 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11120 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11121 flags, KMEM_GUARD_NONE);
11122 }
11123
11124 *copy_result = copy;
11125 return KERN_SUCCESS;
11126 }
11127
11128 /*
11129 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11130 *
11131 * Description:
11132 * Copy out data from a kernel buffer into space in the
11133 * destination map. The space may be otpionally dynamically
11134 * allocated.
11135 *
11136 * If successful, consumes the copy object.
11137 * Otherwise, the caller is responsible for it.
11138 *
11139 * Callers of this function must call vm_map_copy_require on
11140 * previously created vm_map_copy_t or pass a newly created
11141 * one to ensure that it hasn't been forged.
11142 */
11143 static int vm_map_copyout_kernel_buffer_failures = 0;
11144 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11145 vm_map_copyout_kernel_buffer(
11146 vm_map_t map,
11147 vm_map_address_t *addr, /* IN/OUT */
11148 vm_map_copy_t copy,
11149 vm_map_size_t copy_size,
11150 boolean_t overwrite,
11151 boolean_t consume_on_success)
11152 {
11153 kern_return_t kr = KERN_SUCCESS;
11154 thread_t thread = current_thread();
11155
11156 assert(copy->size == copy_size);
11157
11158 /*
11159 * check for corrupted vm_map_copy structure
11160 */
11161 if (copy_size > msg_ool_size_small || copy->offset) {
11162 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11163 (long long)copy->size, (long long)copy->offset);
11164 }
11165
11166 if (!overwrite) {
11167 /*
11168 * Allocate space in the target map for the data
11169 */
11170 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11171
11172 if (map == kernel_map) {
11173 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11174 }
11175
11176 *addr = 0;
11177 kr = vm_map_enter(map,
11178 addr,
11179 vm_map_round_page(copy_size,
11180 VM_MAP_PAGE_MASK(map)),
11181 (vm_map_offset_t) 0,
11182 vmk_flags,
11183 VM_OBJECT_NULL,
11184 (vm_object_offset_t) 0,
11185 FALSE,
11186 VM_PROT_DEFAULT,
11187 VM_PROT_ALL,
11188 VM_INHERIT_DEFAULT);
11189 if (kr != KERN_SUCCESS) {
11190 return kr;
11191 }
11192 #if KASAN
11193 if (map->pmap == kernel_pmap) {
11194 kasan_notify_address(*addr, copy->size);
11195 }
11196 #endif
11197 }
11198
11199 /*
11200 * Copyout the data from the kernel buffer to the target map.
11201 */
11202 if (thread->map == map) {
11203 /*
11204 * If the target map is the current map, just do
11205 * the copy.
11206 */
11207 assert((vm_size_t)copy_size == copy_size);
11208 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11209 kr = KERN_INVALID_ADDRESS;
11210 }
11211 } else {
11212 vm_map_t oldmap;
11213
11214 /*
11215 * If the target map is another map, assume the
11216 * target's address space identity for the duration
11217 * of the copy.
11218 */
11219 vm_map_reference(map);
11220 oldmap = vm_map_switch(map);
11221
11222 assert((vm_size_t)copy_size == copy_size);
11223 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11224 vm_map_copyout_kernel_buffer_failures++;
11225 kr = KERN_INVALID_ADDRESS;
11226 }
11227
11228 (void) vm_map_switch(oldmap);
11229 vm_map_deallocate(map);
11230 }
11231
11232 if (kr != KERN_SUCCESS) {
11233 /* the copy failed, clean up */
11234 if (!overwrite) {
11235 /*
11236 * Deallocate the space we allocated in the target map.
11237 */
11238 (void) vm_map_remove(map,
11239 vm_map_trunc_page(*addr,
11240 VM_MAP_PAGE_MASK(map)),
11241 vm_map_round_page((*addr +
11242 vm_map_round_page(copy_size,
11243 VM_MAP_PAGE_MASK(map))),
11244 VM_MAP_PAGE_MASK(map)));
11245 *addr = 0;
11246 }
11247 } else {
11248 /* copy was successful, dicard the copy structure */
11249 if (consume_on_success) {
11250 kfree_data(copy->cpy_kdata, copy_size);
11251 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11252 }
11253 }
11254
11255 return kr;
11256 }
11257
11258 /*
11259 * Routine: vm_map_copy_insert [internal use only]
11260 *
11261 * Description:
11262 * Link a copy chain ("copy") into a map at the
11263 * specified location (after "where").
11264 *
11265 * Callers of this function must call vm_map_copy_require on
11266 * previously created vm_map_copy_t or pass a newly created
11267 * one to ensure that it hasn't been forged.
11268 * Side effects:
11269 * The copy chain is destroyed.
11270 */
11271 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11272 vm_map_copy_insert(
11273 vm_map_t map,
11274 vm_map_entry_t after_where,
11275 vm_map_copy_t copy)
11276 {
11277 vm_map_entry_t entry;
11278
11279 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11280 entry = vm_map_copy_first_entry(copy);
11281 vm_map_copy_entry_unlink(copy, entry);
11282 vm_map_store_entry_link(map, after_where, entry,
11283 VM_MAP_KERNEL_FLAGS_NONE);
11284 after_where = entry;
11285 }
11286 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11287 }
11288
11289 /*
11290 * Callers of this function must call vm_map_copy_require on
11291 * previously created vm_map_copy_t or pass a newly created
11292 * one to ensure that it hasn't been forged.
11293 */
11294 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11295 vm_map_copy_remap(
11296 vm_map_t map,
11297 vm_map_entry_t where,
11298 vm_map_copy_t copy,
11299 vm_map_offset_t adjustment,
11300 vm_prot_t cur_prot,
11301 vm_prot_t max_prot,
11302 vm_inherit_t inheritance)
11303 {
11304 vm_map_entry_t copy_entry, new_entry;
11305
11306 for (copy_entry = vm_map_copy_first_entry(copy);
11307 copy_entry != vm_map_copy_to_entry(copy);
11308 copy_entry = copy_entry->vme_next) {
11309 /* get a new VM map entry for the map */
11310 new_entry = vm_map_entry_create(map);
11311 /* copy the "copy entry" to the new entry */
11312 vm_map_entry_copy(map, new_entry, copy_entry);
11313 /* adjust "start" and "end" */
11314 new_entry->vme_start += adjustment;
11315 new_entry->vme_end += adjustment;
11316 /* clear some attributes */
11317 new_entry->inheritance = inheritance;
11318 new_entry->protection = cur_prot;
11319 new_entry->max_protection = max_prot;
11320 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11321 /* take an extra reference on the entry's "object" */
11322 if (new_entry->is_sub_map) {
11323 assert(!new_entry->use_pmap); /* not nested */
11324 vm_map_reference(VME_SUBMAP(new_entry));
11325 } else {
11326 vm_object_reference(VME_OBJECT(new_entry));
11327 }
11328 /* insert the new entry in the map */
11329 vm_map_store_entry_link(map, where, new_entry,
11330 VM_MAP_KERNEL_FLAGS_NONE);
11331 /* continue inserting the "copy entries" after the new entry */
11332 where = new_entry;
11333 }
11334 }
11335
11336
11337 /*
11338 * Returns true if *size matches (or is in the range of) copy->size.
11339 * Upon returning true, the *size field is updated with the actual size of the
11340 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11341 */
11342 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11343 vm_map_copy_validate_size(
11344 vm_map_t dst_map,
11345 vm_map_copy_t copy,
11346 vm_map_size_t *size)
11347 {
11348 if (copy == VM_MAP_COPY_NULL) {
11349 return FALSE;
11350 }
11351
11352 /*
11353 * Assert that the vm_map_copy is coming from the right
11354 * zone and hasn't been forged
11355 */
11356 vm_map_copy_require(copy);
11357
11358 vm_map_size_t copy_sz = copy->size;
11359 vm_map_size_t sz = *size;
11360 switch (copy->type) {
11361 case VM_MAP_COPY_KERNEL_BUFFER:
11362 if (sz == copy_sz) {
11363 return TRUE;
11364 }
11365 break;
11366 case VM_MAP_COPY_ENTRY_LIST:
11367 /*
11368 * potential page-size rounding prevents us from exactly
11369 * validating this flavor of vm_map_copy, but we can at least
11370 * assert that it's within a range.
11371 */
11372 if (copy_sz >= sz &&
11373 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11374 *size = copy_sz;
11375 return TRUE;
11376 }
11377 break;
11378 default:
11379 break;
11380 }
11381 return FALSE;
11382 }
11383
11384 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11385 vm_map_copyout_internal(
11386 vm_map_t dst_map,
11387 vm_map_address_t *dst_addr, /* OUT */
11388 vm_map_copy_t copy,
11389 vm_map_size_ut copy_size_u,
11390 boolean_t consume_on_success,
11391 vm_prot_t cur_protection,
11392 vm_prot_t max_protection,
11393 vm_inherit_t inheritance)
11394 {
11395 vm_map_size_t size, copy_size;
11396 vm_map_size_t adjustment;
11397 vm_map_offset_t start;
11398 vm_object_offset_t vm_copy_start;
11399 vm_map_entry_t last;
11400 vm_map_entry_t entry;
11401 vm_map_copy_t original_copy;
11402 kern_return_t kr;
11403 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11404
11405 /*
11406 * Check for null copy object.
11407 */
11408
11409 if (copy == VM_MAP_COPY_NULL) {
11410 *dst_addr = 0;
11411 return KERN_SUCCESS;
11412 }
11413
11414 /*
11415 * Assert that the vm_map_copy is coming from the right
11416 * zone and hasn't been forged
11417 */
11418 vm_map_copy_require(copy);
11419
11420 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11421 *dst_addr = 0;
11422 ktriage_record(thread_tid(current_thread()),
11423 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11424 KDBG_TRIAGE_RESERVED,
11425 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11426 KERN_FAILURE /* arg */);
11427 return KERN_FAILURE;
11428 }
11429 copy_size = copy->size;
11430
11431 /*
11432 * Check for special kernel buffer allocated
11433 * by new_ipc_kmsg_copyin.
11434 */
11435
11436 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11437 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11438 copy, copy_size, FALSE,
11439 consume_on_success);
11440 if (kr) {
11441 ktriage_record(thread_tid(current_thread()),
11442 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11443 KDBG_TRIAGE_RESERVED,
11444 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11445 }
11446 return kr;
11447 }
11448
11449 original_copy = copy;
11450 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11451 vm_map_copy_t target_copy;
11452 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11453
11454 target_copy = VM_MAP_COPY_NULL;
11455 DEBUG4K_ADJUST("adjusting...\n");
11456 kr = vm_map_copy_adjust_to_target(
11457 copy,
11458 0, /* offset */
11459 copy->size, /* size */
11460 dst_map,
11461 TRUE, /* copy */
11462 &target_copy,
11463 &overmap_start,
11464 &overmap_end,
11465 &trimmed_start);
11466 if (kr != KERN_SUCCESS) {
11467 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11468 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11469 return kr;
11470 }
11471 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11472 if (target_copy != copy) {
11473 copy = target_copy;
11474 }
11475 copy_size = copy->size;
11476 }
11477
11478 /*
11479 * Find space for the data
11480 */
11481
11482 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11483 VM_MAP_COPY_PAGE_MASK(copy));
11484 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11485 VM_MAP_COPY_PAGE_MASK(copy))
11486 - vm_copy_start;
11487
11488 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11489
11490 vm_map_lock(dst_map);
11491 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11492 &start, &last);
11493 if (kr != KERN_SUCCESS) {
11494 vm_map_unlock(dst_map);
11495 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11496 return kr;
11497 }
11498
11499 adjustment = start - vm_copy_start;
11500 if (!consume_on_success) {
11501 /*
11502 * We're not allowed to consume "copy", so we'll have to
11503 * copy its map entries into the destination map below.
11504 * No need to re-allocate map entries from the correct
11505 * (pageable or not) zone, since we'll get new map entries
11506 * during the transfer.
11507 * We'll also adjust the map entries's "start" and "end"
11508 * during the transfer, to keep "copy"'s entries consistent
11509 * with its "offset".
11510 */
11511 goto after_adjustments;
11512 }
11513
11514 /*
11515 * Since we're going to just drop the map
11516 * entries from the copy into the destination
11517 * map, they must come from the same pool.
11518 */
11519
11520 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11521 /*
11522 * Mismatches occur when dealing with the default
11523 * pager.
11524 */
11525 vm_map_entry_t next, new;
11526
11527 /*
11528 * Find the zone that the copies were allocated from
11529 */
11530
11531 entry = vm_map_copy_first_entry(copy);
11532
11533 /*
11534 * Reinitialize the copy so that vm_map_copy_entry_link
11535 * will work.
11536 */
11537 vm_map_store_copy_reset(copy, entry);
11538 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11539
11540 /*
11541 * Copy each entry.
11542 */
11543 while (entry != vm_map_copy_to_entry(copy)) {
11544 new = vm_map_copy_entry_create(copy);
11545 vm_map_entry_copy_full(new, entry);
11546 new->vme_no_copy_on_read = FALSE;
11547 assert(!new->iokit_acct);
11548 if (new->is_sub_map) {
11549 /* clr address space specifics */
11550 new->use_pmap = FALSE;
11551 }
11552 vm_map_copy_entry_link(copy,
11553 vm_map_copy_last_entry(copy),
11554 new);
11555 next = entry->vme_next;
11556 vm_map_entry_dispose(entry);
11557 entry = next;
11558 }
11559 }
11560
11561 /*
11562 * Adjust the addresses in the copy chain, and
11563 * reset the region attributes.
11564 */
11565
11566 for (entry = vm_map_copy_first_entry(copy);
11567 entry != vm_map_copy_to_entry(copy);
11568 entry = entry->vme_next) {
11569 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11570 /*
11571 * We're injecting this copy entry into a map that
11572 * has the standard page alignment, so clear
11573 * "map_aligned" (which might have been inherited
11574 * from the original map entry).
11575 */
11576 entry->map_aligned = FALSE;
11577 }
11578
11579 entry->vme_start += adjustment;
11580 entry->vme_end += adjustment;
11581
11582 if (entry->map_aligned) {
11583 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11584 VM_MAP_PAGE_MASK(dst_map)));
11585 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11586 VM_MAP_PAGE_MASK(dst_map)));
11587 }
11588
11589 entry->inheritance = VM_INHERIT_DEFAULT;
11590 entry->protection = VM_PROT_DEFAULT;
11591 entry->max_protection = VM_PROT_ALL;
11592 entry->behavior = VM_BEHAVIOR_DEFAULT;
11593
11594 /*
11595 * If the entry is now wired,
11596 * map the pages into the destination map.
11597 */
11598 if (entry->wired_count != 0) {
11599 vm_map_offset_t va;
11600 vm_object_offset_t offset;
11601 vm_object_t object;
11602 vm_prot_t prot;
11603 int type_of_fault;
11604 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11605
11606 /* TODO4K would need to use actual page size */
11607 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11608
11609 object = VME_OBJECT(entry);
11610 offset = VME_OFFSET(entry);
11611 va = entry->vme_start;
11612
11613 pmap_pageable(dst_map->pmap,
11614 entry->vme_start,
11615 entry->vme_end,
11616 TRUE);
11617
11618 while (va < entry->vme_end) {
11619 vm_page_t m;
11620 struct vm_object_fault_info fault_info = {};
11621
11622 /*
11623 * Look up the page in the object.
11624 * Assert that the page will be found in the
11625 * top object:
11626 * either
11627 * the object was newly created by
11628 * vm_object_copy_slowly, and has
11629 * copies of all of the pages from
11630 * the source object
11631 * or
11632 * the object was moved from the old
11633 * map entry; because the old map
11634 * entry was wired, all of the pages
11635 * were in the top-level object.
11636 * (XXX not true if we wire pages for
11637 * reading)
11638 */
11639 vm_object_lock(object);
11640
11641 m = vm_page_lookup(object, offset);
11642 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11643 m->vmp_absent) {
11644 panic("vm_map_copyout: wiring %p", m);
11645 }
11646
11647 prot = entry->protection;
11648
11649 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11650 prot) {
11651 prot |= VM_PROT_EXECUTE;
11652 }
11653
11654 type_of_fault = DBG_CACHE_HIT_FAULT;
11655
11656 fault_info.user_tag = VME_ALIAS(entry);
11657 fault_info.pmap_options = 0;
11658 if (entry->iokit_acct ||
11659 (!entry->is_sub_map && !entry->use_pmap)) {
11660 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11661 }
11662 if (entry->vme_xnu_user_debug &&
11663 !VM_PAGE_OBJECT(m)->code_signed) {
11664 /*
11665 * Modified code-signed executable
11666 * region: this page does not belong
11667 * to a code-signed VM object, so it
11668 * must have been copied and should
11669 * therefore be typed XNU_USER_DEBUG
11670 * rather than XNU_USER_EXEC.
11671 */
11672 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11673 }
11674
11675 vm_fault_enter(m,
11676 dst_map->pmap,
11677 va,
11678 PAGE_SIZE, 0,
11679 prot,
11680 prot,
11681 VM_PAGE_WIRED(m),
11682 FALSE, /* change_wiring */
11683 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11684 &fault_info,
11685 NULL, /* need_retry */
11686 &type_of_fault,
11687 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11688
11689 vm_object_unlock(object);
11690
11691 offset += PAGE_SIZE_64;
11692 va += PAGE_SIZE;
11693 }
11694 }
11695 }
11696
11697 after_adjustments:
11698
11699 /*
11700 * Correct the page alignment for the result
11701 */
11702
11703 *dst_addr = start + (copy->offset - vm_copy_start);
11704
11705 #if KASAN
11706 kasan_notify_address(*dst_addr, size);
11707 #endif
11708
11709 /*
11710 * Update the hints and the map size
11711 */
11712
11713 if (consume_on_success) {
11714 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11715 } else {
11716 SAVE_HINT_MAP_WRITE(dst_map, last);
11717 }
11718
11719 dst_map->size += size;
11720
11721 /*
11722 * Link in the copy
11723 */
11724
11725 if (consume_on_success) {
11726 vm_map_copy_insert(dst_map, last, copy);
11727 if (copy != original_copy) {
11728 vm_map_copy_discard(original_copy);
11729 original_copy = VM_MAP_COPY_NULL;
11730 }
11731 } else {
11732 vm_map_copy_remap(dst_map, last, copy, adjustment,
11733 cur_protection, max_protection,
11734 inheritance);
11735 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11736 vm_map_copy_discard(copy);
11737 copy = original_copy;
11738 }
11739 }
11740
11741
11742 vm_map_unlock(dst_map);
11743
11744 /*
11745 * XXX If wiring_required, call vm_map_pageable
11746 */
11747
11748 return KERN_SUCCESS;
11749 }
11750
11751 /*
11752 * Routine: vm_map_copyout_size
11753 *
11754 * Description:
11755 * Copy out a copy chain ("copy") into newly-allocated
11756 * space in the destination map. Uses a prevalidated
11757 * size for the copy object (vm_map_copy_validate_size).
11758 *
11759 * If successful, consumes the copy object.
11760 * Otherwise, the caller is responsible for it.
11761 */
11762 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11763 vm_map_copyout_size(
11764 vm_map_t dst_map,
11765 vm_map_address_t *dst_addr, /* OUT */
11766 vm_map_copy_t copy,
11767 vm_map_size_ut copy_size)
11768 {
11769 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11770 TRUE, /* consume_on_success */
11771 VM_PROT_DEFAULT,
11772 VM_PROT_ALL,
11773 VM_INHERIT_DEFAULT);
11774 }
11775
11776 /*
11777 * Routine: vm_map_copyout
11778 *
11779 * Description:
11780 * Copy out a copy chain ("copy") into newly-allocated
11781 * space in the destination map.
11782 *
11783 * If successful, consumes the copy object.
11784 * Otherwise, the caller is responsible for it.
11785 */
11786 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11787 vm_map_copyout(
11788 vm_map_t dst_map,
11789 vm_map_address_t *dst_addr, /* OUT */
11790 vm_map_copy_t copy)
11791 {
11792 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11793 TRUE, /* consume_on_success */
11794 VM_PROT_DEFAULT,
11795 VM_PROT_ALL,
11796 VM_INHERIT_DEFAULT);
11797 }
11798
11799 /*
11800 * Routine: vm_map_copyin
11801 *
11802 * Description:
11803 * see vm_map_copyin_common. Exported via Unsupported.exports.
11804 *
11805 */
11806 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11807 vm_map_copyin(
11808 vm_map_t src_map,
11809 vm_map_address_ut src_addr,
11810 vm_map_size_ut len,
11811 boolean_t src_destroy,
11812 vm_map_copy_t *copy_result) /* OUT */
11813 {
11814 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11815 FALSE, copy_result, FALSE);
11816 }
11817
11818 /*
11819 * Routine: vm_map_copyin_common
11820 *
11821 * Description:
11822 * Copy the specified region (src_addr, len) from the
11823 * source address space (src_map), possibly removing
11824 * the region from the source address space (src_destroy).
11825 *
11826 * Returns:
11827 * A vm_map_copy_t object (copy_result), suitable for
11828 * insertion into another address space (using vm_map_copyout),
11829 * copying over another address space region (using
11830 * vm_map_copy_overwrite). If the copy is unused, it
11831 * should be destroyed (using vm_map_copy_discard).
11832 *
11833 * In/out conditions:
11834 * The source map should not be locked on entry.
11835 */
11836
11837 typedef struct submap_map {
11838 vm_map_t parent_map;
11839 vm_map_offset_t base_start;
11840 vm_map_offset_t base_end;
11841 vm_map_size_t base_len;
11842 struct submap_map *next;
11843 } submap_map_t;
11844
11845 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11846 vm_map_copyin_common(
11847 vm_map_t src_map,
11848 vm_map_address_ut src_addr,
11849 vm_map_size_ut len,
11850 boolean_t src_destroy,
11851 __unused boolean_t src_volatile,
11852 vm_map_copy_t *copy_result, /* OUT */
11853 boolean_t use_maxprot)
11854 {
11855 int flags;
11856
11857 flags = 0;
11858 if (src_destroy) {
11859 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11860 }
11861 if (use_maxprot) {
11862 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11863 }
11864 return vm_map_copyin_internal(src_map,
11865 src_addr,
11866 len,
11867 flags,
11868 copy_result);
11869 }
11870
11871 static __attribute__((always_inline, warn_unused_result))
11872 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11873 vm_map_copyin_sanitize(
11874 vm_map_t src_map,
11875 vm_map_address_ut src_addr_u,
11876 vm_map_size_ut len_u,
11877 vm_map_offset_t *src_start,
11878 vm_map_offset_t *src_end,
11879 vm_map_size_t *len,
11880 vm_map_offset_t *src_addr_unaligned)
11881 {
11882 kern_return_t kr;
11883 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
11884 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
11885
11886 if (src_map->pmap == kernel_pmap) {
11887 flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
11888 }
11889
11890
11891 kr = vm_sanitize_addr_size(src_addr_u, len_u,
11892 VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11893 src_map,
11894 flags,
11895 src_start, src_end, len);
11896 if (__improbable(kr != KERN_SUCCESS)) {
11897 return kr;
11898 }
11899
11900 /*
11901 * Compute (page aligned) start and end of region
11902 */
11903 *src_addr_unaligned = *src_start; /* remember unaligned value */
11904 *src_start = vm_map_trunc_page(*src_addr_unaligned,
11905 VM_MAP_PAGE_MASK(src_map));
11906 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11907 return KERN_SUCCESS;
11908 }
11909
11910 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11911 vm_map_copyin_internal(
11912 vm_map_t src_map,
11913 vm_map_address_ut src_addr_u,
11914 vm_map_size_ut len_u,
11915 int flags,
11916 vm_map_copy_t *copy_result) /* OUT */
11917 {
11918 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11919 * in multi-level lookup, this
11920 * entry contains the actual
11921 * vm_object/offset.
11922 */
11923 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11924
11925 vm_map_offset_t src_start; /* Start of current entry --
11926 * where copy is taking place now
11927 */
11928 vm_map_offset_t src_end; /* End of entire region to be
11929 * copied */
11930 vm_map_offset_t src_addr_unaligned;
11931 vm_map_offset_t src_base;
11932 vm_map_size_t len;
11933 vm_map_t base_map = src_map;
11934 boolean_t map_share = FALSE;
11935 submap_map_t *parent_maps = NULL;
11936
11937 vm_map_copy_t copy; /* Resulting copy */
11938 vm_map_address_t copy_addr;
11939 vm_map_size_t copy_size;
11940 boolean_t src_destroy;
11941 boolean_t use_maxprot;
11942 boolean_t preserve_purgeable;
11943 boolean_t entry_was_shared;
11944 vm_map_entry_t saved_src_entry;
11945 kern_return_t kr;
11946
11947
11948 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11949 return KERN_INVALID_ARGUMENT;
11950 }
11951
11952 /*
11953 * Check for copies of zero bytes.
11954 */
11955 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11956 *copy_result = VM_MAP_COPY_NULL;
11957 return KERN_SUCCESS;
11958 }
11959
11960 /*
11961 * Sanitize any input parameters that are addr/size/prot/inherit
11962 */
11963 kr = vm_map_copyin_sanitize(
11964 src_map,
11965 src_addr_u,
11966 len_u,
11967 &src_start,
11968 &src_end,
11969 &len,
11970 &src_addr_unaligned);
11971 if (__improbable(kr != KERN_SUCCESS)) {
11972 return vm_sanitize_get_kr(kr);
11973 }
11974
11975 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11976 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11977 preserve_purgeable =
11978 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11979
11980 /*
11981 * If the copy is sufficiently small, use a kernel buffer instead
11982 * of making a virtual copy. The theory being that the cost of
11983 * setting up VM (and taking C-O-W faults) dominates the copy costs
11984 * for small regions.
11985 */
11986 if ((len <= msg_ool_size_small) &&
11987 !use_maxprot &&
11988 !preserve_purgeable &&
11989 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11990 /*
11991 * Since the "msg_ool_size_small" threshold was increased and
11992 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11993 * address space limits, we revert to doing a virtual copy if the
11994 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11995 * of the commpage would now fail when it used to work.
11996 */
11997 (src_start >= vm_map_min(src_map) &&
11998 src_start < vm_map_max(src_map) &&
11999 src_end >= vm_map_min(src_map) &&
12000 src_end < vm_map_max(src_map))) {
12001 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12002 src_destroy, copy_result);
12003 }
12004
12005 /*
12006 * Allocate a header element for the list.
12007 *
12008 * Use the start and end in the header to
12009 * remember the endpoints prior to rounding.
12010 */
12011
12012 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12013 copy->cpy_hdr.entries_pageable = TRUE;
12014 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12015 copy->offset = src_addr_unaligned;
12016 copy->size = len;
12017
12018 new_entry = vm_map_copy_entry_create(copy);
12019
12020 #define RETURN(x) \
12021 MACRO_BEGIN \
12022 vm_map_unlock(src_map); \
12023 if(src_map != base_map) \
12024 vm_map_deallocate(src_map); \
12025 if (new_entry != VM_MAP_ENTRY_NULL) \
12026 vm_map_copy_entry_dispose(new_entry); \
12027 vm_map_copy_discard(copy); \
12028 { \
12029 submap_map_t *_ptr; \
12030 \
12031 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12032 parent_maps=parent_maps->next; \
12033 if (_ptr->parent_map != base_map) \
12034 vm_map_deallocate(_ptr->parent_map); \
12035 kfree_type(submap_map_t, _ptr); \
12036 } \
12037 } \
12038 MACRO_RETURN(x); \
12039 MACRO_END
12040
12041 /*
12042 * Find the beginning of the region.
12043 */
12044
12045 vm_map_lock(src_map);
12046
12047 /*
12048 * Lookup the original "src_addr_unaligned" rather than the truncated
12049 * "src_start", in case "src_start" falls in a non-map-aligned
12050 * map entry *before* the map entry that contains "src_addr_unaligned"...
12051 */
12052 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12053 RETURN(KERN_INVALID_ADDRESS);
12054 }
12055 if (!tmp_entry->is_sub_map) {
12056 /*
12057 * ... but clip to the map-rounded "src_start" rather than
12058 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
12059 * first copy entry at the end, if needed.
12060 */
12061 vm_map_clip_start(src_map, tmp_entry, src_start);
12062 }
12063 if (src_start < tmp_entry->vme_start) {
12064 /*
12065 * Move "src_start" up to the start of the
12066 * first map entry to copy.
12067 */
12068 src_start = tmp_entry->vme_start;
12069 }
12070 /* set for later submap fix-up */
12071 copy_addr = src_start;
12072
12073 /*
12074 * Go through entries until we get to the end.
12075 */
12076
12077 while (TRUE) {
12078 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12079 vm_map_size_t src_size; /* Size of source
12080 * map entry (in both
12081 * maps)
12082 */
12083
12084 vm_object_t src_object; /* Object to copy */
12085 vm_object_offset_t src_offset;
12086
12087 vm_object_t new_copy_object;/* vm_object_copy_* result */
12088
12089 boolean_t src_needs_copy; /* Should source map
12090 * be made read-only
12091 * for copy-on-write?
12092 */
12093
12094 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12095
12096 boolean_t was_wired; /* Was source wired? */
12097 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12098 vm_map_version_t version; /* Version before locks
12099 * dropped to make copy
12100 */
12101 kern_return_t result; /* Return value from
12102 * copy_strategically.
12103 */
12104 while (tmp_entry->is_sub_map) {
12105 vm_map_size_t submap_len;
12106 submap_map_t *ptr;
12107
12108 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12109 ptr->next = parent_maps;
12110 parent_maps = ptr;
12111 ptr->parent_map = src_map;
12112 ptr->base_start = src_start;
12113 ptr->base_end = src_end;
12114 submap_len = tmp_entry->vme_end - src_start;
12115 if (submap_len > (src_end - src_start)) {
12116 submap_len = src_end - src_start;
12117 }
12118 ptr->base_len = submap_len;
12119
12120 src_start -= tmp_entry->vme_start;
12121 src_start += VME_OFFSET(tmp_entry);
12122 src_end = src_start + submap_len;
12123 src_map = VME_SUBMAP(tmp_entry);
12124 vm_map_lock(src_map);
12125 /* keep an outstanding reference for all maps in */
12126 /* the parents tree except the base map */
12127 vm_map_reference(src_map);
12128 vm_map_unlock(ptr->parent_map);
12129 if (!vm_map_lookup_entry(
12130 src_map, src_start, &tmp_entry)) {
12131 RETURN(KERN_INVALID_ADDRESS);
12132 }
12133 map_share = TRUE;
12134 if (!tmp_entry->is_sub_map) {
12135 vm_map_clip_start(src_map, tmp_entry, src_start);
12136 }
12137 src_entry = tmp_entry;
12138 }
12139 /* we are now in the lowest level submap... */
12140
12141 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12142 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12143 /* This is not, supported for now.In future */
12144 /* we will need to detect the phys_contig */
12145 /* condition and then upgrade copy_slowly */
12146 /* to do physical copy from the device mem */
12147 /* based object. We can piggy-back off of */
12148 /* the was wired boolean to set-up the */
12149 /* proper handling */
12150 RETURN(KERN_PROTECTION_FAILURE);
12151 }
12152 /*
12153 * Create a new address map entry to hold the result.
12154 * Fill in the fields from the appropriate source entries.
12155 * We must unlock the source map to do this if we need
12156 * to allocate a map entry.
12157 */
12158 if (new_entry == VM_MAP_ENTRY_NULL) {
12159 version.main_timestamp = src_map->timestamp;
12160 vm_map_unlock(src_map);
12161
12162 new_entry = vm_map_copy_entry_create(copy);
12163
12164 vm_map_lock(src_map);
12165 if ((version.main_timestamp + 1) != src_map->timestamp) {
12166 if (!vm_map_lookup_entry(src_map, src_start,
12167 &tmp_entry)) {
12168 RETURN(KERN_INVALID_ADDRESS);
12169 }
12170 if (!tmp_entry->is_sub_map) {
12171 vm_map_clip_start(src_map, tmp_entry, src_start);
12172 }
12173 continue; /* restart w/ new tmp_entry */
12174 }
12175 }
12176
12177 /*
12178 * Verify that the region can be read.
12179 */
12180 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12181 !use_maxprot) ||
12182 (src_entry->max_protection & VM_PROT_READ) == 0) {
12183 RETURN(KERN_PROTECTION_FAILURE);
12184 }
12185
12186 /*
12187 * Clip against the endpoints of the entire region.
12188 */
12189
12190 vm_map_clip_end(src_map, src_entry, src_end);
12191
12192 src_size = src_entry->vme_end - src_start;
12193 src_object = VME_OBJECT(src_entry);
12194 src_offset = VME_OFFSET(src_entry);
12195 was_wired = (src_entry->wired_count != 0);
12196
12197 vm_map_entry_copy(src_map, new_entry, src_entry);
12198 if (new_entry->is_sub_map) {
12199 /* clr address space specifics */
12200 new_entry->use_pmap = FALSE;
12201 } else {
12202 /*
12203 * We're dealing with a copy-on-write operation,
12204 * so the resulting mapping should not inherit the
12205 * original mapping's accounting settings.
12206 * "iokit_acct" should have been cleared in
12207 * vm_map_entry_copy().
12208 * "use_pmap" should be reset to its default (TRUE)
12209 * so that the new mapping gets accounted for in
12210 * the task's memory footprint.
12211 */
12212 assert(!new_entry->iokit_acct);
12213 new_entry->use_pmap = TRUE;
12214 }
12215
12216 /*
12217 * Attempt non-blocking copy-on-write optimizations.
12218 */
12219
12220 /*
12221 * If we are destroying the source, and the object
12222 * is internal, we could move the object reference
12223 * from the source to the copy. The copy is
12224 * copy-on-write only if the source is.
12225 * We make another reference to the object, because
12226 * destroying the source entry will deallocate it.
12227 *
12228 * This memory transfer has to be atomic, (to prevent
12229 * the VM object from being shared or copied while
12230 * it's being moved here), so we could only do this
12231 * if we won't have to unlock the VM map until the
12232 * original mapping has been fully removed.
12233 */
12234
12235 RestartCopy:
12236 if ((src_object == VM_OBJECT_NULL ||
12237 (!was_wired && !map_share && !tmp_entry->is_shared
12238 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12239 vm_object_copy_quickly(
12240 VME_OBJECT(new_entry),
12241 src_offset,
12242 src_size,
12243 &src_needs_copy,
12244 &new_entry_needs_copy)) {
12245 new_entry->needs_copy = new_entry_needs_copy;
12246
12247 /*
12248 * Handle copy-on-write obligations
12249 */
12250
12251 if (src_needs_copy && !tmp_entry->needs_copy) {
12252 vm_prot_t prot;
12253
12254 prot = src_entry->protection & ~VM_PROT_WRITE;
12255
12256 if (override_nx(src_map, VME_ALIAS(src_entry))
12257 && prot) {
12258 prot |= VM_PROT_EXECUTE;
12259 }
12260
12261 vm_object_pmap_protect(
12262 src_object,
12263 src_offset,
12264 src_size,
12265 (src_entry->is_shared ?
12266 PMAP_NULL
12267 : src_map->pmap),
12268 VM_MAP_PAGE_SIZE(src_map),
12269 src_entry->vme_start,
12270 prot);
12271
12272 assert(tmp_entry->wired_count == 0);
12273 tmp_entry->needs_copy = TRUE;
12274 }
12275
12276 /*
12277 * The map has never been unlocked, so it's safe
12278 * to move to the next entry rather than doing
12279 * another lookup.
12280 */
12281
12282 goto CopySuccessful;
12283 }
12284
12285 entry_was_shared = tmp_entry->is_shared;
12286
12287 /*
12288 * Take an object reference, so that we may
12289 * release the map lock(s).
12290 */
12291
12292 assert(src_object != VM_OBJECT_NULL);
12293 vm_object_reference(src_object);
12294
12295 /*
12296 * Record the timestamp for later verification.
12297 * Unlock the map.
12298 */
12299
12300 version.main_timestamp = src_map->timestamp;
12301 vm_map_unlock(src_map); /* Increments timestamp once! */
12302 saved_src_entry = src_entry;
12303 tmp_entry = VM_MAP_ENTRY_NULL;
12304 src_entry = VM_MAP_ENTRY_NULL;
12305
12306 /*
12307 * Perform the copy
12308 */
12309
12310 if (was_wired ||
12311 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12312 !(flags & VM_MAP_COPYIN_FORK)) ||
12313 (debug4k_no_cow_copyin &&
12314 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12315 CopySlowly:
12316 vm_object_lock(src_object);
12317 result = vm_object_copy_slowly(
12318 src_object,
12319 src_offset,
12320 src_size,
12321 THREAD_UNINT,
12322 &new_copy_object);
12323 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12324 saved_used_for_jit = new_entry->used_for_jit;
12325 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12326 new_entry->used_for_jit = saved_used_for_jit;
12327 VME_OFFSET_SET(new_entry,
12328 src_offset - vm_object_trunc_page(src_offset));
12329 new_entry->needs_copy = FALSE;
12330 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12331 (entry_was_shared || map_share)) {
12332 vm_object_t new_object;
12333
12334 vm_object_lock_shared(src_object);
12335 new_object = vm_object_copy_delayed(
12336 src_object,
12337 src_offset,
12338 src_size,
12339 TRUE);
12340 if (new_object == VM_OBJECT_NULL) {
12341 goto CopySlowly;
12342 }
12343
12344 VME_OBJECT_SET(new_entry, new_object, false, 0);
12345 assert(new_entry->wired_count == 0);
12346 new_entry->needs_copy = TRUE;
12347 assert(!new_entry->iokit_acct);
12348 assert(new_object->purgable == VM_PURGABLE_DENY);
12349 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12350 result = KERN_SUCCESS;
12351 } else {
12352 vm_object_offset_t new_offset;
12353 new_offset = VME_OFFSET(new_entry);
12354 result = vm_object_copy_strategically(src_object,
12355 src_offset,
12356 src_size,
12357 (flags & VM_MAP_COPYIN_FORK),
12358 &new_copy_object,
12359 &new_offset,
12360 &new_entry_needs_copy);
12361 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12362 saved_used_for_jit = new_entry->used_for_jit;
12363 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12364 new_entry->used_for_jit = saved_used_for_jit;
12365 if (new_offset != VME_OFFSET(new_entry)) {
12366 VME_OFFSET_SET(new_entry, new_offset);
12367 }
12368
12369 new_entry->needs_copy = new_entry_needs_copy;
12370 }
12371
12372 if (result == KERN_SUCCESS &&
12373 ((preserve_purgeable &&
12374 src_object->purgable != VM_PURGABLE_DENY) ||
12375 new_entry->used_for_jit)) {
12376 /*
12377 * Purgeable objects should be COPY_NONE, true share;
12378 * this should be propogated to the copy.
12379 *
12380 * Also force mappings the pmap specially protects to
12381 * be COPY_NONE; trying to COW these mappings would
12382 * change the effective protections, which could have
12383 * side effects if the pmap layer relies on the
12384 * specified protections.
12385 */
12386
12387 vm_object_t new_object;
12388
12389 new_object = VME_OBJECT(new_entry);
12390 assert(new_object != src_object);
12391 vm_object_lock(new_object);
12392 assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12393 assert(new_object->shadow == VM_OBJECT_NULL);
12394 assert(new_object->vo_copy == VM_OBJECT_NULL);
12395 assert(new_object->vo_owner == NULL);
12396
12397 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12398
12399 if (preserve_purgeable &&
12400 src_object->purgable != VM_PURGABLE_DENY) {
12401 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12402
12403 /* start as non-volatile with no owner... */
12404 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12405 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12406 /* ... and move to src_object's purgeable state */
12407 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12408 int state;
12409 state = src_object->purgable;
12410 vm_object_purgable_control(
12411 new_object,
12412 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12413 &state);
12414 }
12415 /* no pmap accounting for purgeable objects */
12416 new_entry->use_pmap = FALSE;
12417 }
12418
12419 vm_object_unlock(new_object);
12420 new_object = VM_OBJECT_NULL;
12421 }
12422
12423 /*
12424 * Throw away the extra reference
12425 */
12426
12427 vm_object_deallocate(src_object);
12428
12429 if (result != KERN_SUCCESS &&
12430 result != KERN_MEMORY_RESTART_COPY) {
12431 vm_map_lock(src_map);
12432 RETURN(result);
12433 }
12434
12435 /*
12436 * Verify that the map has not substantially
12437 * changed while the copy was being made.
12438 */
12439
12440 vm_map_lock(src_map);
12441
12442 if ((version.main_timestamp + 1) == src_map->timestamp) {
12443 /* src_map hasn't changed: src_entry is still valid */
12444 src_entry = saved_src_entry;
12445 goto VerificationSuccessful;
12446 }
12447
12448 /*
12449 * Simple version comparison failed.
12450 *
12451 * Retry the lookup and verify that the
12452 * same object/offset are still present.
12453 *
12454 * [Note: a memory manager that colludes with
12455 * the calling task can detect that we have
12456 * cheated. While the map was unlocked, the
12457 * mapping could have been changed and restored.]
12458 */
12459
12460 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12461 if (result != KERN_MEMORY_RESTART_COPY) {
12462 vm_object_deallocate(VME_OBJECT(new_entry));
12463 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12464 /* reset accounting state */
12465 new_entry->iokit_acct = FALSE;
12466 new_entry->use_pmap = TRUE;
12467 }
12468 RETURN(KERN_INVALID_ADDRESS);
12469 }
12470
12471 src_entry = tmp_entry;
12472 vm_map_clip_start(src_map, src_entry, src_start);
12473
12474 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12475 !use_maxprot) ||
12476 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12477 goto VerificationFailed;
12478 }
12479
12480 if (src_entry->vme_end < new_entry->vme_end) {
12481 /*
12482 * This entry might have been shortened
12483 * (vm_map_clip_end) or been replaced with
12484 * an entry that ends closer to "src_start"
12485 * than before.
12486 * Adjust "new_entry" accordingly; copying
12487 * less memory would be correct but we also
12488 * redo the copy (see below) if the new entry
12489 * no longer points at the same object/offset.
12490 */
12491 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12492 VM_MAP_COPY_PAGE_MASK(copy)));
12493 new_entry->vme_end = src_entry->vme_end;
12494 src_size = new_entry->vme_end - src_start;
12495 } else if (src_entry->vme_end > new_entry->vme_end) {
12496 /*
12497 * This entry might have been extended
12498 * (vm_map_entry_simplify() or coalesce)
12499 * or been replaced with an entry that ends farther
12500 * from "src_start" than before.
12501 *
12502 * We've called vm_object_copy_*() only on
12503 * the previous <start:end> range, so we can't
12504 * just extend new_entry. We have to re-do
12505 * the copy based on the new entry as if it was
12506 * pointing at a different object/offset (see
12507 * "Verification failed" below).
12508 */
12509 }
12510
12511 if ((VME_OBJECT(src_entry) != src_object) ||
12512 (VME_OFFSET(src_entry) != src_offset) ||
12513 (src_entry->vme_end > new_entry->vme_end)) {
12514 /*
12515 * Verification failed.
12516 *
12517 * Start over with this top-level entry.
12518 */
12519
12520 VerificationFailed: ;
12521
12522 vm_object_deallocate(VME_OBJECT(new_entry));
12523 tmp_entry = src_entry;
12524 continue;
12525 }
12526
12527 /*
12528 * Verification succeeded.
12529 */
12530
12531 VerificationSuccessful:;
12532
12533 if (result == KERN_MEMORY_RESTART_COPY) {
12534 goto RestartCopy;
12535 }
12536
12537 /*
12538 * Copy succeeded.
12539 */
12540
12541 CopySuccessful: ;
12542
12543 /*
12544 * Link in the new copy entry.
12545 */
12546
12547 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12548 new_entry);
12549
12550 /*
12551 * Determine whether the entire region
12552 * has been copied.
12553 */
12554 src_base = src_start;
12555 src_start = new_entry->vme_end;
12556 new_entry = VM_MAP_ENTRY_NULL;
12557 while ((src_start >= src_end) && (src_end != 0)) {
12558 submap_map_t *ptr;
12559
12560 if (src_map == base_map) {
12561 /* back to the top */
12562 break;
12563 }
12564
12565 ptr = parent_maps;
12566 assert(ptr != NULL);
12567 parent_maps = parent_maps->next;
12568
12569 /* fix up the damage we did in that submap */
12570 vm_map_simplify_range(src_map,
12571 src_base,
12572 src_end);
12573
12574 vm_map_unlock(src_map);
12575 vm_map_deallocate(src_map);
12576 vm_map_lock(ptr->parent_map);
12577 src_map = ptr->parent_map;
12578 src_base = ptr->base_start;
12579 src_start = ptr->base_start + ptr->base_len;
12580 src_end = ptr->base_end;
12581 if (!vm_map_lookup_entry(src_map,
12582 src_start,
12583 &tmp_entry) &&
12584 (src_end > src_start)) {
12585 RETURN(KERN_INVALID_ADDRESS);
12586 }
12587 kfree_type(submap_map_t, ptr);
12588 if (parent_maps == NULL) {
12589 map_share = FALSE;
12590 }
12591 src_entry = tmp_entry->vme_prev;
12592 }
12593
12594 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12595 (src_start >= src_addr_unaligned + len) &&
12596 (src_addr_unaligned + len != 0)) {
12597 /*
12598 * Stop copying now, even though we haven't reached
12599 * "src_end". We'll adjust the end of the last copy
12600 * entry at the end, if needed.
12601 *
12602 * If src_map's aligment is different from the
12603 * system's page-alignment, there could be
12604 * extra non-map-aligned map entries between
12605 * the original (non-rounded) "src_addr_unaligned + len"
12606 * and the rounded "src_end".
12607 * We do not want to copy those map entries since
12608 * they're not part of the copied range.
12609 */
12610 break;
12611 }
12612
12613 if ((src_start >= src_end) && (src_end != 0)) {
12614 break;
12615 }
12616
12617 /*
12618 * Verify that there are no gaps in the region
12619 */
12620
12621 tmp_entry = src_entry->vme_next;
12622 if ((tmp_entry->vme_start != src_start) ||
12623 (tmp_entry == vm_map_to_entry(src_map))) {
12624 RETURN(KERN_INVALID_ADDRESS);
12625 }
12626 }
12627
12628 /*
12629 * If the source should be destroyed, do it now, since the
12630 * copy was successful.
12631 */
12632 if (src_destroy) {
12633 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12634
12635 if (src_map == kernel_map) {
12636 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12637 }
12638 (void)vm_map_remove_and_unlock(src_map,
12639 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12640 src_end,
12641 remove_flags,
12642 KMEM_GUARD_NONE);
12643 } else {
12644 /* fix up the damage we did in the base map */
12645 vm_map_simplify_range(
12646 src_map,
12647 vm_map_trunc_page(src_addr_unaligned,
12648 VM_MAP_PAGE_MASK(src_map)),
12649 vm_map_round_page(src_end,
12650 VM_MAP_PAGE_MASK(src_map)));
12651 vm_map_unlock(src_map);
12652 }
12653
12654 tmp_entry = VM_MAP_ENTRY_NULL;
12655
12656 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12657 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12658 vm_map_offset_t original_start, original_offset, original_end;
12659
12660 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12661
12662 /* adjust alignment of first copy_entry's "vme_start" */
12663 tmp_entry = vm_map_copy_first_entry(copy);
12664 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12665 vm_map_offset_t adjustment;
12666
12667 original_start = tmp_entry->vme_start;
12668 original_offset = VME_OFFSET(tmp_entry);
12669
12670 /* map-align the start of the first copy entry... */
12671 adjustment = (tmp_entry->vme_start -
12672 vm_map_trunc_page(
12673 tmp_entry->vme_start,
12674 VM_MAP_PAGE_MASK(src_map)));
12675 tmp_entry->vme_start -= adjustment;
12676 VME_OFFSET_SET(tmp_entry,
12677 VME_OFFSET(tmp_entry) - adjustment);
12678 copy_addr -= adjustment;
12679 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12680 /* ... adjust for mis-aligned start of copy range */
12681 adjustment =
12682 (vm_map_trunc_page(copy->offset,
12683 PAGE_MASK) -
12684 vm_map_trunc_page(copy->offset,
12685 VM_MAP_PAGE_MASK(src_map)));
12686 if (adjustment) {
12687 assert(page_aligned(adjustment));
12688 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12689 tmp_entry->vme_start += adjustment;
12690 VME_OFFSET_SET(tmp_entry,
12691 (VME_OFFSET(tmp_entry) +
12692 adjustment));
12693 copy_addr += adjustment;
12694 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12695 }
12696
12697 /*
12698 * Assert that the adjustments haven't exposed
12699 * more than was originally copied...
12700 */
12701 assert(tmp_entry->vme_start >= original_start);
12702 assert(VME_OFFSET(tmp_entry) >= original_offset);
12703 /*
12704 * ... and that it did not adjust outside of a
12705 * a single 16K page.
12706 */
12707 assert(vm_map_trunc_page(tmp_entry->vme_start,
12708 VM_MAP_PAGE_MASK(src_map)) ==
12709 vm_map_trunc_page(original_start,
12710 VM_MAP_PAGE_MASK(src_map)));
12711 }
12712
12713 /* adjust alignment of last copy_entry's "vme_end" */
12714 tmp_entry = vm_map_copy_last_entry(copy);
12715 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12716 vm_map_offset_t adjustment;
12717
12718 original_end = tmp_entry->vme_end;
12719
12720 /* map-align the end of the last copy entry... */
12721 tmp_entry->vme_end =
12722 vm_map_round_page(tmp_entry->vme_end,
12723 VM_MAP_PAGE_MASK(src_map));
12724 /* ... adjust for mis-aligned end of copy range */
12725 adjustment =
12726 (vm_map_round_page((copy->offset +
12727 copy->size),
12728 VM_MAP_PAGE_MASK(src_map)) -
12729 vm_map_round_page((copy->offset +
12730 copy->size),
12731 PAGE_MASK));
12732 if (adjustment) {
12733 assert(page_aligned(adjustment));
12734 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12735 tmp_entry->vme_end -= adjustment;
12736 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12737 }
12738
12739 /*
12740 * Assert that the adjustments haven't exposed
12741 * more than was originally copied...
12742 */
12743 assert(tmp_entry->vme_end <= original_end);
12744 /*
12745 * ... and that it did not adjust outside of a
12746 * a single 16K page.
12747 */
12748 assert(vm_map_round_page(tmp_entry->vme_end,
12749 VM_MAP_PAGE_MASK(src_map)) ==
12750 vm_map_round_page(original_end,
12751 VM_MAP_PAGE_MASK(src_map)));
12752 }
12753 }
12754
12755 /* Fix-up start and end points in copy. This is necessary */
12756 /* when the various entries in the copy object were picked */
12757 /* up from different sub-maps */
12758
12759 tmp_entry = vm_map_copy_first_entry(copy);
12760 copy_size = 0; /* compute actual size */
12761 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12762 assert(VM_MAP_PAGE_ALIGNED(
12763 copy_addr + (tmp_entry->vme_end -
12764 tmp_entry->vme_start),
12765 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12766 assert(VM_MAP_PAGE_ALIGNED(
12767 copy_addr,
12768 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12769
12770 /*
12771 * The copy_entries will be injected directly into the
12772 * destination map and might not be "map aligned" there...
12773 */
12774 tmp_entry->map_aligned = FALSE;
12775
12776 tmp_entry->vme_end = copy_addr +
12777 (tmp_entry->vme_end - tmp_entry->vme_start);
12778 tmp_entry->vme_start = copy_addr;
12779 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12780 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12781 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12782 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12783 }
12784
12785 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12786 copy_size < copy->size) {
12787 /*
12788 * The actual size of the VM map copy is smaller than what
12789 * was requested by the caller. This must be because some
12790 * PAGE_SIZE-sized pages are missing at the end of the last
12791 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12792 * The caller might not have been aware of those missing
12793 * pages and might not want to be aware of it, which is
12794 * fine as long as they don't try to access (and crash on)
12795 * those missing pages.
12796 * Let's adjust the size of the "copy", to avoid failing
12797 * in vm_map_copyout() or vm_map_copy_overwrite().
12798 */
12799 assert(vm_map_round_page(copy_size,
12800 VM_MAP_PAGE_MASK(src_map)) ==
12801 vm_map_round_page(copy->size,
12802 VM_MAP_PAGE_MASK(src_map)));
12803 copy->size = copy_size;
12804 }
12805
12806 *copy_result = copy;
12807 return KERN_SUCCESS;
12808
12809 #undef RETURN
12810 }
12811
12812 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12813 vm_map_copy_extract(
12814 vm_map_t src_map,
12815 vm_map_address_t src_addr,
12816 vm_map_size_t len,
12817 boolean_t do_copy,
12818 vm_map_copy_t *copy_result, /* OUT */
12819 vm_prot_t *cur_prot, /* IN/OUT */
12820 vm_prot_t *max_prot, /* IN/OUT */
12821 vm_inherit_t inheritance,
12822 vm_map_kernel_flags_t vmk_flags)
12823 {
12824 vm_map_copy_t copy;
12825 kern_return_t kr;
12826 vm_prot_t required_cur_prot, required_max_prot;
12827
12828 /*
12829 * Check for copies of zero bytes.
12830 */
12831
12832 if (len == 0) {
12833 *copy_result = VM_MAP_COPY_NULL;
12834 return KERN_SUCCESS;
12835 }
12836
12837 /*
12838 * Check that the end address doesn't overflow
12839 */
12840 if (src_addr + len < src_addr) {
12841 return KERN_INVALID_ADDRESS;
12842 }
12843 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12844 return KERN_INVALID_ADDRESS;
12845 }
12846
12847 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12848 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12849 }
12850
12851 required_cur_prot = *cur_prot;
12852 required_max_prot = *max_prot;
12853
12854 /*
12855 * Allocate a header element for the list.
12856 *
12857 * Use the start and end in the header to
12858 * remember the endpoints prior to rounding.
12859 */
12860
12861 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12862 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12863 copy->offset = 0;
12864 copy->size = len;
12865
12866 kr = vm_map_remap_extract(src_map,
12867 src_addr,
12868 len,
12869 do_copy, /* copy */
12870 copy,
12871 cur_prot, /* IN/OUT */
12872 max_prot, /* IN/OUT */
12873 inheritance,
12874 vmk_flags);
12875 if (kr != KERN_SUCCESS) {
12876 vm_map_copy_discard(copy);
12877 if ((kr == KERN_INVALID_ADDRESS ||
12878 kr == KERN_INVALID_ARGUMENT) &&
12879 src_map->terminated) {
12880 /* tell the caller that this address space is gone */
12881 kr = KERN_TERMINATED;
12882 }
12883 return kr;
12884 }
12885 if (required_cur_prot != VM_PROT_NONE) {
12886 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12887 assert((*max_prot & required_max_prot) == required_max_prot);
12888 }
12889
12890 *copy_result = copy;
12891 return KERN_SUCCESS;
12892 }
12893
12894 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12895 vm_map_fork_share(
12896 vm_map_t old_map,
12897 vm_map_entry_t old_entry,
12898 vm_map_t new_map)
12899 {
12900 vm_object_t object;
12901 vm_map_entry_t new_entry;
12902
12903 /*
12904 * New sharing code. New map entry
12905 * references original object. Internal
12906 * objects use asynchronous copy algorithm for
12907 * future copies. First make sure we have
12908 * the right object. If we need a shadow,
12909 * or someone else already has one, then
12910 * make a new shadow and share it.
12911 */
12912
12913 if (!old_entry->is_sub_map) {
12914 object = VME_OBJECT(old_entry);
12915 }
12916
12917 if (old_entry->is_sub_map) {
12918 assert(old_entry->wired_count == 0);
12919 #ifndef NO_NESTED_PMAP
12920 #if !PMAP_FORK_NEST
12921 if (old_entry->use_pmap) {
12922 kern_return_t result;
12923
12924 result = pmap_nest(new_map->pmap,
12925 (VME_SUBMAP(old_entry))->pmap,
12926 (addr64_t)old_entry->vme_start,
12927 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12928 if (result) {
12929 panic("vm_map_fork_share: pmap_nest failed!");
12930 }
12931 }
12932 #endif /* !PMAP_FORK_NEST */
12933 #endif /* NO_NESTED_PMAP */
12934 } else if (object == VM_OBJECT_NULL) {
12935 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12936 old_entry->vme_start));
12937 VME_OFFSET_SET(old_entry, 0);
12938 VME_OBJECT_SET(old_entry, object, false, 0);
12939 old_entry->use_pmap = TRUE;
12940 // assert(!old_entry->needs_copy);
12941 } else if (object->copy_strategy !=
12942 MEMORY_OBJECT_COPY_SYMMETRIC) {
12943 /*
12944 * We are already using an asymmetric
12945 * copy, and therefore we already have
12946 * the right object.
12947 */
12948
12949 assert(!old_entry->needs_copy);
12950 } else if (old_entry->needs_copy || /* case 1 */
12951 object->shadowed || /* case 2 */
12952 (!object->true_share && /* case 3 */
12953 !old_entry->is_shared &&
12954 (object->vo_size >
12955 (vm_map_size_t)(old_entry->vme_end -
12956 old_entry->vme_start)))) {
12957 bool is_writable;
12958
12959 /*
12960 * We need to create a shadow.
12961 * There are three cases here.
12962 * In the first case, we need to
12963 * complete a deferred symmetrical
12964 * copy that we participated in.
12965 * In the second and third cases,
12966 * we need to create the shadow so
12967 * that changes that we make to the
12968 * object do not interfere with
12969 * any symmetrical copies which
12970 * have occured (case 2) or which
12971 * might occur (case 3).
12972 *
12973 * The first case is when we had
12974 * deferred shadow object creation
12975 * via the entry->needs_copy mechanism.
12976 * This mechanism only works when
12977 * only one entry points to the source
12978 * object, and we are about to create
12979 * a second entry pointing to the
12980 * same object. The problem is that
12981 * there is no way of mapping from
12982 * an object to the entries pointing
12983 * to it. (Deferred shadow creation
12984 * works with one entry because occurs
12985 * at fault time, and we walk from the
12986 * entry to the object when handling
12987 * the fault.)
12988 *
12989 * The second case is when the object
12990 * to be shared has already been copied
12991 * with a symmetric copy, but we point
12992 * directly to the object without
12993 * needs_copy set in our entry. (This
12994 * can happen because different ranges
12995 * of an object can be pointed to by
12996 * different entries. In particular,
12997 * a single entry pointing to an object
12998 * can be split by a call to vm_inherit,
12999 * which, combined with task_create, can
13000 * result in the different entries
13001 * having different needs_copy values.)
13002 * The shadowed flag in the object allows
13003 * us to detect this case. The problem
13004 * with this case is that if this object
13005 * has or will have shadows, then we
13006 * must not perform an asymmetric copy
13007 * of this object, since such a copy
13008 * allows the object to be changed, which
13009 * will break the previous symmetrical
13010 * copies (which rely upon the object
13011 * not changing). In a sense, the shadowed
13012 * flag says "don't change this object".
13013 * We fix this by creating a shadow
13014 * object for this object, and sharing
13015 * that. This works because we are free
13016 * to change the shadow object (and thus
13017 * to use an asymmetric copy strategy);
13018 * this is also semantically correct,
13019 * since this object is temporary, and
13020 * therefore a copy of the object is
13021 * as good as the object itself. (This
13022 * is not true for permanent objects,
13023 * since the pager needs to see changes,
13024 * which won't happen if the changes
13025 * are made to a copy.)
13026 *
13027 * The third case is when the object
13028 * to be shared has parts sticking
13029 * outside of the entry we're working
13030 * with, and thus may in the future
13031 * be subject to a symmetrical copy.
13032 * (This is a preemptive version of
13033 * case 2.)
13034 */
13035 VME_OBJECT_SHADOW(old_entry,
13036 (vm_map_size_t) (old_entry->vme_end -
13037 old_entry->vme_start),
13038 vm_map_always_shadow(old_map));
13039
13040 /*
13041 * If we're making a shadow for other than
13042 * copy on write reasons, then we have
13043 * to remove write permission.
13044 */
13045
13046 is_writable = false;
13047 if (old_entry->protection & VM_PROT_WRITE) {
13048 is_writable = true;
13049 #if __arm64e__
13050 } else if (old_entry->used_for_tpro) {
13051 is_writable = true;
13052 #endif /* __arm64e__ */
13053 }
13054 if (!old_entry->needs_copy && is_writable) {
13055 vm_prot_t prot;
13056
13057 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13058 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13059 __FUNCTION__, old_map, old_map->pmap,
13060 old_entry,
13061 (uint64_t)old_entry->vme_start,
13062 (uint64_t)old_entry->vme_end,
13063 old_entry->protection);
13064 }
13065
13066 prot = old_entry->protection & ~VM_PROT_WRITE;
13067
13068 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13069 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13070 __FUNCTION__, old_map, old_map->pmap,
13071 old_entry,
13072 (uint64_t)old_entry->vme_start,
13073 (uint64_t)old_entry->vme_end,
13074 prot);
13075 }
13076
13077 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13078 prot |= VM_PROT_EXECUTE;
13079 }
13080
13081
13082 if (old_map->mapped_in_other_pmaps) {
13083 vm_object_pmap_protect(
13084 VME_OBJECT(old_entry),
13085 VME_OFFSET(old_entry),
13086 (old_entry->vme_end -
13087 old_entry->vme_start),
13088 PMAP_NULL,
13089 PAGE_SIZE,
13090 old_entry->vme_start,
13091 prot);
13092 } else {
13093 pmap_protect(old_map->pmap,
13094 old_entry->vme_start,
13095 old_entry->vme_end,
13096 prot);
13097 }
13098 }
13099
13100 old_entry->needs_copy = FALSE;
13101 object = VME_OBJECT(old_entry);
13102 }
13103
13104
13105 /*
13106 * If object was using a symmetric copy strategy,
13107 * change its copy strategy to the default
13108 * asymmetric copy strategy, which is copy_delay
13109 * in the non-norma case and copy_call in the
13110 * norma case. Bump the reference count for the
13111 * new entry.
13112 */
13113
13114 if (old_entry->is_sub_map) {
13115 vm_map_reference(VME_SUBMAP(old_entry));
13116 } else {
13117 vm_object_lock(object);
13118 vm_object_reference_locked(object);
13119 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13120 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13121 }
13122 vm_object_unlock(object);
13123 }
13124
13125 /*
13126 * Clone the entry, using object ref from above.
13127 * Mark both entries as shared.
13128 */
13129
13130 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13131 vm_map_entry_copy(old_map, new_entry, old_entry);
13132 old_entry->is_shared = TRUE;
13133 new_entry->is_shared = TRUE;
13134
13135 /*
13136 * We're dealing with a shared mapping, so the resulting mapping
13137 * should inherit some of the original mapping's accounting settings.
13138 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13139 * "use_pmap" should stay the same as before (if it hasn't been reset
13140 * to TRUE when we cleared "iokit_acct").
13141 */
13142 assert(!new_entry->iokit_acct);
13143
13144 /*
13145 * If old entry's inheritence is VM_INHERIT_NONE,
13146 * the new entry is for corpse fork, remove the
13147 * write permission from the new entry.
13148 */
13149 if (old_entry->inheritance == VM_INHERIT_NONE) {
13150 new_entry->protection &= ~VM_PROT_WRITE;
13151 new_entry->max_protection &= ~VM_PROT_WRITE;
13152 }
13153
13154 /*
13155 * Insert the entry into the new map -- we
13156 * know we're inserting at the end of the new
13157 * map.
13158 */
13159
13160 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13161 VM_MAP_KERNEL_FLAGS_NONE);
13162
13163 /*
13164 * Update the physical map
13165 */
13166
13167 if (old_entry->is_sub_map) {
13168 /* Bill Angell pmap support goes here */
13169 } else {
13170 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13171 old_entry->vme_end - old_entry->vme_start,
13172 old_entry->vme_start);
13173 }
13174 }
13175
13176 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13177 vm_map_fork_copy(
13178 vm_map_t old_map,
13179 vm_map_entry_t *old_entry_p,
13180 vm_map_t new_map,
13181 int vm_map_copyin_flags)
13182 {
13183 vm_map_entry_t old_entry = *old_entry_p;
13184 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13185 vm_map_offset_t start = old_entry->vme_start;
13186 vm_map_copy_t copy;
13187 vm_map_entry_t last = vm_map_last_entry(new_map);
13188
13189 vm_map_unlock(old_map);
13190 /*
13191 * Use maxprot version of copyin because we
13192 * care about whether this memory can ever
13193 * be accessed, not just whether it's accessible
13194 * right now.
13195 */
13196 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13197 if (vm_map_copyin_internal(old_map, start, entry_size,
13198 vm_map_copyin_flags, ©)
13199 != KERN_SUCCESS) {
13200 /*
13201 * The map might have changed while it
13202 * was unlocked, check it again. Skip
13203 * any blank space or permanently
13204 * unreadable region.
13205 */
13206 vm_map_lock(old_map);
13207 if (!vm_map_lookup_entry(old_map, start, &last) ||
13208 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13209 last = last->vme_next;
13210 }
13211 *old_entry_p = last;
13212
13213 /*
13214 * XXX For some error returns, want to
13215 * XXX skip to the next element. Note
13216 * that INVALID_ADDRESS and
13217 * PROTECTION_FAILURE are handled above.
13218 */
13219
13220 return FALSE;
13221 }
13222
13223 /*
13224 * Assert that the vm_map_copy is coming from the right
13225 * zone and hasn't been forged
13226 */
13227 vm_map_copy_require(copy);
13228
13229 /*
13230 * Insert the copy into the new map
13231 */
13232 vm_map_copy_insert(new_map, last, copy);
13233
13234 /*
13235 * Pick up the traversal at the end of
13236 * the copied region.
13237 */
13238
13239 vm_map_lock(old_map);
13240 start += entry_size;
13241 if (!vm_map_lookup_entry(old_map, start, &last)) {
13242 last = last->vme_next;
13243 } else {
13244 if (last->vme_start == start) {
13245 /*
13246 * No need to clip here and we don't
13247 * want to cause any unnecessary
13248 * unnesting...
13249 */
13250 } else {
13251 vm_map_clip_start(old_map, last, start);
13252 }
13253 }
13254 *old_entry_p = last;
13255
13256 return TRUE;
13257 }
13258
13259 #if PMAP_FORK_NEST
13260 #define PMAP_FORK_NEST_DEBUG 0
13261 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13262 vm_map_fork_unnest(
13263 pmap_t new_pmap,
13264 vm_map_offset_t pre_nested_start,
13265 vm_map_offset_t pre_nested_end,
13266 vm_map_offset_t start,
13267 vm_map_offset_t end)
13268 {
13269 kern_return_t kr;
13270 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13271
13272 assertf(pre_nested_start <= pre_nested_end,
13273 "pre_nested start 0x%llx end 0x%llx",
13274 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13275 assertf(start <= end,
13276 "start 0x%llx end 0x%llx",
13277 (uint64_t) start, (uint64_t)end);
13278
13279 if (pre_nested_start == pre_nested_end) {
13280 /* nothing was pre-nested: done */
13281 return;
13282 }
13283 if (end <= pre_nested_start) {
13284 /* fully before pre-nested range: done */
13285 return;
13286 }
13287 if (start >= pre_nested_end) {
13288 /* fully after pre-nested range: done */
13289 return;
13290 }
13291 /* ignore parts of range outside of pre_nested range */
13292 if (start < pre_nested_start) {
13293 start = pre_nested_start;
13294 }
13295 if (end > pre_nested_end) {
13296 end = pre_nested_end;
13297 }
13298 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13299 start_unnest = start & ~nesting_mask;
13300 end_unnest = (end + nesting_mask) & ~nesting_mask;
13301 kr = pmap_unnest(new_pmap,
13302 (addr64_t)start_unnest,
13303 (uint64_t)(end_unnest - start_unnest));
13304 #if PMAP_FORK_NEST_DEBUG
13305 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13306 #endif /* PMAP_FORK_NEST_DEBUG */
13307 assertf(kr == KERN_SUCCESS,
13308 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13309 (uint64_t)start, (uint64_t)end, new_pmap,
13310 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13311 kr);
13312 }
13313 #endif /* PMAP_FORK_NEST */
13314
13315 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13316 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13317 {
13318 new_map->size_limit = old_map->size_limit;
13319 new_map->data_limit = old_map->data_limit;
13320 new_map->user_wire_limit = old_map->user_wire_limit;
13321 new_map->reserved_regions = old_map->reserved_regions;
13322 }
13323
13324 /*
13325 * vm_map_fork:
13326 *
13327 * Create and return a new map based on the old
13328 * map, according to the inheritance values on the
13329 * regions in that map and the options.
13330 *
13331 * The source map must not be locked.
13332 */
13333 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13334 vm_map_fork(
13335 ledger_t ledger,
13336 vm_map_t old_map,
13337 int options)
13338 {
13339 pmap_t new_pmap;
13340 vm_map_t new_map;
13341 vm_map_entry_t old_entry;
13342 vm_map_size_t new_size = 0, entry_size;
13343 vm_map_entry_t new_entry;
13344 boolean_t src_needs_copy;
13345 boolean_t new_entry_needs_copy;
13346 boolean_t pmap_is64bit;
13347 int vm_map_copyin_flags;
13348 vm_inherit_t old_entry_inheritance;
13349 int map_create_options;
13350 kern_return_t footprint_collect_kr;
13351
13352 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13353 VM_MAP_FORK_PRESERVE_PURGEABLE |
13354 VM_MAP_FORK_CORPSE_FOOTPRINT |
13355 VM_MAP_FORK_SHARE_IF_OWNED)) {
13356 /* unsupported option */
13357 return VM_MAP_NULL;
13358 }
13359
13360 pmap_is64bit =
13361 #if defined(__i386__) || defined(__x86_64__)
13362 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13363 #elif defined(__arm64__)
13364 old_map->pmap->is_64bit;
13365 #else
13366 #error Unknown architecture.
13367 #endif
13368
13369 unsigned int pmap_flags = 0;
13370 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13371 #if defined(HAS_APPLE_PAC)
13372 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13373 #endif
13374 #if CONFIG_ROSETTA
13375 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13376 #endif
13377 #if PMAP_CREATE_FORCE_4K_PAGES
13378 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13379 PAGE_SIZE != FOURK_PAGE_SIZE) {
13380 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13381 }
13382 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13383 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13384 if (new_pmap == NULL) {
13385 return VM_MAP_NULL;
13386 }
13387
13388 vm_map_reference(old_map);
13389 vm_map_lock(old_map);
13390
13391 map_create_options = 0;
13392 if (old_map->hdr.entries_pageable) {
13393 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13394 }
13395 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13396 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13397 footprint_collect_kr = KERN_SUCCESS;
13398 }
13399 new_map = vm_map_create_options(new_pmap,
13400 old_map->min_offset,
13401 old_map->max_offset,
13402 map_create_options);
13403
13404 /* inherit cs_enforcement */
13405 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13406
13407 vm_map_lock(new_map);
13408 vm_commit_pagezero_status(new_map);
13409 /* inherit the parent map's page size */
13410 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13411
13412 /* inherit the parent rlimits */
13413 vm_map_inherit_limits(new_map, old_map);
13414
13415 #if CONFIG_MAP_RANGES
13416 /* inherit the parent map's VM ranges */
13417 vm_map_range_fork(new_map, old_map);
13418 #endif
13419
13420 #if CODE_SIGNING_MONITOR
13421 /* Prepare the monitor for the fork */
13422 csm_fork_prepare(old_map->pmap, new_pmap);
13423 #endif
13424
13425 #if PMAP_FORK_NEST
13426 /*
13427 * Pre-nest the shared region's pmap.
13428 */
13429 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13430 pmap_fork_nest(old_map->pmap, new_pmap,
13431 &pre_nested_start, &pre_nested_end);
13432 #if PMAP_FORK_NEST_DEBUG
13433 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13434 #endif /* PMAP_FORK_NEST_DEBUG */
13435 #endif /* PMAP_FORK_NEST */
13436
13437 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13438 /*
13439 * Abort any corpse collection if the system is shutting down.
13440 */
13441 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13442 get_system_inshutdown()) {
13443 #if PMAP_FORK_NEST
13444 new_entry = vm_map_last_entry(new_map);
13445 if (new_entry == vm_map_to_entry(new_map)) {
13446 /* unnest all that was pre-nested */
13447 vm_map_fork_unnest(new_pmap,
13448 pre_nested_start, pre_nested_end,
13449 vm_map_min(new_map), vm_map_max(new_map));
13450 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13451 /* unnest hole at the end, if pre-nested */
13452 vm_map_fork_unnest(new_pmap,
13453 pre_nested_start, pre_nested_end,
13454 new_entry->vme_end, vm_map_max(new_map));
13455 }
13456 #endif /* PMAP_FORK_NEST */
13457 vm_map_corpse_footprint_collect_done(new_map);
13458 vm_map_unlock(new_map);
13459 vm_map_unlock(old_map);
13460 vm_map_deallocate(new_map);
13461 vm_map_deallocate(old_map);
13462 printf("Aborting corpse map due to system shutdown\n");
13463 return VM_MAP_NULL;
13464 }
13465
13466 entry_size = old_entry->vme_end - old_entry->vme_start;
13467
13468 #if PMAP_FORK_NEST
13469 /*
13470 * Undo any unnecessary pre-nesting.
13471 */
13472 vm_map_offset_t prev_end;
13473 if (old_entry == vm_map_first_entry(old_map)) {
13474 prev_end = vm_map_min(old_map);
13475 } else {
13476 prev_end = old_entry->vme_prev->vme_end;
13477 }
13478 if (prev_end < old_entry->vme_start) {
13479 /* unnest hole before this entry, if pre-nested */
13480 vm_map_fork_unnest(new_pmap,
13481 pre_nested_start, pre_nested_end,
13482 prev_end, old_entry->vme_start);
13483 }
13484 if (old_entry->is_sub_map && old_entry->use_pmap) {
13485 /* keep this entry nested in the child */
13486 #if PMAP_FORK_NEST_DEBUG
13487 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13488 #endif /* PMAP_FORK_NEST_DEBUG */
13489 } else {
13490 /* undo nesting for this entry, if pre-nested */
13491 vm_map_fork_unnest(new_pmap,
13492 pre_nested_start, pre_nested_end,
13493 old_entry->vme_start, old_entry->vme_end);
13494 }
13495 #endif /* PMAP_FORK_NEST */
13496
13497 old_entry_inheritance = old_entry->inheritance;
13498 /*
13499 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13500 * share VM_INHERIT_NONE entries that are not backed by a
13501 * device pager.
13502 */
13503 if (old_entry_inheritance == VM_INHERIT_NONE &&
13504 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13505 (old_entry->protection & VM_PROT_READ) &&
13506 !(!old_entry->is_sub_map &&
13507 VME_OBJECT(old_entry) != NULL &&
13508 VME_OBJECT(old_entry)->pager != NULL &&
13509 is_device_pager_ops(
13510 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13511 old_entry_inheritance = VM_INHERIT_SHARE;
13512 }
13513 if (old_entry_inheritance == VM_INHERIT_COPY &&
13514 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13515 !old_entry->is_sub_map &&
13516 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13517 vm_object_t object;
13518 task_t owner;
13519 object = VME_OBJECT(old_entry);
13520 owner = VM_OBJECT_OWNER(object);
13521 if (owner != TASK_NULL &&
13522 owner->map == old_map) {
13523 /*
13524 * This mapping points at a VM object owned
13525 * by the task being forked.
13526 * Some tools reporting memory accounting
13527 * info rely on the object ID, so share this
13528 * mapping instead of copying, to make the
13529 * corpse look exactly like the original
13530 * task in that respect.
13531 */
13532 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13533 old_entry_inheritance = VM_INHERIT_SHARE;
13534 }
13535 }
13536
13537 if (old_entry_inheritance != VM_INHERIT_NONE &&
13538 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13539 footprint_collect_kr == KERN_SUCCESS) {
13540 /*
13541 * The corpse won't have old_map->pmap to query
13542 * footprint information, so collect that data now
13543 * and store it in new_map->vmmap_corpse_footprint
13544 * for later autopsy.
13545 */
13546 footprint_collect_kr =
13547 vm_map_corpse_footprint_collect(old_map,
13548 old_entry,
13549 new_map);
13550 }
13551
13552 switch (old_entry_inheritance) {
13553 case VM_INHERIT_NONE:
13554 break;
13555
13556 case VM_INHERIT_SHARE:
13557 vm_map_fork_share(old_map, old_entry, new_map);
13558 new_size += entry_size;
13559 break;
13560
13561 case VM_INHERIT_COPY:
13562
13563 /*
13564 * Inline the copy_quickly case;
13565 * upon failure, fall back on call
13566 * to vm_map_fork_copy.
13567 */
13568
13569 if (old_entry->is_sub_map) {
13570 break;
13571 }
13572 if ((old_entry->wired_count != 0) ||
13573 ((VME_OBJECT(old_entry) != NULL) &&
13574 (VME_OBJECT(old_entry)->true_share))) {
13575 goto slow_vm_map_fork_copy;
13576 }
13577
13578 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13579 vm_map_entry_copy(old_map, new_entry, old_entry);
13580 if (old_entry->vme_permanent) {
13581 /* inherit "permanent" on fork() */
13582 new_entry->vme_permanent = TRUE;
13583 }
13584
13585 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13586 new_map->jit_entry_exists = TRUE;
13587 }
13588
13589 if (new_entry->is_sub_map) {
13590 /* clear address space specifics */
13591 new_entry->use_pmap = FALSE;
13592 } else {
13593 /*
13594 * We're dealing with a copy-on-write operation,
13595 * so the resulting mapping should not inherit
13596 * the original mapping's accounting settings.
13597 * "iokit_acct" should have been cleared in
13598 * vm_map_entry_copy().
13599 * "use_pmap" should be reset to its default
13600 * (TRUE) so that the new mapping gets
13601 * accounted for in the task's memory footprint.
13602 */
13603 assert(!new_entry->iokit_acct);
13604 new_entry->use_pmap = TRUE;
13605 }
13606
13607 if (!vm_object_copy_quickly(
13608 VME_OBJECT(new_entry),
13609 VME_OFFSET(old_entry),
13610 (old_entry->vme_end -
13611 old_entry->vme_start),
13612 &src_needs_copy,
13613 &new_entry_needs_copy)) {
13614 vm_map_entry_dispose(new_entry);
13615 goto slow_vm_map_fork_copy;
13616 }
13617
13618 /*
13619 * Handle copy-on-write obligations
13620 */
13621
13622 if (src_needs_copy && !old_entry->needs_copy) {
13623 vm_prot_t prot;
13624
13625 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13626 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13627 __FUNCTION__,
13628 old_map, old_map->pmap, old_entry,
13629 (uint64_t)old_entry->vme_start,
13630 (uint64_t)old_entry->vme_end,
13631 old_entry->protection);
13632 }
13633
13634 prot = old_entry->protection & ~VM_PROT_WRITE;
13635
13636 if (override_nx(old_map, VME_ALIAS(old_entry))
13637 && prot) {
13638 prot |= VM_PROT_EXECUTE;
13639 }
13640
13641 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13642 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13643 __FUNCTION__,
13644 old_map, old_map->pmap, old_entry,
13645 (uint64_t)old_entry->vme_start,
13646 (uint64_t)old_entry->vme_end,
13647 prot);
13648 }
13649
13650 vm_object_pmap_protect(
13651 VME_OBJECT(old_entry),
13652 VME_OFFSET(old_entry),
13653 (old_entry->vme_end -
13654 old_entry->vme_start),
13655 ((old_entry->is_shared
13656 || old_map->mapped_in_other_pmaps)
13657 ? PMAP_NULL :
13658 old_map->pmap),
13659 VM_MAP_PAGE_SIZE(old_map),
13660 old_entry->vme_start,
13661 prot);
13662
13663 assert(old_entry->wired_count == 0);
13664 old_entry->needs_copy = TRUE;
13665 }
13666 new_entry->needs_copy = new_entry_needs_copy;
13667
13668 /*
13669 * Insert the entry at the end
13670 * of the map.
13671 */
13672
13673 vm_map_store_entry_link(new_map,
13674 vm_map_last_entry(new_map),
13675 new_entry,
13676 VM_MAP_KERNEL_FLAGS_NONE);
13677 new_size += entry_size;
13678 break;
13679
13680 slow_vm_map_fork_copy:
13681 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13682 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13683 vm_map_copyin_flags |=
13684 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13685 }
13686 if (vm_map_fork_copy(old_map,
13687 &old_entry,
13688 new_map,
13689 vm_map_copyin_flags)) {
13690 new_size += entry_size;
13691 }
13692 continue;
13693 }
13694 old_entry = old_entry->vme_next;
13695 }
13696
13697 #if PMAP_FORK_NEST
13698 new_entry = vm_map_last_entry(new_map);
13699 if (new_entry == vm_map_to_entry(new_map)) {
13700 /* unnest all that was pre-nested */
13701 vm_map_fork_unnest(new_pmap,
13702 pre_nested_start, pre_nested_end,
13703 vm_map_min(new_map), vm_map_max(new_map));
13704 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13705 /* unnest hole at the end, if pre-nested */
13706 vm_map_fork_unnest(new_pmap,
13707 pre_nested_start, pre_nested_end,
13708 new_entry->vme_end, vm_map_max(new_map));
13709 }
13710 #endif /* PMAP_FORK_NEST */
13711
13712 #if defined(__arm64__)
13713 pmap_insert_commpage(new_map->pmap);
13714 #endif /* __arm64__ */
13715
13716 new_map->size = new_size;
13717
13718 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13719 vm_map_corpse_footprint_collect_done(new_map);
13720 }
13721
13722 /* Propagate JIT entitlement for the pmap layer. */
13723 if (pmap_get_jit_entitled(old_map->pmap)) {
13724 /* Tell the pmap that it supports JIT. */
13725 pmap_set_jit_entitled(new_map->pmap);
13726 }
13727
13728 /* Propagate TPRO settings for the pmap layer */
13729 if (pmap_get_tpro(old_map->pmap)) {
13730 /* Tell the pmap that it supports TPRO */
13731 pmap_set_tpro(new_map->pmap);
13732 }
13733
13734
13735 vm_map_unlock(new_map);
13736 vm_map_unlock(old_map);
13737 vm_map_deallocate(old_map);
13738
13739 return new_map;
13740 }
13741
13742 /*
13743 * vm_map_exec:
13744 *
13745 * Setup the "new_map" with the proper execution environment according
13746 * to the type of executable (platform, 64bit, chroot environment).
13747 * Map the comm page and shared region, etc...
13748 */
13749 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13750 vm_map_exec(
13751 vm_map_t new_map,
13752 task_t task,
13753 boolean_t is64bit,
13754 void *fsroot,
13755 cpu_type_t cpu,
13756 cpu_subtype_t cpu_subtype,
13757 boolean_t reslide,
13758 boolean_t is_driverkit,
13759 uint32_t rsr_version)
13760 {
13761 SHARED_REGION_TRACE_DEBUG(
13762 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13763 (void *)VM_KERNEL_ADDRPERM(current_task()),
13764 (void *)VM_KERNEL_ADDRPERM(new_map),
13765 (void *)VM_KERNEL_ADDRPERM(task),
13766 (void *)VM_KERNEL_ADDRPERM(fsroot),
13767 cpu,
13768 cpu_subtype));
13769 (void) vm_commpage_enter(new_map, task, is64bit);
13770
13771 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13772
13773 SHARED_REGION_TRACE_DEBUG(
13774 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13775 (void *)VM_KERNEL_ADDRPERM(current_task()),
13776 (void *)VM_KERNEL_ADDRPERM(new_map),
13777 (void *)VM_KERNEL_ADDRPERM(task),
13778 (void *)VM_KERNEL_ADDRPERM(fsroot),
13779 cpu,
13780 cpu_subtype));
13781
13782 /*
13783 * Some devices have region(s) of memory that shouldn't get allocated by
13784 * user processes. The following code creates dummy vm_map_entry_t's for each
13785 * of the regions that needs to be reserved to prevent any allocations in
13786 * those regions.
13787 */
13788 kern_return_t kr = KERN_FAILURE;
13789 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13790 vmk_flags.vmkf_beyond_max = true;
13791
13792 const struct vm_reserved_region *regions = NULL;
13793 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13794 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13795
13796 for (size_t i = 0; i < num_regions; ++i) {
13797 vm_map_offset_t address = regions[i].vmrr_addr;
13798
13799 kr = vm_map_enter(
13800 new_map,
13801 &address,
13802 regions[i].vmrr_size,
13803 (vm_map_offset_t)0,
13804 vmk_flags,
13805 VM_OBJECT_NULL,
13806 (vm_object_offset_t)0,
13807 FALSE,
13808 VM_PROT_NONE,
13809 VM_PROT_NONE,
13810 VM_INHERIT_COPY);
13811
13812 if (kr != KERN_SUCCESS) {
13813 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13814 }
13815 }
13816
13817 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13818
13819 return KERN_SUCCESS;
13820 }
13821
13822 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13823 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13824 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13825 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13826 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13827 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13828 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13829 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13830 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13831 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13832 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13833 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13834 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13835 /*
13836 * vm_map_lookup_and_lock_object:
13837 *
13838 * Finds the VM object, offset, and
13839 * protection for a given virtual address in the
13840 * specified map, assuming a page fault of the
13841 * type specified.
13842 *
13843 * Returns the (object, offset, protection) for
13844 * this address, whether it is wired down, and whether
13845 * this map has the only reference to the data in question.
13846 * In order to later verify this lookup, a "version"
13847 * is returned.
13848 * If contended != NULL, *contended will be set to
13849 * true iff the thread had to spin or block to acquire
13850 * an exclusive lock.
13851 *
13852 * The map MUST be locked by the caller and WILL be
13853 * locked on exit. In order to guarantee the
13854 * existence of the returned object, it is returned
13855 * locked.
13856 *
13857 * If a lookup is requested with "write protection"
13858 * specified, the map may be changed to perform virtual
13859 * copying operations, although the data referenced will
13860 * remain the same.
13861 */
13862 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13863 vm_map_lookup_and_lock_object(
13864 vm_map_t *var_map, /* IN/OUT */
13865 vm_map_offset_t vaddr,
13866 vm_prot_t fault_type,
13867 int object_lock_type,
13868 vm_map_version_t *out_version, /* OUT */
13869 vm_object_t *object, /* OUT */
13870 vm_object_offset_t *offset, /* OUT */
13871 vm_prot_t *out_prot, /* OUT */
13872 boolean_t *wired, /* OUT */
13873 vm_object_fault_info_t fault_info, /* OUT */
13874 vm_map_t *real_map, /* OUT */
13875 bool *contended) /* OUT */
13876 {
13877 vm_map_entry_t entry;
13878 vm_map_t map = *var_map;
13879 vm_map_t old_map = *var_map;
13880 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13881 vm_map_offset_t cow_parent_vaddr = 0;
13882 vm_map_offset_t old_start = 0;
13883 vm_map_offset_t old_end = 0;
13884 vm_prot_t prot;
13885 boolean_t mask_protections;
13886 boolean_t force_copy;
13887 boolean_t no_force_copy_if_executable;
13888 boolean_t submap_needed_copy;
13889 vm_prot_t original_fault_type;
13890 vm_map_size_t fault_page_mask;
13891
13892 /*
13893 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13894 * as a mask against the mapping's actual protections, not as an
13895 * absolute value.
13896 */
13897 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13898 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13899 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13900 fault_type &= VM_PROT_ALL;
13901 original_fault_type = fault_type;
13902 if (contended) {
13903 *contended = false;
13904 }
13905
13906 *real_map = map;
13907
13908 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13909 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13910
13911 RetryLookup:
13912 fault_type = original_fault_type;
13913
13914 /*
13915 * If the map has an interesting hint, try it before calling
13916 * full blown lookup routine.
13917 */
13918 entry = map->hint;
13919
13920 if ((entry == vm_map_to_entry(map)) ||
13921 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13922 vm_map_entry_t tmp_entry;
13923
13924 /*
13925 * Entry was either not a valid hint, or the vaddr
13926 * was not contained in the entry, so do a full lookup.
13927 */
13928 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13929 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13930 vm_map_unlock(cow_sub_map_parent);
13931 }
13932 if ((*real_map != map)
13933 && (*real_map != cow_sub_map_parent)) {
13934 vm_map_unlock(*real_map);
13935 }
13936 return KERN_INVALID_ADDRESS;
13937 }
13938
13939 entry = tmp_entry;
13940 }
13941 if (map == old_map) {
13942 old_start = entry->vme_start;
13943 old_end = entry->vme_end;
13944 }
13945
13946 /*
13947 * Handle submaps. Drop lock on upper map, submap is
13948 * returned locked.
13949 */
13950
13951 submap_needed_copy = FALSE;
13952 submap_recurse:
13953 if (entry->is_sub_map) {
13954 vm_map_offset_t local_vaddr;
13955 vm_map_offset_t end_delta;
13956 vm_map_offset_t start_delta;
13957 vm_map_offset_t top_entry_saved_start;
13958 vm_object_offset_t top_entry_saved_offset;
13959 vm_map_entry_t submap_entry, saved_submap_entry;
13960 vm_object_offset_t submap_entry_offset;
13961 vm_object_size_t submap_entry_size;
13962 vm_prot_t subentry_protection;
13963 vm_prot_t subentry_max_protection;
13964 boolean_t subentry_no_copy_on_read;
13965 boolean_t subentry_permanent;
13966 boolean_t subentry_csm_associated;
13967 #if __arm64e__
13968 boolean_t subentry_used_for_tpro;
13969 #endif /* __arm64e__ */
13970 boolean_t mapped_needs_copy = FALSE;
13971 vm_map_version_t version;
13972
13973 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13974 "map %p (%d) entry %p submap %p (%d)\n",
13975 map, VM_MAP_PAGE_SHIFT(map), entry,
13976 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13977
13978 local_vaddr = vaddr;
13979 top_entry_saved_start = entry->vme_start;
13980 top_entry_saved_offset = VME_OFFSET(entry);
13981
13982 if ((entry->use_pmap &&
13983 !((fault_type & VM_PROT_WRITE) ||
13984 force_copy))) {
13985 /* if real_map equals map we unlock below */
13986 if ((*real_map != map) &&
13987 (*real_map != cow_sub_map_parent)) {
13988 vm_map_unlock(*real_map);
13989 }
13990 *real_map = VME_SUBMAP(entry);
13991 }
13992
13993 if (entry->needs_copy &&
13994 ((fault_type & VM_PROT_WRITE) ||
13995 force_copy)) {
13996 if (!mapped_needs_copy) {
13997 if (vm_map_lock_read_to_write(map)) {
13998 vm_map_lock_read(map);
13999 *real_map = map;
14000 goto RetryLookup;
14001 }
14002 vm_map_lock_read(VME_SUBMAP(entry));
14003 *var_map = VME_SUBMAP(entry);
14004 cow_sub_map_parent = map;
14005 /* reset base to map before cow object */
14006 /* this is the map which will accept */
14007 /* the new cow object */
14008 old_start = entry->vme_start;
14009 old_end = entry->vme_end;
14010 cow_parent_vaddr = vaddr;
14011 mapped_needs_copy = TRUE;
14012 } else {
14013 vm_map_lock_read(VME_SUBMAP(entry));
14014 *var_map = VME_SUBMAP(entry);
14015 if ((cow_sub_map_parent != map) &&
14016 (*real_map != map)) {
14017 vm_map_unlock(map);
14018 }
14019 }
14020 } else {
14021 if (entry->needs_copy) {
14022 submap_needed_copy = TRUE;
14023 }
14024 vm_map_lock_read(VME_SUBMAP(entry));
14025 *var_map = VME_SUBMAP(entry);
14026 /* leave map locked if it is a target */
14027 /* cow sub_map above otherwise, just */
14028 /* follow the maps down to the object */
14029 /* here we unlock knowing we are not */
14030 /* revisiting the map. */
14031 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14032 vm_map_unlock_read(map);
14033 }
14034 }
14035
14036 entry = NULL;
14037 map = *var_map;
14038
14039 /* calculate the offset in the submap for vaddr */
14040 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14041 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14042 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14043 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14044
14045 RetrySubMap:
14046 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14047 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14048 vm_map_unlock(cow_sub_map_parent);
14049 }
14050 if ((*real_map != map)
14051 && (*real_map != cow_sub_map_parent)) {
14052 vm_map_unlock(*real_map);
14053 }
14054 *real_map = map;
14055 return KERN_INVALID_ADDRESS;
14056 }
14057
14058 /* find the attenuated shadow of the underlying object */
14059 /* on our target map */
14060
14061 /* in english the submap object may extend beyond the */
14062 /* region mapped by the entry or, may only fill a portion */
14063 /* of it. For our purposes, we only care if the object */
14064 /* doesn't fill. In this case the area which will */
14065 /* ultimately be clipped in the top map will only need */
14066 /* to be as big as the portion of the underlying entry */
14067 /* which is mapped */
14068 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14069 submap_entry->vme_start - top_entry_saved_offset : 0;
14070
14071 end_delta =
14072 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14073 submap_entry->vme_end ?
14074 0 : (top_entry_saved_offset +
14075 (old_end - old_start))
14076 - submap_entry->vme_end;
14077
14078 old_start += start_delta;
14079 old_end -= end_delta;
14080
14081 if (submap_entry->is_sub_map) {
14082 entry = submap_entry;
14083 vaddr = local_vaddr;
14084 goto submap_recurse;
14085 }
14086
14087 if (((fault_type & VM_PROT_WRITE) ||
14088 force_copy)
14089 && cow_sub_map_parent) {
14090 vm_object_t sub_object, copy_object;
14091 vm_object_offset_t copy_offset;
14092 vm_map_offset_t local_start;
14093 vm_map_offset_t local_end;
14094 boolean_t object_copied = FALSE;
14095 vm_object_offset_t object_copied_offset = 0;
14096 boolean_t object_copied_needs_copy = FALSE;
14097 kern_return_t kr = KERN_SUCCESS;
14098
14099 if (vm_map_lock_read_to_write(map)) {
14100 vm_map_lock_read(map);
14101 old_start -= start_delta;
14102 old_end += end_delta;
14103 goto RetrySubMap;
14104 }
14105
14106
14107 sub_object = VME_OBJECT(submap_entry);
14108 if (sub_object == VM_OBJECT_NULL) {
14109 sub_object =
14110 vm_object_allocate(
14111 (vm_map_size_t)
14112 (submap_entry->vme_end -
14113 submap_entry->vme_start));
14114 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14115 VME_OFFSET_SET(submap_entry, 0);
14116 assert(!submap_entry->is_sub_map);
14117 assert(submap_entry->use_pmap);
14118 }
14119 local_start = local_vaddr -
14120 (cow_parent_vaddr - old_start);
14121 local_end = local_vaddr +
14122 (old_end - cow_parent_vaddr);
14123 vm_map_clip_start(map, submap_entry, local_start);
14124 vm_map_clip_end(map, submap_entry, local_end);
14125 if (submap_entry->is_sub_map) {
14126 /* unnesting was done when clipping */
14127 assert(!submap_entry->use_pmap);
14128 }
14129
14130 /* This is the COW case, lets connect */
14131 /* an entry in our space to the underlying */
14132 /* object in the submap, bypassing the */
14133 /* submap. */
14134 submap_entry_offset = VME_OFFSET(submap_entry);
14135 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14136
14137 if ((submap_entry->wired_count != 0 ||
14138 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14139 (submap_entry->protection & VM_PROT_EXECUTE) &&
14140 no_force_copy_if_executable) {
14141 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14142 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14143 vm_map_unlock(cow_sub_map_parent);
14144 }
14145 if ((*real_map != map)
14146 && (*real_map != cow_sub_map_parent)) {
14147 vm_map_unlock(*real_map);
14148 }
14149 *real_map = map;
14150 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14151 vm_map_lock_write_to_read(map);
14152 kr = KERN_PROTECTION_FAILURE;
14153 DTRACE_VM4(submap_no_copy_executable,
14154 vm_map_t, map,
14155 vm_object_offset_t, submap_entry_offset,
14156 vm_object_size_t, submap_entry_size,
14157 int, kr);
14158 return kr;
14159 }
14160
14161 if (submap_entry->wired_count != 0) {
14162 vm_object_reference(sub_object);
14163
14164 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14165 "submap_entry %p offset 0x%llx\n",
14166 submap_entry, VME_OFFSET(submap_entry));
14167
14168 DTRACE_VM6(submap_copy_slowly,
14169 vm_map_t, cow_sub_map_parent,
14170 vm_map_offset_t, vaddr,
14171 vm_map_t, map,
14172 vm_object_size_t, submap_entry_size,
14173 int, submap_entry->wired_count,
14174 int, sub_object->copy_strategy);
14175
14176 saved_submap_entry = submap_entry;
14177 version.main_timestamp = map->timestamp;
14178 vm_map_unlock(map); /* Increments timestamp by 1 */
14179 submap_entry = VM_MAP_ENTRY_NULL;
14180
14181 vm_object_lock(sub_object);
14182 kr = vm_object_copy_slowly(sub_object,
14183 submap_entry_offset,
14184 submap_entry_size,
14185 FALSE,
14186 ©_object);
14187 object_copied = TRUE;
14188 object_copied_offset = 0;
14189 /* 4k: account for extra offset in physical page */
14190 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14191 object_copied_needs_copy = FALSE;
14192 vm_object_deallocate(sub_object);
14193
14194 vm_map_lock(map);
14195
14196 if (kr != KERN_SUCCESS &&
14197 kr != KERN_MEMORY_RESTART_COPY) {
14198 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14199 vm_map_unlock(cow_sub_map_parent);
14200 }
14201 if ((*real_map != map)
14202 && (*real_map != cow_sub_map_parent)) {
14203 vm_map_unlock(*real_map);
14204 }
14205 *real_map = map;
14206 vm_object_deallocate(copy_object);
14207 copy_object = VM_OBJECT_NULL;
14208 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14209 vm_map_lock_write_to_read(map);
14210 DTRACE_VM4(submap_copy_error_slowly,
14211 vm_object_t, sub_object,
14212 vm_object_offset_t, submap_entry_offset,
14213 vm_object_size_t, submap_entry_size,
14214 int, kr);
14215 vm_map_lookup_and_lock_object_copy_slowly_error++;
14216 return kr;
14217 }
14218
14219 if ((kr == KERN_SUCCESS) &&
14220 (version.main_timestamp + 1) == map->timestamp) {
14221 submap_entry = saved_submap_entry;
14222 } else {
14223 saved_submap_entry = NULL;
14224 old_start -= start_delta;
14225 old_end += end_delta;
14226 vm_object_deallocate(copy_object);
14227 copy_object = VM_OBJECT_NULL;
14228 vm_map_lock_write_to_read(map);
14229 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14230 goto RetrySubMap;
14231 }
14232 vm_map_lookup_and_lock_object_copy_slowly_count++;
14233 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14234 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14235 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14236 }
14237 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14238 submap_entry_offset = VME_OFFSET(submap_entry);
14239 copy_object = VM_OBJECT_NULL;
14240 object_copied_offset = submap_entry_offset;
14241 object_copied_needs_copy = FALSE;
14242 DTRACE_VM6(submap_copy_strategically,
14243 vm_map_t, cow_sub_map_parent,
14244 vm_map_offset_t, vaddr,
14245 vm_map_t, map,
14246 vm_object_size_t, submap_entry_size,
14247 int, submap_entry->wired_count,
14248 int, sub_object->copy_strategy);
14249 kr = vm_object_copy_strategically(
14250 sub_object,
14251 submap_entry_offset,
14252 submap_entry->vme_end - submap_entry->vme_start,
14253 false, /* forking */
14254 ©_object,
14255 &object_copied_offset,
14256 &object_copied_needs_copy);
14257 if (kr == KERN_MEMORY_RESTART_COPY) {
14258 old_start -= start_delta;
14259 old_end += end_delta;
14260 vm_object_deallocate(copy_object);
14261 copy_object = VM_OBJECT_NULL;
14262 vm_map_lock_write_to_read(map);
14263 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14264 goto RetrySubMap;
14265 }
14266 if (kr != KERN_SUCCESS) {
14267 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14268 vm_map_unlock(cow_sub_map_parent);
14269 }
14270 if ((*real_map != map)
14271 && (*real_map != cow_sub_map_parent)) {
14272 vm_map_unlock(*real_map);
14273 }
14274 *real_map = map;
14275 vm_object_deallocate(copy_object);
14276 copy_object = VM_OBJECT_NULL;
14277 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14278 vm_map_lock_write_to_read(map);
14279 DTRACE_VM4(submap_copy_error_strategically,
14280 vm_object_t, sub_object,
14281 vm_object_offset_t, submap_entry_offset,
14282 vm_object_size_t, submap_entry_size,
14283 int, kr);
14284 vm_map_lookup_and_lock_object_copy_strategically_error++;
14285 return kr;
14286 }
14287 assert(copy_object != VM_OBJECT_NULL);
14288 assert(copy_object != sub_object);
14289 object_copied = TRUE;
14290 vm_map_lookup_and_lock_object_copy_strategically_count++;
14291 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14292 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14293 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14294 }
14295 } else {
14296 /* set up shadow object */
14297 object_copied = FALSE;
14298 copy_object = sub_object;
14299 vm_object_lock(sub_object);
14300 vm_object_reference_locked(sub_object);
14301 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14302 vm_object_unlock(sub_object);
14303
14304 assert(submap_entry->wired_count == 0);
14305 submap_entry->needs_copy = TRUE;
14306
14307 prot = submap_entry->protection;
14308 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14309 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14310 __FUNCTION__,
14311 map, map->pmap, submap_entry,
14312 (uint64_t)submap_entry->vme_start,
14313 (uint64_t)submap_entry->vme_end,
14314 prot);
14315 }
14316 prot = prot & ~VM_PROT_WRITE;
14317 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14318 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14319 __FUNCTION__,
14320 map, map->pmap, submap_entry,
14321 (uint64_t)submap_entry->vme_start,
14322 (uint64_t)submap_entry->vme_end,
14323 prot);
14324 }
14325
14326 if (override_nx(old_map,
14327 VME_ALIAS(submap_entry))
14328 && prot) {
14329 prot |= VM_PROT_EXECUTE;
14330 }
14331
14332 vm_object_pmap_protect(
14333 sub_object,
14334 VME_OFFSET(submap_entry),
14335 submap_entry->vme_end -
14336 submap_entry->vme_start,
14337 (submap_entry->is_shared
14338 || map->mapped_in_other_pmaps) ?
14339 PMAP_NULL : map->pmap,
14340 VM_MAP_PAGE_SIZE(map),
14341 submap_entry->vme_start,
14342 prot);
14343 vm_map_lookup_and_lock_object_copy_shadow_count++;
14344 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14345 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14346 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14347 }
14348 }
14349
14350 /*
14351 * Adjust the fault offset to the submap entry.
14352 */
14353 copy_offset = (local_vaddr -
14354 submap_entry->vme_start +
14355 VME_OFFSET(submap_entry));
14356
14357 /* This works diffently than the */
14358 /* normal submap case. We go back */
14359 /* to the parent of the cow map and*/
14360 /* clip out the target portion of */
14361 /* the sub_map, substituting the */
14362 /* new copy object, */
14363
14364 subentry_protection = submap_entry->protection;
14365 subentry_max_protection = submap_entry->max_protection;
14366 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14367 subentry_permanent = submap_entry->vme_permanent;
14368 subentry_csm_associated = submap_entry->csm_associated;
14369 #if __arm64e__
14370 subentry_used_for_tpro = submap_entry->used_for_tpro;
14371 #endif // __arm64e__
14372 vm_map_unlock(map);
14373 submap_entry = NULL; /* not valid after map unlock */
14374
14375 local_start = old_start;
14376 local_end = old_end;
14377 map = cow_sub_map_parent;
14378 *var_map = cow_sub_map_parent;
14379 vaddr = cow_parent_vaddr;
14380 cow_sub_map_parent = NULL;
14381
14382 if (!vm_map_lookup_entry(map,
14383 vaddr, &entry)) {
14384 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14385 vm_map_unlock(cow_sub_map_parent);
14386 }
14387 if ((*real_map != map)
14388 && (*real_map != cow_sub_map_parent)) {
14389 vm_map_unlock(*real_map);
14390 }
14391 *real_map = map;
14392 vm_object_deallocate(
14393 copy_object);
14394 copy_object = VM_OBJECT_NULL;
14395 vm_map_lock_write_to_read(map);
14396 DTRACE_VM4(submap_lookup_post_unlock,
14397 uint64_t, (uint64_t)entry->vme_start,
14398 uint64_t, (uint64_t)entry->vme_end,
14399 vm_map_offset_t, vaddr,
14400 int, object_copied);
14401 return KERN_INVALID_ADDRESS;
14402 }
14403
14404 /* clip out the portion of space */
14405 /* mapped by the sub map which */
14406 /* corresponds to the underlying */
14407 /* object */
14408
14409 /*
14410 * Clip (and unnest) the smallest nested chunk
14411 * possible around the faulting address...
14412 */
14413 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14414 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14415 /*
14416 * ... but don't go beyond the "old_start" to "old_end"
14417 * range, to avoid spanning over another VM region
14418 * with a possibly different VM object and/or offset.
14419 */
14420 if (local_start < old_start) {
14421 local_start = old_start;
14422 }
14423 if (local_end > old_end) {
14424 local_end = old_end;
14425 }
14426 /*
14427 * Adjust copy_offset to the start of the range.
14428 */
14429 copy_offset -= (vaddr - local_start);
14430
14431 vm_map_clip_start(map, entry, local_start);
14432 vm_map_clip_end(map, entry, local_end);
14433 if (entry->is_sub_map) {
14434 /* unnesting was done when clipping */
14435 assert(!entry->use_pmap);
14436 }
14437
14438 /* substitute copy object for */
14439 /* shared map entry */
14440 vm_map_deallocate(VME_SUBMAP(entry));
14441 assert(!entry->iokit_acct);
14442 entry->use_pmap = TRUE;
14443 VME_OBJECT_SET(entry, copy_object, false, 0);
14444
14445 /* propagate the submap entry's protections */
14446 if (entry->protection != VM_PROT_READ) {
14447 /*
14448 * Someone has already altered the top entry's
14449 * protections via vm_protect(VM_PROT_COPY).
14450 * Respect these new values and ignore the
14451 * submap entry's protections.
14452 */
14453 } else {
14454 /*
14455 * Regular copy-on-write: propagate the submap
14456 * entry's protections to the top map entry.
14457 */
14458 entry->protection |= subentry_protection;
14459 }
14460 entry->max_protection |= subentry_max_protection;
14461 /* propagate some attributes from subentry */
14462 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14463 entry->vme_permanent = subentry_permanent;
14464 entry->csm_associated = subentry_csm_associated;
14465 #if __arm64e__
14466 /* propagate TPRO iff the destination map has TPRO enabled */
14467 if (subentry_used_for_tpro) {
14468 if (vm_map_tpro(map)) {
14469 entry->used_for_tpro = subentry_used_for_tpro;
14470 } else {
14471 /* "permanent" came from being TPRO */
14472 entry->vme_permanent = FALSE;
14473 }
14474 }
14475 #endif /* __arm64e */
14476 if ((entry->protection & VM_PROT_WRITE) &&
14477 (entry->protection & VM_PROT_EXECUTE) &&
14478 #if XNU_TARGET_OS_OSX
14479 map->pmap != kernel_pmap &&
14480 (vm_map_cs_enforcement(map)
14481 #if __arm64__
14482 || !VM_MAP_IS_EXOTIC(map)
14483 #endif /* __arm64__ */
14484 ) &&
14485 #endif /* XNU_TARGET_OS_OSX */
14486 #if CODE_SIGNING_MONITOR
14487 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14488 #endif
14489 !(entry->used_for_jit) &&
14490 VM_MAP_POLICY_WX_STRIP_X(map)) {
14491 DTRACE_VM3(cs_wx,
14492 uint64_t, (uint64_t)entry->vme_start,
14493 uint64_t, (uint64_t)entry->vme_end,
14494 vm_prot_t, entry->protection);
14495 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14496 proc_selfpid(),
14497 (get_bsdtask_info(current_task())
14498 ? proc_name_address(get_bsdtask_info(current_task()))
14499 : "?"),
14500 __FUNCTION__, __LINE__,
14501 #if DEVELOPMENT || DEBUG
14502 (uint64_t)entry->vme_start,
14503 (uint64_t)entry->vme_end,
14504 #else /* DEVELOPMENT || DEBUG */
14505 (uint64_t)0,
14506 (uint64_t)0,
14507 #endif /* DEVELOPMENT || DEBUG */
14508 entry->protection);
14509 entry->protection &= ~VM_PROT_EXECUTE;
14510 }
14511
14512 if (object_copied) {
14513 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14514 entry->needs_copy = object_copied_needs_copy;
14515 entry->is_shared = FALSE;
14516 } else {
14517 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14518 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14519 assert(entry->wired_count == 0);
14520 VME_OFFSET_SET(entry, copy_offset);
14521 entry->needs_copy = TRUE;
14522 if (map != old_map) {
14523 entry->is_shared = TRUE;
14524 }
14525 }
14526 if (entry->inheritance == VM_INHERIT_SHARE) {
14527 entry->inheritance = VM_INHERIT_COPY;
14528 }
14529
14530 vm_map_lock_write_to_read(map);
14531 } else {
14532 if ((cow_sub_map_parent)
14533 && (cow_sub_map_parent != *real_map)
14534 && (cow_sub_map_parent != map)) {
14535 vm_map_unlock(cow_sub_map_parent);
14536 }
14537 entry = submap_entry;
14538 vaddr = local_vaddr;
14539 }
14540 }
14541
14542 /*
14543 * Check whether this task is allowed to have
14544 * this page.
14545 */
14546
14547 prot = entry->protection;
14548
14549 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14550 /*
14551 * HACK -- if not a stack, then allow execution
14552 */
14553 prot |= VM_PROT_EXECUTE;
14554 }
14555
14556 #if __arm64e__
14557 /*
14558 * If the entry we're dealing with is TPRO and we have a write
14559 * fault, inject VM_PROT_WRITE into protections. This allows us
14560 * to maintain RO permissions when not marked as TPRO.
14561 */
14562 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14563 prot |= VM_PROT_WRITE;
14564 }
14565 #endif /* __arm64e__ */
14566 if (mask_protections) {
14567 fault_type &= prot;
14568 if (fault_type == VM_PROT_NONE) {
14569 goto protection_failure;
14570 }
14571 }
14572 if (((fault_type & prot) != fault_type)
14573 #if __arm64__
14574 /* prefetch abort in execute-only page */
14575 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14576 #elif defined(__x86_64__)
14577 /* Consider the UEXEC bit when handling an EXECUTE fault */
14578 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14579 #endif
14580 ) {
14581 protection_failure:
14582 if (*real_map != map) {
14583 vm_map_unlock(*real_map);
14584 }
14585 *real_map = map;
14586
14587 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14588 log_stack_execution_failure((addr64_t)vaddr, prot);
14589 }
14590
14591 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14592 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14593 /*
14594 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14595 *
14596 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14597 */
14598 return KERN_PROTECTION_FAILURE;
14599 }
14600
14601 /*
14602 * If this page is not pageable, we have to get
14603 * it for all possible accesses.
14604 */
14605
14606 *wired = (entry->wired_count != 0);
14607 if (*wired) {
14608 fault_type = prot;
14609 }
14610
14611 /*
14612 * If the entry was copy-on-write, we either ...
14613 */
14614
14615 if (entry->needs_copy) {
14616 /*
14617 * If we want to write the page, we may as well
14618 * handle that now since we've got the map locked.
14619 *
14620 * If we don't need to write the page, we just
14621 * demote the permissions allowed.
14622 */
14623
14624 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14625 /*
14626 * Make a new object, and place it in the
14627 * object chain. Note that no new references
14628 * have appeared -- one just moved from the
14629 * map to the new object.
14630 */
14631
14632 if (vm_map_lock_read_to_write(map)) {
14633 vm_map_lock_read(map);
14634 goto RetryLookup;
14635 }
14636
14637 if (VME_OBJECT(entry)->shadowed == FALSE) {
14638 vm_object_lock(VME_OBJECT(entry));
14639 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14640 vm_object_unlock(VME_OBJECT(entry));
14641 }
14642 VME_OBJECT_SHADOW(entry,
14643 (vm_map_size_t) (entry->vme_end -
14644 entry->vme_start),
14645 vm_map_always_shadow(map));
14646 entry->needs_copy = FALSE;
14647
14648 vm_map_lock_write_to_read(map);
14649 }
14650 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14651 /*
14652 * We're attempting to read a copy-on-write
14653 * page -- don't allow writes.
14654 */
14655
14656 prot &= (~VM_PROT_WRITE);
14657 }
14658 }
14659
14660 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14661 /*
14662 * We went through a "needs_copy" submap without triggering
14663 * a copy, so granting write access to the page would bypass
14664 * that submap's "needs_copy".
14665 */
14666 assert(!(fault_type & VM_PROT_WRITE));
14667 assert(!*wired);
14668 assert(!force_copy);
14669 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14670 prot &= ~VM_PROT_WRITE;
14671 }
14672
14673 /*
14674 * Create an object if necessary.
14675 */
14676 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14677 if (vm_map_lock_read_to_write(map)) {
14678 vm_map_lock_read(map);
14679 goto RetryLookup;
14680 }
14681
14682 VME_OBJECT_SET(entry,
14683 vm_object_allocate(
14684 (vm_map_size_t)(entry->vme_end -
14685 entry->vme_start)), false, 0);
14686 VME_OFFSET_SET(entry, 0);
14687 assert(entry->use_pmap);
14688 vm_map_lock_write_to_read(map);
14689 }
14690
14691 /*
14692 * Return the object/offset from this entry. If the entry
14693 * was copy-on-write or empty, it has been fixed up. Also
14694 * return the protection.
14695 */
14696
14697 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14698 *object = VME_OBJECT(entry);
14699 *out_prot = prot;
14700 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14701
14702 if (fault_info) {
14703 /* ... the caller will change "interruptible" if needed */
14704 fault_info->user_tag = VME_ALIAS(entry);
14705 fault_info->pmap_options = 0;
14706 if (entry->iokit_acct ||
14707 (!entry->is_sub_map && !entry->use_pmap)) {
14708 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14709 }
14710 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14711 fault_info->behavior = entry->behavior;
14712 }
14713 fault_info->lo_offset = VME_OFFSET(entry);
14714 fault_info->hi_offset =
14715 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14716 fault_info->no_cache = entry->no_cache;
14717 fault_info->stealth = FALSE;
14718 fault_info->io_sync = FALSE;
14719 if (entry->used_for_jit ||
14720 #if CODE_SIGNING_MONITOR
14721 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14722 #endif
14723 entry->vme_resilient_codesign) {
14724 fault_info->cs_bypass = TRUE;
14725 } else {
14726 fault_info->cs_bypass = FALSE;
14727 }
14728 fault_info->csm_associated = FALSE;
14729 #if CODE_SIGNING_MONITOR
14730 if (entry->csm_associated) {
14731 /*
14732 * The pmap layer will validate this page
14733 * before allowing it to be executed from.
14734 */
14735 fault_info->csm_associated = TRUE;
14736 }
14737 #endif
14738 fault_info->mark_zf_absent = FALSE;
14739 fault_info->batch_pmap_op = FALSE;
14740 fault_info->resilient_media = entry->vme_resilient_media;
14741 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14742 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14743 #if __arm64e__
14744 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14745 #else /* __arm64e__ */
14746 fault_info->fi_used_for_tpro = FALSE;
14747 #endif
14748 if (entry->translated_allow_execute) {
14749 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14750 }
14751 }
14752
14753 /*
14754 * Lock the object to prevent it from disappearing
14755 */
14756 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14757 if (contended == NULL) {
14758 vm_object_lock(*object);
14759 } else {
14760 *contended = vm_object_lock_check_contended(*object);
14761 }
14762 } else {
14763 vm_object_lock_shared(*object);
14764 }
14765
14766 /*
14767 * Save the version number
14768 */
14769
14770 out_version->main_timestamp = map->timestamp;
14771
14772 return KERN_SUCCESS;
14773 }
14774
14775
14776 /*
14777 * vm_map_verify:
14778 *
14779 * Verifies that the map in question has not changed
14780 * since the given version. The map has to be locked
14781 * ("shared" mode is fine) before calling this function
14782 * and it will be returned locked too.
14783 */
14784 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14785 vm_map_verify(
14786 vm_map_t map,
14787 vm_map_version_t *version) /* REF */
14788 {
14789 boolean_t result;
14790
14791 vm_map_lock_assert_held(map);
14792 result = (map->timestamp == version->main_timestamp);
14793
14794 return result;
14795 }
14796
14797 /*
14798 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14799 * Goes away after regular vm_region_recurse function migrates to
14800 * 64 bits
14801 * vm_region_recurse: A form of vm_region which follows the
14802 * submaps in a target map
14803 *
14804 */
14805
14806 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14807 vm_map_region_recurse_64(
14808 vm_map_t map,
14809 vm_map_offset_ut *address_u, /* IN/OUT */
14810 vm_map_size_ut *size_u, /* OUT */
14811 natural_t *nesting_depth, /* IN/OUT */
14812 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14813 mach_msg_type_number_t *count) /* IN/OUT */
14814 {
14815 mach_msg_type_number_t original_count;
14816 vm_region_extended_info_data_t extended;
14817 vm_map_entry_t tmp_entry;
14818 vm_map_offset_t user_address;
14819 unsigned int user_max_depth;
14820
14821 /*
14822 * "curr_entry" is the VM map entry preceding or including the
14823 * address we're looking for.
14824 * "curr_map" is the map or sub-map containing "curr_entry".
14825 * "curr_address" is the equivalent of the top map's "user_address"
14826 * in the current map.
14827 * "curr_offset" is the cumulated offset of "curr_map" in the
14828 * target task's address space.
14829 * "curr_depth" is the depth of "curr_map" in the chain of
14830 * sub-maps.
14831 *
14832 * "curr_max_below" and "curr_max_above" limit the range (around
14833 * "curr_address") we should take into account in the current (sub)map.
14834 * They limit the range to what's visible through the map entries
14835 * we've traversed from the top map to the current map.
14836 *
14837 */
14838 vm_map_entry_t curr_entry;
14839 vm_map_address_t curr_address;
14840 vm_map_offset_t curr_offset;
14841 vm_map_t curr_map;
14842 unsigned int curr_depth;
14843 vm_map_offset_t curr_max_below, curr_max_above;
14844 vm_map_offset_t curr_skip;
14845
14846 /*
14847 * "next_" is the same as "curr_" but for the VM region immediately
14848 * after the address we're looking for. We need to keep track of this
14849 * too because we want to return info about that region if the
14850 * address we're looking for is not mapped.
14851 */
14852 vm_map_entry_t next_entry;
14853 vm_map_offset_t next_offset;
14854 vm_map_offset_t next_address;
14855 vm_map_t next_map;
14856 unsigned int next_depth;
14857 vm_map_offset_t next_max_below, next_max_above;
14858 vm_map_offset_t next_skip;
14859
14860 boolean_t look_for_pages;
14861 vm_region_submap_short_info_64_t short_info;
14862 boolean_t do_region_footprint;
14863 int effective_page_size, effective_page_shift;
14864 boolean_t submap_needed_copy;
14865
14866 if (map == VM_MAP_NULL) {
14867 /* no address space to work on */
14868 return KERN_INVALID_ARGUMENT;
14869 }
14870
14871 user_address = vm_sanitize_addr(map, *address_u);
14872
14873 effective_page_shift = vm_self_region_page_shift(map);
14874 effective_page_size = (1 << effective_page_shift);
14875
14876 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14877 /*
14878 * "info" structure is not big enough and
14879 * would overflow
14880 */
14881 return KERN_INVALID_ARGUMENT;
14882 }
14883
14884 do_region_footprint = task_self_region_footprint();
14885 original_count = *count;
14886
14887 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14888 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14889 look_for_pages = FALSE;
14890 short_info = (vm_region_submap_short_info_64_t) submap_info;
14891 submap_info = NULL;
14892 } else {
14893 look_for_pages = TRUE;
14894 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14895 short_info = NULL;
14896
14897 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14898 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14899 }
14900 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14901 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14902 }
14903 }
14904
14905 user_max_depth = *nesting_depth;
14906 submap_needed_copy = FALSE;
14907
14908 if (not_in_kdp) {
14909 vm_map_lock_read(map);
14910 }
14911
14912 recurse_again:
14913 curr_entry = NULL;
14914 curr_map = map;
14915 curr_address = user_address;
14916 curr_offset = 0;
14917 curr_skip = 0;
14918 curr_depth = 0;
14919 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14920 curr_max_below = curr_address;
14921
14922 next_entry = NULL;
14923 next_map = NULL;
14924 next_address = 0;
14925 next_offset = 0;
14926 next_skip = 0;
14927 next_depth = 0;
14928 next_max_above = (vm_map_offset_t) -1;
14929 next_max_below = (vm_map_offset_t) -1;
14930
14931 for (;;) {
14932 if (vm_map_lookup_entry(curr_map,
14933 curr_address,
14934 &tmp_entry)) {
14935 /* tmp_entry contains the address we're looking for */
14936 curr_entry = tmp_entry;
14937 } else {
14938 vm_map_offset_t skip;
14939 /*
14940 * The address is not mapped. "tmp_entry" is the
14941 * map entry preceding the address. We want the next
14942 * one, if it exists.
14943 */
14944 curr_entry = tmp_entry->vme_next;
14945
14946 if (curr_entry == vm_map_to_entry(curr_map) ||
14947 (curr_entry->vme_start >=
14948 curr_address + curr_max_above)) {
14949 /* no next entry at this level: stop looking */
14950 if (not_in_kdp) {
14951 vm_map_unlock_read(curr_map);
14952 }
14953 curr_entry = NULL;
14954 curr_map = NULL;
14955 curr_skip = 0;
14956 curr_offset = 0;
14957 curr_depth = 0;
14958 curr_max_above = 0;
14959 curr_max_below = 0;
14960 break;
14961 }
14962
14963 /* adjust current address and offset */
14964 skip = curr_entry->vme_start - curr_address;
14965 curr_address = curr_entry->vme_start;
14966 curr_skip += skip;
14967 curr_offset += skip;
14968 curr_max_above -= skip;
14969 curr_max_below = 0;
14970 }
14971
14972 /*
14973 * Is the next entry at this level closer to the address (or
14974 * deeper in the submap chain) than the one we had
14975 * so far ?
14976 */
14977 tmp_entry = curr_entry->vme_next;
14978 if (tmp_entry == vm_map_to_entry(curr_map)) {
14979 /* no next entry at this level */
14980 } else if (tmp_entry->vme_start >=
14981 curr_address + curr_max_above) {
14982 /*
14983 * tmp_entry is beyond the scope of what we mapped of
14984 * this submap in the upper level: ignore it.
14985 */
14986 } else if ((next_entry == NULL) ||
14987 (tmp_entry->vme_start + curr_offset <=
14988 next_entry->vme_start + next_offset)) {
14989 /*
14990 * We didn't have a "next_entry" or this one is
14991 * closer to the address we're looking for:
14992 * use this "tmp_entry" as the new "next_entry".
14993 */
14994 if (next_entry != NULL) {
14995 /* unlock the last "next_map" */
14996 if (next_map != curr_map && not_in_kdp) {
14997 vm_map_unlock_read(next_map);
14998 }
14999 }
15000 next_entry = tmp_entry;
15001 next_map = curr_map;
15002 next_depth = curr_depth;
15003 next_address = next_entry->vme_start;
15004 next_skip = curr_skip;
15005 next_skip += (next_address - curr_address);
15006 next_offset = curr_offset;
15007 next_offset += (next_address - curr_address);
15008 next_max_above = MIN(next_max_above, curr_max_above);
15009 next_max_above = MIN(next_max_above,
15010 next_entry->vme_end - next_address);
15011 next_max_below = MIN(next_max_below, curr_max_below);
15012 next_max_below = MIN(next_max_below,
15013 next_address - next_entry->vme_start);
15014 }
15015
15016 /*
15017 * "curr_max_{above,below}" allow us to keep track of the
15018 * portion of the submap that is actually mapped at this level:
15019 * the rest of that submap is irrelevant to us, since it's not
15020 * mapped here.
15021 * The relevant portion of the map starts at
15022 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15023 */
15024 curr_max_above = MIN(curr_max_above,
15025 curr_entry->vme_end - curr_address);
15026 curr_max_below = MIN(curr_max_below,
15027 curr_address - curr_entry->vme_start);
15028
15029 if (!curr_entry->is_sub_map ||
15030 curr_depth >= user_max_depth) {
15031 /*
15032 * We hit a leaf map or we reached the maximum depth
15033 * we could, so stop looking. Keep the current map
15034 * locked.
15035 */
15036 break;
15037 }
15038
15039 /*
15040 * Get down to the next submap level.
15041 */
15042
15043 if (curr_entry->needs_copy) {
15044 /* everything below this is effectively copy-on-write */
15045 submap_needed_copy = TRUE;
15046 }
15047
15048 /*
15049 * Lock the next level and unlock the current level,
15050 * unless we need to keep it locked to access the "next_entry"
15051 * later.
15052 */
15053 if (not_in_kdp) {
15054 vm_map_lock_read(VME_SUBMAP(curr_entry));
15055 }
15056 if (curr_map == next_map) {
15057 /* keep "next_map" locked in case we need it */
15058 } else {
15059 /* release this map */
15060 if (not_in_kdp) {
15061 vm_map_unlock_read(curr_map);
15062 }
15063 }
15064
15065 /*
15066 * Adjust the offset. "curr_entry" maps the submap
15067 * at relative address "curr_entry->vme_start" in the
15068 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15069 * bytes of the submap.
15070 * "curr_offset" always represents the offset of a virtual
15071 * address in the curr_map relative to the absolute address
15072 * space (i.e. the top-level VM map).
15073 */
15074 curr_offset +=
15075 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15076 curr_address = user_address + curr_offset;
15077 /* switch to the submap */
15078 curr_map = VME_SUBMAP(curr_entry);
15079 curr_depth++;
15080 curr_entry = NULL;
15081 }
15082
15083 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15084 // so probably should be a real 32b ID vs. ptr.
15085 // Current users just check for equality
15086
15087 if (curr_entry == NULL) {
15088 /* no VM region contains the address... */
15089
15090 if (do_region_footprint && /* we want footprint numbers */
15091 next_entry == NULL && /* & there are no more regions */
15092 /* & we haven't already provided our fake region: */
15093 user_address <= vm_map_last_entry(map)->vme_end) {
15094 ledger_amount_t ledger_resident, ledger_compressed;
15095
15096 /*
15097 * Add a fake memory region to account for
15098 * purgeable and/or ledger-tagged memory that
15099 * counts towards this task's memory footprint,
15100 * i.e. the resident/compressed pages of non-volatile
15101 * objects owned by that task.
15102 */
15103 task_ledgers_footprint(map->pmap->ledger,
15104 &ledger_resident,
15105 &ledger_compressed);
15106 if (ledger_resident + ledger_compressed == 0) {
15107 /* no purgeable memory usage to report */
15108 return KERN_INVALID_ADDRESS;
15109 }
15110 /* fake region to show nonvolatile footprint */
15111 if (look_for_pages) {
15112 submap_info->protection = VM_PROT_DEFAULT;
15113 submap_info->max_protection = VM_PROT_DEFAULT;
15114 submap_info->inheritance = VM_INHERIT_DEFAULT;
15115 submap_info->offset = 0;
15116 submap_info->user_tag = -1;
15117 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15118 submap_info->pages_shared_now_private = 0;
15119 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15120 submap_info->pages_dirtied = submap_info->pages_resident;
15121 submap_info->ref_count = 1;
15122 submap_info->shadow_depth = 0;
15123 submap_info->external_pager = 0;
15124 submap_info->share_mode = SM_PRIVATE;
15125 if (submap_needed_copy) {
15126 submap_info->share_mode = SM_COW;
15127 }
15128 submap_info->is_submap = 0;
15129 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15130 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15131 submap_info->user_wired_count = 0;
15132 submap_info->pages_reusable = 0;
15133 } else {
15134 short_info->user_tag = -1;
15135 short_info->offset = 0;
15136 short_info->protection = VM_PROT_DEFAULT;
15137 short_info->inheritance = VM_INHERIT_DEFAULT;
15138 short_info->max_protection = VM_PROT_DEFAULT;
15139 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15140 short_info->user_wired_count = 0;
15141 short_info->is_submap = 0;
15142 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15143 short_info->external_pager = 0;
15144 short_info->shadow_depth = 0;
15145 short_info->share_mode = SM_PRIVATE;
15146 if (submap_needed_copy) {
15147 short_info->share_mode = SM_COW;
15148 }
15149 short_info->ref_count = 1;
15150 }
15151 *nesting_depth = 0;
15152 *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15153 *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15154 return KERN_SUCCESS;
15155 }
15156
15157 if (next_entry == NULL) {
15158 /* ... and no VM region follows it either */
15159 return KERN_INVALID_ADDRESS;
15160 }
15161 /* ... gather info about the next VM region */
15162 curr_entry = next_entry;
15163 curr_map = next_map; /* still locked ... */
15164 curr_address = next_address;
15165 curr_skip = next_skip;
15166 curr_offset = next_offset;
15167 curr_depth = next_depth;
15168 curr_max_above = next_max_above;
15169 curr_max_below = next_max_below;
15170 } else {
15171 /* we won't need "next_entry" after all */
15172 if (next_entry != NULL) {
15173 /* release "next_map" */
15174 if (next_map != curr_map && not_in_kdp) {
15175 vm_map_unlock_read(next_map);
15176 }
15177 }
15178 }
15179 next_entry = NULL;
15180 next_map = NULL;
15181 next_offset = 0;
15182 next_skip = 0;
15183 next_depth = 0;
15184 next_max_below = -1;
15185 next_max_above = -1;
15186
15187 if (curr_entry->is_sub_map &&
15188 curr_depth < user_max_depth) {
15189 /*
15190 * We're not as deep as we could be: we must have
15191 * gone back up after not finding anything mapped
15192 * below the original top-level map entry's.
15193 * Let's move "curr_address" forward and recurse again.
15194 */
15195 user_address = curr_address;
15196 goto recurse_again;
15197 }
15198
15199 *nesting_depth = curr_depth;
15200 *address_u = vm_sanitize_wrap_addr(
15201 user_address + curr_skip - curr_max_below);
15202 *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15203
15204 if (look_for_pages) {
15205 submap_info->user_tag = VME_ALIAS(curr_entry);
15206 submap_info->offset = VME_OFFSET(curr_entry);
15207 submap_info->protection = curr_entry->protection;
15208 submap_info->inheritance = curr_entry->inheritance;
15209 submap_info->max_protection = curr_entry->max_protection;
15210 submap_info->behavior = curr_entry->behavior;
15211 submap_info->user_wired_count = curr_entry->user_wired_count;
15212 submap_info->is_submap = curr_entry->is_sub_map;
15213 if (curr_entry->is_sub_map) {
15214 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15215 } else {
15216 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15217 }
15218 } else {
15219 short_info->user_tag = VME_ALIAS(curr_entry);
15220 short_info->offset = VME_OFFSET(curr_entry);
15221 short_info->protection = curr_entry->protection;
15222 short_info->inheritance = curr_entry->inheritance;
15223 short_info->max_protection = curr_entry->max_protection;
15224 short_info->behavior = curr_entry->behavior;
15225 short_info->user_wired_count = curr_entry->user_wired_count;
15226 short_info->is_submap = curr_entry->is_sub_map;
15227 if (curr_entry->is_sub_map) {
15228 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15229 } else {
15230 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15231 }
15232 }
15233
15234 extended.pages_resident = 0;
15235 extended.pages_swapped_out = 0;
15236 extended.pages_shared_now_private = 0;
15237 extended.pages_dirtied = 0;
15238 extended.pages_reusable = 0;
15239 extended.external_pager = 0;
15240 extended.shadow_depth = 0;
15241 extended.share_mode = SM_EMPTY;
15242 extended.ref_count = 0;
15243
15244 if (not_in_kdp) {
15245 if (!curr_entry->is_sub_map) {
15246 vm_map_offset_t range_start, range_end;
15247 range_start = MAX((curr_address - curr_max_below),
15248 curr_entry->vme_start);
15249 range_end = MIN((curr_address + curr_max_above),
15250 curr_entry->vme_end);
15251 vm_map_region_walk(curr_map,
15252 range_start,
15253 curr_entry,
15254 (VME_OFFSET(curr_entry) +
15255 (range_start -
15256 curr_entry->vme_start)),
15257 range_end - range_start,
15258 &extended,
15259 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15260 if (submap_needed_copy) {
15261 extended.share_mode = SM_COW;
15262 }
15263 } else {
15264 if (curr_entry->use_pmap) {
15265 extended.share_mode = SM_TRUESHARED;
15266 } else {
15267 extended.share_mode = SM_PRIVATE;
15268 }
15269 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15270 }
15271 }
15272
15273 if (look_for_pages) {
15274 submap_info->pages_resident = extended.pages_resident;
15275 submap_info->pages_swapped_out = extended.pages_swapped_out;
15276 submap_info->pages_shared_now_private =
15277 extended.pages_shared_now_private;
15278 submap_info->pages_dirtied = extended.pages_dirtied;
15279 submap_info->external_pager = extended.external_pager;
15280 submap_info->shadow_depth = extended.shadow_depth;
15281 submap_info->share_mode = extended.share_mode;
15282 submap_info->ref_count = extended.ref_count;
15283
15284 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15285 submap_info->pages_reusable = extended.pages_reusable;
15286 }
15287 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15288 if (curr_entry->is_sub_map) {
15289 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15290 } else if (VME_OBJECT(curr_entry)) {
15291 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15292 } else {
15293 submap_info->object_id_full = 0ull;
15294 }
15295 }
15296 } else {
15297 short_info->external_pager = extended.external_pager;
15298 short_info->shadow_depth = extended.shadow_depth;
15299 short_info->share_mode = extended.share_mode;
15300 short_info->ref_count = extended.ref_count;
15301 }
15302
15303 if (not_in_kdp) {
15304 vm_map_unlock_read(curr_map);
15305 }
15306
15307 return KERN_SUCCESS;
15308 }
15309
15310 /*
15311 * vm_region:
15312 *
15313 * User call to obtain information about a region in
15314 * a task's address map. Currently, only one flavor is
15315 * supported.
15316 *
15317 * XXX The reserved and behavior fields cannot be filled
15318 * in until the vm merge from the IK is completed, and
15319 * vm_reserve is implemented.
15320 */
15321
15322 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15323 vm_map_region(
15324 vm_map_t map,
15325 vm_map_offset_ut *address_u, /* IN/OUT */
15326 vm_map_size_ut *size_u, /* OUT */
15327 vm_region_flavor_t flavor, /* IN */
15328 vm_region_info_t info, /* OUT */
15329 mach_msg_type_number_t *count, /* IN/OUT */
15330 mach_port_t *object_name) /* OUT */
15331 {
15332 vm_map_entry_t tmp_entry;
15333 vm_map_entry_t entry;
15334 vm_map_offset_t start;
15335
15336 if (map == VM_MAP_NULL) {
15337 return KERN_INVALID_ARGUMENT;
15338 }
15339
15340 start = vm_sanitize_addr(map, *address_u);
15341
15342 switch (flavor) {
15343 case VM_REGION_BASIC_INFO:
15344 /* legacy for old 32-bit objects info */
15345 {
15346 vm_region_basic_info_t basic;
15347
15348 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15349 return KERN_INVALID_ARGUMENT;
15350 }
15351
15352 basic = (vm_region_basic_info_t) info;
15353 *count = VM_REGION_BASIC_INFO_COUNT;
15354
15355 vm_map_lock_read(map);
15356
15357 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15358 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15359 vm_map_unlock_read(map);
15360 return KERN_INVALID_ADDRESS;
15361 }
15362 } else {
15363 entry = tmp_entry;
15364 }
15365
15366 start = entry->vme_start;
15367
15368 basic->offset = (uint32_t)VME_OFFSET(entry);
15369 basic->protection = entry->protection;
15370 basic->inheritance = entry->inheritance;
15371 basic->max_protection = entry->max_protection;
15372 basic->behavior = entry->behavior;
15373 basic->user_wired_count = entry->user_wired_count;
15374 basic->reserved = entry->is_sub_map;
15375
15376 *address_u = vm_sanitize_wrap_addr(start);
15377 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15378
15379 if (object_name) {
15380 *object_name = IP_NULL;
15381 }
15382 if (entry->is_sub_map) {
15383 basic->shared = FALSE;
15384 } else {
15385 basic->shared = entry->is_shared;
15386 }
15387
15388 vm_map_unlock_read(map);
15389 return KERN_SUCCESS;
15390 }
15391
15392 case VM_REGION_BASIC_INFO_64:
15393 {
15394 vm_region_basic_info_64_t basic;
15395
15396 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15397 return KERN_INVALID_ARGUMENT;
15398 }
15399
15400 basic = (vm_region_basic_info_64_t) info;
15401 *count = VM_REGION_BASIC_INFO_COUNT_64;
15402
15403 vm_map_lock_read(map);
15404
15405 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15406 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15407 vm_map_unlock_read(map);
15408 return KERN_INVALID_ADDRESS;
15409 }
15410 } else {
15411 entry = tmp_entry;
15412 }
15413
15414 start = entry->vme_start;
15415
15416 basic->offset = VME_OFFSET(entry);
15417 basic->protection = entry->protection;
15418 basic->inheritance = entry->inheritance;
15419 basic->max_protection = entry->max_protection;
15420 basic->behavior = entry->behavior;
15421 basic->user_wired_count = entry->user_wired_count;
15422 basic->reserved = entry->is_sub_map;
15423
15424 *address_u = vm_sanitize_wrap_addr(start);
15425 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15426
15427 if (object_name) {
15428 *object_name = IP_NULL;
15429 }
15430 if (entry->is_sub_map) {
15431 basic->shared = FALSE;
15432 } else {
15433 basic->shared = entry->is_shared;
15434 }
15435
15436 vm_map_unlock_read(map);
15437 return KERN_SUCCESS;
15438 }
15439 case VM_REGION_EXTENDED_INFO:
15440 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15441 return KERN_INVALID_ARGUMENT;
15442 }
15443 OS_FALLTHROUGH;
15444 case VM_REGION_EXTENDED_INFO__legacy:
15445 {
15446 vm_region_extended_info_t extended;
15447 mach_msg_type_number_t original_count;
15448 int effective_page_size, effective_page_shift;
15449
15450 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15451 return KERN_INVALID_ARGUMENT;
15452 }
15453
15454 extended = (vm_region_extended_info_t) info;
15455
15456 effective_page_shift = vm_self_region_page_shift(map);
15457 effective_page_size = (1 << effective_page_shift);
15458
15459 vm_map_lock_read(map);
15460
15461 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15462 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15463 vm_map_unlock_read(map);
15464 return KERN_INVALID_ADDRESS;
15465 }
15466 } else {
15467 entry = tmp_entry;
15468 }
15469 start = entry->vme_start;
15470
15471 extended->protection = entry->protection;
15472 extended->user_tag = VME_ALIAS(entry);
15473 extended->pages_resident = 0;
15474 extended->pages_swapped_out = 0;
15475 extended->pages_shared_now_private = 0;
15476 extended->pages_dirtied = 0;
15477 extended->external_pager = 0;
15478 extended->shadow_depth = 0;
15479
15480 original_count = *count;
15481 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15482 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15483 } else {
15484 extended->pages_reusable = 0;
15485 *count = VM_REGION_EXTENDED_INFO_COUNT;
15486 }
15487
15488 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15489
15490 if (object_name) {
15491 *object_name = IP_NULL;
15492 }
15493
15494 *address_u = vm_sanitize_wrap_addr(start);
15495 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15496
15497 vm_map_unlock_read(map);
15498 return KERN_SUCCESS;
15499 }
15500 case VM_REGION_TOP_INFO:
15501 {
15502 vm_region_top_info_t top;
15503
15504 if (*count < VM_REGION_TOP_INFO_COUNT) {
15505 return KERN_INVALID_ARGUMENT;
15506 }
15507
15508 top = (vm_region_top_info_t) info;
15509 *count = VM_REGION_TOP_INFO_COUNT;
15510
15511 vm_map_lock_read(map);
15512
15513 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15514 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15515 vm_map_unlock_read(map);
15516 return KERN_INVALID_ADDRESS;
15517 }
15518 } else {
15519 entry = tmp_entry;
15520 }
15521 start = entry->vme_start;
15522
15523 top->private_pages_resident = 0;
15524 top->shared_pages_resident = 0;
15525
15526 vm_map_region_top_walk(entry, top);
15527
15528 if (object_name) {
15529 *object_name = IP_NULL;
15530 }
15531
15532 *address_u = vm_sanitize_wrap_addr(start);
15533 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15534
15535 vm_map_unlock_read(map);
15536 return KERN_SUCCESS;
15537 }
15538 default:
15539 return KERN_INVALID_ARGUMENT;
15540 }
15541 }
15542
15543 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15544 MIN((entry_size), \
15545 ((obj)->all_reusable ? \
15546 (obj)->wired_page_count : \
15547 (obj)->resident_page_count - (obj)->reusable_page_count))
15548
15549 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15550 vm_map_region_top_walk(
15551 vm_map_entry_t entry,
15552 vm_region_top_info_t top)
15553 {
15554 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15555 top->share_mode = SM_EMPTY;
15556 top->ref_count = 0;
15557 top->obj_id = 0;
15558 return;
15559 }
15560
15561 {
15562 struct vm_object *obj, *tmp_obj;
15563 int ref_count;
15564 uint32_t entry_size;
15565
15566 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15567
15568 obj = VME_OBJECT(entry);
15569
15570 vm_object_lock(obj);
15571
15572 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15573 obj->paging_in_progress) {
15574 ref_count--;
15575 }
15576
15577 assert(obj->reusable_page_count <= obj->resident_page_count);
15578 if (obj->shadow) {
15579 if (ref_count == 1) {
15580 top->private_pages_resident =
15581 OBJ_RESIDENT_COUNT(obj, entry_size);
15582 } else {
15583 top->shared_pages_resident =
15584 OBJ_RESIDENT_COUNT(obj, entry_size);
15585 }
15586 top->ref_count = ref_count;
15587 top->share_mode = SM_COW;
15588
15589 while ((tmp_obj = obj->shadow)) {
15590 vm_object_lock(tmp_obj);
15591 vm_object_unlock(obj);
15592 obj = tmp_obj;
15593
15594 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15595 obj->paging_in_progress) {
15596 ref_count--;
15597 }
15598
15599 assert(obj->reusable_page_count <= obj->resident_page_count);
15600 top->shared_pages_resident +=
15601 OBJ_RESIDENT_COUNT(obj, entry_size);
15602 top->ref_count += ref_count - 1;
15603 }
15604 } else {
15605 if (entry->superpage_size) {
15606 top->share_mode = SM_LARGE_PAGE;
15607 top->shared_pages_resident = 0;
15608 top->private_pages_resident = entry_size;
15609 } else if (entry->needs_copy) {
15610 top->share_mode = SM_COW;
15611 top->shared_pages_resident =
15612 OBJ_RESIDENT_COUNT(obj, entry_size);
15613 } else {
15614 if (ref_count == 1 ||
15615 (ref_count == 2 && obj->named)) {
15616 top->share_mode = SM_PRIVATE;
15617 top->private_pages_resident =
15618 OBJ_RESIDENT_COUNT(obj,
15619 entry_size);
15620 } else {
15621 top->share_mode = SM_SHARED;
15622 top->shared_pages_resident =
15623 OBJ_RESIDENT_COUNT(obj,
15624 entry_size);
15625 }
15626 }
15627 top->ref_count = ref_count;
15628 }
15629
15630 vm_object_unlock(obj);
15631
15632 /* XXX K64: obj_id will be truncated */
15633 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15634 }
15635 }
15636
15637 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15638 vm_map_region_walk(
15639 vm_map_t map,
15640 vm_map_offset_t va,
15641 vm_map_entry_t entry,
15642 vm_object_offset_t offset,
15643 vm_object_size_t range,
15644 vm_region_extended_info_t extended,
15645 boolean_t look_for_pages,
15646 mach_msg_type_number_t count)
15647 {
15648 struct vm_object *obj, *tmp_obj;
15649 vm_map_offset_t last_offset;
15650 int i;
15651 int ref_count;
15652 struct vm_object *shadow_object;
15653 unsigned short shadow_depth;
15654 boolean_t do_region_footprint;
15655 int effective_page_size, effective_page_shift;
15656 vm_map_offset_t effective_page_mask;
15657
15658 do_region_footprint = task_self_region_footprint();
15659
15660 if ((entry->is_sub_map) ||
15661 (VME_OBJECT(entry) == 0) ||
15662 (VME_OBJECT(entry)->phys_contiguous &&
15663 !entry->superpage_size)) {
15664 extended->share_mode = SM_EMPTY;
15665 extended->ref_count = 0;
15666 return;
15667 }
15668
15669 if (entry->superpage_size) {
15670 extended->shadow_depth = 0;
15671 extended->share_mode = SM_LARGE_PAGE;
15672 extended->ref_count = 1;
15673 extended->external_pager = 0;
15674
15675 /* TODO4K: Superpage in 4k mode? */
15676 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15677 extended->shadow_depth = 0;
15678 return;
15679 }
15680
15681 effective_page_shift = vm_self_region_page_shift(map);
15682 effective_page_size = (1 << effective_page_shift);
15683 effective_page_mask = effective_page_size - 1;
15684
15685 offset = vm_map_trunc_page(offset, effective_page_mask);
15686
15687 obj = VME_OBJECT(entry);
15688
15689 vm_object_lock(obj);
15690
15691 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15692 obj->paging_in_progress) {
15693 ref_count--;
15694 }
15695
15696 if (look_for_pages) {
15697 for (last_offset = offset + range;
15698 offset < last_offset;
15699 offset += effective_page_size, va += effective_page_size) {
15700 if (do_region_footprint) {
15701 int disp;
15702
15703 disp = 0;
15704 if (map->has_corpse_footprint) {
15705 /*
15706 * Query the page info data we saved
15707 * while forking the corpse.
15708 */
15709 vm_map_corpse_footprint_query_page_info(
15710 map,
15711 va,
15712 &disp);
15713 } else {
15714 /*
15715 * Query the pmap.
15716 */
15717 vm_map_footprint_query_page_info(
15718 map,
15719 entry,
15720 va,
15721 &disp);
15722 }
15723 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15724 extended->pages_resident++;
15725 }
15726 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15727 extended->pages_reusable++;
15728 }
15729 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15730 extended->pages_dirtied++;
15731 }
15732 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15733 extended->pages_swapped_out++;
15734 }
15735 continue;
15736 }
15737
15738 vm_map_region_look_for_page(map, va, obj,
15739 vm_object_trunc_page(offset), ref_count,
15740 0, extended, count);
15741 }
15742
15743 if (do_region_footprint) {
15744 goto collect_object_info;
15745 }
15746 } else {
15747 collect_object_info:
15748 shadow_object = obj->shadow;
15749 shadow_depth = 0;
15750
15751 if (!(obj->internal)) {
15752 extended->external_pager = 1;
15753 }
15754
15755 if (shadow_object != VM_OBJECT_NULL) {
15756 vm_object_lock(shadow_object);
15757 for (;
15758 shadow_object != VM_OBJECT_NULL;
15759 shadow_depth++) {
15760 vm_object_t next_shadow;
15761
15762 if (!(shadow_object->internal)) {
15763 extended->external_pager = 1;
15764 }
15765
15766 next_shadow = shadow_object->shadow;
15767 if (next_shadow) {
15768 vm_object_lock(next_shadow);
15769 }
15770 vm_object_unlock(shadow_object);
15771 shadow_object = next_shadow;
15772 }
15773 }
15774 extended->shadow_depth = shadow_depth;
15775 }
15776
15777 if (extended->shadow_depth || entry->needs_copy) {
15778 extended->share_mode = SM_COW;
15779 } else {
15780 if (ref_count == 1) {
15781 extended->share_mode = SM_PRIVATE;
15782 } else {
15783 if (obj->true_share) {
15784 extended->share_mode = SM_TRUESHARED;
15785 } else {
15786 extended->share_mode = SM_SHARED;
15787 }
15788 }
15789 }
15790 extended->ref_count = ref_count - extended->shadow_depth;
15791
15792 for (i = 0; i < extended->shadow_depth; i++) {
15793 if ((tmp_obj = obj->shadow) == 0) {
15794 break;
15795 }
15796 vm_object_lock(tmp_obj);
15797 vm_object_unlock(obj);
15798
15799 if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15800 tmp_obj->paging_in_progress) {
15801 ref_count--;
15802 }
15803
15804 extended->ref_count += ref_count;
15805 obj = tmp_obj;
15806 }
15807 vm_object_unlock(obj);
15808
15809 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15810 extended->share_mode = SM_PRIVATE;
15811 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15812 vm_map_entry_t cur;
15813 vm_map_entry_t last;
15814 int my_refs;
15815
15816 obj = VME_OBJECT(entry);
15817 last = vm_map_to_entry(map);
15818 my_refs = 0;
15819
15820 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15821 obj->paging_in_progress) {
15822 ref_count--;
15823 }
15824 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15825 if (vm_map_region_has_obj_ref(cur, obj)) {
15826 my_refs++;
15827 }
15828 }
15829
15830 if (my_refs == ref_count) {
15831 extended->share_mode = SM_PRIVATE_ALIASED;
15832 } else if (my_refs > 1) {
15833 extended->share_mode = SM_SHARED_ALIASED;
15834 }
15835 }
15836 }
15837
15838
15839 /* object is locked on entry and locked on return */
15840
15841
15842 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15843 vm_map_region_look_for_page(
15844 __unused vm_map_t map,
15845 __unused vm_map_offset_t va,
15846 vm_object_t object,
15847 vm_object_offset_t offset,
15848 int max_refcnt,
15849 unsigned short depth,
15850 vm_region_extended_info_t extended,
15851 mach_msg_type_number_t count)
15852 {
15853 vm_page_t p;
15854 vm_object_t shadow;
15855 int ref_count;
15856 vm_object_t caller_object;
15857
15858 shadow = object->shadow;
15859 caller_object = object;
15860
15861
15862 while (TRUE) {
15863 if (!(object->internal)) {
15864 extended->external_pager = 1;
15865 }
15866
15867 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15868 if (shadow && (max_refcnt == 1)) {
15869 extended->pages_shared_now_private++;
15870 }
15871
15872 if (!p->vmp_fictitious &&
15873 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15874 extended->pages_dirtied++;
15875 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15876 if (p->vmp_reusable || object->all_reusable) {
15877 extended->pages_reusable++;
15878 }
15879 }
15880
15881 extended->pages_resident++;
15882
15883 if (object != caller_object) {
15884 vm_object_unlock(object);
15885 }
15886
15887 return;
15888 }
15889 if (object->internal &&
15890 object->alive &&
15891 !object->terminating &&
15892 object->pager_ready) {
15893 if (vm_object_compressor_pager_state_get(object, offset)
15894 == VM_EXTERNAL_STATE_EXISTS) {
15895 /* the pager has that page */
15896 extended->pages_swapped_out++;
15897 if (object != caller_object) {
15898 vm_object_unlock(object);
15899 }
15900 return;
15901 }
15902 }
15903
15904 if (shadow) {
15905 vm_object_lock(shadow);
15906 if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
15907 shadow->paging_in_progress) {
15908 ref_count--;
15909 }
15910
15911 if (++depth > extended->shadow_depth) {
15912 extended->shadow_depth = depth;
15913 }
15914
15915 if (ref_count > max_refcnt) {
15916 max_refcnt = ref_count;
15917 }
15918
15919 if (object != caller_object) {
15920 vm_object_unlock(object);
15921 }
15922
15923 offset = offset + object->vo_shadow_offset;
15924 object = shadow;
15925 shadow = object->shadow;
15926 continue;
15927 }
15928 if (object != caller_object) {
15929 vm_object_unlock(object);
15930 }
15931 break;
15932 }
15933 }
15934
15935 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15936 vm_map_region_has_obj_ref(
15937 vm_map_entry_t entry,
15938 vm_object_t object)
15939 {
15940 vm_object_t cur_obj;
15941 vm_object_t shadow_obj;
15942
15943 if (entry->is_sub_map) {
15944 return FALSE;
15945 }
15946
15947 cur_obj = VME_OBJECT(entry);
15948 if (cur_obj == VM_OBJECT_NULL) {
15949 return FALSE;
15950 } else if (cur_obj == object) {
15951 return TRUE;
15952 }
15953
15954 /*
15955 * Avoid locks for first shadow check, otherwise diagnostic tools will
15956 * spend most of their time obtaining locks in this function when analyzing
15957 * processes with many VM entries which may commonly have no shadow chain.
15958 *
15959 * This is acceptable because:
15960 * - Shadow's fields are not accessed outside of its lock
15961 * - Objects are unlikely to be modified due to:
15962 * - Many diagnostic tools suspend the task
15963 * - VM map is locked
15964 * - The rare incorrect return from this function turns a guess into a
15965 * slightly worse guess
15966 * - Entire shadow chain is not locked as a whole, so can still change
15967 * while traversing, resulting in incorrect guess even with locking
15968 */
15969 shadow_obj = cur_obj->shadow;
15970 if (shadow_obj == VM_OBJECT_NULL) {
15971 return FALSE;
15972 } else if (shadow_obj == object) {
15973 return TRUE;
15974 }
15975
15976 vm_object_lock(cur_obj);
15977
15978 while ((shadow_obj = cur_obj->shadow)) {
15979 /* check if object was found before grabbing a lock */
15980 if (shadow_obj == object) {
15981 vm_object_unlock(cur_obj);
15982 return TRUE;
15983 }
15984
15985 vm_object_lock(shadow_obj);
15986 vm_object_unlock(cur_obj);
15987 cur_obj = shadow_obj;
15988 }
15989
15990 /* exhausted the shadow chain */
15991 vm_object_unlock(cur_obj);
15992 return FALSE;
15993 }
15994
15995
15996 /*
15997 * Routine: vm_map_simplify
15998 *
15999 * Description:
16000 * Attempt to simplify the map representation in
16001 * the vicinity of the given starting address.
16002 * Note:
16003 * This routine is intended primarily to keep the
16004 * kernel maps more compact -- they generally don't
16005 * benefit from the "expand a map entry" technology
16006 * at allocation time because the adjacent entry
16007 * is often wired down.
16008 */
16009 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16010 vm_map_simplify_entry(
16011 vm_map_t map,
16012 vm_map_entry_t this_entry)
16013 {
16014 vm_map_entry_t prev_entry;
16015
16016 prev_entry = this_entry->vme_prev;
16017
16018 if ((this_entry != vm_map_to_entry(map)) &&
16019 (prev_entry != vm_map_to_entry(map)) &&
16020
16021 (prev_entry->vme_end == this_entry->vme_start) &&
16022
16023 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16024 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16025 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16026 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16027 prev_entry->vme_start))
16028 == VME_OFFSET(this_entry)) &&
16029
16030 (prev_entry->behavior == this_entry->behavior) &&
16031 (prev_entry->needs_copy == this_entry->needs_copy) &&
16032 (prev_entry->protection == this_entry->protection) &&
16033 (prev_entry->max_protection == this_entry->max_protection) &&
16034 (prev_entry->inheritance == this_entry->inheritance) &&
16035 (prev_entry->use_pmap == this_entry->use_pmap) &&
16036 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16037 (prev_entry->no_cache == this_entry->no_cache) &&
16038 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16039 (prev_entry->map_aligned == this_entry->map_aligned) &&
16040 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16041 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16042 #if __arm64e__
16043 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16044 #endif
16045 (prev_entry->csm_associated == this_entry->csm_associated) &&
16046 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16047 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16048 (prev_entry->vme_resilient_codesign ==
16049 this_entry->vme_resilient_codesign) &&
16050 (prev_entry->vme_resilient_media ==
16051 this_entry->vme_resilient_media) &&
16052 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16053 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16054
16055 (prev_entry->wired_count == this_entry->wired_count) &&
16056 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16057
16058 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16059 (prev_entry->in_transition == FALSE) &&
16060 (this_entry->in_transition == FALSE) &&
16061 (prev_entry->needs_wakeup == FALSE) &&
16062 (this_entry->needs_wakeup == FALSE) &&
16063 (prev_entry->is_shared == this_entry->is_shared) &&
16064 (prev_entry->superpage_size == FALSE) &&
16065 (this_entry->superpage_size == FALSE)
16066 ) {
16067 if (prev_entry->vme_permanent) {
16068 assert(this_entry->vme_permanent);
16069 prev_entry->vme_permanent = false;
16070 }
16071 vm_map_store_entry_unlink(map, prev_entry, true);
16072 assert(prev_entry->vme_start < this_entry->vme_end);
16073 if (prev_entry->map_aligned) {
16074 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16075 VM_MAP_PAGE_MASK(map)));
16076 }
16077 this_entry->vme_start = prev_entry->vme_start;
16078 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16079
16080 if (map->holelistenabled) {
16081 vm_map_store_update_first_free(map, this_entry, TRUE);
16082 }
16083
16084 if (prev_entry->is_sub_map) {
16085 vm_map_deallocate(VME_SUBMAP(prev_entry));
16086 } else {
16087 vm_object_deallocate(VME_OBJECT(prev_entry));
16088 }
16089 vm_map_entry_dispose(prev_entry);
16090 SAVE_HINT_MAP_WRITE(map, this_entry);
16091 }
16092 }
16093
16094 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16095 vm_map_simplify(
16096 vm_map_t map,
16097 vm_map_offset_t start)
16098 {
16099 vm_map_entry_t this_entry;
16100
16101 vm_map_lock(map);
16102 if (vm_map_lookup_entry(map, start, &this_entry)) {
16103 vm_map_simplify_entry(map, this_entry);
16104 vm_map_simplify_entry(map, this_entry->vme_next);
16105 }
16106 vm_map_unlock(map);
16107 }
16108
16109 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16110 vm_map_simplify_range(
16111 vm_map_t map,
16112 vm_map_offset_t start,
16113 vm_map_offset_t end)
16114 {
16115 vm_map_entry_t entry;
16116
16117 /*
16118 * The map should be locked (for "write") by the caller.
16119 */
16120
16121 if (start >= end) {
16122 /* invalid address range */
16123 return;
16124 }
16125
16126 start = vm_map_trunc_page(start,
16127 VM_MAP_PAGE_MASK(map));
16128 end = vm_map_round_page(end,
16129 VM_MAP_PAGE_MASK(map));
16130
16131 if (!vm_map_lookup_entry(map, start, &entry)) {
16132 /* "start" is not mapped and "entry" ends before "start" */
16133 if (entry == vm_map_to_entry(map)) {
16134 /* start with first entry in the map */
16135 entry = vm_map_first_entry(map);
16136 } else {
16137 /* start with next entry */
16138 entry = entry->vme_next;
16139 }
16140 }
16141
16142 while (entry != vm_map_to_entry(map) &&
16143 entry->vme_start <= end) {
16144 /* try and coalesce "entry" with its previous entry */
16145 vm_map_simplify_entry(map, entry);
16146 entry = entry->vme_next;
16147 }
16148 }
16149
16150 static __attribute__((always_inline, warn_unused_result))
16151 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16152 vm_map_machine_attribute_sanitize(
16153 vm_map_t map,
16154 vm_map_offset_ut start_u,
16155 vm_map_offset_ut end_u,
16156 mach_vm_offset_t *start,
16157 mach_vm_offset_t *end,
16158 vm_map_size_t *size)
16159 {
16160 return vm_sanitize_addr_end(start_u, end_u,
16161 VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16162 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16163 size);
16164 }
16165
16166
16167 /*
16168 * Routine: vm_map_machine_attribute
16169 * Purpose:
16170 * Provide machine-specific attributes to mappings,
16171 * such as cachability etc. for machines that provide
16172 * them. NUMA architectures and machines with big/strange
16173 * caches will use this.
16174 * Note:
16175 * Responsibilities for locking and checking are handled here,
16176 * everything else in the pmap module. If any non-volatile
16177 * information must be kept, the pmap module should handle
16178 * it itself. [This assumes that attributes do not
16179 * need to be inherited, which seems ok to me]
16180 */
16181 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16182 vm_map_machine_attribute(
16183 vm_map_t map,
16184 vm_map_offset_ut start_u,
16185 vm_map_offset_ut end_u,
16186 vm_machine_attribute_t attribute,
16187 vm_machine_attribute_val_t *value) /* IN/OUT */
16188 {
16189 mach_vm_offset_t start, end;
16190 vm_map_size_t sync_size;
16191 kern_return_t ret;
16192 vm_map_entry_t entry;
16193
16194 ret = vm_map_machine_attribute_sanitize(map,
16195 start_u,
16196 end_u,
16197 &start,
16198 &end,
16199 &sync_size);
16200 if (__improbable(ret != KERN_SUCCESS)) {
16201 return vm_sanitize_get_kr(ret);
16202 }
16203
16204 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16205 return KERN_INVALID_ADDRESS;
16206 }
16207
16208 vm_map_lock(map);
16209
16210 if (attribute != MATTR_CACHE) {
16211 /* If we don't have to find physical addresses, we */
16212 /* don't have to do an explicit traversal here. */
16213 ret = pmap_attribute(map->pmap, start, end - start,
16214 attribute, value);
16215 vm_map_unlock(map);
16216 return ret;
16217 }
16218
16219 ret = KERN_SUCCESS; /* Assume it all worked */
16220
16221 while (sync_size) {
16222 if (vm_map_lookup_entry(map, start, &entry)) {
16223 vm_map_size_t sub_size;
16224 if ((entry->vme_end - start) > sync_size) {
16225 sub_size = sync_size;
16226 sync_size = 0;
16227 } else {
16228 sub_size = entry->vme_end - start;
16229 sync_size -= sub_size;
16230 }
16231 if (entry->is_sub_map) {
16232 vm_map_offset_t sub_start;
16233 vm_map_offset_t sub_end;
16234
16235 sub_start = (start - entry->vme_start)
16236 + VME_OFFSET(entry);
16237 sub_end = sub_start + sub_size;
16238 vm_map_machine_attribute(
16239 VME_SUBMAP(entry),
16240 sub_start,
16241 sub_end,
16242 attribute, value);
16243 } else if (VME_OBJECT(entry)) {
16244 vm_page_t m;
16245 vm_object_t object;
16246 vm_object_t base_object;
16247 vm_object_t last_object;
16248 vm_object_offset_t offset;
16249 vm_object_offset_t base_offset;
16250 vm_map_size_t range;
16251 range = sub_size;
16252 offset = (start - entry->vme_start)
16253 + VME_OFFSET(entry);
16254 offset = vm_object_trunc_page(offset);
16255 base_offset = offset;
16256 object = VME_OBJECT(entry);
16257 base_object = object;
16258 last_object = NULL;
16259
16260 vm_object_lock(object);
16261
16262 while (range) {
16263 m = vm_page_lookup(
16264 object, offset);
16265
16266 if (m && !m->vmp_fictitious) {
16267 ret =
16268 pmap_attribute_cache_sync(
16269 VM_PAGE_GET_PHYS_PAGE(m),
16270 PAGE_SIZE,
16271 attribute, value);
16272 } else if (object->shadow) {
16273 offset = offset + object->vo_shadow_offset;
16274 last_object = object;
16275 object = object->shadow;
16276 vm_object_lock(last_object->shadow);
16277 vm_object_unlock(last_object);
16278 continue;
16279 }
16280 if (range < PAGE_SIZE) {
16281 range = 0;
16282 } else {
16283 range -= PAGE_SIZE;
16284 }
16285
16286 if (base_object != object) {
16287 vm_object_unlock(object);
16288 vm_object_lock(base_object);
16289 object = base_object;
16290 }
16291 /* Bump to the next page */
16292 base_offset += PAGE_SIZE;
16293 offset = base_offset;
16294 }
16295 vm_object_unlock(object);
16296 }
16297 start += sub_size;
16298 } else {
16299 vm_map_unlock(map);
16300 return KERN_FAILURE;
16301 }
16302 }
16303
16304 vm_map_unlock(map);
16305
16306 return ret;
16307 }
16308
16309 /*
16310 * vm_map_behavior_set:
16311 *
16312 * Sets the paging reference behavior of the specified address
16313 * range in the target map. Paging reference behavior affects
16314 * how pagein operations resulting from faults on the map will be
16315 * clustered.
16316 */
16317 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16318 vm_map_behavior_set(
16319 vm_map_t map,
16320 vm_map_offset_t start,
16321 vm_map_offset_t end,
16322 vm_behavior_t new_behavior)
16323 {
16324 vm_map_entry_t entry;
16325 vm_map_entry_t temp_entry;
16326
16327 if (start > end ||
16328 start < vm_map_min(map) ||
16329 end > vm_map_max(map)) {
16330 return KERN_NO_SPACE;
16331 }
16332 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16333 return KERN_INVALID_ADDRESS;
16334 }
16335
16336 switch (new_behavior) {
16337 /*
16338 * This first block of behaviors all set a persistent state on the specified
16339 * memory range. All we have to do here is to record the desired behavior
16340 * in the vm_map_entry_t's.
16341 */
16342
16343 case VM_BEHAVIOR_DEFAULT:
16344 case VM_BEHAVIOR_RANDOM:
16345 case VM_BEHAVIOR_SEQUENTIAL:
16346 case VM_BEHAVIOR_RSEQNTL:
16347 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16348 vm_map_lock(map);
16349
16350 /*
16351 * The entire address range must be valid for the map.
16352 * Note that vm_map_range_check() does a
16353 * vm_map_lookup_entry() internally and returns the
16354 * entry containing the start of the address range if
16355 * the entire range is valid.
16356 */
16357 if (vm_map_range_check(map, start, end, &temp_entry)) {
16358 entry = temp_entry;
16359 vm_map_clip_start(map, entry, start);
16360 } else {
16361 vm_map_unlock(map);
16362 return KERN_INVALID_ADDRESS;
16363 }
16364
16365 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16366 vm_map_clip_end(map, entry, end);
16367 if (entry->is_sub_map) {
16368 assert(!entry->use_pmap);
16369 }
16370
16371 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16372 entry->zero_wired_pages = TRUE;
16373 } else {
16374 entry->behavior = new_behavior;
16375 }
16376 entry = entry->vme_next;
16377 }
16378
16379 vm_map_unlock(map);
16380 break;
16381
16382 /*
16383 * The rest of these are different from the above in that they cause
16384 * an immediate action to take place as opposed to setting a behavior that
16385 * affects future actions.
16386 */
16387
16388 case VM_BEHAVIOR_WILLNEED:
16389 return vm_map_willneed(map, start, end);
16390
16391 case VM_BEHAVIOR_DONTNEED:
16392 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16393
16394 case VM_BEHAVIOR_FREE:
16395 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16396
16397 case VM_BEHAVIOR_REUSABLE:
16398 return vm_map_reusable_pages(map, start, end);
16399
16400 case VM_BEHAVIOR_REUSE:
16401 return vm_map_reuse_pages(map, start, end);
16402
16403 case VM_BEHAVIOR_CAN_REUSE:
16404 return vm_map_can_reuse(map, start, end);
16405
16406 #if MACH_ASSERT
16407 case VM_BEHAVIOR_PAGEOUT:
16408 return vm_map_pageout(map, start, end);
16409 #endif /* MACH_ASSERT */
16410
16411 case VM_BEHAVIOR_ZERO:
16412 return vm_map_zero(map, start, end);
16413
16414 default:
16415 return KERN_INVALID_ARGUMENT;
16416 }
16417
16418 return KERN_SUCCESS;
16419 }
16420
16421
16422 /*
16423 * Internals for madvise(MADV_WILLNEED) system call.
16424 *
16425 * The implementation is to do:-
16426 * a) read-ahead if the mapping corresponds to a mapped regular file
16427 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16428 */
16429
16430
16431 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16432 vm_map_willneed(
16433 vm_map_t map,
16434 vm_map_offset_t start,
16435 vm_map_offset_t end
16436 )
16437 {
16438 vm_map_entry_t entry;
16439 vm_object_t object;
16440 memory_object_t pager;
16441 struct vm_object_fault_info fault_info = {};
16442 kern_return_t kr;
16443 vm_object_size_t len;
16444 vm_object_offset_t offset;
16445
16446 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16447 task_pid(current_task()), start, end);
16448 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16449 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16450 fault_info.stealth = TRUE;
16451
16452 /*
16453 * The MADV_WILLNEED operation doesn't require any changes to the
16454 * vm_map_entry_t's, so the read lock is sufficient.
16455 */
16456
16457 vm_map_lock_read(map);
16458
16459 /*
16460 * The madvise semantics require that the address range be fully
16461 * allocated with no holes. Otherwise, we're required to return
16462 * an error.
16463 */
16464
16465 if (!vm_map_range_check(map, start, end, &entry)) {
16466 vm_map_unlock_read(map);
16467 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16468 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16469 return KERN_INVALID_ADDRESS;
16470 }
16471
16472 /*
16473 * Examine each vm_map_entry_t in the range.
16474 */
16475 for (; entry != vm_map_to_entry(map) && start < end;) {
16476 /*
16477 * The first time through, the start address could be anywhere
16478 * within the vm_map_entry we found. So adjust the offset to
16479 * correspond. After that, the offset will always be zero to
16480 * correspond to the beginning of the current vm_map_entry.
16481 */
16482 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16483
16484 /*
16485 * Set the length so we don't go beyond the end of the
16486 * map_entry or beyond the end of the range we were given.
16487 * This range could span also multiple map entries all of which
16488 * map different files, so make sure we only do the right amount
16489 * of I/O for each object. Note that it's possible for there
16490 * to be multiple map entries all referring to the same object
16491 * but with different page permissions, but it's not worth
16492 * trying to optimize that case.
16493 */
16494 len = MIN(entry->vme_end - start, end - start);
16495
16496 if ((vm_size_t) len != len) {
16497 /* 32-bit overflow */
16498 len = (vm_size_t) (0 - PAGE_SIZE);
16499 }
16500 fault_info.cluster_size = (vm_size_t) len;
16501 fault_info.lo_offset = offset;
16502 fault_info.hi_offset = offset + len;
16503 fault_info.user_tag = VME_ALIAS(entry);
16504 fault_info.pmap_options = 0;
16505 if (entry->iokit_acct ||
16506 (!entry->is_sub_map && !entry->use_pmap)) {
16507 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16508 }
16509 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16510
16511 /*
16512 * If the entry is a submap OR there's no read permission
16513 * to this mapping, then just skip it.
16514 */
16515 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16516 entry = entry->vme_next;
16517 start = entry->vme_start;
16518 continue;
16519 }
16520
16521 object = VME_OBJECT(entry);
16522
16523 if (object == NULL ||
16524 (object && object->internal)) {
16525 /*
16526 * Memory range backed by anonymous memory.
16527 */
16528 vm_size_t region_size = 0, effective_page_size = 0;
16529 vm_map_offset_t addr = 0, effective_page_mask = 0;
16530
16531 region_size = len;
16532 addr = start;
16533
16534 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16535 effective_page_size = effective_page_mask + 1;
16536
16537 vm_map_unlock_read(map);
16538
16539 while (region_size) {
16540 vm_pre_fault(
16541 vm_map_trunc_page(addr, effective_page_mask),
16542 VM_PROT_READ | VM_PROT_WRITE);
16543
16544 region_size -= effective_page_size;
16545 addr += effective_page_size;
16546 }
16547 } else {
16548 /*
16549 * Find the file object backing this map entry. If there is
16550 * none, then we simply ignore the "will need" advice for this
16551 * entry and go on to the next one.
16552 */
16553 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16554 entry = entry->vme_next;
16555 start = entry->vme_start;
16556 continue;
16557 }
16558
16559 vm_object_paging_begin(object);
16560 pager = object->pager;
16561 vm_object_unlock(object);
16562
16563 /*
16564 * The data_request() could take a long time, so let's
16565 * release the map lock to avoid blocking other threads.
16566 */
16567 vm_map_unlock_read(map);
16568
16569 /*
16570 * Get the data from the object asynchronously.
16571 *
16572 * Note that memory_object_data_request() places limits on the
16573 * amount of I/O it will do. Regardless of the len we
16574 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16575 * silently truncates the len to that size. This isn't
16576 * necessarily bad since madvise shouldn't really be used to
16577 * page in unlimited amounts of data. Other Unix variants
16578 * limit the willneed case as well. If this turns out to be an
16579 * issue for developers, then we can always adjust the policy
16580 * here and still be backwards compatible since this is all
16581 * just "advice".
16582 */
16583 kr = memory_object_data_request(
16584 pager,
16585 vm_object_trunc_page(offset) + object->paging_offset,
16586 0, /* ignored */
16587 VM_PROT_READ,
16588 (memory_object_fault_info_t)&fault_info);
16589
16590 vm_object_lock(object);
16591 vm_object_paging_end(object);
16592 vm_object_unlock(object);
16593
16594 /*
16595 * If we couldn't do the I/O for some reason, just give up on
16596 * the madvise. We still return success to the user since
16597 * madvise isn't supposed to fail when the advice can't be
16598 * taken.
16599 */
16600
16601 if (kr != KERN_SUCCESS) {
16602 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16603 task_pid(current_task()), start, kr);
16604 return KERN_SUCCESS;
16605 }
16606 }
16607
16608 start += len;
16609 if (start >= end) {
16610 /* done */
16611 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16612 task_pid(current_task()), start, KERN_SUCCESS);
16613 return KERN_SUCCESS;
16614 }
16615
16616 /* look up next entry */
16617 vm_map_lock_read(map);
16618 if (!vm_map_lookup_entry(map, start, &entry)) {
16619 /*
16620 * There's a new hole in the address range.
16621 */
16622 vm_map_unlock_read(map);
16623 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16624 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16625 return KERN_INVALID_ADDRESS;
16626 }
16627 }
16628
16629 vm_map_unlock_read(map);
16630 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16631 task_pid(current_task()), start, KERN_SUCCESS);
16632 return KERN_SUCCESS;
16633 }
16634
16635 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16636 vm_map_entry_is_reusable(
16637 vm_map_entry_t entry)
16638 {
16639 /* Only user map entries */
16640
16641 vm_object_t object;
16642
16643 if (entry->is_sub_map) {
16644 return FALSE;
16645 }
16646
16647 switch (VME_ALIAS(entry)) {
16648 case VM_MEMORY_MALLOC:
16649 case VM_MEMORY_MALLOC_SMALL:
16650 case VM_MEMORY_MALLOC_LARGE:
16651 case VM_MEMORY_REALLOC:
16652 case VM_MEMORY_MALLOC_TINY:
16653 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16654 case VM_MEMORY_MALLOC_LARGE_REUSED:
16655 /*
16656 * This is a malloc() memory region: check if it's still
16657 * in its original state and can be re-used for more
16658 * malloc() allocations.
16659 */
16660 break;
16661 default:
16662 /*
16663 * Not a malloc() memory region: let the caller decide if
16664 * it's re-usable.
16665 */
16666 return TRUE;
16667 }
16668
16669 if (/*entry->is_shared ||*/
16670 entry->is_sub_map ||
16671 entry->in_transition ||
16672 entry->protection != VM_PROT_DEFAULT ||
16673 entry->max_protection != VM_PROT_ALL ||
16674 entry->inheritance != VM_INHERIT_DEFAULT ||
16675 entry->no_cache ||
16676 entry->vme_permanent ||
16677 entry->superpage_size != FALSE ||
16678 entry->zero_wired_pages ||
16679 entry->wired_count != 0 ||
16680 entry->user_wired_count != 0) {
16681 return FALSE;
16682 }
16683
16684 object = VME_OBJECT(entry);
16685 if (object == VM_OBJECT_NULL) {
16686 return TRUE;
16687 }
16688 if (
16689 #if 0
16690 /*
16691 * Let's proceed even if the VM object is potentially
16692 * shared.
16693 * We check for this later when processing the actual
16694 * VM pages, so the contents will be safe if shared.
16695 *
16696 * But we can still mark this memory region as "reusable" to
16697 * acknowledge that the caller did let us know that the memory
16698 * could be re-used and should not be penalized for holding
16699 * on to it. This allows its "resident size" to not include
16700 * the reusable range.
16701 */
16702 object->ref_count == 1 &&
16703 #endif
16704 object->vo_copy == VM_OBJECT_NULL &&
16705 object->shadow == VM_OBJECT_NULL &&
16706 object->internal &&
16707 object->purgable == VM_PURGABLE_DENY &&
16708 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16709 !object->code_signed) {
16710 return TRUE;
16711 }
16712 return FALSE;
16713 }
16714
16715 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16716 vm_map_reuse_pages(
16717 vm_map_t map,
16718 vm_map_offset_t start,
16719 vm_map_offset_t end)
16720 {
16721 vm_map_entry_t entry;
16722 vm_object_t object;
16723 vm_object_offset_t start_offset, end_offset;
16724
16725 /*
16726 * The MADV_REUSE operation doesn't require any changes to the
16727 * vm_map_entry_t's, so the read lock is sufficient.
16728 */
16729
16730 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16731 /*
16732 * XXX TODO4K
16733 * need to figure out what reusable means for a
16734 * portion of a native page.
16735 */
16736 return KERN_SUCCESS;
16737 }
16738
16739 vm_map_lock_read(map);
16740 assert(map->pmap != kernel_pmap); /* protect alias access */
16741
16742 /*
16743 * The madvise semantics require that the address range be fully
16744 * allocated with no holes. Otherwise, we're required to return
16745 * an error.
16746 */
16747
16748 if (!vm_map_range_check(map, start, end, &entry)) {
16749 vm_map_unlock_read(map);
16750 vm_page_stats_reusable.reuse_pages_failure++;
16751 return KERN_INVALID_ADDRESS;
16752 }
16753
16754 /*
16755 * Examine each vm_map_entry_t in the range.
16756 */
16757 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16758 entry = entry->vme_next) {
16759 /*
16760 * Sanity check on the VM map entry.
16761 */
16762 if (!vm_map_entry_is_reusable(entry)) {
16763 vm_map_unlock_read(map);
16764 vm_page_stats_reusable.reuse_pages_failure++;
16765 return KERN_INVALID_ADDRESS;
16766 }
16767
16768 /*
16769 * The first time through, the start address could be anywhere
16770 * within the vm_map_entry we found. So adjust the offset to
16771 * correspond.
16772 */
16773 if (entry->vme_start < start) {
16774 start_offset = start - entry->vme_start;
16775 } else {
16776 start_offset = 0;
16777 }
16778 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16779 start_offset += VME_OFFSET(entry);
16780 end_offset += VME_OFFSET(entry);
16781
16782 object = VME_OBJECT(entry);
16783 if (object != VM_OBJECT_NULL) {
16784 vm_object_lock(object);
16785 vm_object_reuse_pages(object, start_offset, end_offset,
16786 TRUE);
16787 vm_object_unlock(object);
16788 }
16789
16790 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16791 /*
16792 * XXX
16793 * We do not hold the VM map exclusively here.
16794 * The "alias" field is not that critical, so it's
16795 * safe to update it here, as long as it is the only
16796 * one that can be modified while holding the VM map
16797 * "shared".
16798 */
16799 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16800 }
16801 }
16802
16803 vm_map_unlock_read(map);
16804 vm_page_stats_reusable.reuse_pages_success++;
16805 return KERN_SUCCESS;
16806 }
16807
16808
16809 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16810 vm_map_reusable_pages(
16811 vm_map_t map,
16812 vm_map_offset_t start,
16813 vm_map_offset_t end)
16814 {
16815 vm_map_entry_t entry;
16816 vm_object_t object;
16817 vm_object_offset_t start_offset, end_offset;
16818 vm_map_offset_t pmap_offset;
16819
16820 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16821 /*
16822 * XXX TODO4K
16823 * need to figure out what reusable means for a portion
16824 * of a native page.
16825 */
16826 return KERN_SUCCESS;
16827 }
16828
16829 /*
16830 * The MADV_REUSABLE operation doesn't require any changes to the
16831 * vm_map_entry_t's, so the read lock is sufficient.
16832 */
16833
16834 vm_map_lock_read(map);
16835 assert(map->pmap != kernel_pmap); /* protect alias access */
16836
16837 /*
16838 * The madvise semantics require that the address range be fully
16839 * allocated with no holes. Otherwise, we're required to return
16840 * an error.
16841 */
16842
16843 if (!vm_map_range_check(map, start, end, &entry)) {
16844 vm_map_unlock_read(map);
16845 vm_page_stats_reusable.reusable_pages_failure++;
16846 return KERN_INVALID_ADDRESS;
16847 }
16848
16849 /*
16850 * Examine each vm_map_entry_t in the range.
16851 */
16852 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16853 entry = entry->vme_next) {
16854 int kill_pages = 0;
16855 boolean_t reusable_no_write = FALSE;
16856
16857 /*
16858 * Sanity check on the VM map entry.
16859 */
16860 if (!vm_map_entry_is_reusable(entry)) {
16861 vm_map_unlock_read(map);
16862 vm_page_stats_reusable.reusable_pages_failure++;
16863 return KERN_INVALID_ADDRESS;
16864 }
16865
16866 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16867 #if __arm64e__
16868 && !entry->used_for_tpro
16869 #endif
16870 ) {
16871 /* not writable: can't discard contents */
16872 vm_map_unlock_read(map);
16873 vm_page_stats_reusable.reusable_nonwritable++;
16874 vm_page_stats_reusable.reusable_pages_failure++;
16875 return KERN_PROTECTION_FAILURE;
16876 }
16877
16878 /*
16879 * The first time through, the start address could be anywhere
16880 * within the vm_map_entry we found. So adjust the offset to
16881 * correspond.
16882 */
16883 if (entry->vme_start < start) {
16884 start_offset = start - entry->vme_start;
16885 pmap_offset = start;
16886 } else {
16887 start_offset = 0;
16888 pmap_offset = entry->vme_start;
16889 }
16890 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16891 start_offset += VME_OFFSET(entry);
16892 end_offset += VME_OFFSET(entry);
16893
16894 object = VME_OBJECT(entry);
16895 if (object == VM_OBJECT_NULL) {
16896 continue;
16897 }
16898
16899 if (entry->protection & VM_PROT_EXECUTE) {
16900 /*
16901 * Executable mappings might be write-protected by
16902 * hardware, so do not attempt to write to these pages.
16903 */
16904 reusable_no_write = TRUE;
16905 }
16906
16907 if (entry->vme_xnu_user_debug) {
16908 /*
16909 * User debug pages might be write-protected by hardware,
16910 * so do not attempt to write to these pages.
16911 */
16912 reusable_no_write = TRUE;
16913 }
16914
16915 vm_object_lock(object);
16916 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
16917 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16918 object->vo_copy == VM_OBJECT_NULL)) &&
16919 object->shadow == VM_OBJECT_NULL &&
16920 /*
16921 * "iokit_acct" entries are billed for their virtual size
16922 * (rather than for their resident pages only), so they
16923 * wouldn't benefit from making pages reusable, and it
16924 * would be hard to keep track of pages that are both
16925 * "iokit_acct" and "reusable" in the pmap stats and
16926 * ledgers.
16927 */
16928 !(entry->iokit_acct ||
16929 (!entry->is_sub_map && !entry->use_pmap))) {
16930 if (os_ref_get_count_raw(&object->ref_count) != 1) {
16931 vm_page_stats_reusable.reusable_shared++;
16932 }
16933 kill_pages = 1;
16934 } else {
16935 kill_pages = -1;
16936 }
16937 if (kill_pages != -1) {
16938 vm_object_deactivate_pages(object,
16939 start_offset,
16940 end_offset - start_offset,
16941 kill_pages,
16942 TRUE /*reusable_pages*/,
16943 reusable_no_write,
16944 map->pmap,
16945 pmap_offset);
16946 } else {
16947 vm_page_stats_reusable.reusable_pages_shared++;
16948 DTRACE_VM4(vm_map_reusable_pages_shared,
16949 unsigned int, VME_ALIAS(entry),
16950 vm_map_t, map,
16951 vm_map_entry_t, entry,
16952 vm_object_t, object);
16953 }
16954 vm_object_unlock(object);
16955
16956 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16957 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16958 /*
16959 * XXX
16960 * We do not hold the VM map exclusively here.
16961 * The "alias" field is not that critical, so it's
16962 * safe to update it here, as long as it is the only
16963 * one that can be modified while holding the VM map
16964 * "shared".
16965 */
16966 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16967 }
16968 }
16969
16970 vm_map_unlock_read(map);
16971 vm_page_stats_reusable.reusable_pages_success++;
16972 return KERN_SUCCESS;
16973 }
16974
16975
16976 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16977 vm_map_can_reuse(
16978 vm_map_t map,
16979 vm_map_offset_t start,
16980 vm_map_offset_t end)
16981 {
16982 vm_map_entry_t entry;
16983
16984 /*
16985 * The MADV_REUSABLE operation doesn't require any changes to the
16986 * vm_map_entry_t's, so the read lock is sufficient.
16987 */
16988
16989 vm_map_lock_read(map);
16990 assert(map->pmap != kernel_pmap); /* protect alias access */
16991
16992 /*
16993 * The madvise semantics require that the address range be fully
16994 * allocated with no holes. Otherwise, we're required to return
16995 * an error.
16996 */
16997
16998 if (!vm_map_range_check(map, start, end, &entry)) {
16999 vm_map_unlock_read(map);
17000 vm_page_stats_reusable.can_reuse_failure++;
17001 return KERN_INVALID_ADDRESS;
17002 }
17003
17004 /*
17005 * Examine each vm_map_entry_t in the range.
17006 */
17007 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17008 entry = entry->vme_next) {
17009 /*
17010 * Sanity check on the VM map entry.
17011 */
17012 if (!vm_map_entry_is_reusable(entry)) {
17013 vm_map_unlock_read(map);
17014 vm_page_stats_reusable.can_reuse_failure++;
17015 return KERN_INVALID_ADDRESS;
17016 }
17017 }
17018
17019 vm_map_unlock_read(map);
17020 vm_page_stats_reusable.can_reuse_success++;
17021 return KERN_SUCCESS;
17022 }
17023
17024
17025 #if MACH_ASSERT
17026 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17027 vm_map_pageout(
17028 vm_map_t map,
17029 vm_map_offset_t start,
17030 vm_map_offset_t end)
17031 {
17032 vm_map_entry_t entry;
17033
17034 /*
17035 * The MADV_PAGEOUT operation doesn't require any changes to the
17036 * vm_map_entry_t's, so the read lock is sufficient.
17037 */
17038
17039 vm_map_lock_read(map);
17040
17041 /*
17042 * The madvise semantics require that the address range be fully
17043 * allocated with no holes. Otherwise, we're required to return
17044 * an error.
17045 */
17046
17047 if (!vm_map_range_check(map, start, end, &entry)) {
17048 vm_map_unlock_read(map);
17049 return KERN_INVALID_ADDRESS;
17050 }
17051
17052 /*
17053 * Examine each vm_map_entry_t in the range.
17054 */
17055 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17056 entry = entry->vme_next) {
17057 vm_object_t object;
17058
17059 /*
17060 * Sanity check on the VM map entry.
17061 */
17062 if (entry->is_sub_map) {
17063 vm_map_t submap;
17064 vm_map_offset_t submap_start;
17065 vm_map_offset_t submap_end;
17066 vm_map_entry_t submap_entry;
17067
17068 submap = VME_SUBMAP(entry);
17069 submap_start = VME_OFFSET(entry);
17070 submap_end = submap_start + (entry->vme_end -
17071 entry->vme_start);
17072
17073 vm_map_lock_read(submap);
17074
17075 if (!vm_map_range_check(submap,
17076 submap_start,
17077 submap_end,
17078 &submap_entry)) {
17079 vm_map_unlock_read(submap);
17080 vm_map_unlock_read(map);
17081 return KERN_INVALID_ADDRESS;
17082 }
17083
17084 if (submap_entry->is_sub_map) {
17085 vm_map_unlock_read(submap);
17086 continue;
17087 }
17088
17089 object = VME_OBJECT(submap_entry);
17090 if (object == VM_OBJECT_NULL || !object->internal) {
17091 vm_map_unlock_read(submap);
17092 continue;
17093 }
17094
17095 vm_object_pageout(object);
17096
17097 vm_map_unlock_read(submap);
17098 submap = VM_MAP_NULL;
17099 submap_entry = VM_MAP_ENTRY_NULL;
17100 continue;
17101 }
17102
17103 object = VME_OBJECT(entry);
17104 if (object == VM_OBJECT_NULL || !object->internal) {
17105 continue;
17106 }
17107
17108 vm_object_pageout(object);
17109 }
17110
17111 vm_map_unlock_read(map);
17112 return KERN_SUCCESS;
17113 }
17114 #endif /* MACH_ASSERT */
17115
17116 /*
17117 * This function determines if the zero operation can be run on the
17118 * respective entry. Additional checks on the object are in
17119 * vm_object_zero_preflight.
17120 */
17121 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17122 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17123 {
17124 /*
17125 * Zeroing is restricted to writable non-executable entries and non-JIT
17126 * regions.
17127 */
17128 if (!(entry->protection & VM_PROT_WRITE) ||
17129 (entry->protection & VM_PROT_EXECUTE) ||
17130 entry->used_for_jit ||
17131 entry->vme_xnu_user_debug) {
17132 return KERN_PROTECTION_FAILURE;
17133 }
17134
17135 /*
17136 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17137 * allowed for submaps.
17138 */
17139 if (entry->needs_copy || entry->is_sub_map) {
17140 return KERN_NO_ACCESS;
17141 }
17142
17143 return KERN_SUCCESS;
17144 }
17145
17146 /*
17147 * This function translates entry's start and end to offsets in the object
17148 */
17149 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17150 vm_map_get_bounds_in_object(
17151 vm_map_entry_t entry,
17152 vm_map_offset_t start,
17153 vm_map_offset_t end,
17154 vm_map_offset_t *start_offset,
17155 vm_map_offset_t *end_offset)
17156 {
17157 if (entry->vme_start < start) {
17158 *start_offset = start - entry->vme_start;
17159 } else {
17160 *start_offset = 0;
17161 }
17162 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17163 *start_offset += VME_OFFSET(entry);
17164 *end_offset += VME_OFFSET(entry);
17165 }
17166
17167 /*
17168 * This function iterates through the entries in the requested range
17169 * and zeroes any resident pages in the corresponding objects. Compressed
17170 * pages are dropped instead of being faulted in and zeroed.
17171 */
17172 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17173 vm_map_zero(
17174 vm_map_t map,
17175 vm_map_offset_t start,
17176 vm_map_offset_t end)
17177 {
17178 vm_map_entry_t entry;
17179 vm_map_offset_t cur = start;
17180 kern_return_t ret;
17181
17182 /*
17183 * This operation isn't supported where the map page size is less than
17184 * the hardware page size. Caller will need to handle error and
17185 * explicitly zero memory if needed.
17186 */
17187 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17188 return KERN_NO_ACCESS;
17189 }
17190
17191 /*
17192 * The MADV_ZERO operation doesn't require any changes to the
17193 * vm_map_entry_t's, so the read lock is sufficient.
17194 */
17195 vm_map_lock_read(map);
17196 assert(map->pmap != kernel_pmap); /* protect alias access */
17197
17198 /*
17199 * The madvise semantics require that the address range be fully
17200 * allocated with no holes. Otherwise, we're required to return
17201 * an error. This check needs to be redone if the map has changed.
17202 */
17203 if (!vm_map_range_check(map, cur, end, &entry)) {
17204 vm_map_unlock_read(map);
17205 return KERN_INVALID_ADDRESS;
17206 }
17207
17208 /*
17209 * Examine each vm_map_entry_t in the range.
17210 */
17211 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17212 vm_map_offset_t cur_offset;
17213 vm_map_offset_t end_offset;
17214 unsigned int last_timestamp = map->timestamp;
17215 vm_object_t object = VME_OBJECT(entry);
17216
17217 ret = vm_map_zero_entry_preflight(entry);
17218 if (ret != KERN_SUCCESS) {
17219 vm_map_unlock_read(map);
17220 return ret;
17221 }
17222
17223 if (object == VM_OBJECT_NULL) {
17224 entry = entry->vme_next;
17225 continue;
17226 }
17227
17228 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17229 vm_object_lock(object);
17230 /*
17231 * Take a reference on the object as vm_object_zero will drop the object
17232 * lock when it encounters a busy page.
17233 */
17234 vm_object_reference_locked(object);
17235 vm_map_unlock_read(map);
17236
17237 ret = vm_object_zero(object, cur_offset, end_offset);
17238 vm_object_unlock(object);
17239 vm_object_deallocate(object);
17240 if (ret != KERN_SUCCESS) {
17241 return ret;
17242 }
17243 /*
17244 * Update cur as vm_object_zero has succeeded.
17245 */
17246 cur += (end_offset - cur_offset);
17247 if (cur == end) {
17248 return KERN_SUCCESS;
17249 }
17250
17251 /*
17252 * If the map timestamp has changed, restart by relooking up cur in the
17253 * map
17254 */
17255 vm_map_lock_read(map);
17256 if (last_timestamp != map->timestamp) {
17257 /*
17258 * Relookup cur in the map
17259 */
17260 if (!vm_map_range_check(map, cur, end, &entry)) {
17261 vm_map_unlock_read(map);
17262 return KERN_INVALID_ADDRESS;
17263 }
17264 continue;
17265 }
17266 /*
17267 * If the map hasn't changed proceed with the next entry
17268 */
17269 entry = entry->vme_next;
17270 }
17271
17272 vm_map_unlock_read(map);
17273 return KERN_SUCCESS;
17274 }
17275
17276
17277 /*
17278 * Routine: vm_map_entry_insert
17279 *
17280 * Description: This routine inserts a new vm_entry in a locked map.
17281 */
17282 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17283 vm_map_entry_insert(
17284 vm_map_t map,
17285 vm_map_entry_t insp_entry,
17286 vm_map_offset_t start,
17287 vm_map_offset_t end,
17288 vm_object_t object,
17289 vm_object_offset_t offset,
17290 vm_map_kernel_flags_t vmk_flags,
17291 boolean_t needs_copy,
17292 vm_prot_t cur_protection,
17293 vm_prot_t max_protection,
17294 vm_inherit_t inheritance,
17295 boolean_t clear_map_aligned)
17296 {
17297 vm_map_entry_t new_entry;
17298 boolean_t map_aligned = FALSE;
17299
17300 assert(insp_entry != (vm_map_entry_t)0);
17301 vm_map_lock_assert_exclusive(map);
17302
17303 __assert_only vm_object_offset_t end_offset = 0;
17304 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17305
17306 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17307 map_aligned = TRUE;
17308 }
17309 if (clear_map_aligned &&
17310 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17311 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17312 map_aligned = FALSE;
17313 }
17314 if (map_aligned) {
17315 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17316 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17317 } else {
17318 assert(page_aligned(start));
17319 assert(page_aligned(end));
17320 }
17321 assert(start < end);
17322
17323 new_entry = vm_map_entry_create(map);
17324
17325 new_entry->vme_start = start;
17326 new_entry->vme_end = end;
17327
17328 if (vmk_flags.vmkf_submap) {
17329 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17330 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17331 } else {
17332 VME_OBJECT_SET(new_entry, object, false, 0);
17333 }
17334 VME_OFFSET_SET(new_entry, offset);
17335 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17336
17337 new_entry->map_aligned = map_aligned;
17338 new_entry->needs_copy = needs_copy;
17339 new_entry->inheritance = inheritance;
17340 new_entry->protection = cur_protection;
17341 new_entry->max_protection = max_protection;
17342 /*
17343 * submap: "use_pmap" means "nested".
17344 * default: false.
17345 *
17346 * object: "use_pmap" means "use pmap accounting" for footprint.
17347 * default: true.
17348 */
17349 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17350 new_entry->no_cache = vmk_flags.vmf_no_cache;
17351 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17352 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17353 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17354 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17355
17356 if (vmk_flags.vmkf_map_jit) {
17357 if (!(map->jit_entry_exists) ||
17358 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17359 new_entry->used_for_jit = TRUE;
17360 map->jit_entry_exists = TRUE;
17361 }
17362 }
17363
17364 /*
17365 * Insert the new entry into the list.
17366 */
17367
17368 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17369 map->size += end - start;
17370
17371 /*
17372 * Update the free space hint and the lookup hint.
17373 */
17374
17375 SAVE_HINT_MAP_WRITE(map, new_entry);
17376 return new_entry;
17377 }
17378
17379 /*
17380 * Routine: vm_map_remap_extract
17381 *
17382 * Description: This routine returns a vm_entry list from a map.
17383 */
17384 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17385 vm_map_remap_extract(
17386 vm_map_t map,
17387 vm_map_offset_t addr,
17388 vm_map_size_t size,
17389 boolean_t copy,
17390 vm_map_copy_t map_copy,
17391 vm_prot_t *cur_protection, /* IN/OUT */
17392 vm_prot_t *max_protection, /* IN/OUT */
17393 /* What, no behavior? */
17394 vm_inherit_t inheritance,
17395 vm_map_kernel_flags_t vmk_flags)
17396 {
17397 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17398 kern_return_t result;
17399 vm_map_size_t mapped_size;
17400 vm_map_size_t tmp_size;
17401 vm_map_entry_t src_entry; /* result of last map lookup */
17402 vm_map_entry_t new_entry;
17403 vm_object_offset_t offset;
17404 vm_map_offset_t map_address;
17405 vm_map_offset_t src_start; /* start of entry to map */
17406 vm_map_offset_t src_end; /* end of region to be mapped */
17407 vm_object_t object;
17408 vm_map_version_t version;
17409 boolean_t src_needs_copy;
17410 boolean_t new_entry_needs_copy;
17411 vm_map_entry_t saved_src_entry;
17412 boolean_t src_entry_was_wired;
17413 vm_prot_t max_prot_for_prot_copy;
17414 vm_map_offset_t effective_page_mask;
17415 bool pageable, same_map;
17416 boolean_t vm_remap_legacy;
17417 vm_prot_t required_cur_prot, required_max_prot;
17418 vm_object_t new_copy_object; /* vm_object_copy_* result */
17419 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17420
17421 pageable = vmk_flags.vmkf_copy_pageable;
17422 same_map = vmk_flags.vmkf_copy_same_map;
17423
17424 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17425
17426 assert(map != VM_MAP_NULL);
17427 assert(size != 0);
17428 assert(size == vm_map_round_page(size, effective_page_mask));
17429 assert(inheritance == VM_INHERIT_NONE ||
17430 inheritance == VM_INHERIT_COPY ||
17431 inheritance == VM_INHERIT_SHARE);
17432 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17433 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17434 assert((*cur_protection & *max_protection) == *cur_protection);
17435
17436 /*
17437 * Compute start and end of region.
17438 */
17439 src_start = vm_map_trunc_page(addr, effective_page_mask);
17440 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17441
17442 /*
17443 * Initialize map_header.
17444 */
17445 map_header->nentries = 0;
17446 map_header->entries_pageable = pageable;
17447 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17448 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17449 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17450 vm_map_store_init(map_header);
17451
17452 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17453 /*
17454 * Special case for vm_map_protect(VM_PROT_COPY):
17455 * we want to set the new mappings' max protection to the
17456 * specified *max_protection...
17457 */
17458 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17459 /* ... but we want to use the vm_remap() legacy mode */
17460 vmk_flags.vmkf_remap_legacy_mode = true;
17461 *max_protection = VM_PROT_NONE;
17462 *cur_protection = VM_PROT_NONE;
17463 } else {
17464 max_prot_for_prot_copy = VM_PROT_NONE;
17465 }
17466
17467 if (vmk_flags.vmkf_remap_legacy_mode) {
17468 /*
17469 * vm_remap() legacy mode:
17470 * Extract all memory regions in the specified range and
17471 * collect the strictest set of protections allowed on the
17472 * entire range, so the caller knows what they can do with
17473 * the remapped range.
17474 * We start with VM_PROT_ALL and we'll remove the protections
17475 * missing from each memory region.
17476 */
17477 vm_remap_legacy = TRUE;
17478 *cur_protection = VM_PROT_ALL;
17479 *max_protection = VM_PROT_ALL;
17480 required_cur_prot = VM_PROT_NONE;
17481 required_max_prot = VM_PROT_NONE;
17482 } else {
17483 /*
17484 * vm_remap_new() mode:
17485 * Extract all memory regions in the specified range and
17486 * ensure that they have at least the protections specified
17487 * by the caller via *cur_protection and *max_protection.
17488 * The resulting mapping should have these protections.
17489 */
17490 vm_remap_legacy = FALSE;
17491 if (copy) {
17492 required_cur_prot = VM_PROT_NONE;
17493 required_max_prot = VM_PROT_READ;
17494 } else {
17495 required_cur_prot = *cur_protection;
17496 required_max_prot = *max_protection;
17497 }
17498 }
17499
17500 map_address = 0;
17501 mapped_size = 0;
17502 result = KERN_SUCCESS;
17503
17504 /*
17505 * The specified source virtual space might correspond to
17506 * multiple map entries, need to loop on them.
17507 */
17508 vm_map_lock(map);
17509
17510 if (map->pmap == kernel_pmap) {
17511 map_copy->is_kernel_range = true;
17512 map_copy->orig_range = kmem_addr_get_range(addr, size);
17513 #if CONFIG_MAP_RANGES
17514 } else if (map->uses_user_ranges) {
17515 map_copy->is_user_range = true;
17516 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17517 #endif /* CONFIG_MAP_RANGES */
17518 }
17519
17520 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17521 /*
17522 * This address space uses sub-pages so the range might
17523 * not be re-mappable in an address space with larger
17524 * pages. Re-assemble any broken-up VM map entries to
17525 * improve our chances of making it work.
17526 */
17527 vm_map_simplify_range(map, src_start, src_end);
17528 }
17529 while (mapped_size != size) {
17530 vm_map_size_t entry_size;
17531
17532 /*
17533 * Find the beginning of the region.
17534 */
17535 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17536 result = KERN_INVALID_ADDRESS;
17537 break;
17538 }
17539
17540 if (src_start < src_entry->vme_start ||
17541 (mapped_size && src_start != src_entry->vme_start)) {
17542 result = KERN_INVALID_ADDRESS;
17543 break;
17544 }
17545
17546 tmp_size = size - mapped_size;
17547 if (src_end > src_entry->vme_end) {
17548 tmp_size -= (src_end - src_entry->vme_end);
17549 }
17550
17551 entry_size = (vm_map_size_t)(src_entry->vme_end -
17552 src_entry->vme_start);
17553
17554 if (src_entry->is_sub_map &&
17555 vmk_flags.vmkf_copy_single_object) {
17556 vm_map_t submap;
17557 vm_map_offset_t submap_start;
17558 vm_map_size_t submap_size;
17559 boolean_t submap_needs_copy;
17560
17561 /*
17562 * No check for "required protection" on "src_entry"
17563 * because the protections that matter are the ones
17564 * on the submap's VM map entry, which will be checked
17565 * during the call to vm_map_remap_extract() below.
17566 */
17567 object = VM_OBJECT_NULL;
17568
17569 submap_size = src_entry->vme_end - src_start;
17570 if (submap_size > size) {
17571 submap_size = size;
17572 }
17573 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17574 submap = VME_SUBMAP(src_entry);
17575 if (copy) {
17576 /*
17577 * The caller wants a copy-on-write re-mapping,
17578 * so let's extract from the submap accordingly.
17579 */
17580 submap_needs_copy = TRUE;
17581 } else if (src_entry->needs_copy) {
17582 /*
17583 * The caller wants a shared re-mapping but the
17584 * submap is mapped with "needs_copy", so its
17585 * contents can't be shared as is. Extract the
17586 * contents of the submap as "copy-on-write".
17587 * The re-mapping won't be shared with the
17588 * original mapping but this is equivalent to
17589 * what happened with the original "remap from
17590 * submap" code.
17591 * The shared region is mapped "needs_copy", for
17592 * example.
17593 */
17594 submap_needs_copy = TRUE;
17595 } else {
17596 /*
17597 * The caller wants a shared re-mapping and
17598 * this mapping can be shared (no "needs_copy"),
17599 * so let's extract from the submap accordingly.
17600 * Kernel submaps are mapped without
17601 * "needs_copy", for example.
17602 */
17603 submap_needs_copy = FALSE;
17604 }
17605 vm_map_reference(submap);
17606 vm_map_unlock(map);
17607 src_entry = NULL;
17608 if (vm_remap_legacy) {
17609 *cur_protection = VM_PROT_NONE;
17610 *max_protection = VM_PROT_NONE;
17611 }
17612
17613 DTRACE_VM7(remap_submap_recurse,
17614 vm_map_t, map,
17615 vm_map_offset_t, addr,
17616 vm_map_size_t, size,
17617 boolean_t, copy,
17618 vm_map_offset_t, submap_start,
17619 vm_map_size_t, submap_size,
17620 boolean_t, submap_needs_copy);
17621
17622 result = vm_map_remap_extract(submap,
17623 submap_start,
17624 submap_size,
17625 submap_needs_copy,
17626 map_copy,
17627 cur_protection,
17628 max_protection,
17629 inheritance,
17630 vmk_flags);
17631 vm_map_deallocate(submap);
17632
17633 if (result == KERN_SUCCESS &&
17634 submap_needs_copy &&
17635 !copy) {
17636 /*
17637 * We were asked for a "shared"
17638 * re-mapping but had to ask for a
17639 * "copy-on-write" remapping of the
17640 * submap's mapping to honor the
17641 * submap's "needs_copy".
17642 * We now need to resolve that
17643 * pending "copy-on-write" to
17644 * get something we can share.
17645 */
17646 vm_map_entry_t copy_entry;
17647 vm_object_offset_t copy_offset;
17648 vm_map_size_t copy_size;
17649 vm_object_t copy_object;
17650 copy_entry = vm_map_copy_first_entry(map_copy);
17651 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17652 copy_object = VME_OBJECT(copy_entry);
17653 copy_offset = VME_OFFSET(copy_entry);
17654 if (copy_object == VM_OBJECT_NULL) {
17655 assert(copy_offset == 0);
17656 assert(!copy_entry->needs_copy);
17657 if (copy_entry->max_protection == VM_PROT_NONE) {
17658 assert(copy_entry->protection == VM_PROT_NONE);
17659 /* nothing to share */
17660 } else {
17661 assert(copy_offset == 0);
17662 copy_object = vm_object_allocate(copy_size);
17663 VME_OFFSET_SET(copy_entry, 0);
17664 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17665 assert(copy_entry->use_pmap);
17666 }
17667 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17668 /* already shareable */
17669 assert(!copy_entry->needs_copy);
17670 } else if (copy_entry->needs_copy ||
17671 copy_object->shadowed ||
17672 (copy_object->internal &&
17673 !copy_object->true_share &&
17674 !copy_entry->is_shared &&
17675 copy_object->vo_size > copy_size)) {
17676 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17677 assert(copy_entry->use_pmap);
17678 if (copy_entry->needs_copy) {
17679 /* already write-protected */
17680 } else {
17681 vm_prot_t prot;
17682 prot = copy_entry->protection & ~VM_PROT_WRITE;
17683 vm_object_pmap_protect(copy_object,
17684 copy_offset,
17685 copy_size,
17686 PMAP_NULL,
17687 PAGE_SIZE,
17688 0,
17689 prot);
17690 }
17691 copy_entry->needs_copy = FALSE;
17692 }
17693 copy_object = VME_OBJECT(copy_entry);
17694 copy_offset = VME_OFFSET(copy_entry);
17695 if (copy_object &&
17696 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17697 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17698 copy_object->true_share = TRUE;
17699 }
17700 }
17701
17702 return result;
17703 }
17704
17705 if (src_entry->is_sub_map) {
17706 /* protections for submap mapping are irrelevant here */
17707 } else if (((src_entry->protection & required_cur_prot) !=
17708 required_cur_prot) ||
17709 ((src_entry->max_protection & required_max_prot) !=
17710 required_max_prot)) {
17711 if (vmk_flags.vmkf_copy_single_object &&
17712 mapped_size != 0) {
17713 /*
17714 * Single object extraction.
17715 * We can't extract more with the required
17716 * protection but we've extracted some, so
17717 * stop there and declare success.
17718 * The caller should check the size of
17719 * the copy entry we've extracted.
17720 */
17721 result = KERN_SUCCESS;
17722 } else {
17723 /*
17724 * VM range extraction.
17725 * Required proctection is not available
17726 * for this part of the range: fail.
17727 */
17728 result = KERN_PROTECTION_FAILURE;
17729 }
17730 break;
17731 }
17732
17733 if (src_entry->is_sub_map) {
17734 vm_map_t submap;
17735 vm_map_offset_t submap_start;
17736 vm_map_size_t submap_size;
17737 vm_map_copy_t submap_copy;
17738 vm_prot_t submap_curprot, submap_maxprot;
17739 boolean_t submap_needs_copy;
17740
17741 /*
17742 * No check for "required protection" on "src_entry"
17743 * because the protections that matter are the ones
17744 * on the submap's VM map entry, which will be checked
17745 * during the call to vm_map_copy_extract() below.
17746 */
17747 object = VM_OBJECT_NULL;
17748 submap_copy = VM_MAP_COPY_NULL;
17749
17750 /* find equivalent range in the submap */
17751 submap = VME_SUBMAP(src_entry);
17752 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17753 submap_size = tmp_size;
17754 if (copy) {
17755 /*
17756 * The caller wants a copy-on-write re-mapping,
17757 * so let's extract from the submap accordingly.
17758 */
17759 submap_needs_copy = TRUE;
17760 } else if (src_entry->needs_copy) {
17761 /*
17762 * The caller wants a shared re-mapping but the
17763 * submap is mapped with "needs_copy", so its
17764 * contents can't be shared as is. Extract the
17765 * contents of the submap as "copy-on-write".
17766 * The re-mapping won't be shared with the
17767 * original mapping but this is equivalent to
17768 * what happened with the original "remap from
17769 * submap" code.
17770 * The shared region is mapped "needs_copy", for
17771 * example.
17772 */
17773 submap_needs_copy = TRUE;
17774 } else {
17775 /*
17776 * The caller wants a shared re-mapping and
17777 * this mapping can be shared (no "needs_copy"),
17778 * so let's extract from the submap accordingly.
17779 * Kernel submaps are mapped without
17780 * "needs_copy", for example.
17781 */
17782 submap_needs_copy = FALSE;
17783 }
17784 /* extra ref to keep submap alive */
17785 vm_map_reference(submap);
17786
17787 DTRACE_VM7(remap_submap_recurse,
17788 vm_map_t, map,
17789 vm_map_offset_t, addr,
17790 vm_map_size_t, size,
17791 boolean_t, copy,
17792 vm_map_offset_t, submap_start,
17793 vm_map_size_t, submap_size,
17794 boolean_t, submap_needs_copy);
17795
17796 /*
17797 * The map can be safely unlocked since we
17798 * already hold a reference on the submap.
17799 *
17800 * No timestamp since we don't care if the map
17801 * gets modified while we're down in the submap.
17802 * We'll resume the extraction at src_start + tmp_size
17803 * anyway.
17804 */
17805 vm_map_unlock(map);
17806 src_entry = NULL; /* not valid once map is unlocked */
17807
17808 if (vm_remap_legacy) {
17809 submap_curprot = VM_PROT_NONE;
17810 submap_maxprot = VM_PROT_NONE;
17811 if (max_prot_for_prot_copy) {
17812 submap_maxprot = max_prot_for_prot_copy;
17813 }
17814 } else {
17815 assert(!max_prot_for_prot_copy);
17816 submap_curprot = *cur_protection;
17817 submap_maxprot = *max_protection;
17818 }
17819 result = vm_map_copy_extract(submap,
17820 submap_start,
17821 submap_size,
17822 submap_needs_copy,
17823 &submap_copy,
17824 &submap_curprot,
17825 &submap_maxprot,
17826 inheritance,
17827 vmk_flags);
17828
17829 /* release extra ref on submap */
17830 vm_map_deallocate(submap);
17831 submap = VM_MAP_NULL;
17832
17833 if (result != KERN_SUCCESS) {
17834 vm_map_lock(map);
17835 break;
17836 }
17837
17838 /* transfer submap_copy entries to map_header */
17839 while (vm_map_copy_first_entry(submap_copy) !=
17840 vm_map_copy_to_entry(submap_copy)) {
17841 vm_map_entry_t copy_entry;
17842 vm_map_size_t copy_entry_size;
17843
17844 copy_entry = vm_map_copy_first_entry(submap_copy);
17845
17846 /*
17847 * Prevent kernel_object from being exposed to
17848 * user space.
17849 */
17850 if (__improbable(copy_entry->vme_kernel_object)) {
17851 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17852 proc_selfpid(),
17853 (get_bsdtask_info(current_task())
17854 ? proc_name_address(get_bsdtask_info(current_task()))
17855 : "?"));
17856 DTRACE_VM(extract_kernel_only);
17857 result = KERN_INVALID_RIGHT;
17858 vm_map_copy_discard(submap_copy);
17859 submap_copy = VM_MAP_COPY_NULL;
17860 vm_map_lock(map);
17861 break;
17862 }
17863
17864 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17865 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17866 copy_entry->vme_start = map_address;
17867 copy_entry->vme_end = map_address + copy_entry_size;
17868 map_address += copy_entry_size;
17869 mapped_size += copy_entry_size;
17870 src_start += copy_entry_size;
17871 assert(src_start <= src_end);
17872 _vm_map_store_entry_link(map_header,
17873 map_header->links.prev,
17874 copy_entry);
17875 }
17876 /* done with submap_copy */
17877 vm_map_copy_discard(submap_copy);
17878
17879 if (vm_remap_legacy) {
17880 *cur_protection &= submap_curprot;
17881 *max_protection &= submap_maxprot;
17882 }
17883
17884 /* re-acquire the map lock and continue to next entry */
17885 vm_map_lock(map);
17886 continue;
17887 } else {
17888 object = VME_OBJECT(src_entry);
17889
17890 /*
17891 * Prevent kernel_object from being exposed to
17892 * user space.
17893 */
17894 if (__improbable(is_kernel_object(object))) {
17895 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17896 proc_selfpid(),
17897 (get_bsdtask_info(current_task())
17898 ? proc_name_address(get_bsdtask_info(current_task()))
17899 : "?"));
17900 DTRACE_VM(extract_kernel_only);
17901 result = KERN_INVALID_RIGHT;
17902 break;
17903 }
17904
17905 if (src_entry->iokit_acct) {
17906 /*
17907 * This entry uses "IOKit accounting".
17908 */
17909 } else if (object != VM_OBJECT_NULL &&
17910 object->internal &&
17911 (object->purgable != VM_PURGABLE_DENY ||
17912 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17913 /*
17914 * Purgeable objects have their own accounting:
17915 * no pmap accounting for them.
17916 */
17917 assertf(!src_entry->use_pmap,
17918 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17919 map,
17920 src_entry,
17921 (uint64_t)src_entry->vme_start,
17922 (uint64_t)src_entry->vme_end,
17923 src_entry->protection,
17924 src_entry->max_protection,
17925 VME_ALIAS(src_entry));
17926 } else {
17927 /*
17928 * Not IOKit or purgeable:
17929 * must be accounted by pmap stats.
17930 */
17931 assertf(src_entry->use_pmap,
17932 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17933 map,
17934 src_entry,
17935 (uint64_t)src_entry->vme_start,
17936 (uint64_t)src_entry->vme_end,
17937 src_entry->protection,
17938 src_entry->max_protection,
17939 VME_ALIAS(src_entry));
17940 }
17941
17942 if (object == VM_OBJECT_NULL) {
17943 assert(!src_entry->needs_copy);
17944 if (src_entry->max_protection == VM_PROT_NONE) {
17945 assert(src_entry->protection == VM_PROT_NONE);
17946 /*
17947 * No VM object and no permissions:
17948 * this must be a reserved range with
17949 * nothing to share or copy.
17950 * There could also be all sorts of
17951 * pmap shenanigans within that reserved
17952 * range, so let's just copy the map
17953 * entry as is to remap a similar
17954 * reserved range.
17955 */
17956 offset = 0; /* no object => no offset */
17957 goto copy_src_entry;
17958 }
17959 object = vm_object_allocate(entry_size);
17960 VME_OFFSET_SET(src_entry, 0);
17961 VME_OBJECT_SET(src_entry, object, false, 0);
17962 assert(src_entry->use_pmap);
17963 assert(!map->mapped_in_other_pmaps);
17964 } else if (src_entry->wired_count ||
17965 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17966 /*
17967 * A wired memory region should not have
17968 * any pending copy-on-write and needs to
17969 * keep pointing at the VM object that
17970 * contains the wired pages.
17971 * If we're sharing this memory (copy=false),
17972 * we'll share this VM object.
17973 * If we're copying this memory (copy=true),
17974 * we'll call vm_object_copy_slowly() below
17975 * and use the new VM object for the remapping.
17976 *
17977 * Or, we are already using an asymmetric
17978 * copy, and therefore we already have
17979 * the right object.
17980 */
17981 assert(!src_entry->needs_copy);
17982 } else if (src_entry->needs_copy || object->shadowed ||
17983 (object->internal && !object->true_share &&
17984 !src_entry->is_shared &&
17985 object->vo_size > entry_size)) {
17986 bool is_writable;
17987
17988 VME_OBJECT_SHADOW(src_entry, entry_size,
17989 vm_map_always_shadow(map));
17990 assert(src_entry->use_pmap);
17991
17992 is_writable = false;
17993 if (src_entry->protection & VM_PROT_WRITE) {
17994 is_writable = true;
17995 #if __arm64e__
17996 } else if (src_entry->used_for_tpro) {
17997 is_writable = true;
17998 #endif /* __arm64e__ */
17999 }
18000 if (!src_entry->needs_copy && is_writable) {
18001 vm_prot_t prot;
18002
18003 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18004 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18005 __FUNCTION__,
18006 map, map->pmap,
18007 src_entry,
18008 (uint64_t)src_entry->vme_start,
18009 (uint64_t)src_entry->vme_end,
18010 src_entry->protection);
18011 }
18012
18013 prot = src_entry->protection & ~VM_PROT_WRITE;
18014
18015 if (override_nx(map,
18016 VME_ALIAS(src_entry))
18017 && prot) {
18018 prot |= VM_PROT_EXECUTE;
18019 }
18020
18021 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18022 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18023 __FUNCTION__,
18024 map, map->pmap,
18025 src_entry,
18026 (uint64_t)src_entry->vme_start,
18027 (uint64_t)src_entry->vme_end,
18028 prot);
18029 }
18030
18031 if (map->mapped_in_other_pmaps) {
18032 vm_object_pmap_protect(
18033 VME_OBJECT(src_entry),
18034 VME_OFFSET(src_entry),
18035 entry_size,
18036 PMAP_NULL,
18037 PAGE_SIZE,
18038 src_entry->vme_start,
18039 prot);
18040 #if MACH_ASSERT
18041 } else if (__improbable(map->pmap == PMAP_NULL)) {
18042 /*
18043 * Some VM tests (in vm_tests.c)
18044 * sometimes want to use a VM
18045 * map without a pmap.
18046 * Otherwise, this should never
18047 * happen.
18048 */
18049 if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18050 panic("null pmap");
18051 }
18052 #endif /* MACH_ASSERT */
18053 } else {
18054 pmap_protect(vm_map_pmap(map),
18055 src_entry->vme_start,
18056 src_entry->vme_end,
18057 prot);
18058 }
18059 }
18060
18061 object = VME_OBJECT(src_entry);
18062 src_entry->needs_copy = FALSE;
18063 }
18064
18065
18066 vm_object_lock(object);
18067 vm_object_reference_locked(object); /* object ref. for new entry */
18068 assert(!src_entry->needs_copy);
18069 if (object->copy_strategy ==
18070 MEMORY_OBJECT_COPY_SYMMETRIC) {
18071 /*
18072 * If we want to share this object (copy==0),
18073 * it needs to be COPY_DELAY.
18074 * If we want to copy this object (copy==1),
18075 * we can't just set "needs_copy" on our side
18076 * and expect the other side to do the same
18077 * (symmetrically), so we can't let the object
18078 * stay COPY_SYMMETRIC.
18079 * So we always switch from COPY_SYMMETRIC to
18080 * COPY_DELAY.
18081 */
18082 object->copy_strategy =
18083 MEMORY_OBJECT_COPY_DELAY;
18084 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18085 }
18086 vm_object_unlock(object);
18087 }
18088
18089 offset = (VME_OFFSET(src_entry) +
18090 (src_start - src_entry->vme_start));
18091
18092 copy_src_entry:
18093 new_entry = _vm_map_entry_create(map_header);
18094 vm_map_entry_copy(map, new_entry, src_entry);
18095 if (new_entry->is_sub_map) {
18096 /* clr address space specifics */
18097 new_entry->use_pmap = FALSE;
18098 } else if (copy) {
18099 /*
18100 * We're dealing with a copy-on-write operation,
18101 * so the resulting mapping should not inherit the
18102 * original mapping's accounting settings.
18103 * "use_pmap" should be reset to its default (TRUE)
18104 * so that the new mapping gets accounted for in
18105 * the task's memory footprint.
18106 */
18107 new_entry->use_pmap = TRUE;
18108 }
18109 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18110 assert(!new_entry->iokit_acct);
18111
18112 new_entry->map_aligned = FALSE;
18113
18114 new_entry->vme_start = map_address;
18115 new_entry->vme_end = map_address + tmp_size;
18116 assert(new_entry->vme_start < new_entry->vme_end);
18117 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18118 /* security: keep "permanent" and "csm_associated" */
18119 new_entry->vme_permanent = src_entry->vme_permanent;
18120 new_entry->csm_associated = src_entry->csm_associated;
18121 /*
18122 * Remapping for vm_map_protect(VM_PROT_COPY)
18123 * to convert a read-only mapping into a
18124 * copy-on-write version of itself but
18125 * with write access:
18126 * keep the original inheritance but let's not
18127 * add VM_PROT_WRITE to the max protection yet
18128 * since we want to do more security checks against
18129 * the target map.
18130 */
18131 new_entry->inheritance = src_entry->inheritance;
18132 new_entry->protection &= max_prot_for_prot_copy;
18133
18134 #ifdef __arm64e__
18135 /*
18136 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18137 * region to be explicitly writable without TPRO is only permitted
18138 * if TPRO enforcement has been overridden.
18139 *
18140 * In this case we ensure any entries reset the TPRO state
18141 * and we permit the region to be downgraded from permanent.
18142 */
18143 if (new_entry->used_for_tpro) {
18144 if (vmk_flags.vmkf_tpro_enforcement_override) {
18145 new_entry->used_for_tpro = FALSE;
18146 new_entry->vme_permanent = FALSE;
18147 } else {
18148 result = KERN_PROTECTION_FAILURE;
18149 vm_object_deallocate(object);
18150 vm_map_entry_dispose(new_entry);
18151 new_entry = VM_MAP_ENTRY_NULL;
18152 break;
18153 }
18154 }
18155 #endif
18156 } else {
18157 new_entry->inheritance = inheritance;
18158 if (!vm_remap_legacy) {
18159 new_entry->protection = *cur_protection;
18160 new_entry->max_protection = *max_protection;
18161 }
18162 }
18163
18164 VME_OFFSET_SET(new_entry, offset);
18165
18166 /*
18167 * The new region has to be copied now if required.
18168 */
18169 RestartCopy:
18170 if (!copy) {
18171 if (src_entry->used_for_jit == TRUE) {
18172 if (same_map) {
18173 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18174 /*
18175 * Cannot allow an entry describing a JIT
18176 * region to be shared across address spaces.
18177 */
18178 result = KERN_INVALID_ARGUMENT;
18179 vm_object_deallocate(object);
18180 vm_map_entry_dispose(new_entry);
18181 new_entry = VM_MAP_ENTRY_NULL;
18182 break;
18183 }
18184 }
18185
18186 if (!src_entry->is_sub_map &&
18187 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18188 /* no accessible memory; nothing to share */
18189 assert(src_entry->protection == VM_PROT_NONE);
18190 assert(src_entry->max_protection == VM_PROT_NONE);
18191 src_entry->is_shared = FALSE;
18192 } else {
18193 src_entry->is_shared = TRUE;
18194 }
18195 if (!new_entry->is_sub_map &&
18196 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18197 /* no accessible memory; nothing to share */
18198 assert(new_entry->protection == VM_PROT_NONE);
18199 assert(new_entry->max_protection == VM_PROT_NONE);
18200 new_entry->is_shared = FALSE;
18201 } else {
18202 new_entry->is_shared = TRUE;
18203 }
18204 if (!(new_entry->is_sub_map)) {
18205 new_entry->needs_copy = FALSE;
18206 }
18207 } else if (src_entry->is_sub_map) {
18208 /* make this a COW sub_map if not already */
18209 assert(new_entry->wired_count == 0);
18210 new_entry->needs_copy = TRUE;
18211 object = VM_OBJECT_NULL;
18212 } else if (src_entry->wired_count == 0 &&
18213 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18214 vm_object_copy_quickly(VME_OBJECT(new_entry),
18215 VME_OFFSET(new_entry),
18216 (new_entry->vme_end -
18217 new_entry->vme_start),
18218 &src_needs_copy,
18219 &new_entry_needs_copy)) {
18220 new_entry->needs_copy = new_entry_needs_copy;
18221 new_entry->is_shared = FALSE;
18222 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18223
18224 /*
18225 * Handle copy_on_write semantics.
18226 */
18227 if (src_needs_copy && !src_entry->needs_copy) {
18228 vm_prot_t prot;
18229
18230 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18231 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18232 __FUNCTION__,
18233 map, map->pmap, src_entry,
18234 (uint64_t)src_entry->vme_start,
18235 (uint64_t)src_entry->vme_end,
18236 src_entry->protection);
18237 }
18238
18239 prot = src_entry->protection & ~VM_PROT_WRITE;
18240
18241 if (override_nx(map,
18242 VME_ALIAS(src_entry))
18243 && prot) {
18244 prot |= VM_PROT_EXECUTE;
18245 }
18246
18247 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18248 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18249 __FUNCTION__,
18250 map, map->pmap, src_entry,
18251 (uint64_t)src_entry->vme_start,
18252 (uint64_t)src_entry->vme_end,
18253 prot);
18254 }
18255
18256 vm_object_pmap_protect(object,
18257 offset,
18258 entry_size,
18259 ((src_entry->is_shared
18260 || map->mapped_in_other_pmaps) ?
18261 PMAP_NULL : map->pmap),
18262 VM_MAP_PAGE_SIZE(map),
18263 src_entry->vme_start,
18264 prot);
18265
18266 assert(src_entry->wired_count == 0);
18267 src_entry->needs_copy = TRUE;
18268 }
18269 /*
18270 * Throw away the old object reference of the new entry.
18271 */
18272 vm_object_deallocate(object);
18273 } else {
18274 new_entry->is_shared = FALSE;
18275 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18276
18277 src_entry_was_wired = (src_entry->wired_count > 0);
18278 saved_src_entry = src_entry;
18279 src_entry = VM_MAP_ENTRY_NULL;
18280
18281 /*
18282 * The map can be safely unlocked since we
18283 * already hold a reference on the object.
18284 *
18285 * Record the timestamp of the map for later
18286 * verification, and unlock the map.
18287 */
18288 version.main_timestamp = map->timestamp;
18289 vm_map_unlock(map); /* Increments timestamp once! */
18290
18291 /*
18292 * Perform the copy.
18293 */
18294 if (src_entry_was_wired > 0 ||
18295 (debug4k_no_cow_copyin &&
18296 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18297 vm_object_lock(object);
18298 result = vm_object_copy_slowly(
18299 object,
18300 offset,
18301 (new_entry->vme_end -
18302 new_entry->vme_start),
18303 THREAD_UNINT,
18304 &new_copy_object);
18305 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18306 saved_used_for_jit = new_entry->used_for_jit;
18307 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18308 new_entry->used_for_jit = saved_used_for_jit;
18309 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18310 new_entry->needs_copy = FALSE;
18311 } else {
18312 vm_object_offset_t new_offset;
18313
18314 new_offset = VME_OFFSET(new_entry);
18315 result = vm_object_copy_strategically(
18316 object,
18317 offset,
18318 (new_entry->vme_end -
18319 new_entry->vme_start),
18320 false, /* forking */
18321 &new_copy_object,
18322 &new_offset,
18323 &new_entry_needs_copy);
18324 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18325 saved_used_for_jit = new_entry->used_for_jit;
18326 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18327 new_entry->used_for_jit = saved_used_for_jit;
18328 if (new_offset != VME_OFFSET(new_entry)) {
18329 VME_OFFSET_SET(new_entry, new_offset);
18330 }
18331
18332 new_entry->needs_copy = new_entry_needs_copy;
18333 }
18334
18335 /*
18336 * Throw away the old object reference of the new entry.
18337 */
18338 vm_object_deallocate(object);
18339
18340 if (result != KERN_SUCCESS &&
18341 result != KERN_MEMORY_RESTART_COPY) {
18342 vm_map_entry_dispose(new_entry);
18343 vm_map_lock(map);
18344 break;
18345 }
18346
18347 /*
18348 * Verify that the map has not substantially
18349 * changed while the copy was being made.
18350 */
18351
18352 vm_map_lock(map);
18353 if (version.main_timestamp + 1 != map->timestamp) {
18354 /*
18355 * Simple version comparison failed.
18356 *
18357 * Retry the lookup and verify that the
18358 * same object/offset are still present.
18359 */
18360 saved_src_entry = VM_MAP_ENTRY_NULL;
18361 vm_object_deallocate(VME_OBJECT(new_entry));
18362 vm_map_entry_dispose(new_entry);
18363 if (result == KERN_MEMORY_RESTART_COPY) {
18364 result = KERN_SUCCESS;
18365 }
18366 continue;
18367 }
18368 /* map hasn't changed: src_entry is still valid */
18369 src_entry = saved_src_entry;
18370 saved_src_entry = VM_MAP_ENTRY_NULL;
18371
18372 if (result == KERN_MEMORY_RESTART_COPY) {
18373 vm_object_reference(object);
18374 goto RestartCopy;
18375 }
18376 }
18377
18378 _vm_map_store_entry_link(map_header,
18379 map_header->links.prev, new_entry);
18380
18381 /* protections for submap mapping are irrelevant here */
18382 if (vm_remap_legacy && !src_entry->is_sub_map) {
18383 *cur_protection &= src_entry->protection;
18384 *max_protection &= src_entry->max_protection;
18385 }
18386
18387 map_address += tmp_size;
18388 mapped_size += tmp_size;
18389 src_start += tmp_size;
18390
18391 if (vmk_flags.vmkf_copy_single_object) {
18392 if (mapped_size != size) {
18393 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18394 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18395 if (src_entry->vme_next != vm_map_to_entry(map) &&
18396 src_entry->vme_next->vme_object_value ==
18397 src_entry->vme_object_value) {
18398 /* XXX TODO4K */
18399 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18400 }
18401 }
18402 break;
18403 }
18404 } /* end while */
18405
18406 vm_map_unlock(map);
18407 if (result != KERN_SUCCESS) {
18408 /*
18409 * Free all allocated elements.
18410 */
18411 for (src_entry = map_header->links.next;
18412 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18413 src_entry = new_entry) {
18414 new_entry = src_entry->vme_next;
18415 _vm_map_store_entry_unlink(map_header, src_entry, false);
18416 if (src_entry->is_sub_map) {
18417 vm_map_deallocate(VME_SUBMAP(src_entry));
18418 } else {
18419 vm_object_deallocate(VME_OBJECT(src_entry));
18420 }
18421 vm_map_entry_dispose(src_entry);
18422 }
18423 }
18424 return result;
18425 }
18426
18427 bool
vm_map_is_exotic(vm_map_t map)18428 vm_map_is_exotic(
18429 vm_map_t map)
18430 {
18431 return VM_MAP_IS_EXOTIC(map);
18432 }
18433
18434 bool
vm_map_is_alien(vm_map_t map)18435 vm_map_is_alien(
18436 vm_map_t map)
18437 {
18438 return VM_MAP_IS_ALIEN(map);
18439 }
18440
18441 #if XNU_TARGET_OS_OSX
18442 void
vm_map_mark_alien(vm_map_t map)18443 vm_map_mark_alien(
18444 vm_map_t map)
18445 {
18446 vm_map_lock(map);
18447 map->is_alien = true;
18448 vm_map_unlock(map);
18449 }
18450
18451 void
vm_map_single_jit(vm_map_t map)18452 vm_map_single_jit(
18453 vm_map_t map)
18454 {
18455 vm_map_lock(map);
18456 map->single_jit = true;
18457 vm_map_unlock(map);
18458 }
18459 #endif /* XNU_TARGET_OS_OSX */
18460
18461
18462
18463 /*
18464 * Callers of this function must call vm_map_copy_require on
18465 * previously created vm_map_copy_t or pass a newly created
18466 * one to ensure that it hasn't been forged.
18467 */
18468 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18469 vm_map_copy_to_physcopy(
18470 vm_map_copy_t copy_map,
18471 vm_map_t target_map)
18472 {
18473 vm_map_size_t size;
18474 vm_map_entry_t entry;
18475 vm_map_entry_t new_entry;
18476 vm_object_t new_object;
18477 unsigned int pmap_flags;
18478 pmap_t new_pmap;
18479 vm_map_t new_map;
18480 vm_map_address_t src_start, src_end, src_cur;
18481 vm_map_address_t dst_start, dst_end, dst_cur;
18482 kern_return_t kr;
18483 void *kbuf;
18484
18485 /*
18486 * Perform the equivalent of vm_allocate() and memcpy().
18487 * Replace the mappings in "copy_map" with the newly allocated mapping.
18488 */
18489 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18490
18491 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18492
18493 /* create a new pmap to map "copy_map" */
18494 pmap_flags = 0;
18495 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18496 #if PMAP_CREATE_FORCE_4K_PAGES
18497 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18498 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18499 pmap_flags |= PMAP_CREATE_64BIT;
18500 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18501 if (new_pmap == NULL) {
18502 return KERN_RESOURCE_SHORTAGE;
18503 }
18504
18505 /* allocate new VM object */
18506 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18507 new_object = vm_object_allocate(size);
18508 assert(new_object);
18509
18510 /* allocate new VM map entry */
18511 new_entry = vm_map_copy_entry_create(copy_map);
18512 assert(new_entry);
18513
18514 /* finish initializing new VM map entry */
18515 new_entry->protection = VM_PROT_DEFAULT;
18516 new_entry->max_protection = VM_PROT_DEFAULT;
18517 new_entry->use_pmap = TRUE;
18518
18519 /* make new VM map entry point to new VM object */
18520 new_entry->vme_start = 0;
18521 new_entry->vme_end = size;
18522 VME_OBJECT_SET(new_entry, new_object, false, 0);
18523 VME_OFFSET_SET(new_entry, 0);
18524
18525 /* create a new pageable VM map to map "copy_map" */
18526 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18527 VM_MAP_CREATE_PAGEABLE);
18528 assert(new_map);
18529 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18530
18531 /* map "copy_map" in the new VM map */
18532 src_start = 0;
18533 kr = vm_map_copyout_internal(
18534 new_map,
18535 &src_start,
18536 copy_map,
18537 copy_map->size,
18538 FALSE, /* consume_on_success */
18539 VM_PROT_DEFAULT,
18540 VM_PROT_DEFAULT,
18541 VM_INHERIT_DEFAULT);
18542 assert(kr == KERN_SUCCESS);
18543 src_end = src_start + copy_map->size;
18544
18545 /* map "new_object" in the new VM map */
18546 vm_object_reference(new_object);
18547 dst_start = 0;
18548 kr = vm_map_enter(new_map,
18549 &dst_start,
18550 size,
18551 0, /* mask */
18552 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18553 new_object,
18554 0, /* offset */
18555 FALSE, /* needs copy */
18556 VM_PROT_DEFAULT,
18557 VM_PROT_DEFAULT,
18558 VM_INHERIT_DEFAULT);
18559 assert(kr == KERN_SUCCESS);
18560 dst_end = dst_start + size;
18561
18562 /* get a kernel buffer */
18563 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18564
18565 /* physically copy "copy_map" mappings to new VM object */
18566 for (src_cur = src_start, dst_cur = dst_start;
18567 src_cur < src_end;
18568 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18569 vm_size_t bytes;
18570
18571 bytes = PAGE_SIZE;
18572 if (src_cur + PAGE_SIZE > src_end) {
18573 /* partial copy for last page */
18574 bytes = src_end - src_cur;
18575 assert(bytes > 0 && bytes < PAGE_SIZE);
18576 /* rest of dst page should be zero-filled */
18577 }
18578 /* get bytes from src mapping */
18579 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18580 if (kr != KERN_SUCCESS) {
18581 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18582 }
18583 /* put bytes in dst mapping */
18584 assert(dst_cur < dst_end);
18585 assert(dst_cur + bytes <= dst_end);
18586 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18587 if (kr != KERN_SUCCESS) {
18588 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18589 }
18590 }
18591
18592 /* free kernel buffer */
18593 kfree_data(kbuf, PAGE_SIZE);
18594
18595 /* destroy new map */
18596 vm_map_destroy(new_map);
18597 new_map = VM_MAP_NULL;
18598
18599 /* dispose of the old map entries in "copy_map" */
18600 while (vm_map_copy_first_entry(copy_map) !=
18601 vm_map_copy_to_entry(copy_map)) {
18602 entry = vm_map_copy_first_entry(copy_map);
18603 vm_map_copy_entry_unlink(copy_map, entry);
18604 if (entry->is_sub_map) {
18605 vm_map_deallocate(VME_SUBMAP(entry));
18606 } else {
18607 vm_object_deallocate(VME_OBJECT(entry));
18608 }
18609 vm_map_copy_entry_dispose(entry);
18610 }
18611
18612 /* change "copy_map"'s page_size to match "target_map" */
18613 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18614 copy_map->offset = 0;
18615 copy_map->size = size;
18616
18617 /* insert new map entry in "copy_map" */
18618 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18619 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18620
18621 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18622 return KERN_SUCCESS;
18623 }
18624
18625 void
18626 vm_map_copy_adjust_get_target_copy_map(
18627 vm_map_copy_t copy_map,
18628 vm_map_copy_t *target_copy_map_p);
18629 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18630 vm_map_copy_adjust_get_target_copy_map(
18631 vm_map_copy_t copy_map,
18632 vm_map_copy_t *target_copy_map_p)
18633 {
18634 vm_map_copy_t target_copy_map;
18635 vm_map_entry_t entry, target_entry;
18636
18637 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18638 /* the caller already has a "target_copy_map": use it */
18639 return;
18640 }
18641
18642 /* the caller wants us to create a new copy of "copy_map" */
18643 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18644 target_copy_map = vm_map_copy_allocate(copy_map->type);
18645 target_copy_map->offset = copy_map->offset;
18646 target_copy_map->size = copy_map->size;
18647 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18648 for (entry = vm_map_copy_first_entry(copy_map);
18649 entry != vm_map_copy_to_entry(copy_map);
18650 entry = entry->vme_next) {
18651 target_entry = vm_map_copy_entry_create(target_copy_map);
18652 vm_map_entry_copy_full(target_entry, entry);
18653 if (target_entry->is_sub_map) {
18654 vm_map_reference(VME_SUBMAP(target_entry));
18655 } else {
18656 vm_object_reference(VME_OBJECT(target_entry));
18657 }
18658 vm_map_copy_entry_link(
18659 target_copy_map,
18660 vm_map_copy_last_entry(target_copy_map),
18661 target_entry);
18662 }
18663 entry = VM_MAP_ENTRY_NULL;
18664 *target_copy_map_p = target_copy_map;
18665 }
18666
18667 /*
18668 * Callers of this function must call vm_map_copy_require on
18669 * previously created vm_map_copy_t or pass a newly created
18670 * one to ensure that it hasn't been forged.
18671 */
18672 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18673 vm_map_copy_trim(
18674 vm_map_copy_t copy_map,
18675 uint16_t new_page_shift,
18676 vm_map_offset_t trim_start,
18677 vm_map_offset_t trim_end)
18678 {
18679 uint16_t copy_page_shift;
18680 vm_map_entry_t entry, next_entry;
18681
18682 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18683 assert(copy_map->cpy_hdr.nentries > 0);
18684
18685 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18686 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18687
18688 /* use the new page_shift to do the clipping */
18689 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18690 copy_map->cpy_hdr.page_shift = new_page_shift;
18691
18692 for (entry = vm_map_copy_first_entry(copy_map);
18693 entry != vm_map_copy_to_entry(copy_map);
18694 entry = next_entry) {
18695 next_entry = entry->vme_next;
18696 if (entry->vme_end <= trim_start) {
18697 /* entry fully before trim range: skip */
18698 continue;
18699 }
18700 if (entry->vme_start >= trim_end) {
18701 /* entry fully after trim range: done */
18702 break;
18703 }
18704 /* clip entry if needed */
18705 vm_map_copy_clip_start(copy_map, entry, trim_start);
18706 vm_map_copy_clip_end(copy_map, entry, trim_end);
18707 /* dispose of entry */
18708 copy_map->size -= entry->vme_end - entry->vme_start;
18709 vm_map_copy_entry_unlink(copy_map, entry);
18710 if (entry->is_sub_map) {
18711 vm_map_deallocate(VME_SUBMAP(entry));
18712 } else {
18713 vm_object_deallocate(VME_OBJECT(entry));
18714 }
18715 vm_map_copy_entry_dispose(entry);
18716 entry = VM_MAP_ENTRY_NULL;
18717 }
18718
18719 /* restore copy_map's original page_shift */
18720 copy_map->cpy_hdr.page_shift = copy_page_shift;
18721 }
18722
18723 /*
18724 * Make any necessary adjustments to "copy_map" to allow it to be
18725 * mapped into "target_map".
18726 * If no changes were necessary, "target_copy_map" points to the
18727 * untouched "copy_map".
18728 * If changes are necessary, changes will be made to "target_copy_map".
18729 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18730 * copy the original "copy_map" to it before applying the changes.
18731 * The caller should discard "target_copy_map" if it's not the same as
18732 * the original "copy_map".
18733 */
18734 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18735 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18736 vm_map_copy_adjust_to_target(
18737 vm_map_copy_t src_copy_map,
18738 vm_map_offset_ut offset_u,
18739 vm_map_size_ut size_u,
18740 vm_map_t target_map,
18741 boolean_t copy,
18742 vm_map_copy_t *target_copy_map_p,
18743 vm_map_offset_t *overmap_start_p,
18744 vm_map_offset_t *overmap_end_p,
18745 vm_map_offset_t *trimmed_start_p)
18746 {
18747 vm_map_copy_t copy_map, target_copy_map;
18748 vm_map_size_t target_size;
18749 vm_map_size_t src_copy_map_size;
18750 vm_map_size_t overmap_start, overmap_end;
18751 int misalignments;
18752 vm_map_entry_t entry, target_entry;
18753 vm_map_offset_t addr_adjustment;
18754 vm_map_offset_t new_start, new_end;
18755 int copy_page_mask, target_page_mask;
18756 uint16_t copy_page_shift, target_page_shift;
18757 vm_map_offset_t trimmed_end;
18758 vm_map_size_t map_size;
18759 kern_return_t kr;
18760
18761 /*
18762 * Sanitize any input parameters that are addr/size/prot/inherit
18763 */
18764 kr = vm_map_copy_addr_size_sanitize(
18765 target_map,
18766 offset_u,
18767 size_u,
18768 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18769 &new_start,
18770 &new_end,
18771 &map_size);
18772 if (__improbable(kr != KERN_SUCCESS)) {
18773 return vm_sanitize_get_kr(kr);
18774 }
18775
18776 /*
18777 * Assert that the vm_map_copy is coming from the right
18778 * zone and hasn't been forged
18779 */
18780 vm_map_copy_require(src_copy_map);
18781 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18782
18783 /*
18784 * Start working with "src_copy_map" but we'll switch
18785 * to "target_copy_map" as soon as we start making adjustments.
18786 */
18787 copy_map = src_copy_map;
18788 src_copy_map_size = src_copy_map->size;
18789
18790 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18791 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18792 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18793 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18794
18795 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18796
18797 target_copy_map = *target_copy_map_p;
18798 if (target_copy_map != VM_MAP_COPY_NULL) {
18799 vm_map_copy_require(target_copy_map);
18800 }
18801
18802 if (new_end > copy_map->size) {
18803 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18804 return KERN_INVALID_ARGUMENT;
18805 }
18806
18807 /* trim the end */
18808 trimmed_end = 0;
18809 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18810 if (new_end < copy_map->size) {
18811 trimmed_end = src_copy_map_size - new_end;
18812 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18813 /* get "target_copy_map" if needed and adjust it */
18814 vm_map_copy_adjust_get_target_copy_map(copy_map,
18815 &target_copy_map);
18816 copy_map = target_copy_map;
18817 vm_map_copy_trim(target_copy_map, target_page_shift,
18818 new_end, copy_map->size);
18819 }
18820
18821 /* trim the start */
18822 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18823 if (new_start != 0) {
18824 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18825 /* get "target_copy_map" if needed and adjust it */
18826 vm_map_copy_adjust_get_target_copy_map(copy_map,
18827 &target_copy_map);
18828 copy_map = target_copy_map;
18829 vm_map_copy_trim(target_copy_map, target_page_shift,
18830 0, new_start);
18831 }
18832 *trimmed_start_p = new_start;
18833
18834 /* target_size starts with what's left after trimming */
18835 target_size = copy_map->size;
18836 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18837 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18838 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18839 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18840
18841 /* check for misalignments but don't adjust yet */
18842 misalignments = 0;
18843 overmap_start = 0;
18844 overmap_end = 0;
18845 if (copy_page_shift < target_page_shift) {
18846 /*
18847 * Remapping from 4K to 16K: check the VM object alignments
18848 * throughout the range.
18849 * If the start and end of the range are mis-aligned, we can
18850 * over-map to re-align, and adjust the "overmap" start/end
18851 * and "target_size" of the range accordingly.
18852 * If there is any mis-alignment within the range:
18853 * if "copy":
18854 * we can do immediate-copy instead of copy-on-write,
18855 * else:
18856 * no way to remap and share; fail.
18857 */
18858 for (entry = vm_map_copy_first_entry(copy_map);
18859 entry != vm_map_copy_to_entry(copy_map);
18860 entry = entry->vme_next) {
18861 vm_object_offset_t object_offset_start, object_offset_end;
18862
18863 object_offset_start = VME_OFFSET(entry);
18864 object_offset_end = object_offset_start;
18865 object_offset_end += entry->vme_end - entry->vme_start;
18866 if (object_offset_start & target_page_mask) {
18867 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18868 overmap_start++;
18869 } else {
18870 misalignments++;
18871 }
18872 }
18873 if (object_offset_end & target_page_mask) {
18874 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18875 overmap_end++;
18876 } else {
18877 misalignments++;
18878 }
18879 }
18880 }
18881 }
18882 entry = VM_MAP_ENTRY_NULL;
18883
18884 /* decide how to deal with misalignments */
18885 assert(overmap_start <= 1);
18886 assert(overmap_end <= 1);
18887 if (!overmap_start && !overmap_end && !misalignments) {
18888 /* copy_map is properly aligned for target_map ... */
18889 if (*trimmed_start_p) {
18890 /* ... but we trimmed it, so still need to adjust */
18891 } else {
18892 /* ... and we didn't trim anything: we're done */
18893 if (target_copy_map == VM_MAP_COPY_NULL) {
18894 target_copy_map = copy_map;
18895 }
18896 *target_copy_map_p = target_copy_map;
18897 *overmap_start_p = 0;
18898 *overmap_end_p = 0;
18899 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18900 return KERN_SUCCESS;
18901 }
18902 } else if (misalignments && !copy) {
18903 /* can't "share" if misaligned */
18904 DEBUG4K_ADJUST("unsupported sharing\n");
18905 #if MACH_ASSERT
18906 if (debug4k_panic_on_misaligned_sharing) {
18907 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18908 }
18909 #endif /* MACH_ASSERT */
18910 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18911 return KERN_NOT_SUPPORTED;
18912 } else {
18913 /* can't virtual-copy if misaligned (but can physical-copy) */
18914 DEBUG4K_ADJUST("mis-aligned copying\n");
18915 }
18916
18917 /* get a "target_copy_map" if needed and switch to it */
18918 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18919 copy_map = target_copy_map;
18920
18921 if (misalignments && copy) {
18922 vm_map_size_t target_copy_map_size;
18923
18924 /*
18925 * Can't do copy-on-write with misaligned mappings.
18926 * Replace the mappings with a physical copy of the original
18927 * mappings' contents.
18928 */
18929 target_copy_map_size = target_copy_map->size;
18930 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18931 if (kr != KERN_SUCCESS) {
18932 return kr;
18933 }
18934 *target_copy_map_p = target_copy_map;
18935 *overmap_start_p = 0;
18936 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18937 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18938 return KERN_SUCCESS;
18939 }
18940
18941 /* apply the adjustments */
18942 misalignments = 0;
18943 overmap_start = 0;
18944 overmap_end = 0;
18945 /* remove copy_map->offset, so that everything starts at offset 0 */
18946 addr_adjustment = copy_map->offset;
18947 /* also remove whatever we trimmed from the start */
18948 addr_adjustment += *trimmed_start_p;
18949 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18950 target_entry != vm_map_copy_to_entry(target_copy_map);
18951 target_entry = target_entry->vme_next) {
18952 vm_object_offset_t object_offset_start, object_offset_end;
18953
18954 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18955 object_offset_start = VME_OFFSET(target_entry);
18956 if (object_offset_start & target_page_mask) {
18957 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18958 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18959 /*
18960 * start of 1st entry is mis-aligned:
18961 * re-adjust by over-mapping.
18962 */
18963 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18964 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18965 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18966 } else {
18967 misalignments++;
18968 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18969 assert(copy);
18970 }
18971 }
18972
18973 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18974 target_size += overmap_start;
18975 } else {
18976 target_entry->vme_start += overmap_start;
18977 }
18978 target_entry->vme_end += overmap_start;
18979
18980 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18981 if (object_offset_end & target_page_mask) {
18982 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18983 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18984 /*
18985 * end of last entry is mis-aligned: re-adjust by over-mapping.
18986 */
18987 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18988 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18989 target_entry->vme_end += overmap_end;
18990 target_size += overmap_end;
18991 } else {
18992 misalignments++;
18993 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18994 assert(copy);
18995 }
18996 }
18997 target_entry->vme_start -= addr_adjustment;
18998 target_entry->vme_end -= addr_adjustment;
18999 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19000 }
19001
19002 target_copy_map->size = target_size;
19003 target_copy_map->offset += overmap_start;
19004 target_copy_map->offset -= addr_adjustment;
19005 target_copy_map->cpy_hdr.page_shift = target_page_shift;
19006
19007 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19008 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19009 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19010 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19011
19012 *target_copy_map_p = target_copy_map;
19013 *overmap_start_p = overmap_start;
19014 *overmap_end_p = overmap_end;
19015
19016 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19017 return KERN_SUCCESS;
19018 }
19019
19020 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19021 vm_map_range_physical_size(
19022 vm_map_t map,
19023 vm_map_address_t start,
19024 mach_vm_size_t size,
19025 mach_vm_size_t * phys_size)
19026 {
19027 kern_return_t kr;
19028 vm_map_copy_t copy_map, target_copy_map;
19029 vm_map_offset_t adjusted_start, adjusted_end;
19030 vm_map_size_t adjusted_size;
19031 vm_prot_t cur_prot, max_prot;
19032 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19033 vm_map_kernel_flags_t vmk_flags;
19034
19035 if (size == 0) {
19036 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19037 *phys_size = 0;
19038 return KERN_SUCCESS;
19039 }
19040
19041 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19042 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19043 if (__improbable(os_add_overflow(start, size, &end) ||
19044 adjusted_end <= adjusted_start)) {
19045 /* wraparound */
19046 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19047 *phys_size = 0;
19048 return KERN_INVALID_ARGUMENT;
19049 }
19050 if (__improbable(vm_map_range_overflows(map, start, size))) {
19051 *phys_size = 0;
19052 return KERN_INVALID_ADDRESS;
19053 }
19054 assert(adjusted_end > adjusted_start);
19055 adjusted_size = adjusted_end - adjusted_start;
19056 *phys_size = adjusted_size;
19057 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19058 return KERN_SUCCESS;
19059 }
19060 if (start == 0) {
19061 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19062 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19063 if (__improbable(adjusted_end <= adjusted_start)) {
19064 /* wraparound */
19065 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19066 *phys_size = 0;
19067 return KERN_INVALID_ARGUMENT;
19068 }
19069 assert(adjusted_end > adjusted_start);
19070 adjusted_size = adjusted_end - adjusted_start;
19071 *phys_size = adjusted_size;
19072 return KERN_SUCCESS;
19073 }
19074
19075 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19076 vmk_flags.vmkf_copy_pageable = TRUE;
19077 vmk_flags.vmkf_copy_same_map = TRUE;
19078 assert(adjusted_size != 0);
19079 cur_prot = VM_PROT_NONE; /* legacy mode */
19080 max_prot = VM_PROT_NONE; /* legacy mode */
19081 vmk_flags.vmkf_remap_legacy_mode = true;
19082 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19083 FALSE /* copy */,
19084 ©_map,
19085 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19086 vmk_flags);
19087 if (kr != KERN_SUCCESS) {
19088 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19089 //assert(0);
19090 *phys_size = 0;
19091 return kr;
19092 }
19093 assert(copy_map != VM_MAP_COPY_NULL);
19094 target_copy_map = copy_map;
19095 DEBUG4K_ADJUST("adjusting...\n");
19096 kr = vm_map_copy_adjust_to_target(
19097 copy_map,
19098 start - adjusted_start, /* offset */
19099 size, /* size */
19100 kernel_map,
19101 FALSE, /* copy */
19102 &target_copy_map,
19103 &overmap_start,
19104 &overmap_end,
19105 &trimmed_start);
19106 if (kr == KERN_SUCCESS) {
19107 if (target_copy_map->size != *phys_size) {
19108 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19109 }
19110 *phys_size = target_copy_map->size;
19111 } else {
19112 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19113 //assert(0);
19114 *phys_size = 0;
19115 }
19116 vm_map_copy_discard(copy_map);
19117 copy_map = VM_MAP_COPY_NULL;
19118
19119 return kr;
19120 }
19121
19122 static __attribute__((always_inline, warn_unused_result))
19123 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19124 vm_map_remap_sanitize(
19125 vm_map_t src_map,
19126 vm_map_t target_map,
19127 vm_map_address_ut address_u,
19128 vm_map_size_ut size_u,
19129 vm_map_offset_ut mask_u,
19130 vm_map_offset_ut memory_address_u,
19131 vm_prot_ut cur_protection_u,
19132 vm_prot_ut max_protection_u,
19133 vm_inherit_ut inheritance_u,
19134 vm_map_kernel_flags_t vmk_flags,
19135 vm_map_address_t *target_addr,
19136 vm_map_address_t *mask,
19137 vm_map_offset_t *memory_address,
19138 vm_map_offset_t *memory_end,
19139 vm_map_size_t *memory_size,
19140 vm_prot_t *cur_protection,
19141 vm_prot_t *max_protection,
19142 vm_inherit_t *inheritance)
19143 {
19144 kern_return_t result;
19145 vm_sanitize_flags_t vm_sanitize_flags;
19146
19147 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19148 inheritance);
19149 if (__improbable(result != KERN_SUCCESS)) {
19150 return result;
19151 }
19152
19153 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19154 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19155 cur_protection, max_protection);
19156 if (__improbable(result != KERN_SUCCESS)) {
19157 return result;
19158 }
19159
19160 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19161 if (__improbable(result != KERN_SUCCESS)) {
19162 return result;
19163 }
19164
19165 /*
19166 * If the user is requesting that we return the address of the
19167 * first byte of the data (rather than the base of the page),
19168 * then we use different rounding semantics: specifically,
19169 * we assume that (memory_address, size) describes a region
19170 * all of whose pages we must cover, rather than a base to be truncated
19171 * down and a size to be added to that base. So we figure out
19172 * the highest page that the requested region includes and make
19173 * sure that the size will cover it.
19174 *
19175 * The key example we're worried about it is of the form:
19176 *
19177 * memory_address = 0x1ff0, size = 0x20
19178 *
19179 * With the old semantics, we round down the memory_address to 0x1000
19180 * and round up the size to 0x1000, resulting in our covering *only*
19181 * page 0x1000. With the new semantics, we'd realize that the region covers
19182 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19183 * 0x1000 and page 0x2000 in the region we remap.
19184 *
19185 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19186 */
19187 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19188 if (!vmk_flags.vmf_return_data_addr) {
19189 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19190 }
19191
19192 result = vm_sanitize_addr_size(memory_address_u, size_u,
19193 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19194 vm_sanitize_flags, memory_address, memory_end,
19195 memory_size);
19196 if (__improbable(result != KERN_SUCCESS)) {
19197 return result;
19198 }
19199
19200 *target_addr = vm_sanitize_addr(target_map, address_u);
19201 return KERN_SUCCESS;
19202 }
19203
19204 /*
19205 * Routine: vm_remap
19206 *
19207 * Map portion of a task's address space.
19208 * Mapped region must not overlap more than
19209 * one vm memory object. Protections and
19210 * inheritance attributes remain the same
19211 * as in the original task and are out parameters.
19212 * Source and Target task can be identical
19213 * Other attributes are identical as for vm_map()
19214 */
19215 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19216 vm_map_remap(
19217 vm_map_t target_map,
19218 vm_map_address_ut *address_u,
19219 vm_map_size_ut size_u,
19220 vm_map_offset_ut mask_u,
19221 vm_map_kernel_flags_t vmk_flags,
19222 vm_map_t src_map,
19223 vm_map_offset_ut memory_address_u,
19224 boolean_t copy,
19225 vm_prot_ut *cur_protection_u, /* IN/OUT */
19226 vm_prot_ut *max_protection_u, /* IN/OUT */
19227 vm_inherit_ut inheritance_u)
19228 {
19229 vm_map_address_t target_addr, mask;
19230 vm_map_size_t target_size;
19231 vm_map_offset_t memory_address, memory_end;
19232 vm_map_size_t memory_size;
19233 vm_prot_t cur_protection, max_protection;
19234 vm_inherit_t inheritance;
19235 kern_return_t result;
19236 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19237 vm_map_copy_t copy_map;
19238 vm_map_offset_t offset_in_mapping;
19239 vm_map_size_t src_page_mask, target_page_mask;
19240 vm_map_size_t initial_size;
19241 VM_MAP_ZAP_DECLARE(zap_list);
19242
19243 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19244 return KERN_INVALID_ARGUMENT;
19245 }
19246 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19247 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19248
19249 if (src_page_mask != target_page_mask) {
19250 if (copy) {
19251 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19252 } else {
19253 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19254 }
19255 }
19256
19257 /*
19258 * Sanitize any input parameters that are addr/size/prot/inherit
19259 */
19260 result = vm_map_remap_sanitize(src_map,
19261 target_map,
19262 *address_u,
19263 size_u,
19264 mask_u,
19265 memory_address_u,
19266 *cur_protection_u,
19267 *max_protection_u,
19268 inheritance_u,
19269 vmk_flags,
19270 &target_addr,
19271 &mask,
19272 &memory_address,
19273 &memory_end,
19274 &memory_size,
19275 &cur_protection,
19276 &max_protection,
19277 &inheritance);
19278 if (__improbable(result != KERN_SUCCESS)) {
19279 return vm_sanitize_get_kr(result);
19280 }
19281
19282 if (vmk_flags.vmf_return_data_addr) {
19283 /*
19284 * This is safe to unwrap now that the quantities
19285 * have been validated and rounded up normally.
19286 */
19287 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19288 memory_address_u);
19289 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19290 } else {
19291 /*
19292 * IMPORTANT:
19293 * This legacy code path is broken: for the range mentioned
19294 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19295 * two 4k pages, it yields [ memory_address = 0x1000,
19296 * size = 0x1000 ], which covers only the first 4k page.
19297 * BUT some code unfortunately depends on this bug, so we
19298 * can't fix it without breaking something.
19299 * New code should get automatically opted in the new
19300 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19301 */
19302 offset_in_mapping = 0;
19303 initial_size = memory_size;
19304 }
19305
19306 if (vmk_flags.vmf_resilient_media) {
19307 /* must be copy-on-write to be "media resilient" */
19308 if (!copy) {
19309 return KERN_INVALID_ARGUMENT;
19310 }
19311 }
19312
19313 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19314 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19315
19316 assert(memory_size != 0);
19317 result = vm_map_copy_extract(src_map,
19318 memory_address,
19319 memory_size,
19320 copy, ©_map,
19321 &cur_protection, /* IN/OUT */
19322 &max_protection, /* IN/OUT */
19323 inheritance,
19324 vmk_flags);
19325 if (result != KERN_SUCCESS) {
19326 return result;
19327 }
19328 assert(copy_map != VM_MAP_COPY_NULL);
19329
19330 /*
19331 * Handle the policy for vm map ranges
19332 *
19333 * If the maps differ, the target_map policy applies like for vm_map()
19334 * For same mapping remaps, we preserve the range.
19335 */
19336 if (vmk_flags.vmkf_copy_same_map) {
19337 vmk_flags.vmkf_range_id = copy_map->orig_range;
19338 } else {
19339 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19340 }
19341
19342 target_size = memory_size;
19343 if (src_page_mask != target_page_mask) {
19344 vm_map_copy_t target_copy_map;
19345 vm_map_offset_t overmap_start = 0;
19346 vm_map_offset_t overmap_end = 0;
19347 vm_map_offset_t trimmed_start = 0;
19348
19349 target_copy_map = copy_map; /* can modify "copy_map" itself */
19350 DEBUG4K_ADJUST("adjusting...\n");
19351 result = vm_map_copy_adjust_to_target(
19352 copy_map,
19353 offset_in_mapping, /* offset */
19354 initial_size,
19355 target_map,
19356 copy,
19357 &target_copy_map,
19358 &overmap_start,
19359 &overmap_end,
19360 &trimmed_start);
19361 if (result != KERN_SUCCESS) {
19362 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19363 vm_map_copy_discard(copy_map);
19364 return result;
19365 }
19366 if (trimmed_start == 0) {
19367 /* nothing trimmed: no adjustment needed */
19368 } else if (trimmed_start >= offset_in_mapping) {
19369 /* trimmed more than offset_in_mapping: nothing left */
19370 assert(overmap_start == 0);
19371 assert(overmap_end == 0);
19372 offset_in_mapping = 0;
19373 } else {
19374 /* trimmed some of offset_in_mapping: adjust */
19375 assert(overmap_start == 0);
19376 assert(overmap_end == 0);
19377 offset_in_mapping -= trimmed_start;
19378 }
19379 offset_in_mapping += overmap_start;
19380 target_size = target_copy_map->size;
19381 }
19382
19383 /*
19384 * Allocate/check a range of free virtual address
19385 * space for the target
19386 */
19387 target_size = vm_map_round_page(target_size, target_page_mask);
19388
19389 if (target_size == 0) {
19390 vm_map_copy_discard(copy_map);
19391 return KERN_INVALID_ARGUMENT;
19392 }
19393
19394 vm_map_lock(target_map);
19395
19396 if (!vmk_flags.vmf_fixed) {
19397 result = vm_map_locate_space_anywhere(target_map, target_size,
19398 mask, vmk_flags, &target_addr, &insp_entry);
19399 } else {
19400 /*
19401 * vm_map_locate_space_fixed will reject overflowing
19402 * target_addr + target_size values
19403 */
19404 result = vm_map_locate_space_fixed(target_map, target_addr,
19405 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19406
19407 if (result == KERN_MEMORY_PRESENT) {
19408 assert(!vmk_flags.vmkf_already);
19409 insp_entry = VM_MAP_ENTRY_NULL;
19410 result = KERN_NO_SPACE;
19411 }
19412 }
19413
19414 if (result == KERN_SUCCESS) {
19415 while (vm_map_copy_first_entry(copy_map) !=
19416 vm_map_copy_to_entry(copy_map)) {
19417 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19418
19419 vm_map_copy_entry_unlink(copy_map, entry);
19420
19421 if (vmk_flags.vmkf_remap_prot_copy) {
19422 /*
19423 * This vm_map_remap() is for a
19424 * vm_protect(VM_PROT_COPY), so the caller
19425 * expects to be allowed to add write access
19426 * to this new mapping. This is done by
19427 * adding VM_PROT_WRITE to each entry's
19428 * max_protection... unless some security
19429 * settings disallow it.
19430 */
19431 bool allow_write = false;
19432 if (entry->vme_permanent) {
19433 /* immutable mapping... */
19434 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19435 developer_mode_state()) {
19436 /*
19437 * ... but executable and
19438 * possibly being debugged,
19439 * so let's allow it to become
19440 * writable, for breakpoints
19441 * and dtrace probes, for
19442 * example.
19443 */
19444 allow_write = true;
19445 } else {
19446 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19447 proc_selfpid(),
19448 (get_bsdtask_info(current_task())
19449 ? proc_name_address(get_bsdtask_info(current_task()))
19450 : "?"),
19451 (uint64_t)memory_address,
19452 (uint64_t)memory_size,
19453 entry->protection,
19454 entry->max_protection,
19455 developer_mode_state());
19456 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19457 vm_map_entry_t, entry,
19458 vm_map_offset_t, entry->vme_start,
19459 vm_map_offset_t, entry->vme_end,
19460 vm_prot_t, entry->protection,
19461 vm_prot_t, entry->max_protection,
19462 int, VME_ALIAS(entry));
19463 }
19464 } else {
19465 allow_write = true;
19466 }
19467
19468 /*
19469 * VM_PROT_COPY: allow this mapping to become
19470 * writable, unless it was "permanent".
19471 */
19472 if (allow_write) {
19473 entry->max_protection |= VM_PROT_WRITE;
19474 }
19475 }
19476 if (vmk_flags.vmf_resilient_codesign) {
19477 /* no codesigning -> read-only access */
19478 entry->max_protection = VM_PROT_READ;
19479 entry->protection = VM_PROT_READ;
19480 entry->vme_resilient_codesign = TRUE;
19481 }
19482 entry->vme_start += target_addr;
19483 entry->vme_end += target_addr;
19484 assert(!entry->map_aligned);
19485 if (vmk_flags.vmf_resilient_media &&
19486 !entry->is_sub_map &&
19487 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19488 VME_OBJECT(entry)->internal)) {
19489 entry->vme_resilient_media = TRUE;
19490 }
19491 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19492 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19493 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19494 vm_map_store_entry_link(target_map, insp_entry, entry,
19495 vmk_flags);
19496 insp_entry = entry;
19497 }
19498 }
19499
19500 if (vmk_flags.vmf_resilient_codesign) {
19501 cur_protection = VM_PROT_READ;
19502 max_protection = VM_PROT_READ;
19503 }
19504
19505 if (result == KERN_SUCCESS) {
19506 target_map->size += target_size;
19507 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19508 }
19509 vm_map_unlock(target_map);
19510
19511 vm_map_zap_dispose(&zap_list);
19512
19513 if (result == KERN_SUCCESS && target_map->wiring_required) {
19514 result = vm_map_wire_nested(target_map, target_addr,
19515 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19516 TRUE, PMAP_NULL, 0, NULL);
19517 }
19518
19519 if (result == KERN_SUCCESS) {
19520 #if KASAN
19521 if (target_map->pmap == kernel_pmap) {
19522 kasan_notify_address(target_addr, target_size);
19523 }
19524 #endif
19525 /*
19526 * If requested, return the address of the data pointed to by the
19527 * request, rather than the base of the resulting page.
19528 */
19529 if (vmk_flags.vmf_return_data_addr) {
19530 target_addr += offset_in_mapping;
19531 }
19532
19533 /*
19534 * Update OUT parameters.
19535 */
19536 *address_u = vm_sanitize_wrap_addr(target_addr);
19537
19538 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19539 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19540 }
19541
19542 if (src_page_mask != target_page_mask) {
19543 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19544 }
19545 vm_map_copy_discard(copy_map);
19546 copy_map = VM_MAP_COPY_NULL;
19547
19548 return result;
19549 }
19550
19551 /*
19552 * vm_map_switch:
19553 *
19554 * Set the address map for the current thread to the specified map
19555 */
19556
19557 vm_map_t
vm_map_switch(vm_map_t map)19558 vm_map_switch(
19559 vm_map_t map)
19560 {
19561 thread_t thread = current_thread();
19562 vm_map_t oldmap = thread->map;
19563
19564
19565 /*
19566 * Deactivate the current map and activate the requested map
19567 */
19568 mp_disable_preemption();
19569 PMAP_SWITCH_USER(thread, map, cpu_number());
19570 mp_enable_preemption();
19571 return oldmap;
19572 }
19573
19574 static __attribute__((always_inline, warn_unused_result))
19575 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19576 vm_map_rw_user_sanitize(
19577 vm_map_t map,
19578 vm_map_address_ut addr_u,
19579 vm_size_ut size_u,
19580 vm_sanitize_caller_t vm_sanitize_caller,
19581 vm_map_address_t *addr,
19582 vm_map_address_t *end,
19583 vm_map_size_t *size)
19584 {
19585 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19586 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
19587
19588
19589 return vm_sanitize_addr_size(addr_u, size_u,
19590 vm_sanitize_caller, map,
19591 flags,
19592 addr, end, size);
19593 }
19594
19595 /*
19596 * Routine: vm_map_write_user
19597 *
19598 * Description:
19599 * Copy out data from a kernel space into space in the
19600 * destination map. The space must already exist in the
19601 * destination map.
19602 * NOTE: This routine should only be called by threads
19603 * which can block on a page fault. i.e. kernel mode user
19604 * threads.
19605 *
19606 */
19607 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19608 vm_map_write_user(
19609 vm_map_t map,
19610 void *src_p,
19611 vm_map_address_ut dst_addr_u,
19612 vm_size_ut size_u)
19613 {
19614 kern_return_t kr;
19615 vm_map_address_t dst_addr, dst_end;
19616 vm_map_size_t size;
19617
19618 /*
19619 * src_p isn't validated: [src_p, src_p + size_u)
19620 * is trusted kernel input.
19621 *
19622 * dst_addr_u and size_u are untrusted and need to be sanitized.
19623 */
19624 kr = vm_map_rw_user_sanitize(map,
19625 dst_addr_u,
19626 size_u,
19627 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19628 &dst_addr,
19629 &dst_end,
19630 &size);
19631 if (__improbable(kr != KERN_SUCCESS)) {
19632 return vm_sanitize_get_kr(kr);
19633 }
19634
19635 if (current_map() == map) {
19636 if (copyout(src_p, dst_addr, size)) {
19637 kr = KERN_INVALID_ADDRESS;
19638 }
19639 } else {
19640 vm_map_t oldmap;
19641
19642 /* take on the identity of the target map while doing */
19643 /* the transfer */
19644
19645 vm_map_reference(map);
19646 oldmap = vm_map_switch(map);
19647 if (copyout(src_p, dst_addr, size)) {
19648 kr = KERN_INVALID_ADDRESS;
19649 }
19650 vm_map_switch(oldmap);
19651 vm_map_deallocate(map);
19652 }
19653 return kr;
19654 }
19655
19656 /*
19657 * Routine: vm_map_read_user
19658 *
19659 * Description:
19660 * Copy in data from a user space source map into the
19661 * kernel map. The space must already exist in the
19662 * kernel map.
19663 * NOTE: This routine should only be called by threads
19664 * which can block on a page fault. i.e. kernel mode user
19665 * threads.
19666 *
19667 */
19668 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19669 vm_map_read_user(
19670 vm_map_t map,
19671 vm_map_address_ut src_addr_u,
19672 void *dst_p,
19673 vm_size_ut size_u)
19674 {
19675 kern_return_t kr;
19676 vm_map_address_t src_addr, src_end;
19677 vm_map_size_t size;
19678
19679 /*
19680 * dst_p isn't validated: [dst_p, dst_p + size_u)
19681 * is trusted kernel input.
19682 *
19683 * src_addr_u and size_u are untrusted and need to be sanitized.
19684 */
19685 kr = vm_map_rw_user_sanitize(map,
19686 src_addr_u,
19687 size_u,
19688 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19689 &src_addr,
19690 &src_end,
19691 &size);
19692 if (__improbable(kr != KERN_SUCCESS)) {
19693 return vm_sanitize_get_kr(kr);
19694 }
19695
19696 if (current_map() == map) {
19697 if (copyin(src_addr, dst_p, size)) {
19698 kr = KERN_INVALID_ADDRESS;
19699 }
19700 } else {
19701 vm_map_t oldmap;
19702
19703 /* take on the identity of the target map while doing */
19704 /* the transfer */
19705
19706 vm_map_reference(map);
19707 oldmap = vm_map_switch(map);
19708 if (copyin(src_addr, dst_p, size)) {
19709 kr = KERN_INVALID_ADDRESS;
19710 }
19711 vm_map_switch(oldmap);
19712 vm_map_deallocate(map);
19713 }
19714 return kr;
19715 }
19716
19717
19718 static __attribute__((always_inline, warn_unused_result))
19719 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19720 vm_map_check_protection_sanitize(
19721 vm_map_t map,
19722 vm_map_offset_ut start_u,
19723 vm_map_offset_ut end_u,
19724 vm_prot_ut protection_u,
19725 vm_sanitize_caller_t vm_sanitize_caller,
19726 vm_map_offset_t *start,
19727 vm_map_offset_t *end,
19728 vm_prot_t *protection)
19729 {
19730 kern_return_t kr;
19731 vm_map_size_t size;
19732
19733 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19734 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19735 &size);
19736 if (__improbable(kr != KERN_SUCCESS)) {
19737 return kr;
19738 }
19739
19740 /*
19741 * Given that the protection is used only for comparisons below
19742 * no sanitization is being applied on it.
19743 */
19744 *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19745
19746 return KERN_SUCCESS;
19747 }
19748
19749 /*
19750 * vm_map_check_protection:
19751 *
19752 * Assert that the target map allows the specified
19753 * privilege on the entire address region given.
19754 * The entire region must be allocated.
19755 */
19756 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19757 vm_map_check_protection(
19758 vm_map_t map,
19759 vm_map_offset_ut start_u,
19760 vm_map_offset_ut end_u,
19761 vm_prot_ut protection_u,
19762 vm_sanitize_caller_t vm_sanitize_caller)
19763 {
19764 vm_map_entry_t entry;
19765 vm_map_entry_t tmp_entry;
19766 vm_map_offset_t start;
19767 vm_map_offset_t end;
19768 vm_prot_t protection;
19769 kern_return_t kr;
19770
19771 kr = vm_map_check_protection_sanitize(map,
19772 start_u,
19773 end_u,
19774 protection_u,
19775 vm_sanitize_caller,
19776 &start,
19777 &end,
19778 &protection);
19779 if (__improbable(kr != KERN_SUCCESS)) {
19780 kr = vm_sanitize_get_kr(kr);
19781 if (kr == KERN_SUCCESS) {
19782 return true;
19783 }
19784 return false;
19785 }
19786
19787 vm_map_lock(map);
19788
19789 if (start < vm_map_min(map) || end > vm_map_max(map)) {
19790 vm_map_unlock(map);
19791 return false;
19792 }
19793
19794 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19795 vm_map_unlock(map);
19796 return false;
19797 }
19798
19799 entry = tmp_entry;
19800
19801 while (start < end) {
19802 if (entry == vm_map_to_entry(map)) {
19803 vm_map_unlock(map);
19804 return false;
19805 }
19806
19807 /*
19808 * No holes allowed!
19809 */
19810
19811 if (start < entry->vme_start) {
19812 vm_map_unlock(map);
19813 return false;
19814 }
19815
19816 /*
19817 * Check protection associated with entry.
19818 */
19819
19820 if ((entry->protection & protection) != protection) {
19821 vm_map_unlock(map);
19822 return false;
19823 }
19824
19825 /* go to next entry */
19826
19827 start = entry->vme_end;
19828 entry = entry->vme_next;
19829 }
19830 vm_map_unlock(map);
19831 return true;
19832 }
19833
19834 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19835 vm_map_purgable_control(
19836 vm_map_t map,
19837 vm_map_offset_ut address_u,
19838 vm_purgable_t control,
19839 int *state)
19840 {
19841 vm_map_offset_t address;
19842 vm_map_entry_t entry;
19843 vm_object_t object;
19844 kern_return_t kr;
19845 boolean_t was_nonvolatile;
19846
19847 /*
19848 * Vet all the input parameters and current type and state of the
19849 * underlaying object. Return with an error if anything is amiss.
19850 */
19851 if (map == VM_MAP_NULL) {
19852 return KERN_INVALID_ARGUMENT;
19853 }
19854
19855 if (control != VM_PURGABLE_SET_STATE &&
19856 control != VM_PURGABLE_GET_STATE &&
19857 control != VM_PURGABLE_PURGE_ALL &&
19858 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19859 return KERN_INVALID_ARGUMENT;
19860 }
19861
19862 if (control == VM_PURGABLE_PURGE_ALL) {
19863 vm_purgeable_object_purge_all();
19864 return KERN_SUCCESS;
19865 }
19866
19867 if ((control == VM_PURGABLE_SET_STATE ||
19868 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19869 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19870 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19871 return KERN_INVALID_ARGUMENT;
19872 }
19873
19874 address = vm_sanitize_addr(map, address_u);
19875
19876 vm_map_lock_read(map);
19877
19878 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19879 /*
19880 * Must pass a valid non-submap address.
19881 */
19882 vm_map_unlock_read(map);
19883 return KERN_INVALID_ADDRESS;
19884 }
19885
19886 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19887 control != VM_PURGABLE_GET_STATE) {
19888 /*
19889 * Can't apply purgable controls to something you can't write.
19890 */
19891 vm_map_unlock_read(map);
19892 return KERN_PROTECTION_FAILURE;
19893 }
19894
19895 object = VME_OBJECT(entry);
19896 if (object == VM_OBJECT_NULL ||
19897 object->purgable == VM_PURGABLE_DENY) {
19898 /*
19899 * Object must already be present and be purgeable.
19900 */
19901 vm_map_unlock_read(map);
19902 return KERN_INVALID_ARGUMENT;
19903 }
19904
19905 vm_object_lock(object);
19906
19907 #if 00
19908 if (VME_OFFSET(entry) != 0 ||
19909 entry->vme_end - entry->vme_start != object->vo_size) {
19910 /*
19911 * Can only apply purgable controls to the whole (existing)
19912 * object at once.
19913 */
19914 vm_map_unlock_read(map);
19915 vm_object_unlock(object);
19916 return KERN_INVALID_ARGUMENT;
19917 }
19918 #endif
19919
19920 assert(!entry->is_sub_map);
19921 assert(!entry->use_pmap); /* purgeable has its own accounting */
19922
19923 vm_map_unlock_read(map);
19924
19925 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19926
19927 kr = vm_object_purgable_control(object, control, state);
19928
19929 if (was_nonvolatile &&
19930 object->purgable != VM_PURGABLE_NONVOLATILE &&
19931 map->pmap == kernel_pmap) {
19932 #if DEBUG
19933 object->vo_purgeable_volatilizer = kernel_task;
19934 #endif /* DEBUG */
19935 }
19936
19937 vm_object_unlock(object);
19938
19939 return kr;
19940 }
19941
19942 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19943 vm_map_footprint_query_page_info(
19944 vm_map_t map,
19945 vm_map_entry_t map_entry,
19946 vm_map_offset_t curr_s_offset,
19947 int *disposition_p)
19948 {
19949 int pmap_disp;
19950 vm_object_t object = VM_OBJECT_NULL;
19951 int disposition;
19952 int effective_page_size;
19953
19954 vm_map_lock_assert_held(map);
19955 assert(!map->has_corpse_footprint);
19956 assert(curr_s_offset >= map_entry->vme_start);
19957 assert(curr_s_offset < map_entry->vme_end);
19958
19959 if (map_entry->is_sub_map) {
19960 if (!map_entry->use_pmap) {
19961 /* nested pmap: no footprint */
19962 *disposition_p = 0;
19963 return;
19964 }
19965 } else {
19966 object = VME_OBJECT(map_entry);
19967 if (object == VM_OBJECT_NULL) {
19968 /* nothing mapped here: no need to ask */
19969 *disposition_p = 0;
19970 return;
19971 }
19972 }
19973
19974 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19975
19976 pmap_disp = 0;
19977
19978 /*
19979 * Query the pmap.
19980 */
19981 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19982
19983 /*
19984 * Compute this page's disposition.
19985 */
19986 disposition = 0;
19987
19988 /* deal with "alternate accounting" first */
19989 if (!map_entry->is_sub_map &&
19990 object->vo_no_footprint) {
19991 /* does not count in footprint */
19992 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19993 } else if (!map_entry->is_sub_map &&
19994 !object->internal &&
19995 object->vo_ledger_tag &&
19996 VM_OBJECT_OWNER(object) != NULL &&
19997 VM_OBJECT_OWNER(object)->map == map) {
19998 /* owned external object: wired pages count in footprint */
19999 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20000 if ((((curr_s_offset
20001 - map_entry->vme_start
20002 + VME_OFFSET(map_entry))
20003 / effective_page_size) <
20004 object->wired_page_count)) {
20005 /*
20006 * External object owned by this task: report the first
20007 * "#wired" pages as "resident" (to show that they
20008 * contribute to the footprint) but not "dirty"
20009 * (to avoid double-counting with the fake "owned"
20010 * region we'll report at the end of the address space
20011 * to account for all (mapped or not) owned memory
20012 * owned by this task.
20013 */
20014 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20015 }
20016 } else if (!map_entry->is_sub_map &&
20017 object->internal &&
20018 (object->purgable == VM_PURGABLE_NONVOLATILE ||
20019 (object->purgable == VM_PURGABLE_DENY &&
20020 object->vo_ledger_tag)) &&
20021 VM_OBJECT_OWNER(object) != NULL &&
20022 VM_OBJECT_OWNER(object)->map == map) {
20023 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20024 if ((((curr_s_offset
20025 - map_entry->vme_start
20026 + VME_OFFSET(map_entry))
20027 / effective_page_size) <
20028 (object->resident_page_count +
20029 vm_compressor_pager_get_count(object->pager)))) {
20030 /*
20031 * Non-volatile purgeable object owned
20032 * by this task: report the first
20033 * "#resident + #compressed" pages as
20034 * "resident" (to show that they
20035 * contribute to the footprint) but not
20036 * "dirty" (to avoid double-counting
20037 * with the fake "non-volatile" region
20038 * we'll report at the end of the
20039 * address space to account for all
20040 * (mapped or not) non-volatile memory
20041 * owned by this task.
20042 */
20043 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20044 }
20045 } else if (!map_entry->is_sub_map &&
20046 object->internal &&
20047 (object->purgable == VM_PURGABLE_VOLATILE ||
20048 object->purgable == VM_PURGABLE_EMPTY) &&
20049 VM_OBJECT_OWNER(object) != NULL &&
20050 VM_OBJECT_OWNER(object)->map == map) {
20051 if (object->internal) {
20052 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20053 }
20054 if ((((curr_s_offset
20055 - map_entry->vme_start
20056 + VME_OFFSET(map_entry))
20057 / effective_page_size) <
20058 object->wired_page_count)) {
20059 /*
20060 * Volatile|empty purgeable object owned
20061 * by this task: report the first
20062 * "#wired" pages as "resident" (to
20063 * show that they contribute to the
20064 * footprint) but not "dirty" (to avoid
20065 * double-counting with the fake
20066 * "non-volatile" region we'll report
20067 * at the end of the address space to
20068 * account for all (mapped or not)
20069 * non-volatile memory owned by this
20070 * task.
20071 */
20072 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20073 }
20074 } else if (!map_entry->is_sub_map &&
20075 map_entry->iokit_acct &&
20076 object->internal &&
20077 object->purgable == VM_PURGABLE_DENY) {
20078 /*
20079 * Non-purgeable IOKit memory: phys_footprint
20080 * includes the entire virtual mapping.
20081 */
20082 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20083 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20084 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20085 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20086 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20087 /* alternate accounting */
20088 #if __arm64__ && (DEVELOPMENT || DEBUG)
20089 if (map->pmap->footprint_was_suspended) {
20090 /*
20091 * The assertion below can fail if dyld
20092 * suspended footprint accounting
20093 * while doing some adjustments to
20094 * this page; the mapping would say
20095 * "use pmap accounting" but the page
20096 * would be marked "alternate
20097 * accounting".
20098 */
20099 } else
20100 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20101 {
20102 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20103 }
20104 disposition = 0;
20105 } else {
20106 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20107 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20108 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20109 disposition |= VM_PAGE_QUERY_PAGE_REF;
20110 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20111 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20112 } else {
20113 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20114 }
20115 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20116 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20117 }
20118 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20119 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20120 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20121 }
20122 }
20123
20124 *disposition_p = disposition;
20125 }
20126
20127 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20128 vm_map_page_info(
20129 vm_map_t map,
20130 vm_map_offset_ut offset_u,
20131 vm_page_info_flavor_t flavor,
20132 vm_page_info_t info,
20133 mach_msg_type_number_t *count)
20134 {
20135 return vm_map_page_range_info_internal(map,
20136 offset_u, /* start of range */
20137 vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20138 (int)-1, /* effective_page_shift: unspecified */
20139 flavor,
20140 info,
20141 count);
20142 }
20143
20144 static __attribute__((always_inline, warn_unused_result))
20145 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20146 vm_map_page_range_info_sanitize(
20147 vm_map_t map,
20148 vm_map_offset_ut start_offset_u,
20149 vm_map_offset_ut end_offset_u,
20150 vm_map_offset_t effective_page_mask,
20151 vm_map_offset_t *start,
20152 vm_map_offset_t *end,
20153 vm_map_offset_t *offset_in_page)
20154 {
20155 kern_return_t retval;
20156 vm_map_size_t size;
20157
20158 /*
20159 * Perform validation against map's mask but don't align start/end,
20160 * as we need for those to be aligned wrt effective_page_mask
20161 */
20162 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20163 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20164 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20165 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20166 end, &size);
20167 if (retval != KERN_SUCCESS) {
20168 return retval;
20169 }
20170
20171 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20172 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20173 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20174 end, &size);
20175 if (retval != KERN_SUCCESS) {
20176 return retval;
20177 }
20178
20179 *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20180 start_offset_u);
20181
20182 return KERN_SUCCESS;
20183 }
20184
20185 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20186 vm_map_page_range_info_internal(
20187 vm_map_t map,
20188 vm_map_offset_ut start_offset_u,
20189 vm_map_offset_ut end_offset_u,
20190 int effective_page_shift,
20191 vm_page_info_flavor_t flavor,
20192 vm_page_info_t info,
20193 mach_msg_type_number_t *count)
20194 {
20195 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20196 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20197 vm_page_t m = VM_PAGE_NULL;
20198 kern_return_t retval = KERN_SUCCESS;
20199 int disposition = 0;
20200 int ref_count = 0;
20201 int depth = 0, info_idx = 0;
20202 vm_page_info_basic_t basic_info = 0;
20203 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20204 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20205 boolean_t do_region_footprint;
20206 ledger_amount_t ledger_resident, ledger_compressed;
20207 int effective_page_size;
20208 vm_map_offset_t effective_page_mask;
20209
20210 switch (flavor) {
20211 case VM_PAGE_INFO_BASIC:
20212 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20213 /*
20214 * The "vm_page_info_basic_data" structure was not
20215 * properly padded, so allow the size to be off by
20216 * one to maintain backwards binary compatibility...
20217 */
20218 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20219 return KERN_INVALID_ARGUMENT;
20220 }
20221 }
20222 break;
20223 default:
20224 return KERN_INVALID_ARGUMENT;
20225 }
20226
20227 if (effective_page_shift == -1) {
20228 effective_page_shift = vm_self_region_page_shift_safely(map);
20229 if (effective_page_shift == -1) {
20230 return KERN_INVALID_ARGUMENT;
20231 }
20232 }
20233 effective_page_size = (1 << effective_page_shift);
20234 effective_page_mask = effective_page_size - 1;
20235
20236
20237 retval = vm_map_page_range_info_sanitize(map,
20238 start_offset_u,
20239 end_offset_u,
20240 effective_page_mask,
20241 &start,
20242 &end,
20243 &offset_in_page);
20244 if (retval != KERN_SUCCESS) {
20245 return vm_sanitize_get_kr(retval);
20246 }
20247
20248 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20249
20250 do_region_footprint = task_self_region_footprint();
20251 disposition = 0;
20252 ref_count = 0;
20253 depth = 0;
20254 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20255
20256 vm_map_lock_read(map);
20257
20258 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20259
20260 for (curr_s_offset = start; curr_s_offset < end;) {
20261 /*
20262 * New lookup needs reset of these variables.
20263 */
20264 curr_object = object = VM_OBJECT_NULL;
20265 offset_in_object = 0;
20266 ref_count = 0;
20267 depth = 0;
20268
20269 if (do_region_footprint &&
20270 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20271 /*
20272 * Request for "footprint" info about a page beyond
20273 * the end of address space: this must be for
20274 * the fake region vm_map_region_recurse_64()
20275 * reported to account for non-volatile purgeable
20276 * memory owned by this task.
20277 */
20278 disposition = 0;
20279
20280 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20281 (unsigned) ledger_compressed) {
20282 /*
20283 * We haven't reported all the "non-volatile
20284 * compressed" pages yet, so report this fake
20285 * page as "compressed".
20286 */
20287 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20288 } else {
20289 /*
20290 * We've reported all the non-volatile
20291 * compressed page but not all the non-volatile
20292 * pages , so report this fake page as
20293 * "resident dirty".
20294 */
20295 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20296 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20297 disposition |= VM_PAGE_QUERY_PAGE_REF;
20298 }
20299 switch (flavor) {
20300 case VM_PAGE_INFO_BASIC:
20301 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20302 basic_info->disposition = disposition;
20303 basic_info->ref_count = 1;
20304 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20305 basic_info->offset = 0;
20306 basic_info->depth = 0;
20307
20308 info_idx++;
20309 break;
20310 }
20311 curr_s_offset += effective_page_size;
20312 continue;
20313 }
20314
20315 /*
20316 * First, find the map entry covering "curr_s_offset", going down
20317 * submaps if necessary.
20318 */
20319 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20320 /* no entry -> no object -> no page */
20321
20322 if (curr_s_offset < vm_map_min(map)) {
20323 /*
20324 * Illegal address that falls below map min.
20325 */
20326 curr_e_offset = MIN(end, vm_map_min(map));
20327 } else if (curr_s_offset >= vm_map_max(map)) {
20328 /*
20329 * Illegal address that falls on/after map max.
20330 */
20331 curr_e_offset = end;
20332 } else if (map_entry == vm_map_to_entry(map)) {
20333 /*
20334 * Hit a hole.
20335 */
20336 if (map_entry->vme_next == vm_map_to_entry(map)) {
20337 /*
20338 * Empty map.
20339 */
20340 curr_e_offset = MIN(map->max_offset, end);
20341 } else {
20342 /*
20343 * Hole at start of the map.
20344 */
20345 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20346 }
20347 } else {
20348 if (map_entry->vme_next == vm_map_to_entry(map)) {
20349 /*
20350 * Hole at the end of the map.
20351 */
20352 curr_e_offset = MIN(map->max_offset, end);
20353 } else {
20354 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20355 }
20356 }
20357
20358 assert(curr_e_offset >= curr_s_offset);
20359
20360 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20361
20362 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20363
20364 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20365
20366 curr_s_offset = curr_e_offset;
20367
20368 info_idx += num_pages;
20369
20370 continue;
20371 }
20372
20373 /* compute offset from this map entry's start */
20374 offset_in_object = curr_s_offset - map_entry->vme_start;
20375
20376 /* compute offset into this map entry's object (or submap) */
20377 offset_in_object += VME_OFFSET(map_entry);
20378
20379 if (map_entry->is_sub_map) {
20380 vm_map_t sub_map = VM_MAP_NULL;
20381 vm_page_info_t submap_info = 0;
20382 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20383
20384 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20385
20386 submap_s_offset = offset_in_object;
20387 submap_e_offset = submap_s_offset + range_len;
20388
20389 sub_map = VME_SUBMAP(map_entry);
20390
20391 vm_map_reference(sub_map);
20392 vm_map_unlock_read(map);
20393
20394 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20395
20396 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20397 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20398
20399 retval = vm_map_page_range_info_internal(sub_map,
20400 submap_s_offset,
20401 submap_e_offset,
20402 effective_page_shift,
20403 VM_PAGE_INFO_BASIC,
20404 (vm_page_info_t) submap_info,
20405 count);
20406
20407 assert(retval == KERN_SUCCESS);
20408
20409 vm_map_lock_read(map);
20410 vm_map_deallocate(sub_map);
20411
20412 /* Move the "info" index by the number of pages we inspected.*/
20413 info_idx += range_len >> effective_page_shift;
20414
20415 /* Move our current offset by the size of the range we inspected.*/
20416 curr_s_offset += range_len;
20417
20418 continue;
20419 }
20420
20421 object = VME_OBJECT(map_entry);
20422
20423 if (object == VM_OBJECT_NULL) {
20424 /*
20425 * We don't have an object here and, hence,
20426 * no pages to inspect. We'll fill up the
20427 * info structure appropriately.
20428 */
20429
20430 curr_e_offset = MIN(map_entry->vme_end, end);
20431
20432 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20433
20434 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20435
20436 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20437
20438 curr_s_offset = curr_e_offset;
20439
20440 info_idx += num_pages;
20441
20442 continue;
20443 }
20444
20445 if (do_region_footprint) {
20446 disposition = 0;
20447 if (map->has_corpse_footprint) {
20448 /*
20449 * Query the page info data we saved
20450 * while forking the corpse.
20451 */
20452 vm_map_corpse_footprint_query_page_info(
20453 map,
20454 curr_s_offset,
20455 &disposition);
20456 } else {
20457 /*
20458 * Query the live pmap for footprint info
20459 * about this page.
20460 */
20461 vm_map_footprint_query_page_info(
20462 map,
20463 map_entry,
20464 curr_s_offset,
20465 &disposition);
20466 }
20467 switch (flavor) {
20468 case VM_PAGE_INFO_BASIC:
20469 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20470 basic_info->disposition = disposition;
20471 basic_info->ref_count = 1;
20472 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20473 basic_info->offset = 0;
20474 basic_info->depth = 0;
20475
20476 info_idx++;
20477 break;
20478 }
20479 curr_s_offset += effective_page_size;
20480 continue;
20481 }
20482
20483 vm_object_reference(object);
20484 /*
20485 * Shared mode -- so we can allow other readers
20486 * to grab the lock too.
20487 */
20488 vm_object_lock_shared(object);
20489
20490 curr_e_offset = MIN(map_entry->vme_end, end);
20491
20492 vm_map_unlock_read(map);
20493
20494 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20495
20496 curr_object = object;
20497
20498 for (; curr_s_offset < curr_e_offset;) {
20499 if (object == curr_object) {
20500 /* account for our object reference above. */
20501 ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20502 } else {
20503 ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20504 }
20505
20506 curr_offset_in_object = offset_in_object;
20507
20508 for (;;) {
20509 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20510
20511 if (m != VM_PAGE_NULL) {
20512 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20513 break;
20514 } else {
20515 if (curr_object->internal &&
20516 curr_object->alive &&
20517 !curr_object->terminating &&
20518 curr_object->pager_ready) {
20519 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20520 == VM_EXTERNAL_STATE_EXISTS) {
20521 /* the pager has that page */
20522 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20523 break;
20524 }
20525 }
20526
20527 /*
20528 * Go down the VM object shadow chain until we find the page
20529 * we're looking for.
20530 */
20531
20532 if (curr_object->shadow != VM_OBJECT_NULL) {
20533 vm_object_t shadow = VM_OBJECT_NULL;
20534
20535 curr_offset_in_object += curr_object->vo_shadow_offset;
20536 shadow = curr_object->shadow;
20537
20538 vm_object_lock_shared(shadow);
20539 vm_object_unlock(curr_object);
20540
20541 curr_object = shadow;
20542 depth++;
20543 continue;
20544 } else {
20545 break;
20546 }
20547 }
20548 }
20549
20550 /* The ref_count is not strictly accurate, it measures the number */
20551 /* of entities holding a ref on the object, they may not be mapping */
20552 /* the object or may not be mapping the section holding the */
20553 /* target page but its still a ball park number and though an over- */
20554 /* count, it picks up the copy-on-write cases */
20555
20556 /* We could also get a picture of page sharing from pmap_attributes */
20557 /* but this would under count as only faulted-in mappings would */
20558 /* show up. */
20559
20560 if ((curr_object == object) && curr_object->shadow) {
20561 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20562 }
20563
20564 if (!curr_object->internal) {
20565 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20566 }
20567
20568 if (m != VM_PAGE_NULL) {
20569 if (m->vmp_fictitious) {
20570 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20571 } else {
20572 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20573 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20574 }
20575
20576 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20577 disposition |= VM_PAGE_QUERY_PAGE_REF;
20578 }
20579
20580 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20581 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20582 }
20583
20584 /*
20585 * XXX TODO4K:
20586 * when this routine deals with 4k
20587 * pages, check the appropriate CS bit
20588 * here.
20589 */
20590 if (m->vmp_cs_validated) {
20591 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20592 }
20593 if (m->vmp_cs_tainted) {
20594 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20595 }
20596 if (m->vmp_cs_nx) {
20597 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20598 }
20599 if (m->vmp_reusable || curr_object->all_reusable) {
20600 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20601 }
20602 }
20603 }
20604
20605 switch (flavor) {
20606 case VM_PAGE_INFO_BASIC:
20607 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20608 basic_info->disposition = disposition;
20609 basic_info->ref_count = ref_count;
20610 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20611 VM_KERNEL_ADDRHASH(curr_object);
20612 basic_info->offset =
20613 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20614 basic_info->depth = depth;
20615
20616 info_idx++;
20617 break;
20618 }
20619
20620 disposition = 0;
20621 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20622
20623 /*
20624 * Move to next offset in the range and in our object.
20625 */
20626 curr_s_offset += effective_page_size;
20627 offset_in_object += effective_page_size;
20628 curr_offset_in_object = offset_in_object;
20629
20630 if (curr_object != object) {
20631 vm_object_unlock(curr_object);
20632
20633 curr_object = object;
20634
20635 vm_object_lock_shared(curr_object);
20636 } else {
20637 vm_object_lock_yield_shared(curr_object);
20638 }
20639 }
20640
20641 vm_object_unlock(curr_object);
20642 vm_object_deallocate(curr_object);
20643
20644 vm_map_lock_read(map);
20645 }
20646
20647 vm_map_unlock_read(map);
20648 return retval;
20649 }
20650
20651 static __attribute__((always_inline, warn_unused_result))
20652 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20653 vm_map_msync_sanitize(
20654 vm_map_t map,
20655 vm_map_address_ut address_u,
20656 vm_map_size_ut size_u,
20657 vm_object_offset_t *address,
20658 vm_map_size_t *size)
20659 {
20660 vm_object_offset_t end;
20661
20662 return vm_sanitize_addr_size(address_u, size_u,
20663 VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20664 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20665 address, &end, size);
20666 }
20667
20668 /*
20669 * vm_map_msync
20670 *
20671 * Synchronises the memory range specified with its backing store
20672 * image by either flushing or cleaning the contents to the appropriate
20673 * memory manager engaging in a memory object synchronize dialog with
20674 * the manager. The client doesn't return until the manager issues
20675 * m_o_s_completed message. MIG Magically converts user task parameter
20676 * to the task's address map.
20677 *
20678 * interpretation of sync_flags
20679 * VM_SYNC_INVALIDATE - discard pages, only return precious
20680 * pages to manager.
20681 *
20682 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20683 * - discard pages, write dirty or precious
20684 * pages back to memory manager.
20685 *
20686 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20687 * - write dirty or precious pages back to
20688 * the memory manager.
20689 *
20690 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20691 * is a hole in the region, and we would
20692 * have returned KERN_SUCCESS, return
20693 * KERN_INVALID_ADDRESS instead.
20694 *
20695 * NOTE
20696 * The memory object attributes have not yet been implemented, this
20697 * function will have to deal with the invalidate attribute
20698 *
20699 * RETURNS
20700 * KERN_INVALID_TASK Bad task parameter
20701 * KERN_INVALID_ARGUMENT both sync and async were specified.
20702 * KERN_SUCCESS The usual.
20703 * KERN_INVALID_ADDRESS There was a hole in the region.
20704 */
20705
20706 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20707 vm_map_msync(
20708 vm_map_t map,
20709 vm_map_address_ut address_u,
20710 vm_map_size_ut size_u,
20711 vm_sync_t sync_flags)
20712 {
20713 vm_map_entry_t entry;
20714 vm_map_size_t size, amount_left;
20715 vm_object_offset_t address, offset;
20716 vm_object_offset_t start_offset, end_offset;
20717 boolean_t do_sync_req;
20718 boolean_t had_hole = FALSE;
20719 vm_map_offset_t pmap_offset;
20720 kern_return_t kr;
20721
20722 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20723 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20724 return KERN_INVALID_ARGUMENT;
20725 }
20726
20727 if (map == VM_MAP_NULL) {
20728 return KERN_INVALID_TASK;
20729 }
20730
20731 kr = vm_map_msync_sanitize(map,
20732 address_u,
20733 size_u,
20734 &address,
20735 &size);
20736 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20737 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20738 }
20739 if (__improbable(kr != KERN_SUCCESS)) {
20740 return vm_sanitize_get_kr(kr);
20741 }
20742
20743 amount_left = size;
20744
20745 while (amount_left > 0) {
20746 vm_object_size_t flush_size;
20747 vm_object_t object;
20748
20749 vm_map_lock(map);
20750 if (!vm_map_lookup_entry(map,
20751 address,
20752 &entry)) {
20753 vm_map_size_t skip;
20754
20755 /*
20756 * hole in the address map.
20757 */
20758 had_hole = TRUE;
20759
20760 if (sync_flags & VM_SYNC_KILLPAGES) {
20761 /*
20762 * For VM_SYNC_KILLPAGES, there should be
20763 * no holes in the range, since we couldn't
20764 * prevent someone else from allocating in
20765 * that hole and we wouldn't want to "kill"
20766 * their pages.
20767 */
20768 vm_map_unlock(map);
20769 break;
20770 }
20771
20772 /*
20773 * Check for empty map.
20774 */
20775 if (entry == vm_map_to_entry(map) &&
20776 entry->vme_next == entry) {
20777 vm_map_unlock(map);
20778 break;
20779 }
20780 /*
20781 * Check that we don't wrap and that
20782 * we have at least one real map entry.
20783 */
20784 if ((map->hdr.nentries == 0) ||
20785 (entry->vme_next->vme_start < address)) {
20786 vm_map_unlock(map);
20787 break;
20788 }
20789 /*
20790 * Move up to the next entry if needed
20791 */
20792 skip = (entry->vme_next->vme_start - address);
20793 if (skip >= amount_left) {
20794 amount_left = 0;
20795 } else {
20796 amount_left -= skip;
20797 }
20798 address = entry->vme_next->vme_start;
20799 vm_map_unlock(map);
20800 continue;
20801 }
20802
20803 offset = address - entry->vme_start;
20804 pmap_offset = address;
20805
20806 /*
20807 * do we have more to flush than is contained in this
20808 * entry ?
20809 */
20810 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20811 flush_size = entry->vme_end -
20812 (entry->vme_start + offset);
20813 } else {
20814 flush_size = amount_left;
20815 }
20816 amount_left -= flush_size;
20817 address += flush_size;
20818
20819 if (entry->is_sub_map == TRUE) {
20820 vm_map_t local_map;
20821 vm_map_offset_t local_offset;
20822
20823 local_map = VME_SUBMAP(entry);
20824 local_offset = VME_OFFSET(entry);
20825 vm_map_reference(local_map);
20826 vm_map_unlock(map);
20827 if (vm_map_msync(
20828 local_map,
20829 local_offset,
20830 flush_size,
20831 sync_flags) == KERN_INVALID_ADDRESS) {
20832 had_hole = TRUE;
20833 }
20834 vm_map_deallocate(local_map);
20835 continue;
20836 }
20837 object = VME_OBJECT(entry);
20838
20839 /*
20840 * We can't sync this object if the object has not been
20841 * created yet
20842 */
20843 if (object == VM_OBJECT_NULL) {
20844 vm_map_unlock(map);
20845 continue;
20846 }
20847 offset += VME_OFFSET(entry);
20848
20849 vm_object_lock(object);
20850
20851 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20852 int kill_pages = 0;
20853
20854 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20855 /*
20856 * This is a destructive operation and so we
20857 * err on the side of limiting the range of
20858 * the operation.
20859 */
20860 start_offset = vm_object_round_page(offset);
20861 end_offset = vm_object_trunc_page(offset + flush_size);
20862
20863 if (end_offset <= start_offset) {
20864 vm_object_unlock(object);
20865 vm_map_unlock(map);
20866 continue;
20867 }
20868
20869 pmap_offset += start_offset - offset;
20870 } else {
20871 start_offset = offset;
20872 end_offset = offset + flush_size;
20873 }
20874
20875 if (sync_flags & VM_SYNC_KILLPAGES) {
20876 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
20877 ((object->copy_strategy !=
20878 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20879 (object->vo_copy == VM_OBJECT_NULL))) &&
20880 (object->shadow == VM_OBJECT_NULL)) {
20881 if (os_ref_get_count_raw(&object->ref_count) != 1) {
20882 vm_page_stats_reusable.free_shared++;
20883 }
20884 kill_pages = 1;
20885 } else {
20886 kill_pages = -1;
20887 }
20888 }
20889 if (kill_pages != -1) {
20890 vm_object_deactivate_pages(
20891 object,
20892 start_offset,
20893 (vm_object_size_t) (end_offset - start_offset),
20894 kill_pages,
20895 FALSE, /* reusable_pages */
20896 FALSE, /* reusable_no_write */
20897 map->pmap,
20898 pmap_offset);
20899 }
20900 vm_object_unlock(object);
20901 vm_map_unlock(map);
20902 continue;
20903 }
20904 /*
20905 * We can't sync this object if there isn't a pager.
20906 * Don't bother to sync internal objects, since there can't
20907 * be any "permanent" storage for these objects anyway.
20908 */
20909 if ((object->pager == MEMORY_OBJECT_NULL) ||
20910 (object->internal) || (object->private)) {
20911 vm_object_unlock(object);
20912 vm_map_unlock(map);
20913 continue;
20914 }
20915 /*
20916 * keep reference on the object until syncing is done
20917 */
20918 vm_object_reference_locked(object);
20919 vm_object_unlock(object);
20920
20921 vm_map_unlock(map);
20922
20923 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20924 start_offset = vm_object_trunc_page(offset);
20925 end_offset = vm_object_round_page(offset + flush_size);
20926 } else {
20927 start_offset = offset;
20928 end_offset = offset + flush_size;
20929 }
20930
20931 do_sync_req = vm_object_sync(object,
20932 start_offset,
20933 (end_offset - start_offset),
20934 sync_flags & VM_SYNC_INVALIDATE,
20935 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20936 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20937 sync_flags & VM_SYNC_SYNCHRONOUS);
20938
20939 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20940 /*
20941 * clear out the clustering and read-ahead hints
20942 */
20943 vm_object_lock(object);
20944
20945 object->pages_created = 0;
20946 object->pages_used = 0;
20947 object->sequential = 0;
20948 object->last_alloc = 0;
20949
20950 vm_object_unlock(object);
20951 }
20952 vm_object_deallocate(object);
20953 } /* while */
20954
20955 /* for proper msync() behaviour */
20956 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20957 return KERN_INVALID_ADDRESS;
20958 }
20959
20960 return KERN_SUCCESS;
20961 }/* vm_msync */
20962
20963 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20964 vm_named_entry_associate_vm_object(
20965 vm_named_entry_t named_entry,
20966 vm_object_t object,
20967 vm_object_offset_t offset,
20968 vm_object_size_t size,
20969 vm_prot_t prot)
20970 {
20971 vm_map_copy_t copy;
20972 vm_map_entry_t copy_entry;
20973
20974 assert(!named_entry->is_sub_map);
20975 assert(!named_entry->is_copy);
20976 assert(!named_entry->is_object);
20977 assert(!named_entry->internal);
20978 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20979
20980 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20981 copy->offset = offset;
20982 copy->size = size;
20983 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20984
20985 copy_entry = vm_map_copy_entry_create(copy);
20986 copy_entry->protection = prot;
20987 copy_entry->max_protection = prot;
20988 copy_entry->use_pmap = TRUE;
20989 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20990 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20991 VME_OBJECT_SET(copy_entry, object, false, 0);
20992 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20993 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20994
20995 named_entry->backing.copy = copy;
20996 named_entry->is_object = TRUE;
20997 if (object->internal) {
20998 named_entry->internal = TRUE;
20999 }
21000
21001 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21002 named_entry, copy, object, offset, size, prot);
21003 }
21004
21005 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21006 vm_named_entry_to_vm_object(
21007 vm_named_entry_t named_entry)
21008 {
21009 vm_map_copy_t copy;
21010 vm_map_entry_t copy_entry;
21011 vm_object_t object;
21012
21013 assert(!named_entry->is_sub_map);
21014 assert(!named_entry->is_copy);
21015 assert(named_entry->is_object);
21016 copy = named_entry->backing.copy;
21017 assert(copy != VM_MAP_COPY_NULL);
21018 /*
21019 * Assert that the vm_map_copy is coming from the right
21020 * zone and hasn't been forged
21021 */
21022 vm_map_copy_require(copy);
21023 assert(copy->cpy_hdr.nentries == 1);
21024 copy_entry = vm_map_copy_first_entry(copy);
21025 object = VME_OBJECT(copy_entry);
21026
21027 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21028
21029 return object;
21030 }
21031
21032 /*
21033 * Routine: convert_port_entry_to_map
21034 * Purpose:
21035 * Convert from a port specifying an entry or a task
21036 * to a map. Doesn't consume the port ref; produces a map ref,
21037 * which may be null. Unlike convert_port_to_map, the
21038 * port may be task or a named entry backed.
21039 * Conditions:
21040 * Nothing locked.
21041 */
21042
21043 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21044 convert_port_entry_to_map(
21045 ipc_port_t port)
21046 {
21047 vm_map_t map = VM_MAP_NULL;
21048 vm_named_entry_t named_entry;
21049
21050 if (!IP_VALID(port)) {
21051 return VM_MAP_NULL;
21052 }
21053
21054 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21055 return convert_port_to_map(port);
21056 }
21057
21058 named_entry = mach_memory_entry_from_port(port);
21059
21060 if ((named_entry->is_sub_map) &&
21061 (named_entry->protection & VM_PROT_WRITE)) {
21062 map = named_entry->backing.map;
21063 if (map->pmap != PMAP_NULL) {
21064 if (map->pmap == kernel_pmap) {
21065 panic("userspace has access "
21066 "to a kernel map %p", map);
21067 }
21068 pmap_require(map->pmap);
21069 }
21070 vm_map_reference(map);
21071 }
21072
21073 return map;
21074 }
21075
21076 /*
21077 * Export routines to other components for the things we access locally through
21078 * macros.
21079 */
21080 #undef current_map
21081 vm_map_t
current_map(void)21082 current_map(void)
21083 {
21084 return current_map_fast();
21085 }
21086
21087 /*
21088 * vm_map_reference:
21089 *
21090 * Takes a reference on the specified map.
21091 */
21092 void
vm_map_reference(vm_map_t map)21093 vm_map_reference(
21094 vm_map_t map)
21095 {
21096 if (__probable(map != VM_MAP_NULL)) {
21097 vm_map_require(map);
21098 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21099 }
21100 }
21101
21102 /*
21103 * vm_map_deallocate:
21104 *
21105 * Removes a reference from the specified map,
21106 * destroying it if no references remain.
21107 * The map should not be locked.
21108 */
21109 void
vm_map_deallocate(vm_map_t map)21110 vm_map_deallocate(
21111 vm_map_t map)
21112 {
21113 if (__probable(map != VM_MAP_NULL)) {
21114 vm_map_require(map);
21115 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21116 vm_map_destroy(map);
21117 }
21118 }
21119 }
21120
21121 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21122 vm_map_inspect_deallocate(
21123 vm_map_inspect_t map)
21124 {
21125 vm_map_deallocate((vm_map_t)map);
21126 }
21127
21128 void
vm_map_read_deallocate(vm_map_read_t map)21129 vm_map_read_deallocate(
21130 vm_map_read_t map)
21131 {
21132 vm_map_deallocate((vm_map_t)map);
21133 }
21134
21135
21136 void
vm_map_disable_NX(vm_map_t map)21137 vm_map_disable_NX(vm_map_t map)
21138 {
21139 if (map == NULL) {
21140 return;
21141 }
21142 if (map->pmap == NULL) {
21143 return;
21144 }
21145
21146 pmap_disable_NX(map->pmap);
21147 }
21148
21149 void
vm_map_disallow_data_exec(vm_map_t map)21150 vm_map_disallow_data_exec(vm_map_t map)
21151 {
21152 if (map == NULL) {
21153 return;
21154 }
21155
21156 map->map_disallow_data_exec = TRUE;
21157 }
21158
21159 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21160 * more descriptive.
21161 */
21162 void
vm_map_set_32bit(vm_map_t map)21163 vm_map_set_32bit(vm_map_t map)
21164 {
21165 #if defined(__arm64__)
21166 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21167 #else
21168 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21169 #endif
21170 }
21171
21172
21173 void
vm_map_set_64bit(vm_map_t map)21174 vm_map_set_64bit(vm_map_t map)
21175 {
21176 #if defined(__arm64__)
21177 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21178 #else
21179 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21180 #endif
21181 }
21182
21183 /*
21184 * Expand the maximum size of an existing map to 64GB.
21185 */
21186 void
vm_map_set_jumbo(vm_map_t map)21187 vm_map_set_jumbo(vm_map_t map)
21188 {
21189 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21190 vm_map_set_max_addr(map, ~0, false);
21191 #else /* arm64 */
21192 (void) map;
21193 #endif
21194 }
21195
21196 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21197 /*
21198 * Expand the maximum size of an existing map to the maximum supported.
21199 */
21200 void
vm_map_set_extra_jumbo(vm_map_t map)21201 vm_map_set_extra_jumbo(vm_map_t map)
21202 {
21203 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21204 vm_map_set_max_addr(map, ~0, true);
21205 #else /* arm64 */
21206 (void) map;
21207 #endif
21208 }
21209 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21210
21211 /*
21212 * This map has a JIT entitlement
21213 */
21214 void
vm_map_set_jit_entitled(vm_map_t map)21215 vm_map_set_jit_entitled(vm_map_t map)
21216 {
21217 #if defined (__arm64__)
21218 pmap_set_jit_entitled(map->pmap);
21219 #else /* arm64 */
21220 (void) map;
21221 #endif
21222 }
21223
21224 /*
21225 * Get status of this maps TPRO flag
21226 */
21227 boolean_t
vm_map_tpro(vm_map_t map)21228 vm_map_tpro(vm_map_t map)
21229 {
21230 #if defined (__arm64e__)
21231 return pmap_get_tpro(map->pmap);
21232 #else /* arm64e */
21233 (void) map;
21234 return FALSE;
21235 #endif
21236 }
21237
21238 /*
21239 * This map has TPRO enabled
21240 */
21241 void
vm_map_set_tpro(vm_map_t map)21242 vm_map_set_tpro(vm_map_t map)
21243 {
21244 #if defined (__arm64e__)
21245 pmap_set_tpro(map->pmap);
21246 #else /* arm64e */
21247 (void) map;
21248 #endif
21249 }
21250
21251 /*
21252 * Does this map have TPRO enforcement enabled
21253 */
21254 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21255 vm_map_tpro_enforcement(vm_map_t map)
21256 {
21257 return map->tpro_enforcement;
21258 }
21259
21260 /*
21261 * Set TPRO enforcement for this map
21262 */
21263 void
vm_map_set_tpro_enforcement(vm_map_t map)21264 vm_map_set_tpro_enforcement(vm_map_t map)
21265 {
21266 if (vm_map_tpro(map)) {
21267 vm_map_lock(map);
21268 map->tpro_enforcement = TRUE;
21269 vm_map_unlock(map);
21270 }
21271 }
21272
21273 /*
21274 * Enable TPRO on the requested region
21275 *
21276 * Note:
21277 * This routine is primarily intended to be called during/soon after map
21278 * creation before the associated task has been released to run. It is only
21279 * currently safe when we have no resident pages.
21280 */
21281 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21282 vm_map_set_tpro_range(
21283 __unused vm_map_t map,
21284 __unused vm_map_address_t start,
21285 __unused vm_map_address_t end)
21286 {
21287 return TRUE;
21288 }
21289
21290 /*
21291 * Expand the maximum size of an existing map.
21292 */
21293 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21294 vm_map_set_max_addr(
21295 vm_map_t map,
21296 vm_map_offset_t new_max_offset,
21297 __unused bool extra_jumbo)
21298 {
21299 #if defined(__arm64__)
21300 vm_map_offset_t max_supported_offset;
21301 vm_map_offset_t old_max_offset;
21302 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21303
21304 vm_map_lock(map);
21305
21306 old_max_offset = map->max_offset;
21307 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21308 if (extra_jumbo) {
21309 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21310 }
21311 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21312 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21313
21314 new_max_offset = trunc_page(new_max_offset);
21315
21316 /* The address space cannot be shrunk using this routine. */
21317 if (old_max_offset >= new_max_offset) {
21318 vm_map_unlock(map);
21319 return;
21320 }
21321
21322 if (max_supported_offset < new_max_offset) {
21323 new_max_offset = max_supported_offset;
21324 }
21325
21326 map->max_offset = new_max_offset;
21327
21328 /*
21329 * Disable the following chunk of code that extends the "holes" list
21330 * to accomodate a larger VM map.
21331 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21332 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21333 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21334 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21335 * The "holes" list does not need to be adjusted.
21336 */
21337 #if 0
21338 if (map->holelistenabled) {
21339 if (map->holes_list->prev->vme_end == old_max_offset) {
21340 /*
21341 * There is already a hole at the end of the map; simply make it bigger.
21342 */
21343 map->holes_list->prev->vme_end = map->max_offset;
21344 } else {
21345 /*
21346 * There is no hole at the end, so we need to create a new hole
21347 * for the new empty space we're creating.
21348 */
21349 struct vm_map_links *new_hole;
21350
21351 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21352 new_hole->start = old_max_offset;
21353 new_hole->end = map->max_offset;
21354 new_hole->prev = map->holes_list->prev;
21355 new_hole->next = (struct vm_map_entry *)map->holes_list;
21356 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21357 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21358 }
21359 }
21360 #endif
21361
21362 vm_map_unlock(map);
21363 #else
21364 (void)map;
21365 (void)new_max_offset;
21366 #endif
21367 }
21368
21369 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21370 vm_compute_max_offset(boolean_t is64)
21371 {
21372 #if defined(__arm64__)
21373 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21374 #else
21375 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21376 #endif
21377 }
21378
21379 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21380 vm_map_get_max_aslr_slide_section(
21381 vm_map_t map __unused,
21382 int64_t *max_sections,
21383 int64_t *section_size)
21384 {
21385 #if defined(__arm64__)
21386 *max_sections = 3;
21387 *section_size = ARM_TT_TWIG_SIZE;
21388 #else
21389 *max_sections = 1;
21390 *section_size = 0;
21391 #endif
21392 }
21393
21394 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21395 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21396 {
21397 #if defined(__arm64__)
21398 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21399 * limited embedded address space; this is also meant to minimize pmap
21400 * memory usage on 16KB page systems.
21401 */
21402 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21403 #else
21404 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21405 #endif
21406 }
21407
21408 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21409 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21410 {
21411 #if defined(__arm64__)
21412 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21413 * of independent entropy on 16KB page systems.
21414 */
21415 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21416 #else
21417 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21418 #endif
21419 }
21420
21421 boolean_t
vm_map_is_64bit(vm_map_t map)21422 vm_map_is_64bit(
21423 vm_map_t map)
21424 {
21425 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21426 }
21427
21428 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21429 vm_map_has_hard_pagezero(
21430 vm_map_t map,
21431 vm_map_offset_t pagezero_size)
21432 {
21433 /*
21434 * XXX FBDP
21435 * We should lock the VM map (for read) here but we can get away
21436 * with it for now because there can't really be any race condition:
21437 * the VM map's min_offset is changed only when the VM map is created
21438 * and when the zero page is established (when the binary gets loaded),
21439 * and this routine gets called only when the task terminates and the
21440 * VM map is being torn down, and when a new map is created via
21441 * load_machfile()/execve().
21442 */
21443 return map->min_offset >= pagezero_size;
21444 }
21445
21446 /*
21447 * Raise a VM map's maximun offset.
21448 */
21449 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21450 vm_map_raise_max_offset(
21451 vm_map_t map,
21452 vm_map_offset_t new_max_offset)
21453 {
21454 kern_return_t ret;
21455
21456 vm_map_lock(map);
21457 ret = KERN_INVALID_ADDRESS;
21458
21459 if (new_max_offset >= map->max_offset) {
21460 if (!vm_map_is_64bit(map)) {
21461 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21462 map->max_offset = new_max_offset;
21463 ret = KERN_SUCCESS;
21464 }
21465 } else {
21466 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21467 map->max_offset = new_max_offset;
21468 ret = KERN_SUCCESS;
21469 }
21470 }
21471 }
21472
21473 vm_map_unlock(map);
21474 return ret;
21475 }
21476
21477
21478 /*
21479 * Raise a VM map's minimum offset.
21480 * To strictly enforce "page zero" reservation.
21481 */
21482 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21483 vm_map_raise_min_offset(
21484 vm_map_t map,
21485 vm_map_offset_t new_min_offset)
21486 {
21487 vm_map_entry_t first_entry;
21488
21489 new_min_offset = vm_map_round_page(new_min_offset,
21490 VM_MAP_PAGE_MASK(map));
21491
21492 vm_map_lock(map);
21493
21494 if (new_min_offset < map->min_offset) {
21495 /*
21496 * Can't move min_offset backwards, as that would expose
21497 * a part of the address space that was previously, and for
21498 * possibly good reasons, inaccessible.
21499 */
21500 vm_map_unlock(map);
21501 return KERN_INVALID_ADDRESS;
21502 }
21503 if (new_min_offset >= map->max_offset) {
21504 /* can't go beyond the end of the address space */
21505 vm_map_unlock(map);
21506 return KERN_INVALID_ADDRESS;
21507 }
21508
21509 first_entry = vm_map_first_entry(map);
21510 if (first_entry != vm_map_to_entry(map) &&
21511 first_entry->vme_start < new_min_offset) {
21512 /*
21513 * Some memory was already allocated below the new
21514 * minimun offset. It's too late to change it now...
21515 */
21516 vm_map_unlock(map);
21517 return KERN_NO_SPACE;
21518 }
21519
21520 map->min_offset = new_min_offset;
21521
21522 if (map->holelistenabled) {
21523 assert(map->holes_list);
21524 map->holes_list->start = new_min_offset;
21525 assert(new_min_offset < map->holes_list->end);
21526 }
21527
21528 vm_map_unlock(map);
21529
21530 return KERN_SUCCESS;
21531 }
21532
21533 /*
21534 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21535 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21536 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21537 * have to reach over to the BSD data structures.
21538 */
21539
21540 uint64_t vm_map_set_size_limit_count = 0;
21541 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21542 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21543 {
21544 kern_return_t kr;
21545
21546 vm_map_lock(map);
21547 if (new_size_limit < map->size) {
21548 /* new limit should not be lower than its current size */
21549 DTRACE_VM2(vm_map_set_size_limit_fail,
21550 vm_map_size_t, map->size,
21551 uint64_t, new_size_limit);
21552 kr = KERN_FAILURE;
21553 } else if (new_size_limit == map->size_limit) {
21554 /* no change */
21555 kr = KERN_SUCCESS;
21556 } else {
21557 /* set new limit */
21558 DTRACE_VM2(vm_map_set_size_limit,
21559 vm_map_size_t, map->size,
21560 uint64_t, new_size_limit);
21561 if (new_size_limit != RLIM_INFINITY) {
21562 vm_map_set_size_limit_count++;
21563 }
21564 map->size_limit = new_size_limit;
21565 kr = KERN_SUCCESS;
21566 }
21567 vm_map_unlock(map);
21568 return kr;
21569 }
21570
21571 uint64_t vm_map_set_data_limit_count = 0;
21572 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21573 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21574 {
21575 kern_return_t kr;
21576
21577 vm_map_lock(map);
21578 if (new_data_limit < map->size) {
21579 /* new limit should not be lower than its current size */
21580 DTRACE_VM2(vm_map_set_data_limit_fail,
21581 vm_map_size_t, map->size,
21582 uint64_t, new_data_limit);
21583 kr = KERN_FAILURE;
21584 } else if (new_data_limit == map->data_limit) {
21585 /* no change */
21586 kr = KERN_SUCCESS;
21587 } else {
21588 /* set new limit */
21589 DTRACE_VM2(vm_map_set_data_limit,
21590 vm_map_size_t, map->size,
21591 uint64_t, new_data_limit);
21592 if (new_data_limit != RLIM_INFINITY) {
21593 vm_map_set_data_limit_count++;
21594 }
21595 map->data_limit = new_data_limit;
21596 kr = KERN_SUCCESS;
21597 }
21598 vm_map_unlock(map);
21599 return kr;
21600 }
21601
21602 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21603 vm_map_set_user_wire_limit(vm_map_t map,
21604 vm_size_t limit)
21605 {
21606 vm_map_lock(map);
21607 map->user_wire_limit = limit;
21608 vm_map_unlock(map);
21609 }
21610
21611
21612 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21613 vm_map_switch_protect(vm_map_t map,
21614 boolean_t val)
21615 {
21616 vm_map_lock(map);
21617 map->switch_protect = val;
21618 vm_map_unlock(map);
21619 }
21620
21621 extern int cs_process_enforcement_enable;
21622 boolean_t
vm_map_cs_enforcement(vm_map_t map)21623 vm_map_cs_enforcement(
21624 vm_map_t map)
21625 {
21626 if (cs_process_enforcement_enable) {
21627 return TRUE;
21628 }
21629 return map->cs_enforcement;
21630 }
21631
21632 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21633 vm_map_cs_wx_enable(
21634 __unused vm_map_t map)
21635 {
21636 #if CODE_SIGNING_MONITOR
21637 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21638 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21639 return KERN_SUCCESS;
21640 }
21641 return ret;
21642 #else
21643 /* The VM manages WX memory entirely on its own */
21644 return KERN_SUCCESS;
21645 #endif
21646 }
21647
21648 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21649 vm_map_csm_allow_jit(
21650 __unused vm_map_t map)
21651 {
21652 #if CODE_SIGNING_MONITOR
21653 return csm_allow_jit_region(vm_map_pmap(map));
21654 #else
21655 /* No code signing monitor to enforce JIT policy */
21656 return KERN_SUCCESS;
21657 #endif
21658 }
21659
21660 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21661 vm_map_cs_debugged_set(
21662 vm_map_t map,
21663 boolean_t val)
21664 {
21665 vm_map_lock(map);
21666 map->cs_debugged = val;
21667 vm_map_unlock(map);
21668 }
21669
21670 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21671 vm_map_cs_enforcement_set(
21672 vm_map_t map,
21673 boolean_t val)
21674 {
21675 vm_map_lock(map);
21676 map->cs_enforcement = val;
21677 pmap_set_vm_map_cs_enforced(map->pmap, val);
21678 vm_map_unlock(map);
21679 }
21680
21681 /*
21682 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21683 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21684 * bump both counters.
21685 */
21686 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21687 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21688 {
21689 pmap_t pmap = vm_map_pmap(map);
21690
21691 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21692 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21693 }
21694
21695 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21696 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21697 {
21698 pmap_t pmap = vm_map_pmap(map);
21699
21700 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21701 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21702 }
21703
21704 /* Add (generate) code signature for memory range */
21705 #if CONFIG_DYNAMIC_CODE_SIGNING
21706 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21707 vm_map_sign(vm_map_t map,
21708 vm_map_offset_t start,
21709 vm_map_offset_t end)
21710 {
21711 vm_map_entry_t entry;
21712 vm_page_t m;
21713 vm_object_t object;
21714
21715 /*
21716 * Vet all the input parameters and current type and state of the
21717 * underlaying object. Return with an error if anything is amiss.
21718 */
21719 if (map == VM_MAP_NULL) {
21720 return KERN_INVALID_ARGUMENT;
21721 }
21722
21723 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21724 return KERN_INVALID_ADDRESS;
21725 }
21726
21727 vm_map_lock_read(map);
21728
21729 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21730 /*
21731 * Must pass a valid non-submap address.
21732 */
21733 vm_map_unlock_read(map);
21734 return KERN_INVALID_ADDRESS;
21735 }
21736
21737 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21738 /*
21739 * Map entry doesn't cover the requested range. Not handling
21740 * this situation currently.
21741 */
21742 vm_map_unlock_read(map);
21743 return KERN_INVALID_ARGUMENT;
21744 }
21745
21746 object = VME_OBJECT(entry);
21747 if (object == VM_OBJECT_NULL) {
21748 /*
21749 * Object must already be present or we can't sign.
21750 */
21751 vm_map_unlock_read(map);
21752 return KERN_INVALID_ARGUMENT;
21753 }
21754
21755 vm_object_lock(object);
21756 vm_map_unlock_read(map);
21757
21758 while (start < end) {
21759 uint32_t refmod;
21760
21761 m = vm_page_lookup(object,
21762 start - entry->vme_start + VME_OFFSET(entry));
21763 if (m == VM_PAGE_NULL) {
21764 /* shoud we try to fault a page here? we can probably
21765 * demand it exists and is locked for this request */
21766 vm_object_unlock(object);
21767 return KERN_FAILURE;
21768 }
21769 /* deal with special page status */
21770 if (m->vmp_busy ||
21771 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21772 vm_object_unlock(object);
21773 return KERN_FAILURE;
21774 }
21775
21776 /* Page is OK... now "validate" it */
21777 /* This is the place where we'll call out to create a code
21778 * directory, later */
21779 /* XXX TODO4K: deal with 4k subpages individually? */
21780 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21781
21782 /* The page is now "clean" for codesigning purposes. That means
21783 * we don't consider it as modified (wpmapped) anymore. But
21784 * we'll disconnect the page so we note any future modification
21785 * attempts. */
21786 m->vmp_wpmapped = FALSE;
21787 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21788
21789 /* Pull the dirty status from the pmap, since we cleared the
21790 * wpmapped bit */
21791 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21792 SET_PAGE_DIRTY(m, FALSE);
21793 }
21794
21795 /* On to the next page */
21796 start += PAGE_SIZE;
21797 }
21798 vm_object_unlock(object);
21799
21800 return KERN_SUCCESS;
21801 }
21802 #endif
21803
21804 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21805 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21806 {
21807 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21808 vm_map_entry_t next_entry;
21809 kern_return_t kr = KERN_SUCCESS;
21810 VM_MAP_ZAP_DECLARE(zap_list);
21811
21812 vm_map_lock(map);
21813
21814 for (entry = vm_map_first_entry(map);
21815 entry != vm_map_to_entry(map);
21816 entry = next_entry) {
21817 next_entry = entry->vme_next;
21818
21819 if (!entry->is_sub_map &&
21820 VME_OBJECT(entry) &&
21821 (VME_OBJECT(entry)->internal == TRUE) &&
21822 (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21823 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21824 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21825
21826 (void)vm_map_delete(map, entry->vme_start,
21827 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21828 KMEM_GUARD_NONE, &zap_list);
21829 }
21830 }
21831
21832 vm_map_unlock(map);
21833
21834 vm_map_zap_dispose(&zap_list);
21835
21836 return kr;
21837 }
21838
21839
21840 #if DEVELOPMENT || DEBUG
21841
21842 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21843 vm_map_disconnect_page_mappings(
21844 vm_map_t map,
21845 boolean_t do_unnest)
21846 {
21847 vm_map_entry_t entry;
21848 ledger_amount_t byte_count = 0;
21849
21850 if (do_unnest == TRUE) {
21851 #ifndef NO_NESTED_PMAP
21852 vm_map_lock(map);
21853
21854 for (entry = vm_map_first_entry(map);
21855 entry != vm_map_to_entry(map);
21856 entry = entry->vme_next) {
21857 if (entry->is_sub_map && entry->use_pmap) {
21858 /*
21859 * Make sure the range between the start of this entry and
21860 * the end of this entry is no longer nested, so that
21861 * we will only remove mappings from the pmap in use by this
21862 * this task
21863 */
21864 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21865 }
21866 }
21867 vm_map_unlock(map);
21868 #endif
21869 }
21870 vm_map_lock_read(map);
21871
21872 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21873
21874 for (entry = vm_map_first_entry(map);
21875 entry != vm_map_to_entry(map);
21876 entry = entry->vme_next) {
21877 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21878 (VME_OBJECT(entry)->phys_contiguous))) {
21879 continue;
21880 }
21881 if (entry->is_sub_map) {
21882 assert(!entry->use_pmap);
21883 }
21884
21885 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21886 }
21887 vm_map_unlock_read(map);
21888
21889 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21890 }
21891
21892 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21893 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21894 {
21895 vm_object_t object = NULL;
21896 vm_object_offset_t offset;
21897 vm_prot_t prot;
21898 boolean_t wired;
21899 vm_map_version_t version;
21900 vm_map_t real_map;
21901 int result = KERN_FAILURE;
21902
21903 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21904 vm_map_lock(map);
21905
21906 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21907 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21908 NULL, &real_map, NULL);
21909 if (object == NULL) {
21910 result = KERN_MEMORY_ERROR;
21911 } else if (object->pager) {
21912 result = vm_compressor_pager_inject_error(object->pager,
21913 offset);
21914 } else {
21915 result = KERN_MEMORY_PRESENT;
21916 }
21917
21918 if (object != NULL) {
21919 vm_object_unlock(object);
21920 }
21921
21922 if (real_map != map) {
21923 vm_map_unlock(real_map);
21924 }
21925 vm_map_unlock(map);
21926
21927 return result;
21928 }
21929
21930 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21931 * returns: KERN_SUCCESS if iteration completed ok,
21932 * error code if callback returned an error
21933 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21934 * iterated is different from the number in the first call
21935 */
21936 static kern_return_t
21937 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21938 kern_return_t (^entry_handler)(void* entry))
21939 {
21940 vm_map_lock_assert_held(map);
21941 int nentries = map->hdr.nentries;
21942 kern_return_t error = count_handler(nentries);
21943 if (error) {
21944 return error;
21945 }
21946
21947 /* iterate until we loop back to the map, see get_vmmap_entries() */
21948 vm_map_entry_t entry = vm_map_first_entry(map);
21949 int count = 0;
21950 while (entry != vm_map_to_entry(map)) {
21951 error = entry_handler(entry);
21952 if (error != KERN_SUCCESS) {
21953 return error;
21954 }
21955 entry = entry->vme_next;
21956 ++count;
21957 if (count > nentries) {
21958 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21959 return KERN_FAILURE;
21960 }
21961 }
21962 if (count < nentries) {
21963 return KERN_FAILURE;
21964 }
21965 return KERN_SUCCESS;
21966 }
21967
21968 kern_return_t
21969 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21970 kern_return_t (^entry_handler)(void* entry))
21971 {
21972 vm_map_lock_read(map);
21973 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21974 vm_map_unlock_read(map);
21975 return error;
21976 }
21977
21978 /*
21979 * Dump info about the entry into the given buffer.
21980 * return true on success, false if there was not enough space in the give buffer
21981 * argument size in: bytes free in the given buffer, out: bytes written
21982 */
21983 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21984 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21985 {
21986 size_t insize = *size;
21987 kern_return_t kr;
21988 size_t offset = 0;
21989
21990 *size = 0;
21991 if (sizeof(struct vm_map_entry_info) > insize) {
21992 return KERN_NO_SPACE;
21993 }
21994
21995 vm_map_entry_t entry = (vm_map_entry_t)pentry;
21996 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
21997 out_entry->vmei_start = entry->vme_start;
21998 out_entry->vmei_end = entry->vme_end;
21999 out_entry->vmei_alias = VME_ALIAS(entry);
22000 out_entry->vmei_offset = VME_OFFSET(entry);
22001 out_entry->vmei_is_sub_map = entry->is_sub_map;
22002 out_entry->vmei_protection = entry->protection;
22003 offset += sizeof(struct vm_map_entry_info);
22004
22005 out_entry->vmei_slot_mapping_count = 0;
22006 out_entry->vmei_is_compressor_pager = false;
22007 *size = offset;
22008 if (out_entry->vmei_is_sub_map) {
22009 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22010 }
22011 /* have a vm_object? */
22012 vm_object_t object = VME_OBJECT(entry);
22013 if (object == VM_OBJECT_NULL || !object->internal) {
22014 return KERN_SUCCESS;
22015 }
22016 /* objects has a pager? */
22017 memory_object_t pager = object->pager;
22018 if (pager != MEMORY_OBJECT_NULL) {
22019 return KERN_SUCCESS;
22020 }
22021 bool is_compressor = false;
22022 unsigned int slot_mapping_count = 0;
22023 size_t pager_info_size = insize - offset;
22024 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22025 if (kr != KERN_SUCCESS) {
22026 /* didn't have enough space for everything we want to write, caller needs to retry */
22027 return kr;
22028 }
22029 offset += pager_info_size;
22030 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22031 * is just for sanity sake */
22032 out_entry->vmei_is_compressor_pager = is_compressor;
22033 out_entry->vmei_slot_mapping_count = slot_mapping_count;
22034 *size = offset;
22035 return KERN_SUCCESS;
22036 }
22037
22038
22039 #endif
22040
22041
22042 #if CONFIG_FREEZE
22043
22044
22045 extern struct freezer_context freezer_context_global;
22046 AbsoluteTime c_freezer_last_yield_ts = 0;
22047
22048 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22049 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22050
22051 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22052 vm_map_freeze(
22053 task_t task,
22054 unsigned int *purgeable_count,
22055 unsigned int *wired_count,
22056 unsigned int *clean_count,
22057 unsigned int *dirty_count,
22058 unsigned int dirty_budget,
22059 unsigned int *shared_count,
22060 int *freezer_error_code,
22061 boolean_t eval_only)
22062 {
22063 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
22064 kern_return_t kr = KERN_SUCCESS;
22065 boolean_t evaluation_phase = TRUE;
22066 vm_object_t cur_shared_object = NULL;
22067 int cur_shared_obj_ref_cnt = 0;
22068 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22069
22070 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22071
22072 /*
22073 * We need the exclusive lock here so that we can
22074 * block any page faults or lookups while we are
22075 * in the middle of freezing this vm map.
22076 */
22077 vm_map_t map = task->map;
22078
22079 vm_map_lock(map);
22080
22081 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22082
22083 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22084 if (vm_compressor_low_on_space()) {
22085 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22086 }
22087
22088 if (vm_swap_low_on_space()) {
22089 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22090 }
22091
22092 kr = KERN_NO_SPACE;
22093 goto done;
22094 }
22095
22096 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22097 /*
22098 * In-memory compressor backing the freezer. No disk.
22099 * So no need to do the evaluation phase.
22100 */
22101 evaluation_phase = FALSE;
22102
22103 if (eval_only == TRUE) {
22104 /*
22105 * We don't support 'eval_only' mode
22106 * in this non-swap config.
22107 */
22108 *freezer_error_code = FREEZER_ERROR_GENERIC;
22109 kr = KERN_INVALID_ARGUMENT;
22110 goto done;
22111 }
22112
22113 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22114 clock_get_uptime(&c_freezer_last_yield_ts);
22115 }
22116 again:
22117
22118 for (entry2 = vm_map_first_entry(map);
22119 entry2 != vm_map_to_entry(map);
22120 entry2 = entry2->vme_next) {
22121 vm_object_t src_object;
22122
22123 if (entry2->is_sub_map) {
22124 continue;
22125 }
22126
22127 src_object = VME_OBJECT(entry2);
22128 if (!src_object ||
22129 src_object->phys_contiguous ||
22130 !src_object->internal) {
22131 continue;
22132 }
22133
22134 /* If eligible, scan the entry, moving eligible pages over to our parent object */
22135
22136 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22137 /*
22138 * We skip purgeable objects during evaluation phase only.
22139 * If we decide to freeze this process, we'll explicitly
22140 * purge these objects before we go around again with
22141 * 'evaluation_phase' set to FALSE.
22142 */
22143
22144 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22145 /*
22146 * We want to purge objects that may not belong to this task but are mapped
22147 * in this task alone. Since we already purged this task's purgeable memory
22148 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22149 * on this task's purgeable objects. Hence the check for only volatile objects.
22150 */
22151 if (evaluation_phase ||
22152 src_object->purgable != VM_PURGABLE_VOLATILE ||
22153 os_ref_get_count_raw(&src_object->ref_count) != 1) {
22154 continue;
22155 }
22156 vm_object_lock(src_object);
22157 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22158 os_ref_get_count_raw(&src_object->ref_count) == 1) {
22159 purgeable_q_t old_queue;
22160
22161 /* object should be on a purgeable queue */
22162 assert(src_object->objq.next != NULL &&
22163 src_object->objq.prev != NULL);
22164 /* move object from its volatile queue to the nonvolatile queue */
22165 old_queue = vm_purgeable_object_remove(src_object);
22166 assert(old_queue);
22167 if (src_object->purgeable_when_ripe) {
22168 /* remove a token from that volatile queue */
22169 vm_page_lock_queues();
22170 vm_purgeable_token_delete_first(old_queue);
22171 vm_page_unlock_queues();
22172 }
22173 /* purge the object */
22174 vm_object_purge(src_object, 0);
22175 }
22176 vm_object_unlock(src_object);
22177 continue;
22178 }
22179
22180 /*
22181 * Pages belonging to this object could be swapped to disk.
22182 * Make sure it's not a shared object because we could end
22183 * up just bringing it back in again.
22184 *
22185 * We try to optimize somewhat by checking for objects that are mapped
22186 * more than once within our own map. But we don't do full searches,
22187 * we just look at the entries following our current entry.
22188 */
22189
22190 if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22191 if (src_object != cur_shared_object) {
22192 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22193 dirty_shared_count += obj_pages_snapshot;
22194
22195 cur_shared_object = src_object;
22196 cur_shared_obj_ref_cnt = 1;
22197 continue;
22198 } else {
22199 cur_shared_obj_ref_cnt++;
22200 if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22201 /*
22202 * Fall through to below and treat this object as private.
22203 * So deduct its pages from our shared total and add it to the
22204 * private total.
22205 */
22206
22207 dirty_shared_count -= obj_pages_snapshot;
22208 dirty_private_count += obj_pages_snapshot;
22209 } else {
22210 continue;
22211 }
22212 }
22213 }
22214
22215
22216 if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22217 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22218 }
22219
22220 if (evaluation_phase == TRUE) {
22221 continue;
22222 }
22223 }
22224
22225 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22226 *wired_count += src_object->wired_page_count;
22227
22228 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22229 if (vm_compressor_low_on_space()) {
22230 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22231 }
22232
22233 if (vm_swap_low_on_space()) {
22234 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22235 }
22236
22237 kr = KERN_NO_SPACE;
22238 break;
22239 }
22240 if (paged_out_count >= dirty_budget) {
22241 break;
22242 }
22243 dirty_budget -= paged_out_count;
22244 }
22245
22246 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22247 if (evaluation_phase) {
22248 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22249
22250 if (dirty_shared_count > shared_pages_threshold) {
22251 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22252 kr = KERN_FAILURE;
22253 goto done;
22254 }
22255
22256 if (dirty_shared_count &&
22257 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22258 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22259 kr = KERN_FAILURE;
22260 goto done;
22261 }
22262
22263 evaluation_phase = FALSE;
22264 dirty_shared_count = dirty_private_count = 0;
22265
22266 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22267 clock_get_uptime(&c_freezer_last_yield_ts);
22268
22269 if (eval_only) {
22270 kr = KERN_SUCCESS;
22271 goto done;
22272 }
22273
22274 vm_purgeable_purge_task_owned(task);
22275
22276 goto again;
22277 } else {
22278 kr = KERN_SUCCESS;
22279 }
22280
22281 done:
22282 vm_map_unlock(map);
22283
22284 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22285 vm_object_compressed_freezer_done();
22286 }
22287 return kr;
22288 }
22289
22290 #endif
22291
22292 /*
22293 * vm_map_entry_should_cow_for_true_share:
22294 *
22295 * Determines if the map entry should be clipped and setup for copy-on-write
22296 * to avoid applying "true_share" to a large VM object when only a subset is
22297 * targeted.
22298 *
22299 * For now, we target only the map entries created for the Objective C
22300 * Garbage Collector, which initially have the following properties:
22301 * - alias == VM_MEMORY_MALLOC
22302 * - wired_count == 0
22303 * - !needs_copy
22304 * and a VM object with:
22305 * - internal
22306 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22307 * - !true_share
22308 * - vo_size == ANON_CHUNK_SIZE
22309 *
22310 * Only non-kernel map entries.
22311 */
22312 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22313 vm_map_entry_should_cow_for_true_share(
22314 vm_map_entry_t entry)
22315 {
22316 vm_object_t object;
22317
22318 if (entry->is_sub_map) {
22319 /* entry does not point at a VM object */
22320 return FALSE;
22321 }
22322
22323 if (entry->needs_copy) {
22324 /* already set for copy_on_write: done! */
22325 return FALSE;
22326 }
22327
22328 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22329 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22330 /* not a malloc heap or Obj-C Garbage Collector heap */
22331 return FALSE;
22332 }
22333
22334 if (entry->wired_count) {
22335 /* wired: can't change the map entry... */
22336 vm_counters.should_cow_but_wired++;
22337 return FALSE;
22338 }
22339
22340 object = VME_OBJECT(entry);
22341
22342 if (object == VM_OBJECT_NULL) {
22343 /* no object yet... */
22344 return FALSE;
22345 }
22346
22347 if (!object->internal) {
22348 /* not an internal object */
22349 return FALSE;
22350 }
22351
22352 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22353 /* not the default copy strategy */
22354 return FALSE;
22355 }
22356
22357 if (object->true_share) {
22358 /* already true_share: too late to avoid it */
22359 return FALSE;
22360 }
22361
22362 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22363 object->vo_size != ANON_CHUNK_SIZE) {
22364 /* ... not an object created for the ObjC Garbage Collector */
22365 return FALSE;
22366 }
22367
22368 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22369 object->vo_size != 2048 * 4096) {
22370 /* ... not a "MALLOC_SMALL" heap */
22371 return FALSE;
22372 }
22373
22374 /*
22375 * All the criteria match: we have a large object being targeted for "true_share".
22376 * To limit the adverse side-effects linked with "true_share", tell the caller to
22377 * try and avoid setting up the entire object for "true_share" by clipping the
22378 * targeted range and setting it up for copy-on-write.
22379 */
22380 return TRUE;
22381 }
22382
22383 uint64_t vm_map_range_overflows_count = 0;
22384 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22385 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22386 vm_map_range_overflows(
22387 vm_map_t map,
22388 vm_map_offset_t addr,
22389 vm_map_size_t size)
22390 {
22391 vm_map_offset_t start, end, sum;
22392 vm_map_offset_t pgmask;
22393
22394 if (size == 0) {
22395 /* empty range -> no overflow */
22396 return false;
22397 }
22398 pgmask = vm_map_page_mask(map);
22399 start = vm_map_trunc_page_mask(addr, pgmask);
22400 end = vm_map_round_page_mask(addr + size, pgmask);
22401 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22402 vm_map_range_overflows_count++;
22403 if (vm_map_range_overflows_log) {
22404 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22405 proc_selfpid(),
22406 proc_best_name(current_proc()),
22407 (uint64_t)addr,
22408 (uint64_t)size,
22409 (uint64_t)pgmask);
22410 }
22411 DTRACE_VM4(vm_map_range_overflows,
22412 vm_map_t, map,
22413 uint32_t, pgmask,
22414 uint64_t, (uint64_t)addr,
22415 uint64_t, (uint64_t)size);
22416 return true;
22417 }
22418 return false;
22419 }
22420
22421 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22422 vm_map_round_page_mask(
22423 vm_map_offset_t offset,
22424 vm_map_offset_t mask)
22425 {
22426 return VM_MAP_ROUND_PAGE(offset, mask);
22427 }
22428
22429 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22430 vm_map_trunc_page_mask(
22431 vm_map_offset_t offset,
22432 vm_map_offset_t mask)
22433 {
22434 return VM_MAP_TRUNC_PAGE(offset, mask);
22435 }
22436
22437 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22438 vm_map_page_aligned(
22439 vm_map_offset_t offset,
22440 vm_map_offset_t mask)
22441 {
22442 return ((offset) & mask) == 0;
22443 }
22444
22445 int
vm_map_page_shift(vm_map_t map)22446 vm_map_page_shift(
22447 vm_map_t map)
22448 {
22449 return VM_MAP_PAGE_SHIFT(map);
22450 }
22451
22452 int
vm_map_page_size(vm_map_t map)22453 vm_map_page_size(
22454 vm_map_t map)
22455 {
22456 return VM_MAP_PAGE_SIZE(map);
22457 }
22458
22459 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22460 vm_map_page_mask(
22461 vm_map_t map)
22462 {
22463 return VM_MAP_PAGE_MASK(map);
22464 }
22465
22466 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22467 vm_map_set_page_shift(
22468 vm_map_t map,
22469 int pageshift)
22470 {
22471 if (map->hdr.nentries != 0) {
22472 /* too late to change page size */
22473 return KERN_FAILURE;
22474 }
22475
22476 map->hdr.page_shift = (uint16_t)pageshift;
22477
22478 return KERN_SUCCESS;
22479 }
22480
22481 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22482 vm_map_query_volatile(
22483 vm_map_t map,
22484 mach_vm_size_t *volatile_virtual_size_p,
22485 mach_vm_size_t *volatile_resident_size_p,
22486 mach_vm_size_t *volatile_compressed_size_p,
22487 mach_vm_size_t *volatile_pmap_size_p,
22488 mach_vm_size_t *volatile_compressed_pmap_size_p)
22489 {
22490 mach_vm_size_t volatile_virtual_size;
22491 mach_vm_size_t volatile_resident_count;
22492 mach_vm_size_t volatile_compressed_count;
22493 mach_vm_size_t volatile_pmap_count;
22494 mach_vm_size_t volatile_compressed_pmap_count;
22495 mach_vm_size_t resident_count;
22496 vm_map_entry_t entry;
22497 vm_object_t object;
22498
22499 /* map should be locked by caller */
22500
22501 volatile_virtual_size = 0;
22502 volatile_resident_count = 0;
22503 volatile_compressed_count = 0;
22504 volatile_pmap_count = 0;
22505 volatile_compressed_pmap_count = 0;
22506
22507 for (entry = vm_map_first_entry(map);
22508 entry != vm_map_to_entry(map);
22509 entry = entry->vme_next) {
22510 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22511
22512 if (entry->is_sub_map) {
22513 continue;
22514 }
22515 if (!(entry->protection & VM_PROT_WRITE)) {
22516 continue;
22517 }
22518 object = VME_OBJECT(entry);
22519 if (object == VM_OBJECT_NULL) {
22520 continue;
22521 }
22522 if (object->purgable != VM_PURGABLE_VOLATILE &&
22523 object->purgable != VM_PURGABLE_EMPTY) {
22524 continue;
22525 }
22526 if (VME_OFFSET(entry)) {
22527 /*
22528 * If the map entry has been split and the object now
22529 * appears several times in the VM map, we don't want
22530 * to count the object's resident_page_count more than
22531 * once. We count it only for the first one, starting
22532 * at offset 0 and ignore the other VM map entries.
22533 */
22534 continue;
22535 }
22536 resident_count = object->resident_page_count;
22537 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22538 resident_count = 0;
22539 } else {
22540 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22541 }
22542
22543 volatile_virtual_size += entry->vme_end - entry->vme_start;
22544 volatile_resident_count += resident_count;
22545 if (object->pager) {
22546 volatile_compressed_count +=
22547 vm_compressor_pager_get_count(object->pager);
22548 }
22549 pmap_compressed_bytes = 0;
22550 pmap_resident_bytes =
22551 pmap_query_resident(map->pmap,
22552 entry->vme_start,
22553 entry->vme_end,
22554 &pmap_compressed_bytes);
22555 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22556 volatile_compressed_pmap_count += (pmap_compressed_bytes
22557 / PAGE_SIZE);
22558 }
22559
22560 /* map is still locked on return */
22561
22562 *volatile_virtual_size_p = volatile_virtual_size;
22563 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22564 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22565 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22566 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22567
22568 return KERN_SUCCESS;
22569 }
22570
22571 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22572 vm_map_sizes(vm_map_t map,
22573 vm_map_size_t * psize,
22574 vm_map_size_t * pfree,
22575 vm_map_size_t * plargest_free)
22576 {
22577 vm_map_entry_t entry;
22578 vm_map_offset_t prev;
22579 vm_map_size_t free, total_free, largest_free;
22580 boolean_t end;
22581
22582 if (!map) {
22583 *psize = *pfree = *plargest_free = 0;
22584 return;
22585 }
22586 total_free = largest_free = 0;
22587
22588 vm_map_lock_read(map);
22589 if (psize) {
22590 *psize = map->max_offset - map->min_offset;
22591 }
22592
22593 prev = map->min_offset;
22594 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22595 end = (entry == vm_map_to_entry(map));
22596
22597 if (end) {
22598 free = entry->vme_end - prev;
22599 } else {
22600 free = entry->vme_start - prev;
22601 }
22602
22603 total_free += free;
22604 if (free > largest_free) {
22605 largest_free = free;
22606 }
22607
22608 if (end) {
22609 break;
22610 }
22611 prev = entry->vme_end;
22612 }
22613 vm_map_unlock_read(map);
22614 if (pfree) {
22615 *pfree = total_free;
22616 }
22617 if (plargest_free) {
22618 *plargest_free = largest_free;
22619 }
22620 }
22621
22622 #if VM_SCAN_FOR_SHADOW_CHAIN
22623 int
vm_map_shadow_max(vm_map_t map)22624 vm_map_shadow_max(
22625 vm_map_t map)
22626 {
22627 int shadows, shadows_max;
22628 vm_map_entry_t entry;
22629 vm_object_t object, next_object;
22630
22631 if (map == NULL) {
22632 return 0;
22633 }
22634
22635 shadows_max = 0;
22636
22637 vm_map_lock_read(map);
22638
22639 for (entry = vm_map_first_entry(map);
22640 entry != vm_map_to_entry(map);
22641 entry = entry->vme_next) {
22642 if (entry->is_sub_map) {
22643 continue;
22644 }
22645 object = VME_OBJECT(entry);
22646 if (object == NULL) {
22647 continue;
22648 }
22649 vm_object_lock_shared(object);
22650 for (shadows = 0;
22651 object->shadow != NULL;
22652 shadows++, object = next_object) {
22653 next_object = object->shadow;
22654 vm_object_lock_shared(next_object);
22655 vm_object_unlock(object);
22656 }
22657 vm_object_unlock(object);
22658 if (shadows > shadows_max) {
22659 shadows_max = shadows;
22660 }
22661 }
22662
22663 vm_map_unlock_read(map);
22664
22665 return shadows_max;
22666 }
22667 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22668
22669 void
vm_commit_pagezero_status(vm_map_t lmap)22670 vm_commit_pagezero_status(vm_map_t lmap)
22671 {
22672 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22673 }
22674
22675 #if __x86_64__
22676 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22677 vm_map_set_high_start(
22678 vm_map_t map,
22679 vm_map_offset_t high_start)
22680 {
22681 map->vmmap_high_start = high_start;
22682 }
22683 #endif /* __x86_64__ */
22684
22685 #if CODE_SIGNING_MONITOR
22686
22687 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22688 vm_map_entry_cs_associate(
22689 vm_map_t map,
22690 vm_map_entry_t entry,
22691 vm_map_kernel_flags_t vmk_flags)
22692 {
22693 vm_object_t cs_object, cs_shadow, backing_object;
22694 vm_object_offset_t cs_offset, backing_offset;
22695 void *cs_blobs;
22696 struct vnode *cs_vnode;
22697 kern_return_t cs_ret;
22698
22699 if (map->pmap == NULL ||
22700 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22701 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22702 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22703 return KERN_SUCCESS;
22704 }
22705
22706 if (!(entry->protection & VM_PROT_EXECUTE)) {
22707 /*
22708 * This memory region is not executable, so the code-signing
22709 * monitor would usually not care about it...
22710 */
22711 if (vmk_flags.vmkf_remap_prot_copy &&
22712 (entry->max_protection & VM_PROT_EXECUTE)) {
22713 /*
22714 * ... except if the memory region is being remapped
22715 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22716 * which is what a debugger or dtrace would be doing
22717 * to prepare to modify an executable page to insert
22718 * a breakpoint or activate a probe.
22719 * In that case, fall through so that we can mark
22720 * this region as being "debugged" and no longer
22721 * strictly code-signed.
22722 */
22723 } else {
22724 /*
22725 * Really not executable, so no need to tell the
22726 * code-signing monitor.
22727 */
22728 return KERN_SUCCESS;
22729 }
22730 }
22731
22732 vm_map_lock_assert_exclusive(map);
22733
22734 /*
22735 * Check for a debug association mapping before we check for used_for_jit. This
22736 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22737 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22738 * since they are mapped with RW or RX permissions, which the page table monitor
22739 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22740 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22741 * violation when those USER_EXEC pages are mapped as RW.
22742 *
22743 * Since these pages switch between RW and RX through mprotect, they mimic what
22744 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22745 * on macOS systems, this works in our favor here and allows us to continue to
22746 * support these legacy-programmed applications without sacrificing security on
22747 * the page table or the code signing monitor. We don't need to explicitly check
22748 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22749 * created with RX, then the application must map it as RW in order to first write
22750 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22751 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22752 * Similarly, if the mapping was created as RW, and then switched to RX,
22753 * vm_map_protect will again mark the entry as a copy, and both these cases
22754 * lead to this if-statement being entered.
22755 *
22756 * For more information: rdar://115313336.
22757 */
22758 if (vmk_flags.vmkf_remap_prot_copy) {
22759 cs_ret = csm_associate_debug_region(
22760 map->pmap,
22761 entry->vme_start,
22762 entry->vme_end - entry->vme_start);
22763
22764 /*
22765 * csm_associate_debug_region returns not supported when the code signing
22766 * monitor is disabled. This is intentional, since cs_ret is checked towards
22767 * the end of the function, and if it is not supported, then we still want the
22768 * VM to perform code-signing enforcement on this entry. That said, if we don't
22769 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22770 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22771 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22772 * cases, which will cause a violation when attempted to be mapped as writable).
22773 */
22774 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22775 entry->vme_xnu_user_debug = TRUE;
22776 }
22777 #if DEVELOPMENT || DEBUG
22778 if (vm_log_xnu_user_debug) {
22779 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22780 proc_selfpid(),
22781 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22782 __FUNCTION__, __LINE__,
22783 map, entry,
22784 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22785 entry->vme_xnu_user_debug,
22786 cs_ret);
22787 }
22788 #endif /* DEVELOPMENT || DEBUG */
22789 goto done;
22790 }
22791
22792 if (entry->used_for_jit) {
22793 cs_ret = csm_associate_jit_region(
22794 map->pmap,
22795 entry->vme_start,
22796 entry->vme_end - entry->vme_start);
22797 goto done;
22798 }
22799
22800 cs_object = VME_OBJECT(entry);
22801 vm_object_lock_shared(cs_object);
22802 cs_offset = VME_OFFSET(entry);
22803
22804 /* find the VM object backed by the code-signed vnode */
22805 for (;;) {
22806 /* go to the bottom of cs_object's shadow chain */
22807 for (;
22808 cs_object->shadow != VM_OBJECT_NULL;
22809 cs_object = cs_shadow) {
22810 cs_shadow = cs_object->shadow;
22811 cs_offset += cs_object->vo_shadow_offset;
22812 vm_object_lock_shared(cs_shadow);
22813 vm_object_unlock(cs_object);
22814 }
22815 if (cs_object->internal ||
22816 cs_object->pager == MEMORY_OBJECT_NULL) {
22817 vm_object_unlock(cs_object);
22818 return KERN_SUCCESS;
22819 }
22820
22821 cs_offset += cs_object->paging_offset;
22822
22823 /*
22824 * cs_object could be backed by a:
22825 * vnode_pager
22826 * apple_protect_pager
22827 * shared_region_pager
22828 * fourk_pager (multiple backing objects -> fail?)
22829 * ask the pager if it has a backing VM object
22830 */
22831 if (!memory_object_backing_object(cs_object->pager,
22832 cs_offset,
22833 &backing_object,
22834 &backing_offset)) {
22835 /* no backing object: cs_object is it */
22836 break;
22837 }
22838
22839 /* look down the backing object's shadow chain */
22840 vm_object_lock_shared(backing_object);
22841 vm_object_unlock(cs_object);
22842 cs_object = backing_object;
22843 cs_offset = backing_offset;
22844 }
22845
22846 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22847 if (cs_vnode == NULL) {
22848 /* no vnode, no code signatures to associate */
22849 cs_ret = KERN_SUCCESS;
22850 } else {
22851 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22852 &cs_blobs);
22853 assert(cs_ret == KERN_SUCCESS);
22854 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22855 entry->vme_start,
22856 (entry->vme_end - entry->vme_start),
22857 cs_offset,
22858 cs_blobs);
22859 }
22860 vm_object_unlock(cs_object);
22861 cs_object = VM_OBJECT_NULL;
22862
22863 done:
22864 if (cs_ret == KERN_SUCCESS) {
22865 DTRACE_VM2(vm_map_entry_cs_associate_success,
22866 vm_map_offset_t, entry->vme_start,
22867 vm_map_offset_t, entry->vme_end);
22868 if (vm_map_executable_immutable) {
22869 /*
22870 * Prevent this executable
22871 * mapping from being unmapped
22872 * or modified.
22873 */
22874 entry->vme_permanent = TRUE;
22875 }
22876 /*
22877 * pmap says it will validate the
22878 * code-signing validity of pages
22879 * faulted in via this mapping, so
22880 * this map entry should be marked so
22881 * that vm_fault() bypasses code-signing
22882 * validation for faults coming through
22883 * this mapping.
22884 */
22885 entry->csm_associated = TRUE;
22886 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22887 /*
22888 * pmap won't check the code-signing
22889 * validity of pages faulted in via
22890 * this mapping, so VM should keep
22891 * doing it.
22892 */
22893 DTRACE_VM3(vm_map_entry_cs_associate_off,
22894 vm_map_offset_t, entry->vme_start,
22895 vm_map_offset_t, entry->vme_end,
22896 int, cs_ret);
22897 } else {
22898 /*
22899 * A real error: do not allow
22900 * execution in this mapping.
22901 */
22902 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22903 vm_map_offset_t, entry->vme_start,
22904 vm_map_offset_t, entry->vme_end,
22905 int, cs_ret);
22906 if (vmk_flags.vmkf_overwrite_immutable) {
22907 /*
22908 * We can get here when we remap an apple_protect pager
22909 * on top of an already cs_associated executable mapping
22910 * with the same code signatures, so we don't want to
22911 * lose VM_PROT_EXECUTE in that case...
22912 */
22913 } else {
22914 entry->protection &= ~VM_PROT_ALLEXEC;
22915 entry->max_protection &= ~VM_PROT_ALLEXEC;
22916 }
22917 }
22918
22919 return cs_ret;
22920 }
22921
22922 #endif /* CODE_SIGNING_MONITOR */
22923
22924 inline bool
vm_map_is_corpse_source(vm_map_t map)22925 vm_map_is_corpse_source(vm_map_t map)
22926 {
22927 bool status = false;
22928 if (map) {
22929 vm_map_lock_read(map);
22930 status = map->corpse_source;
22931 vm_map_unlock_read(map);
22932 }
22933 return status;
22934 }
22935
22936 inline void
vm_map_set_corpse_source(vm_map_t map)22937 vm_map_set_corpse_source(vm_map_t map)
22938 {
22939 if (map) {
22940 vm_map_lock(map);
22941 map->corpse_source = true;
22942 vm_map_unlock(map);
22943 }
22944 }
22945
22946 inline void
vm_map_unset_corpse_source(vm_map_t map)22947 vm_map_unset_corpse_source(vm_map_t map)
22948 {
22949 if (map) {
22950 vm_map_lock(map);
22951 map->corpse_source = false;
22952 vm_map_unlock(map);
22953 }
22954 }
22955 /*
22956 * FORKED CORPSE FOOTPRINT
22957 *
22958 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22959 * empty since it never ran and never got to fault in any pages.
22960 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22961 * a forked corpse would therefore return very little information.
22962 *
22963 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22964 * to vm_map_fork() to collect footprint information from the original VM map
22965 * and its pmap, and store it in the forked corpse's VM map. That information
22966 * is stored in place of the VM map's "hole list" since we'll never need to
22967 * lookup for holes in the corpse's map.
22968 *
22969 * The corpse's footprint info looks like this:
22970 *
22971 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22972 * as follows:
22973 * +---------------------------------------+
22974 * header-> | cf_size |
22975 * +-------------------+-------------------+
22976 * | cf_last_region | cf_last_zeroes |
22977 * +-------------------+-------------------+
22978 * region1-> | cfr_vaddr |
22979 * +-------------------+-------------------+
22980 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22981 * +---------------------------------------+
22982 * | d4 | d5 | ... |
22983 * +---------------------------------------+
22984 * | ... |
22985 * +-------------------+-------------------+
22986 * | dy | dz | na | na | cfr_vaddr... | <-region2
22987 * +-------------------+-------------------+
22988 * | cfr_vaddr (ctd) | cfr_num_pages |
22989 * +---------------------------------------+
22990 * | d0 | d1 ... |
22991 * +---------------------------------------+
22992 * ...
22993 * +---------------------------------------+
22994 * last region-> | cfr_vaddr |
22995 * +---------------------------------------+
22996 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22997 * +---------------------------------------+
22998 * ...
22999 * +---------------------------------------+
23000 * | dx | dy | dz | na | na | na | na | na |
23001 * +---------------------------------------+
23002 *
23003 * where:
23004 * cf_size: total size of the buffer (rounded to page size)
23005 * cf_last_region: offset in the buffer of the last "region" sub-header
23006 * cf_last_zeroes: number of trailing "zero" dispositions at the end
23007 * of last region
23008 * cfr_vaddr: virtual address of the start of the covered "region"
23009 * cfr_num_pages: number of pages in the covered "region"
23010 * d*: disposition of the page at that virtual address
23011 * Regions in the buffer are word-aligned.
23012 *
23013 * We estimate the size of the buffer based on the number of memory regions
23014 * and the virtual size of the address space. While copying each memory region
23015 * during vm_map_fork(), we also collect the footprint info for that region
23016 * and store it in the buffer, packing it as much as possible (coalescing
23017 * contiguous memory regions to avoid having too many region headers and
23018 * avoiding long streaks of "zero" page dispositions by splitting footprint
23019 * "regions", so the number of regions in the footprint buffer might not match
23020 * the number of memory regions in the address space.
23021 *
23022 * We also have to copy the original task's "nonvolatile" ledgers since that's
23023 * part of the footprint and will need to be reported to any tool asking for
23024 * the footprint information of the forked corpse.
23025 */
23026
23027 uint64_t vm_map_corpse_footprint_count = 0;
23028 uint64_t vm_map_corpse_footprint_size_avg = 0;
23029 uint64_t vm_map_corpse_footprint_size_max = 0;
23030 uint64_t vm_map_corpse_footprint_full = 0;
23031 uint64_t vm_map_corpse_footprint_no_buf = 0;
23032
23033 struct vm_map_corpse_footprint_header {
23034 vm_size_t cf_size; /* allocated buffer size */
23035 uint32_t cf_last_region; /* offset of last region in buffer */
23036 union {
23037 uint32_t cfu_last_zeroes; /* during creation:
23038 * number of "zero" dispositions at
23039 * end of last region */
23040 uint32_t cfu_hint_region; /* during lookup:
23041 * offset of last looked up region */
23042 #define cf_last_zeroes cfu.cfu_last_zeroes
23043 #define cf_hint_region cfu.cfu_hint_region
23044 } cfu;
23045 };
23046 typedef uint8_t cf_disp_t;
23047 struct vm_map_corpse_footprint_region {
23048 vm_map_offset_t cfr_vaddr; /* region start virtual address */
23049 uint32_t cfr_num_pages; /* number of pages in this "region" */
23050 cf_disp_t cfr_disposition[0]; /* disposition of each page */
23051 } __attribute__((packed));
23052
23053 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23054 vm_page_disposition_to_cf_disp(
23055 int disposition)
23056 {
23057 assert(sizeof(cf_disp_t) == 1);
23058 /* relocate bits that don't fit in a "uint8_t" */
23059 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23060 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23061 }
23062 /* cast gets rid of extra bits */
23063 return (cf_disp_t) disposition;
23064 }
23065
23066 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23067 vm_page_cf_disp_to_disposition(
23068 cf_disp_t cf_disp)
23069 {
23070 int disposition;
23071
23072 assert(sizeof(cf_disp_t) == 1);
23073 disposition = (int) cf_disp;
23074 /* move relocated bits back in place */
23075 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23076 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23077 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23078 }
23079 return disposition;
23080 }
23081
23082 /*
23083 * vm_map_corpse_footprint_new_region:
23084 * closes the current footprint "region" and creates a new one
23085 *
23086 * Returns NULL if there's not enough space in the buffer for a new region.
23087 */
23088 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23089 vm_map_corpse_footprint_new_region(
23090 struct vm_map_corpse_footprint_header *footprint_header)
23091 {
23092 uintptr_t footprint_edge;
23093 uint32_t new_region_offset;
23094 struct vm_map_corpse_footprint_region *footprint_region;
23095 struct vm_map_corpse_footprint_region *new_footprint_region;
23096
23097 footprint_edge = ((uintptr_t)footprint_header +
23098 footprint_header->cf_size);
23099 footprint_region = ((struct vm_map_corpse_footprint_region *)
23100 ((char *)footprint_header +
23101 footprint_header->cf_last_region));
23102 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23103 footprint_edge);
23104
23105 /* get rid of trailing zeroes in the last region */
23106 assert(footprint_region->cfr_num_pages >=
23107 footprint_header->cf_last_zeroes);
23108 footprint_region->cfr_num_pages -=
23109 footprint_header->cf_last_zeroes;
23110 footprint_header->cf_last_zeroes = 0;
23111
23112 /* reuse this region if it's now empty */
23113 if (footprint_region->cfr_num_pages == 0) {
23114 return footprint_region;
23115 }
23116
23117 /* compute offset of new region */
23118 new_region_offset = footprint_header->cf_last_region;
23119 new_region_offset += sizeof(*footprint_region);
23120 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23121 new_region_offset = roundup(new_region_offset, sizeof(int));
23122
23123 /* check if we're going over the edge */
23124 if (((uintptr_t)footprint_header +
23125 new_region_offset +
23126 sizeof(*footprint_region)) >=
23127 footprint_edge) {
23128 /* over the edge: no new region */
23129 return NULL;
23130 }
23131
23132 /* adjust offset of last region in header */
23133 footprint_header->cf_last_region = new_region_offset;
23134
23135 new_footprint_region = (struct vm_map_corpse_footprint_region *)
23136 ((char *)footprint_header +
23137 footprint_header->cf_last_region);
23138 new_footprint_region->cfr_vaddr = 0;
23139 new_footprint_region->cfr_num_pages = 0;
23140 /* caller needs to initialize new region */
23141
23142 return new_footprint_region;
23143 }
23144
23145 /*
23146 * vm_map_corpse_footprint_collect:
23147 * collect footprint information for "old_entry" in "old_map" and
23148 * stores it in "new_map"'s vmmap_footprint_info.
23149 */
23150 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23151 vm_map_corpse_footprint_collect(
23152 vm_map_t old_map,
23153 vm_map_entry_t old_entry,
23154 vm_map_t new_map)
23155 {
23156 vm_map_offset_t va;
23157 kern_return_t kr;
23158 struct vm_map_corpse_footprint_header *footprint_header;
23159 struct vm_map_corpse_footprint_region *footprint_region;
23160 struct vm_map_corpse_footprint_region *new_footprint_region;
23161 cf_disp_t *next_disp_p;
23162 uintptr_t footprint_edge;
23163 uint32_t num_pages_tmp;
23164 int effective_page_size;
23165
23166 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23167
23168 va = old_entry->vme_start;
23169
23170 vm_map_lock_assert_exclusive(old_map);
23171 vm_map_lock_assert_exclusive(new_map);
23172
23173 assert(new_map->has_corpse_footprint);
23174 assert(!old_map->has_corpse_footprint);
23175 if (!new_map->has_corpse_footprint ||
23176 old_map->has_corpse_footprint) {
23177 /*
23178 * This can only transfer footprint info from a
23179 * map with a live pmap to a map with a corpse footprint.
23180 */
23181 return KERN_NOT_SUPPORTED;
23182 }
23183
23184 if (new_map->vmmap_corpse_footprint == NULL) {
23185 vm_offset_t buf;
23186 vm_size_t buf_size;
23187
23188 buf = 0;
23189 buf_size = (sizeof(*footprint_header) +
23190 (old_map->hdr.nentries
23191 *
23192 (sizeof(*footprint_region) +
23193 +3)) /* potential alignment for each region */
23194 +
23195 ((old_map->size / effective_page_size)
23196 *
23197 sizeof(cf_disp_t))); /* disposition for each page */
23198 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23199 buf_size = round_page(buf_size);
23200
23201 /* limit buffer to 1 page to validate overflow detection */
23202 // buf_size = PAGE_SIZE;
23203
23204 /* limit size to a somewhat sane amount */
23205 #if XNU_TARGET_OS_OSX
23206 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
23207 #else /* XNU_TARGET_OS_OSX */
23208 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
23209 #endif /* XNU_TARGET_OS_OSX */
23210 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23211 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23212 }
23213
23214 /*
23215 * Allocate the pageable buffer (with a trailing guard page).
23216 * It will be zero-filled on demand.
23217 */
23218 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
23219 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
23220 VM_KERN_MEMORY_DIAG);
23221 if (kr != KERN_SUCCESS) {
23222 vm_map_corpse_footprint_no_buf++;
23223 return kr;
23224 }
23225
23226 /* initialize header and 1st region */
23227 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
23228 new_map->vmmap_corpse_footprint = footprint_header;
23229
23230 footprint_header->cf_size = buf_size;
23231 footprint_header->cf_last_region =
23232 sizeof(*footprint_header);
23233 footprint_header->cf_last_zeroes = 0;
23234
23235 footprint_region = (struct vm_map_corpse_footprint_region *)
23236 ((char *)footprint_header +
23237 footprint_header->cf_last_region);
23238 footprint_region->cfr_vaddr = 0;
23239 footprint_region->cfr_num_pages = 0;
23240 } else {
23241 /* retrieve header and last region */
23242 footprint_header = (struct vm_map_corpse_footprint_header *)
23243 new_map->vmmap_corpse_footprint;
23244 footprint_region = (struct vm_map_corpse_footprint_region *)
23245 ((char *)footprint_header +
23246 footprint_header->cf_last_region);
23247 }
23248 footprint_edge = ((uintptr_t)footprint_header +
23249 footprint_header->cf_size);
23250
23251 if ((footprint_region->cfr_vaddr +
23252 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23253 effective_page_size))
23254 != old_entry->vme_start) {
23255 uint64_t num_pages_delta, num_pages_delta_size;
23256 uint32_t region_offset_delta_size;
23257
23258 /*
23259 * Not the next contiguous virtual address:
23260 * start a new region or store "zero" dispositions for
23261 * the missing pages?
23262 */
23263 /* size of gap in actual page dispositions */
23264 num_pages_delta = ((old_entry->vme_start -
23265 footprint_region->cfr_vaddr) / effective_page_size)
23266 - footprint_region->cfr_num_pages;
23267 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23268 /* size of gap as a new footprint region header */
23269 region_offset_delta_size =
23270 (sizeof(*footprint_region) +
23271 roundup(((footprint_region->cfr_num_pages -
23272 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23273 sizeof(int)) -
23274 ((footprint_region->cfr_num_pages -
23275 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23276 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23277 if (region_offset_delta_size < num_pages_delta_size ||
23278 os_add3_overflow(footprint_region->cfr_num_pages,
23279 (uint32_t) num_pages_delta,
23280 1,
23281 &num_pages_tmp)) {
23282 /*
23283 * Storing data for this gap would take more space
23284 * than inserting a new footprint region header:
23285 * let's start a new region and save space. If it's a
23286 * tie, let's avoid using a new region, since that
23287 * would require more region hops to find the right
23288 * range during lookups.
23289 *
23290 * If the current region's cfr_num_pages would overflow
23291 * if we added "zero" page dispositions for the gap,
23292 * no choice but to start a new region.
23293 */
23294 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23295 new_footprint_region =
23296 vm_map_corpse_footprint_new_region(footprint_header);
23297 /* check that we're not going over the edge */
23298 if (new_footprint_region == NULL) {
23299 goto over_the_edge;
23300 }
23301 footprint_region = new_footprint_region;
23302 /* initialize new region as empty */
23303 footprint_region->cfr_vaddr = old_entry->vme_start;
23304 footprint_region->cfr_num_pages = 0;
23305 } else {
23306 /*
23307 * Store "zero" page dispositions for the missing
23308 * pages.
23309 */
23310 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23311 for (; num_pages_delta > 0; num_pages_delta--) {
23312 next_disp_p = (cf_disp_t *)
23313 ((uintptr_t) footprint_region +
23314 sizeof(*footprint_region));
23315 next_disp_p += footprint_region->cfr_num_pages;
23316 /* check that we're not going over the edge */
23317 if ((uintptr_t)next_disp_p >= footprint_edge) {
23318 goto over_the_edge;
23319 }
23320 /* store "zero" disposition for this gap page */
23321 footprint_region->cfr_num_pages++;
23322 *next_disp_p = (cf_disp_t) 0;
23323 footprint_header->cf_last_zeroes++;
23324 }
23325 }
23326 }
23327
23328 for (va = old_entry->vme_start;
23329 va < old_entry->vme_end;
23330 va += effective_page_size) {
23331 int disposition;
23332 cf_disp_t cf_disp;
23333
23334 vm_map_footprint_query_page_info(old_map,
23335 old_entry,
23336 va,
23337 &disposition);
23338 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23339
23340 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23341
23342 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23343 /*
23344 * Ignore "zero" dispositions at start of
23345 * region: just move start of region.
23346 */
23347 footprint_region->cfr_vaddr += effective_page_size;
23348 continue;
23349 }
23350
23351 /* would region's cfr_num_pages overflow? */
23352 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23353 &num_pages_tmp)) {
23354 /* overflow: create a new region */
23355 new_footprint_region =
23356 vm_map_corpse_footprint_new_region(
23357 footprint_header);
23358 if (new_footprint_region == NULL) {
23359 goto over_the_edge;
23360 }
23361 footprint_region = new_footprint_region;
23362 footprint_region->cfr_vaddr = va;
23363 footprint_region->cfr_num_pages = 0;
23364 }
23365
23366 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23367 sizeof(*footprint_region));
23368 next_disp_p += footprint_region->cfr_num_pages;
23369 /* check that we're not going over the edge */
23370 if ((uintptr_t)next_disp_p >= footprint_edge) {
23371 goto over_the_edge;
23372 }
23373 /* store this dispostion */
23374 *next_disp_p = cf_disp;
23375 footprint_region->cfr_num_pages++;
23376
23377 if (cf_disp != 0) {
23378 /* non-zero disp: break the current zero streak */
23379 footprint_header->cf_last_zeroes = 0;
23380 /* done */
23381 continue;
23382 }
23383
23384 /* zero disp: add to the current streak of zeroes */
23385 footprint_header->cf_last_zeroes++;
23386 if ((footprint_header->cf_last_zeroes +
23387 roundup(((footprint_region->cfr_num_pages -
23388 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23389 (sizeof(int) - 1),
23390 sizeof(int))) <
23391 (sizeof(*footprint_header))) {
23392 /*
23393 * There are not enough trailing "zero" dispositions
23394 * (+ the extra padding we would need for the previous
23395 * region); creating a new region would not save space
23396 * at this point, so let's keep this "zero" disposition
23397 * in this region and reconsider later.
23398 */
23399 continue;
23400 }
23401 /*
23402 * Create a new region to avoid having too many consecutive
23403 * "zero" dispositions.
23404 */
23405 new_footprint_region =
23406 vm_map_corpse_footprint_new_region(footprint_header);
23407 if (new_footprint_region == NULL) {
23408 goto over_the_edge;
23409 }
23410 footprint_region = new_footprint_region;
23411 /* initialize the new region as empty ... */
23412 footprint_region->cfr_num_pages = 0;
23413 /* ... and skip this "zero" disp */
23414 footprint_region->cfr_vaddr = va + effective_page_size;
23415 }
23416
23417 return KERN_SUCCESS;
23418
23419 over_the_edge:
23420 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23421 vm_map_corpse_footprint_full++;
23422 return KERN_RESOURCE_SHORTAGE;
23423 }
23424
23425 /*
23426 * vm_map_corpse_footprint_collect_done:
23427 * completes the footprint collection by getting rid of any remaining
23428 * trailing "zero" dispositions and trimming the unused part of the
23429 * kernel buffer
23430 */
23431 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23432 vm_map_corpse_footprint_collect_done(
23433 vm_map_t new_map)
23434 {
23435 struct vm_map_corpse_footprint_header *footprint_header;
23436 struct vm_map_corpse_footprint_region *footprint_region;
23437 vm_size_t buf_size, actual_size;
23438 kern_return_t kr;
23439
23440 assert(new_map->has_corpse_footprint);
23441 if (!new_map->has_corpse_footprint ||
23442 new_map->vmmap_corpse_footprint == NULL) {
23443 return;
23444 }
23445
23446 footprint_header = (struct vm_map_corpse_footprint_header *)
23447 new_map->vmmap_corpse_footprint;
23448 buf_size = footprint_header->cf_size;
23449
23450 footprint_region = (struct vm_map_corpse_footprint_region *)
23451 ((char *)footprint_header +
23452 footprint_header->cf_last_region);
23453
23454 /* get rid of trailing zeroes in last region */
23455 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23456 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23457 footprint_header->cf_last_zeroes = 0;
23458
23459 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23460 sizeof(*footprint_region) +
23461 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23462
23463 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23464 vm_map_corpse_footprint_size_avg =
23465 (((vm_map_corpse_footprint_size_avg *
23466 vm_map_corpse_footprint_count) +
23467 actual_size) /
23468 (vm_map_corpse_footprint_count + 1));
23469 vm_map_corpse_footprint_count++;
23470 if (actual_size > vm_map_corpse_footprint_size_max) {
23471 vm_map_corpse_footprint_size_max = actual_size;
23472 }
23473
23474 actual_size = round_page(actual_size);
23475 if (buf_size > actual_size) {
23476 kr = vm_deallocate(kernel_map,
23477 vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23478 actual_size + PAGE_SIZE), /* trailing guard page */
23479 vm_sanitize_wrap_size(buf_size - actual_size));
23480 assertf(kr == KERN_SUCCESS,
23481 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23482 footprint_header,
23483 (uint64_t) buf_size,
23484 (uint64_t) actual_size,
23485 kr);
23486 kr = vm_protect(kernel_map,
23487 (vm_address_t)footprint_header + actual_size,
23488 PAGE_SIZE,
23489 FALSE, /* set_maximum */
23490 vm_sanitize_wrap_prot(VM_PROT_NONE));
23491 assertf(kr == KERN_SUCCESS,
23492 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23493 footprint_header,
23494 (uint64_t) buf_size,
23495 (uint64_t) actual_size,
23496 kr);
23497 }
23498
23499 footprint_header->cf_size = actual_size;
23500 }
23501
23502 /*
23503 * vm_map_corpse_footprint_query_page_info:
23504 * retrieves the disposition of the page at virtual address "vaddr"
23505 * in the forked corpse's VM map
23506 *
23507 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23508 */
23509 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23510 vm_map_corpse_footprint_query_page_info(
23511 vm_map_t map,
23512 vm_map_offset_t va,
23513 int *disposition_p)
23514 {
23515 struct vm_map_corpse_footprint_header *footprint_header;
23516 struct vm_map_corpse_footprint_region *footprint_region;
23517 uint32_t footprint_region_offset;
23518 vm_map_offset_t region_start, region_end;
23519 int disp_idx;
23520 kern_return_t kr;
23521 int effective_page_size;
23522 cf_disp_t cf_disp;
23523
23524 if (!map->has_corpse_footprint) {
23525 *disposition_p = 0;
23526 kr = KERN_INVALID_ARGUMENT;
23527 goto done;
23528 }
23529
23530 footprint_header = map->vmmap_corpse_footprint;
23531 if (footprint_header == NULL) {
23532 *disposition_p = 0;
23533 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23534 kr = KERN_INVALID_ARGUMENT;
23535 goto done;
23536 }
23537
23538 /* start looking at the hint ("cf_hint_region") */
23539 footprint_region_offset = footprint_header->cf_hint_region;
23540
23541 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23542
23543 lookup_again:
23544 if (footprint_region_offset < sizeof(*footprint_header)) {
23545 /* hint too low: start from 1st region */
23546 footprint_region_offset = sizeof(*footprint_header);
23547 }
23548 if (footprint_region_offset > footprint_header->cf_last_region) {
23549 /* hint too high: re-start from 1st region */
23550 footprint_region_offset = sizeof(*footprint_header);
23551 }
23552 footprint_region = (struct vm_map_corpse_footprint_region *)
23553 ((char *)footprint_header + footprint_region_offset);
23554 region_start = footprint_region->cfr_vaddr;
23555 region_end = (region_start +
23556 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23557 effective_page_size));
23558 if (va < region_start &&
23559 footprint_region_offset != sizeof(*footprint_header)) {
23560 /* our range starts before the hint region */
23561
23562 /* reset the hint (in a racy way...) */
23563 footprint_header->cf_hint_region = sizeof(*footprint_header);
23564 /* lookup "va" again from 1st region */
23565 footprint_region_offset = sizeof(*footprint_header);
23566 goto lookup_again;
23567 }
23568
23569 while (va >= region_end) {
23570 if (footprint_region_offset >= footprint_header->cf_last_region) {
23571 break;
23572 }
23573 /* skip the region's header */
23574 footprint_region_offset += sizeof(*footprint_region);
23575 /* skip the region's page dispositions */
23576 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23577 /* align to next word boundary */
23578 footprint_region_offset =
23579 roundup(footprint_region_offset,
23580 sizeof(int));
23581 footprint_region = (struct vm_map_corpse_footprint_region *)
23582 ((char *)footprint_header + footprint_region_offset);
23583 region_start = footprint_region->cfr_vaddr;
23584 region_end = (region_start +
23585 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23586 effective_page_size));
23587 }
23588 if (va < region_start || va >= region_end) {
23589 /* page not found */
23590 *disposition_p = 0;
23591 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23592 kr = KERN_SUCCESS;
23593 goto done;
23594 }
23595
23596 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23597 footprint_header->cf_hint_region = footprint_region_offset;
23598
23599 /* get page disposition for "va" in this region */
23600 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23601 cf_disp = footprint_region->cfr_disposition[disp_idx];
23602 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23603 kr = KERN_SUCCESS;
23604 done:
23605 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23606 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23607 DTRACE_VM4(footprint_query_page_info,
23608 vm_map_t, map,
23609 vm_map_offset_t, va,
23610 int, *disposition_p,
23611 kern_return_t, kr);
23612
23613 return kr;
23614 }
23615
23616 void
vm_map_corpse_footprint_destroy(vm_map_t map)23617 vm_map_corpse_footprint_destroy(
23618 vm_map_t map)
23619 {
23620 if (map->has_corpse_footprint &&
23621 map->vmmap_corpse_footprint != 0) {
23622 struct vm_map_corpse_footprint_header *footprint_header;
23623 vm_size_t buf_size;
23624 kern_return_t kr;
23625
23626 footprint_header = map->vmmap_corpse_footprint;
23627 buf_size = footprint_header->cf_size;
23628 kr = vm_deallocate(kernel_map,
23629 vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23630 vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23631 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23632 map->vmmap_corpse_footprint = 0;
23633 map->has_corpse_footprint = FALSE;
23634 }
23635 }
23636
23637 /*
23638 * vm_map_copy_footprint_ledgers:
23639 * copies any ledger that's relevant to the memory footprint of "old_task"
23640 * into the forked corpse's task ("new_task")
23641 */
23642 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23643 vm_map_copy_footprint_ledgers(
23644 task_t old_task,
23645 task_t new_task)
23646 {
23647 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23648 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23649 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23650 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23651 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23652 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23653 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23654 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23655 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23656 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23657 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23658 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23659 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23660 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23661 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23662 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23663 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23664 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23665 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23666 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23667 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23668 }
23669
23670 /*
23671 * vm_map_copy_ledger:
23672 * copy a single ledger from "old_task" to "new_task"
23673 */
23674 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23675 vm_map_copy_ledger(
23676 task_t old_task,
23677 task_t new_task,
23678 int ledger_entry)
23679 {
23680 ledger_amount_t old_balance, new_balance, delta;
23681
23682 assert(new_task->map->has_corpse_footprint);
23683 if (!new_task->map->has_corpse_footprint) {
23684 return;
23685 }
23686
23687 /* turn off sanity checks for the ledger we're about to mess with */
23688 ledger_disable_panic_on_negative(new_task->ledger,
23689 ledger_entry);
23690
23691 /* adjust "new_task" to match "old_task" */
23692 ledger_get_balance(old_task->ledger,
23693 ledger_entry,
23694 &old_balance);
23695 ledger_get_balance(new_task->ledger,
23696 ledger_entry,
23697 &new_balance);
23698 if (new_balance == old_balance) {
23699 /* new == old: done */
23700 } else if (new_balance > old_balance) {
23701 /* new > old ==> new -= new - old */
23702 delta = new_balance - old_balance;
23703 ledger_debit(new_task->ledger,
23704 ledger_entry,
23705 delta);
23706 } else {
23707 /* new < old ==> new += old - new */
23708 delta = old_balance - new_balance;
23709 ledger_credit(new_task->ledger,
23710 ledger_entry,
23711 delta);
23712 }
23713 }
23714
23715 /*
23716 * vm_map_get_pmap:
23717 * returns the pmap associated with the vm_map
23718 */
23719 pmap_t
vm_map_get_pmap(vm_map_t map)23720 vm_map_get_pmap(vm_map_t map)
23721 {
23722 return vm_map_pmap(map);
23723 }
23724
23725 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23726 vm_map_get_phys_page(
23727 vm_map_t map,
23728 vm_offset_t addr)
23729 {
23730 vm_object_offset_t offset;
23731 vm_object_t object;
23732 vm_map_offset_t map_offset;
23733 vm_map_entry_t entry;
23734 ppnum_t phys_page = 0;
23735
23736 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23737
23738 vm_map_lock(map);
23739 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23740 if (entry->is_sub_map) {
23741 vm_map_t old_map;
23742 vm_map_lock(VME_SUBMAP(entry));
23743 old_map = map;
23744 map = VME_SUBMAP(entry);
23745 map_offset = (VME_OFFSET(entry) +
23746 (map_offset - entry->vme_start));
23747 vm_map_unlock(old_map);
23748 continue;
23749 }
23750 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23751 vm_map_unlock(map);
23752 return (ppnum_t) 0;
23753 }
23754 if (VME_OBJECT(entry)->phys_contiguous) {
23755 /* These are not standard pageable memory mappings */
23756 /* If they are not present in the object they will */
23757 /* have to be picked up from the pager through the */
23758 /* fault mechanism. */
23759 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23760 /* need to call vm_fault */
23761 vm_map_unlock(map);
23762 vm_fault(map, map_offset, VM_PROT_NONE,
23763 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23764 THREAD_UNINT, NULL, 0);
23765 vm_map_lock(map);
23766 continue;
23767 }
23768 offset = (VME_OFFSET(entry) +
23769 (map_offset - entry->vme_start));
23770 phys_page = (ppnum_t)
23771 ((VME_OBJECT(entry)->vo_shadow_offset
23772 + offset) >> PAGE_SHIFT);
23773 break;
23774 }
23775 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23776 object = VME_OBJECT(entry);
23777 vm_object_lock(object);
23778 while (TRUE) {
23779 vm_page_t dst_page = vm_page_lookup(object, offset);
23780 if (dst_page == VM_PAGE_NULL) {
23781 if (object->shadow) {
23782 vm_object_t old_object;
23783 vm_object_lock(object->shadow);
23784 old_object = object;
23785 offset = offset + object->vo_shadow_offset;
23786 object = object->shadow;
23787 vm_object_unlock(old_object);
23788 } else {
23789 vm_object_unlock(object);
23790 break;
23791 }
23792 } else {
23793 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23794 vm_object_unlock(object);
23795 break;
23796 }
23797 }
23798 break;
23799 }
23800
23801 vm_map_unlock(map);
23802 return phys_page;
23803 }
23804
23805 #if CONFIG_MAP_RANGES
23806 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23807 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23808
23809 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23810 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23811
23812 /*
23813 * vm_map_range_map_init:
23814 * initializes the VM range ID map to enable index lookup
23815 * of user VM ranges based on VM tag from userspace.
23816 */
23817 static void
vm_map_range_map_init(void)23818 vm_map_range_map_init(void)
23819 {
23820 /*
23821 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23822 * - the former is malloc metadata which should be kept separate
23823 * - the latter has its own ranges
23824 */
23825 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23826 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23827 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23828 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23829 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23830 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23831 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23832 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23833 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23834 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23835 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23836 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23837 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23838 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23839 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23840 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23841 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23842 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23843 }
23844
23845 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23846 vm_map_range_random_uniform(
23847 vm_map_size_t req_size,
23848 vm_map_offset_t min_addr,
23849 vm_map_offset_t max_addr,
23850 vm_map_offset_t offmask)
23851 {
23852 vm_map_offset_t random_addr;
23853 struct mach_vm_range alloc;
23854
23855 req_size = (req_size + offmask) & ~offmask;
23856 min_addr = (min_addr + offmask) & ~offmask;
23857 max_addr = max_addr & ~offmask;
23858
23859 read_random(&random_addr, sizeof(random_addr));
23860 random_addr %= (max_addr - req_size - min_addr);
23861 random_addr &= ~offmask;
23862
23863 alloc.min_address = min_addr + random_addr;
23864 alloc.max_address = min_addr + random_addr + req_size;
23865 return alloc;
23866 }
23867
23868 static vm_map_offset_t
vm_map_range_offmask(void)23869 vm_map_range_offmask(void)
23870 {
23871 uint32_t pte_depth;
23872
23873 /*
23874 * PTE optimizations
23875 *
23876 *
23877 * 16k pages systems
23878 * ~~~~~~~~~~~~~~~~~
23879 *
23880 * A single L1 (sub-)page covers the address space.
23881 * - L2 pages cover 64G,
23882 * - L3 pages cover 32M.
23883 *
23884 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23885 * As a result, we really only need to align the ranges to 32M to avoid
23886 * partial L3 pages.
23887 *
23888 * On macOS, the usage of L2 pages will increase, so as a result we will
23889 * want to align ranges to 64G in order to utilize them fully.
23890 *
23891 *
23892 * 4k pages systems
23893 * ~~~~~~~~~~~~~~~~
23894 *
23895 * A single L0 (sub-)page covers the address space.
23896 * - L1 pages cover 512G,
23897 * - L2 pages cover 1G,
23898 * - L3 pages cover 2M.
23899 *
23900 * The long tail of processes on a system will tend to have a VA usage
23901 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23902 * This is achievable with a single L1 and a few L2s without
23903 * randomization.
23904 *
23905 * However once randomization is introduced, the system will immediately
23906 * need several L1s and many more L2s. As a result:
23907 *
23908 * - on embedded devices, the cost of these extra pages isn't
23909 * sustainable, and we just disable the feature entirely,
23910 *
23911 * - on macOS we align ranges to a 512G boundary so that the extra L1
23912 * pages can be used to their full potential.
23913 */
23914
23915 /*
23916 * note, this function assumes _non exotic mappings_
23917 * which is why it uses the native kernel's PAGE_SHIFT.
23918 */
23919 #if XNU_PLATFORM_MacOSX
23920 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23921 #else /* !XNU_PLATFORM_MacOSX */
23922 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23923 #endif /* !XNU_PLATFORM_MacOSX */
23924
23925 if (pte_depth == 0) {
23926 return 0;
23927 }
23928
23929 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23930 }
23931
23932 /*
23933 * vm_map_range_configure:
23934 * configures the user vm_map ranges by increasing the maximum VA range of
23935 * the map and carving out a range at the end of VA space (searching backwards
23936 * in the newly expanded map).
23937 */
23938 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23939 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23940 {
23941 const vm_map_offset_t offmask = vm_map_range_offmask();
23942 struct mach_vm_range data_range;
23943 vm_map_offset_t default_end;
23944 kern_return_t kr;
23945
23946 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23947 /*
23948 * No point doing vm ranges in a 32bit address space.
23949 */
23950 return KERN_NOT_SUPPORTED;
23951 }
23952
23953 /* Should not be applying ranges to kernel map or kernel map submaps */
23954 assert(vm_map_pmap(map) != kernel_pmap);
23955
23956 #if XNU_PLATFORM_MacOSX
23957
23958 /*
23959 * on macOS, the address space is a massive 47 bits (128T),
23960 * with several carve outs that processes can't use:
23961 * - the shared region
23962 * - the commpage region
23963 * - the GPU carve out (if applicable)
23964 *
23965 * and when nano-malloc is in use it desires memory at the 96T mark.
23966 *
23967 * However, their location is architecture dependent:
23968 * - On intel, the shared region and commpage are
23969 * at the very end of the usable address space (above +127T),
23970 * and there is no GPU carve out, and pthread wants to place
23971 * threads at the 112T mark (0x70T).
23972 *
23973 * - On arm64, these are in the same spot as on embedded devices:
23974 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23975 * o commpage region: [63G, 64G)
23976 * o GPU carve out: [64G, 448G)
23977 *
23978 * This is conveninent because the mappings at the end of the address
23979 * space (when they exist) are made by the kernel.
23980 *
23981 * The policy is to allocate a random 1T for the data heap
23982 * in the end of the address-space in the:
23983 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23984 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23985 */
23986
23987 /* see NANOZONE_SIGNATURE in libmalloc */
23988 #if __x86_64__
23989 default_end = 0x71ull << 40;
23990 #else
23991 default_end = 0x61ull << 40;
23992 #endif
23993 data_range = vm_map_range_random_uniform(1ull << 40,
23994 default_end, 0x7full << 40, offmask);
23995
23996 #else /* !XNU_PLATFORM_MacOSX */
23997
23998 /*
23999 * Embedded devices:
24000 *
24001 * The default VA Size scales with the device physical memory.
24002 *
24003 * Out of that:
24004 * - the "zero" page typically uses 4G + some slide
24005 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
24006 *
24007 * Without the use of jumbo or any adjustment to the address space,
24008 * a default VM map typically looks like this:
24009 *
24010 * 0G -->╒════════════╕
24011 * │ pagezero │
24012 * │ + slide │
24013 * ~4G -->╞════════════╡<-- vm_map_min(map)
24014 * │ │
24015 * 6G -->├────────────┤
24016 * │ shared │
24017 * │ region │
24018 * 10G -->├────────────┤
24019 * │ │
24020 * max_va -->├────────────┤<-- vm_map_max(map)
24021 * │ │
24022 * ╎ jumbo ╎
24023 * ╎ ╎
24024 * │ │
24025 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24026 * │ commpage │
24027 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24028 * │ │
24029 * ╎ GPU ╎
24030 * ╎ carveout ╎
24031 * │ │
24032 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24033 * │ │
24034 * ╎ ╎
24035 * ╎ ╎
24036 * │ │
24037 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24038 *
24039 * When this drawing was made, "max_va" was smaller than
24040 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24041 * 12G of address space for the zero-page, slide, files,
24042 * binaries, heap ...
24043 *
24044 * We will want to make a "heap/data" carve out inside
24045 * the jumbo range of half of that usable space, assuming
24046 * that this is less than a forth of the jumbo range.
24047 *
24048 * The assert below intends to catch when max_va grows
24049 * too large for this heuristic.
24050 */
24051
24052 vm_map_lock_read(map);
24053 default_end = vm_map_max(map);
24054 vm_map_unlock_read(map);
24055
24056 /*
24057 * Check that we're not already jumbo'd,
24058 * or our address space was somehow modified.
24059 *
24060 * If so we cannot guarantee that we can set up the ranges
24061 * safely without interfering with the existing map.
24062 */
24063 if (default_end > vm_compute_max_offset(true)) {
24064 return KERN_NO_SPACE;
24065 }
24066
24067 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24068 /*
24069 * an override boot-arg was set, disable user-ranges
24070 *
24071 * XXX: this is problematic because it means these boot-args
24072 * no longer test the behavior changing the value
24073 * of ARM64_MAX_OFFSET_DEVICE_* would have.
24074 */
24075 return KERN_NOT_SUPPORTED;
24076 }
24077
24078 /* expand the default VM space to 64GB */
24079 vm_map_set_jumbo(map);
24080
24081 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24082 data_range = vm_map_range_random_uniform(GiB(10),
24083 default_end + PAGE_SIZE, vm_map_max(map), offmask);
24084
24085 #endif /* !XNU_PLATFORM_MacOSX */
24086
24087 /*
24088 * Poke holes so that ASAN or people listing regions
24089 * do not think this space is free.
24090 */
24091
24092 if (default_end != data_range.min_address) {
24093 kr = vm_map_enter(map, &default_end,
24094 data_range.min_address - default_end,
24095 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24096 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24097 assert(kr == KERN_SUCCESS);
24098 }
24099
24100 if (data_range.max_address != vm_map_max(map)) {
24101 vm_map_entry_t entry;
24102 vm_size_t size;
24103
24104 /*
24105 * Extend the end of the hole to the next VM entry or the end of the map,
24106 * whichever comes first.
24107 */
24108 vm_map_lock_read(map);
24109 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24110 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24111 size = vm_map_max(map) - data_range.max_address;
24112 } else {
24113 size = entry->vme_start - data_range.max_address;
24114 }
24115 vm_map_unlock_read(map);
24116
24117 kr = vm_map_enter(map, &data_range.max_address, size,
24118 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24119 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24120 assert(kr == KERN_SUCCESS);
24121 }
24122
24123 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24124 if (needs_extra_jumbo_va) {
24125 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
24126 vm_map_set_extra_jumbo(map);
24127 }
24128 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24129
24130 vm_map_lock(map);
24131 map->default_range.min_address = vm_map_min(map);
24132 map->default_range.max_address = default_end;
24133 map->data_range = data_range;
24134 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24135 /* If process has "extra jumbo" entitlement, enable large file range */
24136 if (needs_extra_jumbo_va) {
24137 map->large_file_range = vm_map_range_random_uniform(TiB(1),
24138 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24139 }
24140 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24141 map->uses_user_ranges = true;
24142 vm_map_unlock(map);
24143
24144 return KERN_SUCCESS;
24145 }
24146
24147 /*
24148 * vm_map_range_fork:
24149 * clones the array of ranges from old_map to new_map in support
24150 * of a VM map fork.
24151 */
24152 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24153 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24154 {
24155 if (!old_map->uses_user_ranges) {
24156 /* nothing to do */
24157 return;
24158 }
24159
24160 new_map->default_range = old_map->default_range;
24161 new_map->data_range = old_map->data_range;
24162
24163 if (old_map->extra_ranges_count) {
24164 vm_map_user_range_t otable, ntable;
24165 uint16_t count;
24166
24167 otable = old_map->extra_ranges;
24168 count = old_map->extra_ranges_count;
24169 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24170 Z_WAITOK | Z_ZERO | Z_NOFAIL);
24171 memcpy(ntable, otable,
24172 count * sizeof(struct vm_map_user_range));
24173
24174 new_map->extra_ranges_count = count;
24175 new_map->extra_ranges = ntable;
24176 }
24177
24178 new_map->uses_user_ranges = true;
24179 }
24180
24181 /*
24182 * vm_map_get_user_range:
24183 * copy the VM user range for the given VM map and range ID.
24184 */
24185 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24186 vm_map_get_user_range(
24187 vm_map_t map,
24188 vm_map_range_id_t range_id,
24189 mach_vm_range_t range)
24190 {
24191 if (map == NULL || !map->uses_user_ranges || range == NULL) {
24192 return KERN_INVALID_ARGUMENT;
24193 }
24194
24195 switch (range_id) {
24196 case UMEM_RANGE_ID_DEFAULT:
24197 *range = map->default_range;
24198 return KERN_SUCCESS;
24199
24200 case UMEM_RANGE_ID_HEAP:
24201 *range = map->data_range;
24202 return KERN_SUCCESS;
24203
24204 case UMEM_RANGE_ID_LARGE_FILE:
24205 /*
24206 * Because this function tells a user-space process about the user
24207 * ranges in its VM map, this case communicates whether the large file
24208 * range is in use. Note that this is different from how the large file
24209 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24210 * VA policy and return either the large file range or data range,
24211 * depending on whether the large file range is enabled.
24212 */
24213 if (map->large_file_range.min_address != map->large_file_range.max_address) {
24214 /* large file range is configured and should be used */
24215 *range = map->large_file_range;
24216 } else {
24217 return KERN_INVALID_ARGUMENT;
24218 }
24219 return KERN_SUCCESS;
24220
24221 default:
24222 return KERN_INVALID_ARGUMENT;
24223 }
24224 }
24225
24226 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24227 vm_map_user_range_resolve(
24228 vm_map_t map,
24229 mach_vm_address_t addr,
24230 mach_vm_size_t size,
24231 mach_vm_range_t range)
24232 {
24233 struct mach_vm_range tmp;
24234
24235 vm_map_lock_assert_held(map);
24236
24237 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24238 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24239
24240 if (mach_vm_range_contains(&map->default_range, addr, size)) {
24241 if (range) {
24242 *range = map->default_range;
24243 }
24244 return UMEM_RANGE_ID_DEFAULT;
24245 }
24246
24247 if (mach_vm_range_contains(&map->data_range, addr, size)) {
24248 if (range) {
24249 *range = map->data_range;
24250 }
24251 return UMEM_RANGE_ID_HEAP;
24252 }
24253
24254 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24255 if (range) {
24256 *range = map->large_file_range;
24257 }
24258 return UMEM_RANGE_ID_LARGE_FILE;
24259 }
24260
24261 for (size_t i = 0; i < map->extra_ranges_count; i++) {
24262 vm_map_user_range_t r = &map->extra_ranges[i];
24263
24264 tmp.min_address = r->vmur_min_address;
24265 tmp.max_address = r->vmur_max_address;
24266
24267 if (mach_vm_range_contains(&tmp, addr, size)) {
24268 if (range) {
24269 *range = tmp;
24270 }
24271 return r->vmur_range_id;
24272 }
24273 }
24274
24275 if (range) {
24276 range->min_address = range->max_address = 0;
24277 }
24278 return UMEM_RANGE_ID_DEFAULT;
24279 }
24280 #endif /* CONFIG_MAP_RANGES */
24281
24282 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24283 vm_map_kernel_flags_update_range_id(
24284 vm_map_kernel_flags_t *vmkf,
24285 vm_map_t map,
24286 __unused vm_map_size_t size)
24287 {
24288 if (map == kernel_map) {
24289 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24290 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24291 }
24292 #if CONFIG_MAP_RANGES
24293 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24294 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24295 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24296 || size >= VM_LARGE_FILE_THRESHOLD) {
24297 /*
24298 * if the map doesn't have the large file range configured,
24299 * the range will get resolved to the heap range in `vm_map_get_range`
24300 */
24301 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24302 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24303 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24304 }
24305 #endif /* CONFIG_MAP_RANGES */
24306 }
24307 }
24308
24309 /*
24310 * vm_map_entry_has_device_pager:
24311 * Check if the vm map entry specified by the virtual address has a device pager.
24312 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24313 */
24314 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24315 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24316 {
24317 vm_map_entry_t entry;
24318 vm_object_t object;
24319 boolean_t result;
24320
24321 if (map == NULL) {
24322 return FALSE;
24323 }
24324
24325 vm_map_lock(map);
24326 while (TRUE) {
24327 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24328 result = FALSE;
24329 break;
24330 }
24331 if (entry->is_sub_map) {
24332 // Check the submap
24333 vm_map_t submap = VME_SUBMAP(entry);
24334 assert(submap != NULL);
24335 vm_map_lock(submap);
24336 vm_map_unlock(map);
24337 map = submap;
24338 continue;
24339 }
24340 object = VME_OBJECT(entry);
24341 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24342 result = TRUE;
24343 break;
24344 }
24345 result = FALSE;
24346 break;
24347 }
24348
24349 vm_map_unlock(map);
24350 return result;
24351 }
24352
24353
24354 #if MACH_ASSERT
24355
24356 extern int pmap_ledgers_panic;
24357 extern int pmap_ledgers_panic_leeway;
24358
24359 #define LEDGER_DRIFT(__LEDGER) \
24360 int __LEDGER##_over; \
24361 ledger_amount_t __LEDGER##_over_total; \
24362 ledger_amount_t __LEDGER##_over_max; \
24363 int __LEDGER##_under; \
24364 ledger_amount_t __LEDGER##_under_total; \
24365 ledger_amount_t __LEDGER##_under_max
24366
24367 struct {
24368 uint64_t num_pmaps_checked;
24369
24370 LEDGER_DRIFT(phys_footprint);
24371 LEDGER_DRIFT(internal);
24372 LEDGER_DRIFT(internal_compressed);
24373 LEDGER_DRIFT(external);
24374 LEDGER_DRIFT(reusable);
24375 LEDGER_DRIFT(iokit_mapped);
24376 LEDGER_DRIFT(alternate_accounting);
24377 LEDGER_DRIFT(alternate_accounting_compressed);
24378 LEDGER_DRIFT(page_table);
24379 LEDGER_DRIFT(purgeable_volatile);
24380 LEDGER_DRIFT(purgeable_nonvolatile);
24381 LEDGER_DRIFT(purgeable_volatile_compressed);
24382 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24383 LEDGER_DRIFT(tagged_nofootprint);
24384 LEDGER_DRIFT(tagged_footprint);
24385 LEDGER_DRIFT(tagged_nofootprint_compressed);
24386 LEDGER_DRIFT(tagged_footprint_compressed);
24387 LEDGER_DRIFT(network_volatile);
24388 LEDGER_DRIFT(network_nonvolatile);
24389 LEDGER_DRIFT(network_volatile_compressed);
24390 LEDGER_DRIFT(network_nonvolatile_compressed);
24391 LEDGER_DRIFT(media_nofootprint);
24392 LEDGER_DRIFT(media_footprint);
24393 LEDGER_DRIFT(media_nofootprint_compressed);
24394 LEDGER_DRIFT(media_footprint_compressed);
24395 LEDGER_DRIFT(graphics_nofootprint);
24396 LEDGER_DRIFT(graphics_footprint);
24397 LEDGER_DRIFT(graphics_nofootprint_compressed);
24398 LEDGER_DRIFT(graphics_footprint_compressed);
24399 LEDGER_DRIFT(neural_nofootprint);
24400 LEDGER_DRIFT(neural_footprint);
24401 LEDGER_DRIFT(neural_nofootprint_compressed);
24402 LEDGER_DRIFT(neural_footprint_compressed);
24403 LEDGER_DRIFT(neural_nofootprint_total);
24404 } pmap_ledgers_drift;
24405
24406 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24407 vm_map_pmap_check_ledgers(
24408 pmap_t pmap,
24409 ledger_t ledger,
24410 int pid,
24411 char *procname)
24412 {
24413 ledger_amount_t bal;
24414 boolean_t do_panic;
24415
24416 do_panic = FALSE;
24417
24418 pmap_ledgers_drift.num_pmaps_checked++;
24419
24420 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24421 MACRO_BEGIN \
24422 int panic_on_negative = TRUE; \
24423 ledger_get_balance(ledger, \
24424 task_ledgers.__LEDGER, \
24425 &bal); \
24426 ledger_get_panic_on_negative(ledger, \
24427 task_ledgers.__LEDGER, \
24428 &panic_on_negative); \
24429 if (bal != 0) { \
24430 if (panic_on_negative || \
24431 (pmap_ledgers_panic && \
24432 pmap_ledgers_panic_leeway > 0 && \
24433 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24434 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24435 do_panic = TRUE; \
24436 } \
24437 printf("LEDGER BALANCE proc %d (%s) " \
24438 "\"%s\" = %lld\n", \
24439 pid, procname, #__LEDGER, bal); \
24440 if (bal > 0) { \
24441 pmap_ledgers_drift.__LEDGER##_over++; \
24442 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24443 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24444 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24445 } \
24446 } else if (bal < 0) { \
24447 pmap_ledgers_drift.__LEDGER##_under++; \
24448 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24449 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24450 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24451 } \
24452 } \
24453 } \
24454 MACRO_END
24455
24456 LEDGER_CHECK_BALANCE(phys_footprint);
24457 LEDGER_CHECK_BALANCE(internal);
24458 LEDGER_CHECK_BALANCE(internal_compressed);
24459 LEDGER_CHECK_BALANCE(external);
24460 LEDGER_CHECK_BALANCE(reusable);
24461 LEDGER_CHECK_BALANCE(iokit_mapped);
24462 LEDGER_CHECK_BALANCE(alternate_accounting);
24463 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24464 LEDGER_CHECK_BALANCE(page_table);
24465 LEDGER_CHECK_BALANCE(purgeable_volatile);
24466 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24467 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24468 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24469 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24470 LEDGER_CHECK_BALANCE(tagged_footprint);
24471 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24472 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24473 LEDGER_CHECK_BALANCE(network_volatile);
24474 LEDGER_CHECK_BALANCE(network_nonvolatile);
24475 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24476 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24477 LEDGER_CHECK_BALANCE(media_nofootprint);
24478 LEDGER_CHECK_BALANCE(media_footprint);
24479 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24480 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24481 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24482 LEDGER_CHECK_BALANCE(graphics_footprint);
24483 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24484 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24485 LEDGER_CHECK_BALANCE(neural_nofootprint);
24486 LEDGER_CHECK_BALANCE(neural_footprint);
24487 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24488 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24489 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24490
24491 if (do_panic) {
24492 if (pmap_ledgers_panic) {
24493 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24494 pmap, pid, procname);
24495 } else {
24496 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24497 pmap, pid, procname);
24498 }
24499 }
24500 }
24501
24502 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24503 vm_map_pmap_set_process(
24504 vm_map_t map,
24505 int pid,
24506 char *procname)
24507 {
24508 pmap_set_process(vm_map_pmap(map), pid, procname);
24509 }
24510
24511 #endif /* MACH_ASSERT */
24512