1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133
134 #include <libkern/section_keywords.h>
135
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 "error", /* 0 */
150 "life", /* 1 */
151 "load", /* 2 */
152 "fault", /* 3 */
153 "copy", /* 4 */
154 "share", /* 5 */
155 "adjust", /* 6 */
156 "pmap", /* 7 */
157 "mementry", /* 8 */
158 "iokit", /* 9 */
159 "upl", /* 10 */
160 "exc", /* 11 */
161 "vfs" /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165
166
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183 "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189
190 extern u_int32_t random(void); /* from <libkern/libkern.h> */
191 /* Internal prototypes
192 */
193
194 typedef struct vm_map_zap {
195 vm_map_entry_t vmz_head;
196 vm_map_entry_t *vmz_tail;
197 } *vm_map_zap_t;
198
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201
202 extern kern_return_t vm_map_wire_external(
203 vm_map_t map,
204 vm_map_offset_ut start_u,
205 vm_map_offset_ut end_u,
206 vm_prot_ut prot_u,
207 boolean_t user_wire) __exported;
208
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 vm_map_t src_map,
216 vm_map_address_ut src_addr,
217 vm_map_size_ut len,
218 boolean_t src_destroy,
219 boolean_t src_volatile,
220 vm_map_copy_t *copy_result, /* OUT */
221 boolean_t use_maxprot);
222
223 static vm_map_entry_t vm_map_entry_insert(
224 vm_map_t map,
225 vm_map_entry_t insp_entry,
226 vm_map_offset_t start,
227 vm_map_offset_t end,
228 vm_object_t object,
229 vm_object_offset_t offset,
230 vm_map_kernel_flags_t vmk_flags,
231 boolean_t needs_copy,
232 vm_prot_t cur_protection,
233 vm_prot_t max_protection,
234 vm_inherit_t inheritance,
235 boolean_t clear_map_aligned);
236
237 static void vm_map_simplify_range(
238 vm_map_t map,
239 vm_map_offset_t start,
240 vm_map_offset_t end); /* forward */
241
242 static boolean_t vm_map_range_check(
243 vm_map_t map,
244 vm_map_offset_t start,
245 vm_map_offset_t end,
246 vm_map_entry_t *entry);
247
248 static void vm_map_submap_pmap_clean(
249 vm_map_t map,
250 vm_map_offset_t start,
251 vm_map_offset_t end,
252 vm_map_t sub_map,
253 vm_map_offset_t offset);
254
255 static void vm_map_pmap_enter(
256 vm_map_t map,
257 vm_map_offset_t addr,
258 vm_map_offset_t end_addr,
259 vm_object_t object,
260 vm_object_offset_t offset,
261 vm_prot_t protection);
262
263 static void _vm_map_clip_end(
264 struct vm_map_header *map_header,
265 vm_map_entry_t entry,
266 vm_map_offset_t end);
267
268 static void _vm_map_clip_start(
269 struct vm_map_header *map_header,
270 vm_map_entry_t entry,
271 vm_map_offset_t start);
272
273 static kmem_return_t vm_map_delete(
274 vm_map_t map,
275 vm_map_offset_t start,
276 vm_map_offset_t end,
277 vmr_flags_t flags,
278 kmem_guard_t guard,
279 vm_map_zap_t zap);
280
281 static void vm_map_copy_insert(
282 vm_map_t map,
283 vm_map_entry_t after_where,
284 vm_map_copy_t copy);
285
286 static kern_return_t vm_map_copy_overwrite_unaligned(
287 vm_map_t dst_map,
288 vm_map_entry_t entry,
289 vm_map_copy_t copy,
290 vm_map_address_t start,
291 boolean_t discard_on_success);
292
293 static kern_return_t vm_map_copy_overwrite_aligned(
294 vm_map_t dst_map,
295 vm_map_entry_t tmp_entry,
296 vm_map_copy_t copy,
297 vm_map_offset_t start,
298 pmap_t pmap);
299
300 static kern_return_t vm_map_copyin_kernel_buffer(
301 vm_map_t src_map,
302 vm_map_address_t src_addr,
303 vm_map_size_t len,
304 boolean_t src_destroy,
305 vm_map_copy_t *copy_result); /* OUT */
306
307 static kern_return_t vm_map_copyout_kernel_buffer(
308 vm_map_t map,
309 vm_map_address_t *addr, /* IN/OUT */
310 vm_map_copy_t copy,
311 vm_map_size_t copy_size,
312 boolean_t overwrite,
313 boolean_t consume_on_success);
314
315 static void vm_map_fork_share(
316 vm_map_t old_map,
317 vm_map_entry_t old_entry,
318 vm_map_t new_map);
319
320 static boolean_t vm_map_fork_copy(
321 vm_map_t old_map,
322 vm_map_entry_t *old_entry_p,
323 vm_map_t new_map,
324 int vm_map_copyin_flags);
325
326 static kern_return_t vm_map_wire_nested(
327 vm_map_t map,
328 vm_map_offset_t start,
329 vm_map_offset_t end,
330 vm_prot_t caller_prot,
331 vm_tag_t tag,
332 boolean_t user_wire,
333 pmap_t map_pmap,
334 vm_map_offset_t pmap_addr,
335 ppnum_t *physpage_p);
336
337 static kern_return_t vm_map_unwire_nested(
338 vm_map_t map,
339 vm_map_offset_t start,
340 vm_map_offset_t end,
341 boolean_t user_wire,
342 pmap_t map_pmap,
343 vm_map_offset_t pmap_addr);
344
345 static kern_return_t vm_map_overwrite_submap_recurse(
346 vm_map_t dst_map,
347 vm_map_offset_t dst_addr,
348 vm_map_size_t dst_size);
349
350 static kern_return_t vm_map_copy_overwrite_nested(
351 vm_map_t dst_map,
352 vm_map_offset_t dst_addr,
353 vm_map_copy_t copy,
354 boolean_t interruptible,
355 pmap_t pmap,
356 boolean_t discard_on_success);
357
358 static kern_return_t vm_map_remap_extract(
359 vm_map_t map,
360 vm_map_offset_t addr,
361 vm_map_size_t size,
362 boolean_t copy,
363 vm_map_copy_t map_copy,
364 vm_prot_t *cur_protection,
365 vm_prot_t *max_protection,
366 vm_inherit_t inheritance,
367 vm_map_kernel_flags_t vmk_flags);
368
369 static void vm_map_region_look_for_page(
370 vm_map_t map,
371 vm_map_offset_t va,
372 vm_object_t object,
373 vm_object_offset_t offset,
374 int max_refcnt,
375 unsigned short depth,
376 vm_region_extended_info_t extended,
377 mach_msg_type_number_t count);
378
379 static boolean_t vm_map_region_has_obj_ref(
380 vm_map_entry_t entry,
381 vm_object_t object);
382
383
384 static kern_return_t vm_map_willneed(
385 vm_map_t map,
386 vm_map_offset_t start,
387 vm_map_offset_t end);
388
389 static kern_return_t vm_map_reuse_pages(
390 vm_map_t map,
391 vm_map_offset_t start,
392 vm_map_offset_t end);
393
394 static kern_return_t vm_map_reusable_pages(
395 vm_map_t map,
396 vm_map_offset_t start,
397 vm_map_offset_t end);
398
399 static kern_return_t vm_map_can_reuse(
400 vm_map_t map,
401 vm_map_offset_t start,
402 vm_map_offset_t end);
403
404 static kern_return_t vm_map_zero(
405 vm_map_t map,
406 vm_map_offset_t start,
407 vm_map_offset_t end);
408
409 static kern_return_t vm_map_random_address_for_size(
410 vm_map_t map,
411 vm_map_offset_t *address,
412 vm_map_size_t size,
413 vm_map_kernel_flags_t vmk_flags);
414
415
416 #if CONFIG_MAP_RANGES
417
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 vm_map_t map,
420 mach_vm_address_t addr,
421 mach_vm_address_t size,
422 mach_vm_range_t range);
423
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t vm_map_pageout(
427 vm_map_t map,
428 vm_map_offset_t start,
429 vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431
432 kern_return_t vm_map_corpse_footprint_collect(
433 vm_map_t old_map,
434 vm_map_entry_t old_entry,
435 vm_map_t new_map);
436 void vm_map_corpse_footprint_collect_done(
437 vm_map_t new_map);
438 void vm_map_corpse_footprint_destroy(
439 vm_map_t map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 vm_map_t map,
442 vm_map_offset_t va,
443 int *disposition_p);
444 void vm_map_footprint_query_page_info(
445 vm_map_t map,
446 vm_map_entry_t map_entry,
447 vm_map_offset_t curr_s_offset,
448 int *disposition_p);
449
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453
454 pid_t find_largest_process_vm_map_entries(void);
455
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461
462 /* in vmk flags the meaning of fixed/anywhere is inverted */
463 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 vm_map_kernel_flags_t *vmk_flags,
470 int vm_flags,
471 vm_tag_t vm_tag)
472 {
473 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 vmk_flags->vm_tag = vm_tag;
477 }
478
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 vm_map_kernel_flags_t *vmk_flags,
483 int vm_flags_and_tag)
484 {
485 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 vm_map_kernel_flags_t *vmk_flags,
495 int vm_flags_mask)
496 {
497 /* this function doesn't handle the inverted FIXED/ANYWHERE */
498 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 vm_map_kernel_flags_t vmk_flags,
506 int vm_flags_mask)
507 {
508 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 vm_map_kernel_flags_t vmk_flags,
514 int vm_flags_mask)
515 {
516 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517
518 /* Note: up to 16 still has good calling conventions */
519 static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520
521 #if DEBUG || DEVELOPMENT
522 /*
523 * All of this compiles to nothing if all checks pass.
524 */
525 #define check(field, value) ({ \
526 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 fl.__vm_flags = (value); \
528 fl.field = 0; \
529 assert(fl.__vm_flags == 0); \
530 })
531
532 /* bits 0-7 */
533 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 check(vmf_permanent, VM_FLAGS_PERMANENT);
541
542 /* bits 8-15 */
543 check(vmf_tpro, VM_FLAGS_TPRO);
544 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545
546 /* bits 16-23 */
547 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550
551 {
552 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553
554 /* check user tags will never clip */
555 fl.vm_tag = VM_MEMORY_COUNT - 1;
556 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557
558 /* check kernel tags will never clip */
559 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 }
562
563
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566
567 return (vmflags & ~vm_flags_mask) == 0;
568 }
569
570 /*
571 * Macros to copy a vm_map_entry. We must be careful to correctly
572 * manage the wired page count. vm_map_entry_copy() creates a new
573 * map entry to the same memory - the wired count in the new entry
574 * must be set to zero. vm_map_entry_copy_full() creates a new
575 * entry that is identical to the old entry. This preserves the
576 * wire count; it's used for map splitting and zone changing in
577 * vm_map_copyout.
578 */
579
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 vm_map_t map __unused,
583 vm_map_entry_t new __unused,
584 vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 /* when code signing monitor is enabled, we want to reset on copy */
588 new->csm_associated = FALSE;
589 #else
590 /* when code signing monitor is not enabled, assert as a sanity check */
591 assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug\n",
596 proc_selfpid(),
597 (get_bsdtask_info(current_task())
598 ? proc_name_address(get_bsdtask_info(current_task()))
599 : "?"),
600 __FUNCTION__, __LINE__,
601 map, new, new->vme_start, new->vme_end);
602 }
603 #endif /* DEVELOPMENT || DEBUG */
604 #if XNU_TARGET_OS_OSX
605 /*
606 * On macOS, entries with "vme_xnu_user_debug" can be copied during fork()
607 * and we want the child's entry to keep its "vme_xnu_user_debug" to avoid
608 * trigggering CSM assertions when the child accesses its mapping.
609 */
610 #else /* XNU_TARGET_OS_OSX */
611 new->vme_xnu_user_debug = FALSE;
612 #endif /* XNU_TARGET_OS_OSX */
613 }
614
615 /*
616 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
617 * But for security reasons on some platforms, we don't want the
618 * new mapping to be "used for jit", so we reset the flag here.
619 */
620 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)621 vm_map_entry_copy_code_signing(
622 vm_map_t map,
623 vm_map_entry_t new,
624 vm_map_entry_t old __unused)
625 {
626 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
627 assert(new->used_for_jit == old->used_for_jit);
628 } else {
629 if (old->used_for_jit) {
630 DTRACE_VM3(cs_wx,
631 uint64_t, new->vme_start,
632 uint64_t, new->vme_end,
633 vm_prot_t, new->protection);
634 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
635 proc_selfpid(),
636 (get_bsdtask_info(current_task())
637 ? proc_name_address(get_bsdtask_info(current_task()))
638 : "?"),
639 __FUNCTION__,
640 "removing execute access");
641 new->protection &= ~VM_PROT_EXECUTE;
642 new->max_protection &= ~VM_PROT_EXECUTE;
643 }
644 new->used_for_jit = FALSE;
645 }
646 }
647
648 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)649 vm_map_entry_copy_full(
650 vm_map_entry_t new,
651 vm_map_entry_t old)
652 {
653 #if MAP_ENTRY_CREATION_DEBUG
654 btref_put(new->vme_creation_bt);
655 btref_retain(old->vme_creation_bt);
656 #endif
657 #if MAP_ENTRY_INSERTION_DEBUG
658 btref_put(new->vme_insertion_bt);
659 btref_retain(old->vme_insertion_bt);
660 #endif
661 #if VM_BTLOG_TAGS
662 /* Discard the btref that might be in the new entry */
663 if (new->vme_kernel_object) {
664 btref_put(new->vme_tag_btref);
665 }
666 /* Retain the btref in the old entry to account for its copy */
667 if (old->vme_kernel_object) {
668 btref_retain(old->vme_tag_btref);
669 }
670 #endif /* VM_BTLOG_TAGS */
671 *new = *old;
672 }
673
674 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)675 vm_map_entry_copy(
676 vm_map_t map,
677 vm_map_entry_t new,
678 vm_map_entry_t old)
679 {
680 vm_map_entry_copy_full(new, old);
681
682 new->is_shared = FALSE;
683 new->needs_wakeup = FALSE;
684 new->in_transition = FALSE;
685 new->wired_count = 0;
686 new->user_wired_count = 0;
687 new->vme_permanent = FALSE;
688 vm_map_entry_copy_code_signing(map, new, old);
689 vm_map_entry_copy_csm_assoc(map, new, old);
690 if (new->iokit_acct) {
691 assertf(!new->use_pmap, "old %p new %p\n", old, new);
692 new->iokit_acct = FALSE;
693 new->use_pmap = TRUE;
694 }
695 new->vme_resilient_codesign = FALSE;
696 new->vme_resilient_media = FALSE;
697 new->vme_atomic = FALSE;
698 new->vme_no_copy_on_read = FALSE;
699 }
700
701 /*
702 * Normal lock_read_to_write() returns FALSE/0 on failure.
703 * These functions evaluate to zero on success and non-zero value on failure.
704 */
705 __attribute__((always_inline))
706 int
vm_map_lock_read_to_write(vm_map_t map)707 vm_map_lock_read_to_write(vm_map_t map)
708 {
709 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
710 DTRACE_VM(vm_map_lock_upgrade);
711 return 0;
712 }
713 return 1;
714 }
715
716 __attribute__((always_inline))
717 boolean_t
vm_map_try_lock(vm_map_t map)718 vm_map_try_lock(vm_map_t map)
719 {
720 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
721 DTRACE_VM(vm_map_lock_w);
722 return TRUE;
723 }
724 return FALSE;
725 }
726
727 __attribute__((always_inline))
728 boolean_t
vm_map_try_lock_read(vm_map_t map)729 vm_map_try_lock_read(vm_map_t map)
730 {
731 if (lck_rw_try_lock_shared(&(map)->lock)) {
732 DTRACE_VM(vm_map_lock_r);
733 return TRUE;
734 }
735 return FALSE;
736 }
737
738 /*!
739 * @function kdp_vm_map_is_acquired_exclusive
740 *
741 * @abstract
742 * Checks if vm map is acquired exclusive.
743 *
744 * @discussion
745 * NOT SAFE: To be used only by kernel debugger.
746 *
747 * @param map map to check
748 *
749 * @returns TRUE if the map is acquired exclusively.
750 */
751 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)752 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
753 {
754 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
755 }
756
757 /*
758 * Routines to get the page size the caller should
759 * use while inspecting the target address space.
760 * Use the "_safely" variant if the caller is dealing with a user-provided
761 * array whose size depends on the page size, to avoid any overflow or
762 * underflow of a user-allocated buffer.
763 */
764 int
vm_self_region_page_shift_safely(vm_map_t target_map)765 vm_self_region_page_shift_safely(
766 vm_map_t target_map)
767 {
768 int effective_page_shift = 0;
769
770 if (PAGE_SIZE == (4096)) {
771 /* x86_64 and 4k watches: always use 4k */
772 return PAGE_SHIFT;
773 }
774 /* did caller provide an explicit page size for this thread to use? */
775 effective_page_shift = thread_self_region_page_shift();
776 if (effective_page_shift) {
777 /* use the explicitly-provided page size */
778 return effective_page_shift;
779 }
780 /* no explicit page size: use the caller's page size... */
781 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
782 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
783 /* page size match: safe to use */
784 return effective_page_shift;
785 }
786 /* page size mismatch */
787 return -1;
788 }
789 int
vm_self_region_page_shift(vm_map_t target_map)790 vm_self_region_page_shift(
791 vm_map_t target_map)
792 {
793 int effective_page_shift;
794
795 effective_page_shift = vm_self_region_page_shift_safely(target_map);
796 if (effective_page_shift == -1) {
797 /* no safe value but OK to guess for caller */
798 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
799 VM_MAP_PAGE_SHIFT(target_map));
800 }
801 return effective_page_shift;
802 }
803
804
805 /*
806 * Decide if we want to allow processes to execute from their data or stack areas.
807 * override_nx() returns true if we do. Data/stack execution can be enabled independently
808 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
809 * or allow_stack_exec to enable data execution for that type of data area for that particular
810 * ABI (or both by or'ing the flags together). These are initialized in the architecture
811 * specific pmap files since the default behavior varies according to architecture. The
812 * main reason it varies is because of the need to provide binary compatibility with old
813 * applications that were written before these restrictions came into being. In the old
814 * days, an app could execute anything it could read, but this has slowly been tightened
815 * up over time. The default behavior is:
816 *
817 * 32-bit PPC apps may execute from both stack and data areas
818 * 32-bit Intel apps may exeucte from data areas but not stack
819 * 64-bit PPC/Intel apps may not execute from either data or stack
820 *
821 * An application on any architecture may override these defaults by explicitly
822 * adding PROT_EXEC permission to the page in question with the mprotect(2)
823 * system call. This code here just determines what happens when an app tries to
824 * execute from a page that lacks execute permission.
825 *
826 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
827 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
828 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
829 * execution from data areas for a particular binary even if the arch normally permits it. As
830 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
831 * to support some complicated use cases, notably browsers with out-of-process plugins that
832 * are not all NX-safe.
833 */
834
835 extern int allow_data_exec, allow_stack_exec;
836
837 int
override_nx(vm_map_t map,uint32_t user_tag)838 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
839 {
840 int current_abi;
841
842 if (map->pmap == kernel_pmap) {
843 return FALSE;
844 }
845
846 /*
847 * Determine if the app is running in 32 or 64 bit mode.
848 */
849
850 if (vm_map_is_64bit(map)) {
851 current_abi = VM_ABI_64;
852 } else {
853 current_abi = VM_ABI_32;
854 }
855
856 /*
857 * Determine if we should allow the execution based on whether it's a
858 * stack or data area and the current architecture.
859 */
860
861 if (user_tag == VM_MEMORY_STACK) {
862 return allow_stack_exec & current_abi;
863 }
864
865 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
866 }
867
868
869 /*
870 * Virtual memory maps provide for the mapping, protection,
871 * and sharing of virtual memory objects. In addition,
872 * this module provides for an efficient virtual copy of
873 * memory from one map to another.
874 *
875 * Synchronization is required prior to most operations.
876 *
877 * Maps consist of an ordered doubly-linked list of simple
878 * entries; a single hint is used to speed up lookups.
879 *
880 * Sharing maps have been deleted from this version of Mach.
881 * All shared objects are now mapped directly into the respective
882 * maps. This requires a change in the copy on write strategy;
883 * the asymmetric (delayed) strategy is used for shared temporary
884 * objects instead of the symmetric (shadow) strategy. All maps
885 * are now "top level" maps (either task map, kernel map or submap
886 * of the kernel map).
887 *
888 * Since portions of maps are specified by start/end addreses,
889 * which may not align with existing map entries, all
890 * routines merely "clip" entries to these start/end values.
891 * [That is, an entry is split into two, bordering at a
892 * start or end value.] Note that these clippings may not
893 * always be necessary (as the two resulting entries are then
894 * not changed); however, the clipping is done for convenience.
895 * No attempt is currently made to "glue back together" two
896 * abutting entries.
897 *
898 * The symmetric (shadow) copy strategy implements virtual copy
899 * by copying VM object references from one map to
900 * another, and then marking both regions as copy-on-write.
901 * It is important to note that only one writeable reference
902 * to a VM object region exists in any map when this strategy
903 * is used -- this means that shadow object creation can be
904 * delayed until a write operation occurs. The symmetric (delayed)
905 * strategy allows multiple maps to have writeable references to
906 * the same region of a vm object, and hence cannot delay creating
907 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
908 * Copying of permanent objects is completely different; see
909 * vm_object_copy_strategically() in vm_object.c.
910 */
911
912 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
913
914 #define VM_MAP_ZONE_NAME "maps"
915 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
916
917 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
918 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
919
920 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
921 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
922
923 /*
924 * Asserts that a vm_map_copy object is coming from the
925 * vm_map_copy_zone to ensure that it isn't a fake constructed
926 * anywhere else.
927 */
928 void
vm_map_copy_require(struct vm_map_copy * copy)929 vm_map_copy_require(struct vm_map_copy *copy)
930 {
931 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
932 }
933
934 /*
935 * vm_map_require:
936 *
937 * Ensures that the argument is memory allocated from the genuine
938 * vm map zone. (See zone_id_require_allow_foreign).
939 */
940 void
vm_map_require(vm_map_t map)941 vm_map_require(vm_map_t map)
942 {
943 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
944 }
945
946 #define VM_MAP_EARLY_COUNT_MAX 16
947 static __startup_data vm_offset_t map_data;
948 static __startup_data vm_size_t map_data_size;
949 static __startup_data vm_offset_t kentry_data;
950 static __startup_data vm_size_t kentry_data_size;
951 static __startup_data vm_offset_t map_holes_data;
952 static __startup_data vm_size_t map_holes_data_size;
953 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
954 static __startup_data uint32_t early_map_count;
955
956 #if XNU_TARGET_OS_OSX
957 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
958 #else /* XNU_TARGET_OS_OSX */
959 #define NO_COALESCE_LIMIT 0
960 #endif /* XNU_TARGET_OS_OSX */
961
962 /* Skip acquiring locks if we're in the midst of a kernel core dump */
963 unsigned int not_in_kdp = 1;
964
965 unsigned int vm_map_set_cache_attr_count = 0;
966
967 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)968 vm_map_set_cache_attr(
969 vm_map_t map,
970 vm_map_offset_t va)
971 {
972 vm_map_entry_t map_entry;
973 vm_object_t object;
974 kern_return_t kr = KERN_SUCCESS;
975
976 vm_map_lock_read(map);
977
978 if (!vm_map_lookup_entry(map, va, &map_entry) ||
979 map_entry->is_sub_map) {
980 /*
981 * that memory is not properly mapped
982 */
983 kr = KERN_INVALID_ARGUMENT;
984 goto done;
985 }
986 object = VME_OBJECT(map_entry);
987
988 if (object == VM_OBJECT_NULL) {
989 /*
990 * there should be a VM object here at this point
991 */
992 kr = KERN_INVALID_ARGUMENT;
993 goto done;
994 }
995 vm_object_lock(object);
996 object->set_cache_attr = TRUE;
997 vm_object_unlock(object);
998
999 vm_map_set_cache_attr_count++;
1000 done:
1001 vm_map_unlock_read(map);
1002
1003 return kr;
1004 }
1005
1006
1007 #if CONFIG_CODE_DECRYPTION
1008 /*
1009 * vm_map_apple_protected:
1010 * This remaps the requested part of the object with an object backed by
1011 * the decrypting pager.
1012 * crypt_info contains entry points and session data for the crypt module.
1013 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1014 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1015 */
1016 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1017 vm_map_apple_protected(
1018 vm_map_t map,
1019 vm_map_offset_t start,
1020 vm_map_offset_t end,
1021 vm_object_offset_t crypto_backing_offset,
1022 struct pager_crypt_info *crypt_info,
1023 uint32_t cryptid)
1024 {
1025 boolean_t map_locked;
1026 kern_return_t kr;
1027 vm_map_entry_t map_entry;
1028 struct vm_map_entry tmp_entry;
1029 memory_object_t unprotected_mem_obj;
1030 vm_object_t protected_object;
1031 vm_map_offset_t map_addr;
1032 vm_map_offset_t start_aligned, end_aligned;
1033 vm_object_offset_t crypto_start, crypto_end;
1034 boolean_t cache_pager;
1035
1036 map_locked = FALSE;
1037 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1038
1039 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1040 return KERN_INVALID_ADDRESS;
1041 }
1042 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1043 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1044 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1045 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1046
1047 #if __arm64__
1048 /*
1049 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1050 * so we might have to loop and establish up to 3 mappings:
1051 *
1052 * + the first 16K-page, which might overlap with the previous
1053 * 4K-aligned mapping,
1054 * + the center,
1055 * + the last 16K-page, which might overlap with the next
1056 * 4K-aligned mapping.
1057 * Each of these mapping might be backed by a vnode pager (if
1058 * properly page-aligned) or a "fourk_pager", itself backed by a
1059 * vnode pager (if 4K-aligned but not page-aligned).
1060 */
1061 #endif /* __arm64__ */
1062
1063 map_addr = start_aligned;
1064 for (map_addr = start_aligned;
1065 map_addr < end;
1066 map_addr = tmp_entry.vme_end) {
1067 vm_map_lock(map);
1068 map_locked = TRUE;
1069
1070 /* lookup the protected VM object */
1071 if (!vm_map_lookup_entry(map,
1072 map_addr,
1073 &map_entry) ||
1074 map_entry->is_sub_map ||
1075 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1076 /* that memory is not properly mapped */
1077 kr = KERN_INVALID_ARGUMENT;
1078 goto done;
1079 }
1080
1081 /* ensure mapped memory is mapped as executable except
1082 * except for model decryption flow */
1083 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1084 !(map_entry->protection & VM_PROT_EXECUTE)) {
1085 kr = KERN_INVALID_ARGUMENT;
1086 goto done;
1087 }
1088
1089 /* get the protected object to be decrypted */
1090 protected_object = VME_OBJECT(map_entry);
1091 if (protected_object == VM_OBJECT_NULL) {
1092 /* there should be a VM object here at this point */
1093 kr = KERN_INVALID_ARGUMENT;
1094 goto done;
1095 }
1096 /* ensure protected object stays alive while map is unlocked */
1097 vm_object_reference(protected_object);
1098
1099 /* limit the map entry to the area we want to cover */
1100 vm_map_clip_start(map, map_entry, start_aligned);
1101 vm_map_clip_end(map, map_entry, end_aligned);
1102
1103 tmp_entry = *map_entry;
1104 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1105 vm_map_unlock(map);
1106 map_locked = FALSE;
1107
1108 /*
1109 * This map entry might be only partially encrypted
1110 * (if not fully "page-aligned").
1111 */
1112 crypto_start = 0;
1113 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1114 if (tmp_entry.vme_start < start) {
1115 if (tmp_entry.vme_start != start_aligned) {
1116 kr = KERN_INVALID_ADDRESS;
1117 vm_object_deallocate(protected_object);
1118 goto done;
1119 }
1120 crypto_start += (start - tmp_entry.vme_start);
1121 }
1122 if (tmp_entry.vme_end > end) {
1123 if (tmp_entry.vme_end != end_aligned) {
1124 kr = KERN_INVALID_ADDRESS;
1125 vm_object_deallocate(protected_object);
1126 goto done;
1127 }
1128 crypto_end -= (tmp_entry.vme_end - end);
1129 }
1130
1131 /*
1132 * This "extra backing offset" is needed to get the decryption
1133 * routine to use the right key. It adjusts for the possibly
1134 * relative offset of an interposed "4K" pager...
1135 */
1136 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1137 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1138 }
1139
1140 cache_pager = TRUE;
1141 #if XNU_TARGET_OS_OSX
1142 if (vm_map_is_alien(map)) {
1143 cache_pager = FALSE;
1144 }
1145 #endif /* XNU_TARGET_OS_OSX */
1146
1147 /*
1148 * Lookup (and create if necessary) the protected memory object
1149 * matching that VM object.
1150 * If successful, this also grabs a reference on the memory object,
1151 * to guarantee that it doesn't go away before we get a chance to map
1152 * it.
1153 */
1154 unprotected_mem_obj = apple_protect_pager_setup(
1155 protected_object,
1156 VME_OFFSET(&tmp_entry),
1157 crypto_backing_offset,
1158 crypt_info,
1159 crypto_start,
1160 crypto_end,
1161 cache_pager);
1162
1163 /* release extra ref on protected object */
1164 vm_object_deallocate(protected_object);
1165
1166 if (unprotected_mem_obj == NULL) {
1167 kr = KERN_FAILURE;
1168 goto done;
1169 }
1170
1171 /* can overwrite an immutable mapping */
1172 vm_map_kernel_flags_t vmk_flags = {
1173 .vmf_fixed = true,
1174 .vmf_overwrite = true,
1175 .vmkf_overwrite_immutable = true,
1176 };
1177 /* make the new mapping as "permanent" as the one it replaces */
1178 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1179
1180 /* map this memory object in place of the current one */
1181 map_addr = tmp_entry.vme_start;
1182 kr = mach_vm_map_kernel(map,
1183 vm_sanitize_wrap_addr_ref(&map_addr),
1184 (tmp_entry.vme_end -
1185 tmp_entry.vme_start),
1186 (mach_vm_offset_t) 0,
1187 vmk_flags,
1188 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1189 0,
1190 TRUE,
1191 tmp_entry.protection,
1192 tmp_entry.max_protection,
1193 tmp_entry.inheritance);
1194 assertf(kr == KERN_SUCCESS,
1195 "kr = 0x%x\n", kr);
1196 assertf(map_addr == tmp_entry.vme_start,
1197 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1198 (uint64_t)map_addr,
1199 (uint64_t) tmp_entry.vme_start,
1200 &tmp_entry);
1201
1202 #if VM_MAP_DEBUG_APPLE_PROTECT
1203 if (vm_map_debug_apple_protect) {
1204 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1205 " backing:[object:%p,offset:0x%llx,"
1206 "crypto_backing_offset:0x%llx,"
1207 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1208 map,
1209 (uint64_t) map_addr,
1210 (uint64_t) (map_addr + (tmp_entry.vme_end -
1211 tmp_entry.vme_start)),
1212 unprotected_mem_obj,
1213 protected_object,
1214 VME_OFFSET(&tmp_entry),
1215 crypto_backing_offset,
1216 crypto_start,
1217 crypto_end);
1218 }
1219 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1220
1221 /*
1222 * Release the reference obtained by
1223 * apple_protect_pager_setup().
1224 * The mapping (if it succeeded) is now holding a reference on
1225 * the memory object.
1226 */
1227 memory_object_deallocate(unprotected_mem_obj);
1228 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1229
1230 /* continue with next map entry */
1231 crypto_backing_offset += (tmp_entry.vme_end -
1232 tmp_entry.vme_start);
1233 crypto_backing_offset -= crypto_start;
1234 }
1235 kr = KERN_SUCCESS;
1236
1237 done:
1238 if (map_locked) {
1239 vm_map_unlock(map);
1240 }
1241 return kr;
1242 }
1243 #endif /* CONFIG_CODE_DECRYPTION */
1244
1245
1246 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1247 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1248 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1249
1250 #if XNU_TARGET_OS_OSX
1251 #define MALLOC_NO_COW_DEFAULT 1
1252 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1253 #else /* XNU_TARGET_OS_OSX */
1254 #define MALLOC_NO_COW_DEFAULT 1
1255 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1256 #endif /* XNU_TARGET_OS_OSX */
1257 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1258 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1259 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1260 #if DEBUG
1261 int vm_check_map_sanity = 0;
1262 #endif
1263
1264 /*
1265 * vm_map_init:
1266 *
1267 * Initialize the vm_map module. Must be called before
1268 * any other vm_map routines.
1269 *
1270 * Map and entry structures are allocated from zones -- we must
1271 * initialize those zones.
1272 *
1273 * There are three zones of interest:
1274 *
1275 * vm_map_zone: used to allocate maps.
1276 * vm_map_entry_zone: used to allocate map entries.
1277 *
1278 * LP32:
1279 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1280 *
1281 * The kernel allocates map entries from a special zone that is initially
1282 * "crammed" with memory. It would be difficult (perhaps impossible) for
1283 * the kernel to allocate more memory to a entry zone when it became
1284 * empty since the very act of allocating memory implies the creation
1285 * of a new entry.
1286 */
1287 __startup_func
1288 void
vm_map_init(void)1289 vm_map_init(void)
1290 {
1291
1292 #if MACH_ASSERT
1293 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1294 sizeof(debug4k_filter));
1295 #endif /* MACH_ASSERT */
1296
1297 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1298 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1299
1300 /*
1301 * Don't quarantine because we always need elements available
1302 * Disallow GC on this zone... to aid the GC.
1303 */
1304 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1305 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1306 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1307 z->z_elems_rsv = (uint16_t)(32 *
1308 (ml_early_cpu_max_number() + 1));
1309 });
1310
1311 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1312 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1313 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1314 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1315 });
1316
1317 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1318 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1319
1320 /*
1321 * Add the stolen memory to zones, adjust zone size and stolen counts.
1322 */
1323 zone_cram_early(vm_map_zone, map_data, map_data_size);
1324 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1325 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1326 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1327 zone_count_free(vm_map_zone),
1328 zone_count_free(vm_map_entry_zone),
1329 zone_count_free(vm_map_holes_zone));
1330
1331 /*
1332 * Since these are covered by zones, remove them from stolen page accounting.
1333 */
1334 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1335
1336 #if VM_MAP_DEBUG_APPLE_PROTECT
1337 PE_parse_boot_argn("vm_map_debug_apple_protect",
1338 &vm_map_debug_apple_protect,
1339 sizeof(vm_map_debug_apple_protect));
1340 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1341 #if VM_MAP_DEBUG_APPLE_FOURK
1342 PE_parse_boot_argn("vm_map_debug_fourk",
1343 &vm_map_debug_fourk,
1344 sizeof(vm_map_debug_fourk));
1345 #endif /* VM_MAP_DEBUG_FOURK */
1346
1347 if (malloc_no_cow) {
1348 vm_memory_malloc_no_cow_mask = 0ULL;
1349 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1350 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1351 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1352 #if XNU_TARGET_OS_OSX
1353 /*
1354 * On macOS, keep copy-on-write for MALLOC_LARGE because
1355 * realloc() may use vm_copy() to transfer the old contents
1356 * to the new location.
1357 */
1358 #else /* XNU_TARGET_OS_OSX */
1359 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1360 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1361 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1362 #endif /* XNU_TARGET_OS_OSX */
1363 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1364 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1365 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1366 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1367 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1368 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1369 &vm_memory_malloc_no_cow_mask,
1370 sizeof(vm_memory_malloc_no_cow_mask));
1371 }
1372
1373 #if CONFIG_MAP_RANGES
1374 vm_map_range_map_init();
1375 #endif /* CONFIG_MAP_RANGES */
1376
1377 #if DEBUG
1378 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1379 if (vm_check_map_sanity) {
1380 kprintf("VM sanity checking enabled\n");
1381 } else {
1382 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1383 }
1384 #endif /* DEBUG */
1385
1386 #if DEVELOPMENT || DEBUG
1387 PE_parse_boot_argn("panic_on_unsigned_execute",
1388 &panic_on_unsigned_execute,
1389 sizeof(panic_on_unsigned_execute));
1390 PE_parse_boot_argn("panic_on_mlock_failure",
1391 &panic_on_mlock_failure,
1392 sizeof(panic_on_mlock_failure));
1393 #endif /* DEVELOPMENT || DEBUG */
1394 }
1395
1396 __startup_func
1397 static void
vm_map_steal_memory(void)1398 vm_map_steal_memory(void)
1399 {
1400 /*
1401 * We need to reserve enough memory to support boostraping VM maps
1402 * and the zone subsystem.
1403 *
1404 * The VM Maps that need to function before zones can support them
1405 * are the ones registered with vm_map_will_allocate_early_map(),
1406 * which are:
1407 * - the kernel map
1408 * - the various submaps used by zones (pgz, meta, ...)
1409 *
1410 * We also need enough entries and holes to support them
1411 * until zone_metadata_init() is called, which is when
1412 * the zone allocator becomes capable of expanding dynamically.
1413 *
1414 * We need:
1415 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1416 * - To allow for 3-4 entries per map, but the kernel map
1417 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1418 * to describe the submaps, so double it (and make it 8x too)
1419 * - To allow for holes between entries,
1420 * hence needs the same budget as entries
1421 */
1422 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1423 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1424 VM_MAP_EARLY_COUNT_MAX);
1425
1426 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1427 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1428 8 * VM_MAP_EARLY_COUNT_MAX);
1429
1430 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1431 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1432 8 * VM_MAP_EARLY_COUNT_MAX);
1433
1434 /*
1435 * Steal a contiguous range of memory so that a simple range check
1436 * can validate early addresses being freed/crammed to these
1437 * zones
1438 */
1439 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1440 map_holes_data_size);
1441 kentry_data = map_data + map_data_size;
1442 map_holes_data = kentry_data + kentry_data_size;
1443 }
1444 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1445
1446 __startup_func
1447 static void
vm_kernel_boostraped(void)1448 vm_kernel_boostraped(void)
1449 {
1450 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1451 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1452 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1453
1454 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1455 zone_count_free(vm_map_zone),
1456 zone_count_free(vm_map_entry_zone),
1457 zone_count_free(vm_map_holes_zone));
1458 }
1459 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1460
1461 void
vm_map_disable_hole_optimization(vm_map_t map)1462 vm_map_disable_hole_optimization(vm_map_t map)
1463 {
1464 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1465
1466 if (map->holelistenabled) {
1467 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1468
1469 while (hole_entry != NULL) {
1470 next_hole_entry = hole_entry->vme_next;
1471
1472 hole_entry->vme_next = NULL;
1473 hole_entry->vme_prev = NULL;
1474 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1475
1476 if (next_hole_entry == head_entry) {
1477 hole_entry = NULL;
1478 } else {
1479 hole_entry = next_hole_entry;
1480 }
1481 }
1482
1483 map->holes_list = NULL;
1484 map->holelistenabled = FALSE;
1485
1486 map->first_free = vm_map_first_entry(map);
1487 SAVE_HINT_HOLE_WRITE(map, NULL);
1488 }
1489 }
1490
1491 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1492 vm_kernel_map_is_kernel(vm_map_t map)
1493 {
1494 return map->pmap == kernel_pmap;
1495 }
1496
1497 /*
1498 * vm_map_create:
1499 *
1500 * Creates and returns a new empty VM map with
1501 * the given physical map structure, and having
1502 * the given lower and upper address bounds.
1503 */
1504
1505 extern vm_map_t vm_map_create_external(
1506 pmap_t pmap,
1507 vm_map_offset_t min_off,
1508 vm_map_offset_t max_off,
1509 boolean_t pageable);
1510
1511 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1512 vm_map_create_external(
1513 pmap_t pmap,
1514 vm_map_offset_t min,
1515 vm_map_offset_t max,
1516 boolean_t pageable)
1517 {
1518 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1519
1520 if (pageable) {
1521 options |= VM_MAP_CREATE_PAGEABLE;
1522 }
1523 return vm_map_create_options(pmap, min, max, options);
1524 }
1525
1526 __startup_func
1527 void
vm_map_will_allocate_early_map(vm_map_t * owner)1528 vm_map_will_allocate_early_map(vm_map_t *owner)
1529 {
1530 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1531 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1532 }
1533
1534 early_map_owners[early_map_count++] = owner;
1535 }
1536
1537 __startup_func
1538 void
vm_map_relocate_early_maps(vm_offset_t delta)1539 vm_map_relocate_early_maps(vm_offset_t delta)
1540 {
1541 for (uint32_t i = 0; i < early_map_count; i++) {
1542 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1543
1544 *early_map_owners[i] = (vm_map_t)(addr + delta);
1545 }
1546
1547 early_map_count = ~0u;
1548 }
1549
1550 /*
1551 * Routine: vm_map_relocate_early_elem
1552 *
1553 * Purpose:
1554 * Early zone elements are allocated in a temporary part
1555 * of the address space.
1556 *
1557 * Once the zones live in their final place, the early
1558 * VM maps, map entries and map holes need to be relocated.
1559 *
1560 * It involves rewriting any vm_map_t, vm_map_entry_t or
1561 * pointers to vm_map_links. Other pointers to other types
1562 * are fine.
1563 *
1564 * Fortunately, pointers to those types are self-contained
1565 * in those zones, _except_ for pointers to VM maps,
1566 * which are tracked during early boot and fixed with
1567 * vm_map_relocate_early_maps().
1568 */
1569 __startup_func
1570 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1571 vm_map_relocate_early_elem(
1572 uint32_t zone_id,
1573 vm_offset_t new_addr,
1574 vm_offset_t delta)
1575 {
1576 #define relocate(type_t, field) ({ \
1577 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1578 if (*__field) { \
1579 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1580 } \
1581 })
1582
1583 switch (zone_id) {
1584 case ZONE_ID_VM_MAP:
1585 case ZONE_ID_VM_MAP_ENTRY:
1586 case ZONE_ID_VM_MAP_HOLES:
1587 break;
1588
1589 default:
1590 panic("Unexpected zone ID %d", zone_id);
1591 }
1592
1593 if (zone_id == ZONE_ID_VM_MAP) {
1594 relocate(vm_map_t, hdr.links.prev);
1595 relocate(vm_map_t, hdr.links.next);
1596 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1597 #ifdef VM_MAP_STORE_USE_RB
1598 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1599 #endif /* VM_MAP_STORE_USE_RB */
1600 relocate(vm_map_t, hint);
1601 relocate(vm_map_t, hole_hint);
1602 relocate(vm_map_t, first_free);
1603 return;
1604 }
1605
1606 relocate(struct vm_map_links *, prev);
1607 relocate(struct vm_map_links *, next);
1608
1609 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1610 #ifdef VM_MAP_STORE_USE_RB
1611 relocate(vm_map_entry_t, store.entry.rbe_left);
1612 relocate(vm_map_entry_t, store.entry.rbe_right);
1613 relocate(vm_map_entry_t, store.entry.rbe_parent);
1614 #endif /* VM_MAP_STORE_USE_RB */
1615 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1616 /* no object to relocate because we haven't made any */
1617 ((vm_map_entry_t)new_addr)->vme_submap +=
1618 delta >> VME_SUBMAP_SHIFT;
1619 }
1620 #if MAP_ENTRY_CREATION_DEBUG
1621 relocate(vm_map_entry_t, vme_creation_maphdr);
1622 #endif /* MAP_ENTRY_CREATION_DEBUG */
1623 }
1624
1625 #undef relocate
1626 }
1627
1628 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1629 vm_map_create_options(
1630 pmap_t pmap,
1631 vm_map_offset_t min,
1632 vm_map_offset_t max,
1633 vm_map_create_options_t options)
1634 {
1635 vm_map_t result;
1636
1637 #if DEBUG || DEVELOPMENT
1638 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1639 if (early_map_count != ~0u && early_map_count !=
1640 zone_count_allocated(vm_map_zone) + 1) {
1641 panic("allocating %dth early map, owner not known",
1642 zone_count_allocated(vm_map_zone) + 1);
1643 }
1644 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1645 panic("allocating %dth early map for non kernel pmap",
1646 early_map_count);
1647 }
1648 }
1649 #endif /* DEBUG || DEVELOPMENT */
1650
1651 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1652
1653 vm_map_store_init(&result->hdr);
1654 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1655 vm_map_set_page_shift(result, PAGE_SHIFT);
1656
1657 result->size_limit = RLIM_INFINITY; /* default unlimited */
1658 result->data_limit = RLIM_INFINITY; /* default unlimited */
1659 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1660 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1661 result->pmap = pmap;
1662 result->min_offset = min;
1663 result->max_offset = max;
1664 result->first_free = vm_map_to_entry(result);
1665 result->hint = vm_map_to_entry(result);
1666
1667 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1668 assert(pmap == kernel_pmap);
1669 result->never_faults = true;
1670 }
1671
1672 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1673 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1674 result->has_corpse_footprint = true;
1675 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1676 struct vm_map_links *hole_entry;
1677
1678 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1679 hole_entry->start = min;
1680 /*
1681 * Holes can be used to track ranges all the way up to
1682 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1683 */
1684 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1685 result->holes_list = result->hole_hint = hole_entry;
1686 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1687 result->holelistenabled = true;
1688 }
1689
1690 vm_map_lock_init(result);
1691
1692 return result;
1693 }
1694
1695 /*
1696 * Adjusts a submap that was made by kmem_suballoc()
1697 * before it knew where it would be mapped,
1698 * so that it has the right min/max offsets.
1699 *
1700 * We do not need to hold any locks:
1701 * only the caller knows about this map,
1702 * and it is not published on any entry yet.
1703 */
1704 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1705 vm_map_adjust_offsets(
1706 vm_map_t map,
1707 vm_map_offset_t min_off,
1708 vm_map_offset_t max_off)
1709 {
1710 assert(map->min_offset == 0);
1711 assert(map->max_offset == max_off - min_off);
1712 assert(map->hdr.nentries == 0);
1713 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1714
1715 map->min_offset = min_off;
1716 map->max_offset = max_off;
1717
1718 if (map->holelistenabled) {
1719 struct vm_map_links *hole = map->holes_list;
1720
1721 hole->start = min_off;
1722 #if defined(__arm64__)
1723 hole->end = max_off;
1724 #else
1725 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1726 #endif
1727 }
1728 }
1729
1730
1731 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1732 vm_map_adjusted_size(vm_map_t map)
1733 {
1734 const struct vm_reserved_region *regions = NULL;
1735 size_t num_regions = 0;
1736 mach_vm_size_t reserved_size = 0, map_size = 0;
1737
1738 if (map == NULL || (map->size == 0)) {
1739 return 0;
1740 }
1741
1742 map_size = map->size;
1743
1744 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1745 /*
1746 * No special reserved regions or not an exotic map or the task
1747 * is terminating and these special regions might have already
1748 * been deallocated.
1749 */
1750 return map_size;
1751 }
1752
1753 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1754 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1755
1756 while (num_regions) {
1757 reserved_size += regions[--num_regions].vmrr_size;
1758 }
1759
1760 /*
1761 * There are a few places where the map is being switched out due to
1762 * 'termination' without that bit being set (e.g. exec and corpse purging).
1763 * In those cases, we could have the map's regions being deallocated on
1764 * a core while some accounting process is trying to get the map's size.
1765 * So this assert can't be enabled till all those places are uniform in
1766 * their use of the 'map->terminated' bit.
1767 *
1768 * assert(map_size >= reserved_size);
1769 */
1770
1771 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1772 }
1773
1774 /*
1775 * vm_map_entry_create: [ internal use only ]
1776 *
1777 * Allocates a VM map entry for insertion in the
1778 * given map (or map copy). No fields are filled.
1779 *
1780 * The VM entry will be zero initialized, except for:
1781 * - behavior set to VM_BEHAVIOR_DEFAULT
1782 * - inheritance set to VM_INHERIT_DEFAULT
1783 */
1784 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1785
1786 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1787
1788 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1789 _vm_map_entry_create(
1790 struct vm_map_header *map_header __unused)
1791 {
1792 vm_map_entry_t entry = NULL;
1793
1794 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1795
1796 /*
1797 * Help the compiler with what we know to be true,
1798 * so that the further bitfields inits have good codegen.
1799 *
1800 * See rdar://87041299
1801 */
1802 __builtin_assume(entry->vme_object_value == 0);
1803 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1804 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1805
1806 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1807 "VME_ALIAS_MASK covers tags");
1808
1809 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1810 "can skip zeroing of the behavior field");
1811 entry->inheritance = VM_INHERIT_DEFAULT;
1812
1813 #if MAP_ENTRY_CREATION_DEBUG
1814 entry->vme_creation_maphdr = map_header;
1815 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1816 BTREF_GET_NOWAIT);
1817 #endif
1818 return entry;
1819 }
1820
1821 /*
1822 * vm_map_entry_dispose: [ internal use only ]
1823 *
1824 * Inverse of vm_map_entry_create.
1825 *
1826 * write map lock held so no need to
1827 * do anything special to insure correctness
1828 * of the stores
1829 */
1830 static void
vm_map_entry_dispose(vm_map_entry_t entry)1831 vm_map_entry_dispose(
1832 vm_map_entry_t entry)
1833 {
1834 #if VM_BTLOG_TAGS
1835 if (entry->vme_kernel_object) {
1836 btref_put(entry->vme_tag_btref);
1837 }
1838 #endif /* VM_BTLOG_TAGS */
1839 #if MAP_ENTRY_CREATION_DEBUG
1840 btref_put(entry->vme_creation_bt);
1841 #endif
1842 #if MAP_ENTRY_INSERTION_DEBUG
1843 btref_put(entry->vme_insertion_bt);
1844 #endif
1845 zfree(vm_map_entry_zone, entry);
1846 }
1847
1848 #define vm_map_copy_entry_dispose(copy_entry) \
1849 vm_map_entry_dispose(copy_entry)
1850
1851 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1852 vm_map_zap_first_entry(
1853 vm_map_zap_t list)
1854 {
1855 return list->vmz_head;
1856 }
1857
1858 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1859 vm_map_zap_last_entry(
1860 vm_map_zap_t list)
1861 {
1862 assert(vm_map_zap_first_entry(list));
1863 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1864 }
1865
1866 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1867 vm_map_zap_append(
1868 vm_map_zap_t list,
1869 vm_map_entry_t entry)
1870 {
1871 entry->vme_next = VM_MAP_ENTRY_NULL;
1872 *list->vmz_tail = entry;
1873 list->vmz_tail = &entry->vme_next;
1874 }
1875
1876 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1877 vm_map_zap_pop(
1878 vm_map_zap_t list)
1879 {
1880 vm_map_entry_t head = list->vmz_head;
1881
1882 if (head != VM_MAP_ENTRY_NULL &&
1883 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1884 list->vmz_tail = &list->vmz_head;
1885 }
1886
1887 return head;
1888 }
1889
1890 static void
vm_map_zap_dispose(vm_map_zap_t list)1891 vm_map_zap_dispose(
1892 vm_map_zap_t list)
1893 {
1894 vm_map_entry_t entry;
1895
1896 while ((entry = vm_map_zap_pop(list))) {
1897 if (entry->is_sub_map) {
1898 vm_map_deallocate(VME_SUBMAP(entry));
1899 } else {
1900 vm_object_deallocate(VME_OBJECT(entry));
1901 }
1902
1903 vm_map_entry_dispose(entry);
1904 }
1905 }
1906
1907 #if MACH_ASSERT
1908 static boolean_t first_free_check = FALSE;
1909 boolean_t
first_free_is_valid(vm_map_t map)1910 first_free_is_valid(
1911 vm_map_t map)
1912 {
1913 if (!first_free_check) {
1914 return TRUE;
1915 }
1916
1917 return first_free_is_valid_store( map );
1918 }
1919 #endif /* MACH_ASSERT */
1920
1921
1922 #define vm_map_copy_entry_link(copy, after_where, entry) \
1923 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1924
1925 #define vm_map_copy_entry_unlink(copy, entry) \
1926 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1927
1928 /*
1929 * vm_map_destroy:
1930 *
1931 * Actually destroy a map.
1932 */
1933 void
vm_map_destroy(vm_map_t map)1934 vm_map_destroy(
1935 vm_map_t map)
1936 {
1937 /* final cleanup: this is not allowed to fail */
1938 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1939
1940 VM_MAP_ZAP_DECLARE(zap);
1941
1942 vm_map_lock(map);
1943
1944 map->terminated = true;
1945 /* clean up regular map entries */
1946 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1947 KMEM_GUARD_NONE, &zap);
1948 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1949 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1950 KMEM_GUARD_NONE, &zap);
1951
1952 vm_map_disable_hole_optimization(map);
1953 vm_map_corpse_footprint_destroy(map);
1954
1955 vm_map_unlock(map);
1956
1957 vm_map_zap_dispose(&zap);
1958
1959 assert(map->hdr.nentries == 0);
1960
1961 if (map->pmap) {
1962 pmap_destroy(map->pmap);
1963 }
1964
1965 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1966
1967 #if CONFIG_MAP_RANGES
1968 kfree_data(map->extra_ranges,
1969 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1970 #endif
1971
1972 zfree_id(ZONE_ID_VM_MAP, map);
1973 }
1974
1975 /*
1976 * Returns pid of the task with the largest number of VM map entries.
1977 * Used in the zone-map-exhaustion jetsam path.
1978 */
1979 pid_t
find_largest_process_vm_map_entries(void)1980 find_largest_process_vm_map_entries(void)
1981 {
1982 pid_t victim_pid = -1;
1983 int max_vm_map_entries = 0;
1984 task_t task = TASK_NULL;
1985 queue_head_t *task_list = &tasks;
1986
1987 lck_mtx_lock(&tasks_threads_lock);
1988 queue_iterate(task_list, task, task_t, tasks) {
1989 if (task == kernel_task || !task->active) {
1990 continue;
1991 }
1992
1993 vm_map_t task_map = task->map;
1994 if (task_map != VM_MAP_NULL) {
1995 int task_vm_map_entries = task_map->hdr.nentries;
1996 if (task_vm_map_entries > max_vm_map_entries) {
1997 max_vm_map_entries = task_vm_map_entries;
1998 victim_pid = pid_from_task(task);
1999 }
2000 }
2001 }
2002 lck_mtx_unlock(&tasks_threads_lock);
2003
2004 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
2005 return victim_pid;
2006 }
2007
2008
2009 /*
2010 * vm_map_lookup_entry: [ internal use only ]
2011 *
2012 * Calls into the vm map store layer to find the map
2013 * entry containing (or immediately preceding) the
2014 * specified address in the given map; the entry is returned
2015 * in the "entry" parameter. The boolean
2016 * result indicates whether the address is
2017 * actually contained in the map.
2018 */
2019 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2020 vm_map_lookup_entry(
2021 vm_map_t map,
2022 vm_map_offset_t address,
2023 vm_map_entry_t *entry) /* OUT */
2024 {
2025 bool result = false;
2026
2027 #if CONFIG_KERNEL_TAGGING
2028 if (VM_KERNEL_ADDRESS(address)) {
2029 address = vm_memtag_canonicalize_address(address);
2030 }
2031 #endif /* CONFIG_KERNEL_TAGGING */
2032
2033 #if CONFIG_PROB_GZALLOC
2034 if (map->pmap == kernel_pmap) {
2035 assertf(!pgz_owned(address),
2036 "it is the responsibility of callers to unguard PGZ addresses");
2037 }
2038 #endif /* CONFIG_PROB_GZALLOC */
2039 result = vm_map_store_lookup_entry( map, address, entry );
2040
2041 return result;
2042 }
2043
2044 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2045 vm_map_lookup_entry_or_next(
2046 vm_map_t map,
2047 vm_map_offset_t address,
2048 vm_map_entry_t *entry) /* OUT */
2049 {
2050 if (vm_map_lookup_entry(map, address, entry)) {
2051 return true;
2052 }
2053
2054 *entry = (*entry)->vme_next;
2055 return false;
2056 }
2057
2058 #if CONFIG_PROB_GZALLOC
2059 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2060 vm_map_lookup_entry_allow_pgz(
2061 vm_map_t map,
2062 vm_map_offset_t address,
2063 vm_map_entry_t *entry) /* OUT */
2064 {
2065 #if CONFIG_KERNEL_TAGGING
2066 if (VM_KERNEL_ADDRESS(address)) {
2067 address = vm_memtag_canonicalize_address(address);
2068 }
2069 #endif /* CONFIG_KERNEL_TAGGING */
2070
2071 return vm_map_store_lookup_entry( map, address, entry );
2072 }
2073 #endif /* CONFIG_PROB_GZALLOC */
2074
2075 /*
2076 * Routine: vm_map_range_invalid_panic
2077 * Purpose:
2078 * Panic on detection of an invalid range id.
2079 */
2080 __abortlike
2081 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2082 vm_map_range_invalid_panic(
2083 vm_map_t map,
2084 vm_map_range_id_t range_id)
2085 {
2086 panic("invalid range ID (%u) for map %p", range_id, map);
2087 }
2088
2089 /*
2090 * Routine: vm_map_get_range
2091 * Purpose:
2092 * Adjust bounds based on security policy.
2093 */
2094 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2095 vm_map_get_range(
2096 vm_map_t map,
2097 vm_map_address_t *address,
2098 vm_map_kernel_flags_t *vmk_flags,
2099 vm_map_size_t size,
2100 bool *is_ptr)
2101 {
2102 struct mach_vm_range effective_range = {};
2103 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2104
2105 if (map == kernel_map) {
2106 effective_range = kmem_ranges[range_id];
2107
2108 if (startup_phase >= STARTUP_SUB_KMEM) {
2109 /*
2110 * Hint provided by caller is zeroed as the range is restricted to a
2111 * subset of the entire kernel_map VA, which could put the hint outside
2112 * the range, causing vm_map_store_find_space to fail.
2113 */
2114 *address = 0ull;
2115 /*
2116 * Ensure that range_id passed in by the caller is within meaningful
2117 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2118 * to fail as the corresponding range is invalid. Range id larger than
2119 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2120 */
2121 if ((range_id == KMEM_RANGE_ID_NONE) ||
2122 (range_id > KMEM_RANGE_ID_MAX)) {
2123 vm_map_range_invalid_panic(map, range_id);
2124 }
2125
2126 /*
2127 * Pointer ranges use kmem_locate_space to do allocations.
2128 *
2129 * Non pointer fronts look like [ Small | Large | Permanent ]
2130 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2131 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2132 * use the entire range.
2133 */
2134 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2135 *is_ptr = true;
2136 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2137 effective_range = kmem_large_ranges[range_id];
2138 }
2139 }
2140 #if CONFIG_MAP_RANGES
2141 } else if (map->uses_user_ranges) {
2142 switch (range_id) {
2143 case UMEM_RANGE_ID_DEFAULT:
2144 effective_range = map->default_range;
2145 break;
2146 case UMEM_RANGE_ID_HEAP:
2147 effective_range = map->data_range;
2148 break;
2149 case UMEM_RANGE_ID_LARGE_FILE:
2150 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2151 /* large file range is configured and should be used */
2152 effective_range = map->large_file_range;
2153 } else {
2154 /*
2155 * the user asking for this user range might not have the
2156 * permissions to use the large file range (i.e., it doesn't
2157 * hold the correct entitlement), so we give it the data range
2158 * instead
2159 */
2160 effective_range = map->data_range;
2161 }
2162 break;
2163 case UMEM_RANGE_ID_FIXED:
2164 /*
2165 * anywhere allocations with an address in "FIXED"
2166 * makes no sense, leave the range empty
2167 */
2168 break;
2169
2170 default:
2171 vm_map_range_invalid_panic(map, range_id);
2172 }
2173 #endif /* CONFIG_MAP_RANGES */
2174 } else {
2175 /*
2176 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2177 * allocations of PAGEZERO to explicit requests since its
2178 * normal use is to catch dereferences of NULL and many
2179 * applications also treat pointers with a value of 0 as
2180 * special and suddenly having address 0 contain useable
2181 * memory would tend to confuse those applications.
2182 */
2183 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2184 effective_range.max_address = map->max_offset;
2185 }
2186
2187 return effective_range;
2188 }
2189
2190 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2191 vm_map_locate_space_anywhere(
2192 vm_map_t map,
2193 vm_map_size_t size,
2194 vm_map_offset_t mask,
2195 vm_map_kernel_flags_t vmk_flags,
2196 vm_map_offset_t *start_inout,
2197 vm_map_entry_t *entry_out)
2198 {
2199 struct mach_vm_range effective_range = {};
2200 vm_map_size_t guard_offset;
2201 vm_map_offset_t hint, limit;
2202 vm_map_entry_t entry;
2203 bool is_kmem_ptr_range = false;
2204
2205 /*
2206 * Only supported by vm_map_enter() with a fixed address.
2207 */
2208 assert(!vmk_flags.vmf_fixed);
2209 assert(!vmk_flags.vmkf_beyond_max);
2210
2211 if (__improbable(map->wait_for_space)) {
2212 /*
2213 * support for "wait_for_space" is minimal,
2214 * its only consumer is the ipc_kernel_copy_map.
2215 */
2216 assert(!map->holelistenabled &&
2217 !vmk_flags.vmkf_last_free &&
2218 !vmk_flags.vmkf_keep_map_locked &&
2219 !vmk_flags.vmkf_map_jit &&
2220 !vmk_flags.vmf_random_addr &&
2221 *start_inout <= map->min_offset);
2222 } else if (vmk_flags.vmkf_last_free) {
2223 assert(!vmk_flags.vmkf_map_jit &&
2224 !vmk_flags.vmf_random_addr);
2225 }
2226
2227 if (vmk_flags.vmkf_guard_before) {
2228 guard_offset = VM_MAP_PAGE_SIZE(map);
2229 assert(size > guard_offset);
2230 size -= guard_offset;
2231 } else {
2232 assert(size != 0);
2233 guard_offset = 0;
2234 }
2235
2236 /*
2237 * Validate range_id from flags and get associated range
2238 */
2239 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2240 &is_kmem_ptr_range);
2241
2242 if (is_kmem_ptr_range) {
2243 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2244 vmk_flags.vmkf_last_free, start_inout, entry_out);
2245 }
2246
2247 #if XNU_TARGET_OS_OSX
2248 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2249 assert(map != kernel_map);
2250 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2251 }
2252 #endif /* XNU_TARGET_OS_OSX */
2253
2254 again:
2255 if (vmk_flags.vmkf_last_free) {
2256 hint = *start_inout;
2257
2258 if (hint == 0 || hint > effective_range.max_address) {
2259 hint = effective_range.max_address;
2260 }
2261 if (hint <= effective_range.min_address) {
2262 return KERN_NO_SPACE;
2263 }
2264 limit = effective_range.min_address;
2265 } else {
2266 hint = *start_inout;
2267
2268 if (vmk_flags.vmkf_map_jit) {
2269 if (map->jit_entry_exists &&
2270 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2271 return KERN_INVALID_ARGUMENT;
2272 }
2273 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2274 vmk_flags.vmf_random_addr = true;
2275 }
2276 }
2277
2278 if (vmk_flags.vmf_random_addr) {
2279 kern_return_t kr;
2280
2281 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2282 if (kr != KERN_SUCCESS) {
2283 return kr;
2284 }
2285 }
2286 #if __x86_64__
2287 else if ((hint == 0 || hint == vm_map_min(map)) &&
2288 !map->disable_vmentry_reuse &&
2289 map->vmmap_high_start != 0) {
2290 hint = map->vmmap_high_start;
2291 }
2292 #endif /* __x86_64__ */
2293
2294 if (hint < effective_range.min_address) {
2295 hint = effective_range.min_address;
2296 }
2297 if (effective_range.max_address <= hint) {
2298 return KERN_NO_SPACE;
2299 }
2300
2301 limit = effective_range.max_address;
2302 }
2303 entry = vm_map_store_find_space(map,
2304 hint, limit, vmk_flags.vmkf_last_free,
2305 guard_offset, size, mask,
2306 start_inout);
2307
2308 if (__improbable(entry == NULL)) {
2309 if (map->wait_for_space &&
2310 guard_offset + size <=
2311 effective_range.max_address - effective_range.min_address) {
2312 assert_wait((event_t)map, THREAD_ABORTSAFE);
2313 vm_map_unlock(map);
2314 thread_block(THREAD_CONTINUE_NULL);
2315 vm_map_lock(map);
2316 goto again;
2317 }
2318 return KERN_NO_SPACE;
2319 }
2320
2321 if (entry_out) {
2322 *entry_out = entry;
2323 }
2324 return KERN_SUCCESS;
2325 }
2326
2327 /*!
2328 * @function vm_map_locate_space_fixed()
2329 *
2330 * @brief
2331 * Locate (no reservation) a range in the specified VM map at a fixed address.
2332 *
2333 * @param map the map to scan for memory, must be locked.
2334 * @param start the fixed address trying to be reserved
2335 * @param size the size of the allocation to make.
2336 * @param mask an alignment mask the allocation must respect,
2337 * @param vmk_flags the vm map kernel flags to influence this call.
2338 * vmk_flags.vmf_anywhere must not be set.
2339 * @param entry_out the entry right before the hole.
2340 * @param zap_list a zap list of entries to clean up after the call.
2341 *
2342 * @returns
2343 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2344 * in which case entry_out is set to the entry before the hole.
2345 *
2346 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2347 * in which case entry_out is set the conflicting entry,
2348 * the callers MUST handle this error explicitly.
2349 *
2350 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2351 * would result in a mapping outside of the map.
2352 *
2353 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2354 */
2355 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2356 vm_map_locate_space_fixed(
2357 vm_map_t map,
2358 vm_map_offset_t start,
2359 vm_map_size_t size,
2360 vm_map_offset_t mask,
2361 vm_map_kernel_flags_t vmk_flags,
2362 vm_map_entry_t *entry_out,
2363 vm_map_zap_t zap_list)
2364 {
2365 vm_map_offset_t effective_min_offset, effective_max_offset;
2366 vm_map_entry_t entry;
2367 vm_map_offset_t end;
2368
2369 assert(vmk_flags.vmf_fixed);
2370
2371 effective_min_offset = map->min_offset;
2372 effective_max_offset = map->max_offset;
2373
2374 if (vmk_flags.vmkf_beyond_max) {
2375 /*
2376 * Allow an insertion beyond the map's max offset.
2377 */
2378 effective_max_offset = 0x00000000FFFFF000ULL;
2379 if (vm_map_is_64bit(map)) {
2380 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2381 }
2382 #if XNU_TARGET_OS_OSX
2383 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2384 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2385 #endif /* XNU_TARGET_OS_OSX */
2386 }
2387
2388 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2389 !vmk_flags.vmf_overwrite &&
2390 map->pmap == kernel_pmap &&
2391 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2392 /*
2393 * Force realloc() to switch to a new allocation,
2394 * to prevent 4k-fragmented virtual ranges.
2395 */
2396 // DEBUG4K_ERROR("no realloc in place");
2397 return KERN_NO_SPACE;
2398 }
2399
2400 /*
2401 * Verify that:
2402 * the address doesn't itself violate
2403 * the mask requirement.
2404 */
2405
2406 if ((start & mask) != 0) {
2407 return KERN_NO_SPACE;
2408 }
2409
2410 #if CONFIG_MAP_RANGES
2411 if (map->uses_user_ranges) {
2412 struct mach_vm_range r;
2413
2414 vm_map_user_range_resolve(map, start, 1, &r);
2415 if (r.max_address == 0) {
2416 return KERN_INVALID_ADDRESS;
2417 }
2418 effective_min_offset = r.min_address;
2419 effective_max_offset = r.max_address;
2420 }
2421 #endif /* CONFIG_MAP_RANGES */
2422
2423 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2424 (map == kernel_map)) {
2425 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2426 effective_min_offset = r->min_address;
2427 effective_max_offset = r->max_address;
2428 }
2429
2430 /*
2431 * ... the address is within bounds
2432 */
2433
2434 end = start + size;
2435
2436 if ((start < effective_min_offset) ||
2437 (end > effective_max_offset) ||
2438 (start >= end)) {
2439 return KERN_INVALID_ADDRESS;
2440 }
2441
2442 if (vmk_flags.vmf_overwrite) {
2443 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2444 kern_return_t remove_kr;
2445
2446 /*
2447 * Fixed mapping and "overwrite" flag: attempt to
2448 * remove all existing mappings in the specified
2449 * address range, saving them in our "zap_list".
2450 *
2451 * This avoids releasing the VM map lock in
2452 * vm_map_entry_delete() and allows atomicity
2453 * when we want to replace some mappings with a new one.
2454 * It also allows us to restore the old VM mappings if the
2455 * new mapping fails.
2456 */
2457 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2458
2459 if (vmk_flags.vmkf_overwrite_immutable) {
2460 /* we can overwrite immutable mappings */
2461 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2462 }
2463 if (vmk_flags.vmkf_remap_prot_copy) {
2464 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2465 }
2466 remove_kr = vm_map_delete(map, start, end, remove_flags,
2467 KMEM_GUARD_NONE, zap_list).kmr_return;
2468 if (remove_kr) {
2469 /* XXX FBDP restore zap_list? */
2470 return remove_kr;
2471 }
2472 }
2473
2474 /*
2475 * ... the starting address isn't allocated
2476 */
2477
2478 if (vm_map_lookup_entry(map, start, &entry)) {
2479 *entry_out = entry;
2480 return KERN_MEMORY_PRESENT;
2481 }
2482
2483 /*
2484 * ... the next region doesn't overlap the
2485 * end point.
2486 */
2487
2488 if ((entry->vme_next != vm_map_to_entry(map)) &&
2489 (entry->vme_next->vme_start < end)) {
2490 return KERN_NO_SPACE;
2491 }
2492
2493 *entry_out = entry;
2494 return KERN_SUCCESS;
2495 }
2496
2497 /*
2498 * Routine: vm_map_find_space
2499 * Purpose:
2500 * Allocate a range in the specified virtual address map,
2501 * returning the entry allocated for that range.
2502 * Used by kmem_alloc, etc.
2503 *
2504 * The map must be NOT be locked. It will be returned locked
2505 * on KERN_SUCCESS, unlocked on failure.
2506 *
2507 * If an entry is allocated, the object/offset fields
2508 * are initialized to zero.
2509 */
2510 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2511 vm_map_find_space(
2512 vm_map_t map,
2513 vm_map_offset_t hint_address,
2514 vm_map_size_t size,
2515 vm_map_offset_t mask,
2516 vm_map_kernel_flags_t vmk_flags,
2517 vm_map_entry_t *o_entry) /* OUT */
2518 {
2519 vm_map_entry_t new_entry, entry;
2520 kern_return_t kr;
2521
2522 if (size == 0) {
2523 return KERN_INVALID_ARGUMENT;
2524 }
2525
2526 new_entry = vm_map_entry_create(map);
2527 new_entry->use_pmap = true;
2528 new_entry->protection = VM_PROT_DEFAULT;
2529 new_entry->max_protection = VM_PROT_ALL;
2530
2531 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2532 new_entry->map_aligned = true;
2533 }
2534 if (vmk_flags.vmf_permanent) {
2535 new_entry->vme_permanent = true;
2536 }
2537
2538 vm_map_lock(map);
2539
2540 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2541 &hint_address, &entry);
2542 if (kr != KERN_SUCCESS) {
2543 vm_map_unlock(map);
2544 vm_map_entry_dispose(new_entry);
2545 return kr;
2546 }
2547 new_entry->vme_start = hint_address;
2548 new_entry->vme_end = hint_address + size;
2549
2550 /*
2551 * At this point,
2552 *
2553 * - new_entry's "vme_start" and "vme_end" should define
2554 * the endpoints of the available new range,
2555 *
2556 * - and "entry" should refer to the region before
2557 * the new range,
2558 *
2559 * - and the map should still be locked.
2560 */
2561
2562 assert(page_aligned(new_entry->vme_start));
2563 assert(page_aligned(new_entry->vme_end));
2564 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2565 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2566
2567 /*
2568 * Insert the new entry into the list
2569 */
2570
2571 vm_map_store_entry_link(map, entry, new_entry,
2572 VM_MAP_KERNEL_FLAGS_NONE);
2573 map->size += size;
2574
2575 /*
2576 * Update the lookup hint
2577 */
2578 SAVE_HINT_MAP_WRITE(map, new_entry);
2579
2580 *o_entry = new_entry;
2581 return KERN_SUCCESS;
2582 }
2583
2584 int vm_map_pmap_enter_print = FALSE;
2585 int vm_map_pmap_enter_enable = FALSE;
2586
2587 /*
2588 * Routine: vm_map_pmap_enter [internal only]
2589 *
2590 * Description:
2591 * Force pages from the specified object to be entered into
2592 * the pmap at the specified address if they are present.
2593 * As soon as a page not found in the object the scan ends.
2594 *
2595 * Returns:
2596 * Nothing.
2597 *
2598 * In/out conditions:
2599 * The source map should not be locked on entry.
2600 */
2601 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2602 vm_map_pmap_enter(
2603 vm_map_t map,
2604 vm_map_offset_t addr,
2605 vm_map_offset_t end_addr,
2606 vm_object_t object,
2607 vm_object_offset_t offset,
2608 vm_prot_t protection)
2609 {
2610 int type_of_fault;
2611 kern_return_t kr;
2612 uint8_t object_lock_type = 0;
2613 struct vm_object_fault_info fault_info = {};
2614
2615 if (map->pmap == 0) {
2616 return;
2617 }
2618
2619 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2620
2621 while (addr < end_addr) {
2622 vm_page_t m;
2623
2624
2625 /*
2626 * TODO:
2627 * From vm_map_enter(), we come into this function without the map
2628 * lock held or the object lock held.
2629 * We haven't taken a reference on the object either.
2630 * We should do a proper lookup on the map to make sure
2631 * that things are sane before we go locking objects that
2632 * could have been deallocated from under us.
2633 */
2634
2635 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2636 vm_object_lock(object);
2637
2638 m = vm_page_lookup(object, offset);
2639
2640 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2641 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2642 vm_object_unlock(object);
2643 return;
2644 }
2645
2646 if (vm_map_pmap_enter_print) {
2647 printf("vm_map_pmap_enter:");
2648 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2649 map, (unsigned long long)addr, object, (unsigned long long)offset);
2650 }
2651 type_of_fault = DBG_CACHE_HIT_FAULT;
2652 kr = vm_fault_enter(m, map->pmap,
2653 addr,
2654 PAGE_SIZE, 0,
2655 protection, protection,
2656 VM_PAGE_WIRED(m),
2657 FALSE, /* change_wiring */
2658 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2659 &fault_info,
2660 NULL, /* need_retry */
2661 &type_of_fault,
2662 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2663
2664 vm_object_unlock(object);
2665
2666 offset += PAGE_SIZE_64;
2667 addr += PAGE_SIZE;
2668 }
2669 }
2670
2671 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2672 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2673 vm_map_random_address_for_size(
2674 vm_map_t map,
2675 vm_map_offset_t *address,
2676 vm_map_size_t size,
2677 vm_map_kernel_flags_t vmk_flags)
2678 {
2679 kern_return_t kr = KERN_SUCCESS;
2680 int tries = 0;
2681 vm_map_offset_t random_addr = 0;
2682 vm_map_offset_t hole_end;
2683
2684 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2685 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2686 vm_map_size_t vm_hole_size = 0;
2687 vm_map_size_t addr_space_size;
2688 bool is_kmem_ptr;
2689 struct mach_vm_range effective_range;
2690
2691 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2692 &is_kmem_ptr);
2693
2694 addr_space_size = effective_range.max_address - effective_range.min_address;
2695 if (size >= addr_space_size) {
2696 return KERN_NO_SPACE;
2697 }
2698 addr_space_size -= size;
2699
2700 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2701
2702 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2703 if (startup_phase < STARTUP_SUB_ZALLOC) {
2704 random_addr = (vm_map_offset_t)early_random();
2705 } else {
2706 random_addr = (vm_map_offset_t)random();
2707 }
2708 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2709 random_addr = vm_map_trunc_page(
2710 effective_range.min_address + (random_addr % addr_space_size),
2711 VM_MAP_PAGE_MASK(map));
2712
2713 #if CONFIG_PROB_GZALLOC
2714 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2715 continue;
2716 }
2717 #endif /* CONFIG_PROB_GZALLOC */
2718
2719 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2720 if (prev_entry == vm_map_to_entry(map)) {
2721 next_entry = vm_map_first_entry(map);
2722 } else {
2723 next_entry = prev_entry->vme_next;
2724 }
2725 if (next_entry == vm_map_to_entry(map)) {
2726 hole_end = vm_map_max(map);
2727 } else {
2728 hole_end = next_entry->vme_start;
2729 }
2730 vm_hole_size = hole_end - random_addr;
2731 if (vm_hole_size >= size) {
2732 *address = random_addr;
2733 break;
2734 }
2735 }
2736 tries++;
2737 }
2738
2739 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2740 kr = KERN_NO_SPACE;
2741 }
2742 return kr;
2743 }
2744
2745 static boolean_t
vm_memory_malloc_no_cow(int alias)2746 vm_memory_malloc_no_cow(
2747 int alias)
2748 {
2749 uint64_t alias_mask;
2750
2751 if (!malloc_no_cow) {
2752 return FALSE;
2753 }
2754 if (alias > 63) {
2755 return FALSE;
2756 }
2757 alias_mask = 1ULL << alias;
2758 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2759 return TRUE;
2760 }
2761 return FALSE;
2762 }
2763
2764 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2765 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2766 /*
2767 * Routine: vm_map_enter
2768 *
2769 * Description:
2770 * Allocate a range in the specified virtual address map.
2771 * The resulting range will refer to memory defined by
2772 * the given memory object and offset into that object.
2773 *
2774 * Arguments are as defined in the vm_map call.
2775 */
2776 static unsigned int vm_map_enter_restore_successes = 0;
2777 static unsigned int vm_map_enter_restore_failures = 0;
2778 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2779 vm_map_enter(
2780 vm_map_t map,
2781 vm_map_offset_t *address, /* IN/OUT */
2782 vm_map_size_t size,
2783 vm_map_offset_t mask,
2784 vm_map_kernel_flags_t vmk_flags,
2785 vm_object_t object,
2786 vm_object_offset_t offset,
2787 boolean_t needs_copy,
2788 vm_prot_t cur_protection,
2789 vm_prot_t max_protection,
2790 vm_inherit_t inheritance)
2791 {
2792 vm_map_entry_t entry, new_entry;
2793 vm_map_offset_t start, tmp_start, tmp_offset;
2794 vm_map_offset_t end, tmp_end;
2795 vm_map_offset_t tmp2_start, tmp2_end;
2796 vm_map_offset_t step;
2797 kern_return_t result = KERN_SUCCESS;
2798 bool map_locked = FALSE;
2799 bool pmap_empty = TRUE;
2800 bool new_mapping_established = FALSE;
2801 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2802 const bool anywhere = !vmk_flags.vmf_fixed;
2803 const bool purgable = vmk_flags.vmf_purgeable;
2804 const bool no_cache = vmk_flags.vmf_no_cache;
2805 const bool is_submap = vmk_flags.vmkf_submap;
2806 const bool permanent = vmk_flags.vmf_permanent;
2807 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2808 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2809 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2810 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2811 const bool resilient_media = vmk_flags.vmf_resilient_media;
2812 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2813 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2814 const vm_tag_t alias = vmk_flags.vm_tag;
2815 vm_tag_t user_alias;
2816 kern_return_t kr;
2817 bool clear_map_aligned = FALSE;
2818 vm_map_size_t chunk_size = 0;
2819 vm_object_t caller_object;
2820 VM_MAP_ZAP_DECLARE(zap_old_list);
2821 VM_MAP_ZAP_DECLARE(zap_new_list);
2822
2823 caller_object = object;
2824
2825 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2826
2827 if (vmk_flags.vmf_4gb_chunk) {
2828 #if defined(__LP64__)
2829 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2830 #else /* __LP64__ */
2831 chunk_size = ANON_CHUNK_SIZE;
2832 #endif /* __LP64__ */
2833 } else {
2834 chunk_size = ANON_CHUNK_SIZE;
2835 }
2836
2837
2838
2839 if (superpage_size) {
2840 if (object != VM_OBJECT_NULL) {
2841 /* caller can't provide their own VM object */
2842 return KERN_INVALID_ARGUMENT;
2843 }
2844 switch (superpage_size) {
2845 /*
2846 * Note that the current implementation only supports
2847 * a single size for superpages, SUPERPAGE_SIZE, per
2848 * architecture. As soon as more sizes are supposed
2849 * to be supported, SUPERPAGE_SIZE has to be replaced
2850 * with a lookup of the size depending on superpage_size.
2851 */
2852 #ifdef __x86_64__
2853 case SUPERPAGE_SIZE_ANY:
2854 /* handle it like 2 MB and round up to page size */
2855 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2856 OS_FALLTHROUGH;
2857 case SUPERPAGE_SIZE_2MB:
2858 break;
2859 #endif
2860 default:
2861 return KERN_INVALID_ARGUMENT;
2862 }
2863 mask = SUPERPAGE_SIZE - 1;
2864 if (size & (SUPERPAGE_SIZE - 1)) {
2865 return KERN_INVALID_ARGUMENT;
2866 }
2867 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2868 }
2869
2870
2871 if ((cur_protection & VM_PROT_WRITE) &&
2872 (cur_protection & VM_PROT_EXECUTE) &&
2873 #if XNU_TARGET_OS_OSX
2874 map->pmap != kernel_pmap &&
2875 (cs_process_global_enforcement() ||
2876 (vmk_flags.vmkf_cs_enforcement_override
2877 ? vmk_flags.vmkf_cs_enforcement
2878 : (vm_map_cs_enforcement(map)
2879 #if __arm64__
2880 || !VM_MAP_IS_EXOTIC(map)
2881 #endif /* __arm64__ */
2882 ))) &&
2883 #endif /* XNU_TARGET_OS_OSX */
2884 #if CODE_SIGNING_MONITOR
2885 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2886 #endif
2887 (VM_MAP_POLICY_WX_FAIL(map) ||
2888 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2889 !entry_for_jit) {
2890 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2891
2892 DTRACE_VM3(cs_wx,
2893 uint64_t, 0,
2894 uint64_t, 0,
2895 vm_prot_t, cur_protection);
2896 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2897 proc_selfpid(),
2898 (get_bsdtask_info(current_task())
2899 ? proc_name_address(get_bsdtask_info(current_task()))
2900 : "?"),
2901 __FUNCTION__,
2902 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2903 cur_protection &= ~VM_PROT_EXECUTE;
2904 if (vm_protect_wx_fail) {
2905 return KERN_PROTECTION_FAILURE;
2906 }
2907 }
2908
2909 if (entry_for_jit
2910 && cur_protection != VM_PROT_ALL) {
2911 /*
2912 * Native macOS processes and all non-macOS processes are
2913 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2914 * the RWX requirement was not enforced, and thus, we must live
2915 * with our sins. We are now dealing with a JIT mapping without
2916 * RWX.
2917 *
2918 * We deal with these by letting the MAP_JIT stick in order
2919 * to avoid CS violations when these pages are mapped executable
2920 * down the line. In order to appease the page table monitor (you
2921 * know what I'm talking about), these pages will end up being
2922 * marked as XNU_USER_DEBUG, which will be allowed because we
2923 * don't enforce the code signing monitor on macOS systems. If
2924 * the user-space application ever changes permissions to RWX,
2925 * which they are allowed to since the mapping was originally
2926 * created with MAP_JIT, then they'll switch over to using the
2927 * XNU_USER_JIT type, and won't be allowed to downgrade any
2928 * more after that.
2929 *
2930 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2931 * strictly disallowed.
2932 */
2933
2934 #if XNU_TARGET_OS_OSX
2935 /*
2936 * Continue to allow non-RWX JIT
2937 */
2938 #else
2939 /* non-macOS: reject JIT regions without RWX */
2940 DTRACE_VM3(cs_wx,
2941 uint64_t, 0,
2942 uint64_t, 0,
2943 vm_prot_t, cur_protection);
2944 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2945 proc_selfpid(),
2946 (get_bsdtask_info(current_task())
2947 ? proc_name_address(get_bsdtask_info(current_task()))
2948 : "?"),
2949 __FUNCTION__,
2950 cur_protection);
2951 return KERN_PROTECTION_FAILURE;
2952 #endif
2953 }
2954
2955 /*
2956 * If the task has requested executable lockdown,
2957 * deny any new executable mapping.
2958 */
2959 if (map->map_disallow_new_exec == TRUE) {
2960 if (cur_protection & VM_PROT_EXECUTE) {
2961 return KERN_PROTECTION_FAILURE;
2962 }
2963 }
2964
2965 if (resilient_codesign) {
2966 assert(!is_submap);
2967 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2968 if ((cur_protection | max_protection) & reject_prot) {
2969 return KERN_PROTECTION_FAILURE;
2970 }
2971 }
2972
2973 if (resilient_media) {
2974 assert(!is_submap);
2975 // assert(!needs_copy);
2976 if (object != VM_OBJECT_NULL &&
2977 !object->internal) {
2978 /*
2979 * This mapping is directly backed by an external
2980 * memory manager (e.g. a vnode pager for a file):
2981 * we would not have any safe place to inject
2982 * a zero-filled page if an actual page is not
2983 * available, without possibly impacting the actual
2984 * contents of the mapped object (e.g. the file),
2985 * so we can't provide any media resiliency here.
2986 */
2987 return KERN_INVALID_ARGUMENT;
2988 }
2989 }
2990
2991 if (entry_for_tpro) {
2992 /*
2993 * TPRO overrides the effective permissions of the region
2994 * and explicitly maps as RW. Ensure we have been passed
2995 * the expected permissions. We accept `cur_protections`
2996 * RO as that will be handled on fault.
2997 */
2998 if (!(max_protection & VM_PROT_READ) ||
2999 !(max_protection & VM_PROT_WRITE) ||
3000 !(cur_protection & VM_PROT_READ)) {
3001 return KERN_PROTECTION_FAILURE;
3002 }
3003
3004 /*
3005 * We can now downgrade the cur_protection to RO. This is a mild lie
3006 * to the VM layer. But TPRO will be responsible for toggling the
3007 * protections between RO/RW
3008 */
3009 cur_protection = VM_PROT_READ;
3010 }
3011
3012 if (is_submap) {
3013 vm_map_t submap;
3014 if (purgable) {
3015 /* submaps can not be purgeable */
3016 return KERN_INVALID_ARGUMENT;
3017 }
3018 if (object == VM_OBJECT_NULL) {
3019 /* submaps can not be created lazily */
3020 return KERN_INVALID_ARGUMENT;
3021 }
3022 submap = (vm_map_t) object;
3023 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3024 /* page size mismatch */
3025 return KERN_INVALID_ARGUMENT;
3026 }
3027 }
3028 if (vmk_flags.vmkf_already) {
3029 /*
3030 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3031 * is already present. For it to be meaningul, the requested
3032 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3033 * we shouldn't try and remove what was mapped there first
3034 * (!VM_FLAGS_OVERWRITE).
3035 */
3036 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3037 return KERN_INVALID_ARGUMENT;
3038 }
3039 }
3040
3041 if (size == 0 ||
3042 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3043 *address = 0;
3044 return KERN_INVALID_ARGUMENT;
3045 }
3046
3047 if (map->pmap == kernel_pmap) {
3048 user_alias = VM_KERN_MEMORY_NONE;
3049 } else {
3050 user_alias = alias;
3051 }
3052
3053 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3054 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3055 }
3056
3057 #define RETURN(value) { result = value; goto BailOut; }
3058
3059 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3060 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3061 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3062 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3063 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3064 }
3065
3066 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3067 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3068 /*
3069 * In most cases, the caller rounds the size up to the
3070 * map's page size.
3071 * If we get a size that is explicitly not map-aligned here,
3072 * we'll have to respect the caller's wish and mark the
3073 * mapping as "not map-aligned" to avoid tripping the
3074 * map alignment checks later.
3075 */
3076 clear_map_aligned = TRUE;
3077 }
3078 if (!anywhere &&
3079 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3080 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3081 /*
3082 * We've been asked to map at a fixed address and that
3083 * address is not aligned to the map's specific alignment.
3084 * The caller should know what it's doing (i.e. most likely
3085 * mapping some fragmented copy map, transferring memory from
3086 * a VM map with a different alignment), so clear map_aligned
3087 * for this new VM map entry and proceed.
3088 */
3089 clear_map_aligned = TRUE;
3090 }
3091
3092 /*
3093 * Only zero-fill objects are allowed to be purgable.
3094 * LP64todo - limit purgable objects to 32-bits for now
3095 */
3096 if (purgable &&
3097 (offset != 0 ||
3098 (object != VM_OBJECT_NULL &&
3099 (object->vo_size != size ||
3100 object->purgable == VM_PURGABLE_DENY))
3101 #if __LP64__
3102 || size > ANON_MAX_SIZE
3103 #endif
3104 )) {
3105 return KERN_INVALID_ARGUMENT;
3106 }
3107
3108 vm_map_lock(map);
3109 map_locked = TRUE;
3110
3111 if (anywhere) {
3112 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3113 address, &entry);
3114 start = *address;
3115 } else {
3116 start = *address;
3117 result = vm_map_locate_space_fixed(map, start, size, mask,
3118 vmk_flags, &entry, &zap_old_list);
3119 }
3120
3121 end = start + size;
3122
3123 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3124
3125 /*
3126 * Check if what's already there is what we want.
3127 */
3128 if (result == KERN_MEMORY_PRESENT) {
3129 assert(!anywhere);
3130 if (!(vmk_flags.vmkf_already)) {
3131 RETURN(KERN_NO_SPACE);
3132 }
3133 tmp_start = start;
3134 tmp_offset = offset;
3135 if (entry->vme_start < start) {
3136 tmp_start -= start - entry->vme_start;
3137 tmp_offset -= start - entry->vme_start;
3138 }
3139 for (; entry->vme_start < end;
3140 entry = entry->vme_next) {
3141 /*
3142 * Check if the mapping's attributes
3143 * match the existing map entry.
3144 */
3145 if (entry == vm_map_to_entry(map) ||
3146 entry->vme_start != tmp_start ||
3147 entry->is_sub_map != is_submap ||
3148 VME_OFFSET(entry) != tmp_offset ||
3149 entry->needs_copy != needs_copy ||
3150 entry->protection != cur_protection ||
3151 entry->max_protection != max_protection ||
3152 entry->inheritance != inheritance ||
3153 entry->iokit_acct != iokit_acct ||
3154 VME_ALIAS(entry) != alias) {
3155 /* not the same mapping ! */
3156 RETURN(KERN_NO_SPACE);
3157 }
3158 /*
3159 * Check if the same object is being mapped.
3160 */
3161 if (is_submap) {
3162 if (VME_SUBMAP(entry) !=
3163 (vm_map_t) object) {
3164 /* not the same submap */
3165 RETURN(KERN_NO_SPACE);
3166 }
3167 } else {
3168 if (VME_OBJECT(entry) != object) {
3169 /* not the same VM object... */
3170 vm_object_t obj2;
3171
3172 obj2 = VME_OBJECT(entry);
3173 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3174 (object == VM_OBJECT_NULL || object->internal)) {
3175 /*
3176 * ... but both are
3177 * anonymous memory,
3178 * so equivalent.
3179 */
3180 } else {
3181 RETURN(KERN_NO_SPACE);
3182 }
3183 }
3184 }
3185
3186 tmp_offset += entry->vme_end - entry->vme_start;
3187 tmp_start += entry->vme_end - entry->vme_start;
3188 if (entry->vme_end >= end) {
3189 /* reached the end of our mapping */
3190 break;
3191 }
3192 }
3193 /* it all matches: let's use what's already there ! */
3194 RETURN(KERN_MEMORY_PRESENT);
3195 }
3196
3197 if (result != KERN_SUCCESS) {
3198 goto BailOut;
3199 }
3200
3201
3202 /*
3203 * At this point,
3204 * "start" and "end" should define the endpoints of the
3205 * available new range, and
3206 * "entry" should refer to the region before the new
3207 * range, and
3208 *
3209 * the map should be locked.
3210 */
3211
3212 /*
3213 * See whether we can avoid creating a new entry (and object) by
3214 * extending one of our neighbors. [So far, we only attempt to
3215 * extend from below.] Note that we can never extend/join
3216 * purgable objects because they need to remain distinct
3217 * entities in order to implement their "volatile object"
3218 * semantics.
3219 */
3220
3221 if (purgable ||
3222 entry_for_jit ||
3223 entry_for_tpro ||
3224 vm_memory_malloc_no_cow(user_alias)) {
3225 if (superpage_size) {
3226 /*
3227 * For "super page" allocations, we will allocate
3228 * special physically-contiguous VM objects later on,
3229 * so we should not have flags instructing us to create
3230 * a differently special VM object here.
3231 */
3232 RETURN(KERN_INVALID_ARGUMENT);
3233 }
3234
3235 if (object == VM_OBJECT_NULL) {
3236 assert(!superpage_size);
3237 object = vm_object_allocate(size);
3238 vm_object_lock(object);
3239 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3240 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3241 if (malloc_no_cow_except_fork &&
3242 !purgable &&
3243 !entry_for_jit &&
3244 !entry_for_tpro &&
3245 vm_memory_malloc_no_cow(user_alias)) {
3246 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3247 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3248 }
3249 if (entry_for_jit) {
3250 object->vo_inherit_copy_none = true;
3251 }
3252 if (purgable) {
3253 task_t owner;
3254 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3255 if (map->pmap == kernel_pmap) {
3256 /*
3257 * Purgeable mappings made in a kernel
3258 * map are "owned" by the kernel itself
3259 * rather than the current user task
3260 * because they're likely to be used by
3261 * more than this user task (see
3262 * execargs_purgeable_allocate(), for
3263 * example).
3264 */
3265 owner = kernel_task;
3266 } else {
3267 owner = current_task();
3268 }
3269 assert(object->vo_owner == NULL);
3270 assert(object->resident_page_count == 0);
3271 assert(object->wired_page_count == 0);
3272 vm_purgeable_nonvolatile_enqueue(object, owner);
3273 }
3274 vm_object_unlock(object);
3275 offset = (vm_object_offset_t)0;
3276 }
3277 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3278 /* no coalescing if address space uses sub-pages */
3279 } else if ((is_submap == FALSE) &&
3280 (object == VM_OBJECT_NULL) &&
3281 (entry != vm_map_to_entry(map)) &&
3282 (entry->vme_end == start) &&
3283 (!entry->is_shared) &&
3284 (!entry->is_sub_map) &&
3285 (!entry->in_transition) &&
3286 (!entry->needs_wakeup) &&
3287 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3288 (entry->protection == cur_protection) &&
3289 (entry->max_protection == max_protection) &&
3290 (entry->inheritance == inheritance) &&
3291 ((user_alias == VM_MEMORY_REALLOC) ||
3292 (VME_ALIAS(entry) == alias)) &&
3293 (entry->no_cache == no_cache) &&
3294 (entry->vme_permanent == permanent) &&
3295 /* no coalescing for immutable executable mappings */
3296 !((entry->protection & VM_PROT_EXECUTE) &&
3297 entry->vme_permanent) &&
3298 (!entry->superpage_size && !superpage_size) &&
3299 /*
3300 * No coalescing if not map-aligned, to avoid propagating
3301 * that condition any further than needed:
3302 */
3303 (!entry->map_aligned || !clear_map_aligned) &&
3304 (!entry->zero_wired_pages) &&
3305 (!entry->used_for_jit && !entry_for_jit) &&
3306 #if __arm64e__
3307 (!entry->used_for_tpro && !entry_for_tpro) &&
3308 #endif
3309 (!entry->csm_associated) &&
3310 (entry->iokit_acct == iokit_acct) &&
3311 (!entry->vme_resilient_codesign) &&
3312 (!entry->vme_resilient_media) &&
3313 (!entry->vme_atomic) &&
3314 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3315
3316 ((entry->vme_end - entry->vme_start) + size <=
3317 (user_alias == VM_MEMORY_REALLOC ?
3318 ANON_CHUNK_SIZE :
3319 NO_COALESCE_LIMIT)) &&
3320
3321 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3322 if (vm_object_coalesce(VME_OBJECT(entry),
3323 VM_OBJECT_NULL,
3324 VME_OFFSET(entry),
3325 (vm_object_offset_t) 0,
3326 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3327 (vm_map_size_t)(end - entry->vme_end))) {
3328 /*
3329 * Coalesced the two objects - can extend
3330 * the previous map entry to include the
3331 * new range.
3332 */
3333 map->size += (end - entry->vme_end);
3334 assert(entry->vme_start < end);
3335 assert(VM_MAP_PAGE_ALIGNED(end,
3336 VM_MAP_PAGE_MASK(map)));
3337 if (__improbable(vm_debug_events)) {
3338 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3339 }
3340 entry->vme_end = end;
3341 if (map->holelistenabled) {
3342 vm_map_store_update_first_free(map, entry, TRUE);
3343 } else {
3344 vm_map_store_update_first_free(map, map->first_free, TRUE);
3345 }
3346 new_mapping_established = TRUE;
3347 RETURN(KERN_SUCCESS);
3348 }
3349 }
3350
3351 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3352 new_entry = NULL;
3353
3354 if (vmk_flags.vmkf_submap_adjust) {
3355 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3356 offset = start;
3357 }
3358
3359 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3360 tmp2_end = tmp2_start + step;
3361 /*
3362 * Create a new entry
3363 *
3364 * XXX FBDP
3365 * The reserved "page zero" in each process's address space can
3366 * be arbitrarily large. Splitting it into separate objects and
3367 * therefore different VM map entries serves no purpose and just
3368 * slows down operations on the VM map, so let's not split the
3369 * allocation into chunks if the max protection is NONE. That
3370 * memory should never be accessible, so it will never get to the
3371 * default pager.
3372 */
3373 tmp_start = tmp2_start;
3374 if (!is_submap &&
3375 object == VM_OBJECT_NULL &&
3376 size > chunk_size &&
3377 max_protection != VM_PROT_NONE &&
3378 superpage_size == 0) {
3379 tmp_end = tmp_start + chunk_size;
3380 } else {
3381 tmp_end = tmp2_end;
3382 }
3383 do {
3384 if (!is_submap &&
3385 object != VM_OBJECT_NULL &&
3386 object->internal &&
3387 offset + (tmp_end - tmp_start) > object->vo_size) {
3388 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3389 DTRACE_VM5(vm_map_enter_overmap,
3390 vm_map_t, map,
3391 vm_map_address_t, tmp_start,
3392 vm_map_address_t, tmp_end,
3393 vm_object_offset_t, offset,
3394 vm_object_size_t, object->vo_size);
3395 }
3396 new_entry = vm_map_entry_insert(map,
3397 entry, tmp_start, tmp_end,
3398 object, offset, vmk_flags,
3399 needs_copy,
3400 cur_protection, max_protection,
3401 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3402 VM_INHERIT_NONE : inheritance),
3403 clear_map_aligned);
3404
3405 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3406
3407 if (resilient_codesign) {
3408 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3409 if (!((cur_protection | max_protection) & reject_prot)) {
3410 new_entry->vme_resilient_codesign = TRUE;
3411 }
3412 }
3413
3414 if (resilient_media &&
3415 (object == VM_OBJECT_NULL ||
3416 object->internal)) {
3417 new_entry->vme_resilient_media = TRUE;
3418 }
3419
3420 assert(!new_entry->iokit_acct);
3421 if (!is_submap &&
3422 object != VM_OBJECT_NULL &&
3423 object->internal &&
3424 (object->purgable != VM_PURGABLE_DENY ||
3425 object->vo_ledger_tag)) {
3426 assert(new_entry->use_pmap);
3427 assert(!new_entry->iokit_acct);
3428 /*
3429 * Turn off pmap accounting since
3430 * purgeable (or tagged) objects have their
3431 * own ledgers.
3432 */
3433 new_entry->use_pmap = FALSE;
3434 } else if (!is_submap &&
3435 iokit_acct &&
3436 object != VM_OBJECT_NULL &&
3437 object->internal) {
3438 /* alternate accounting */
3439 assert(!new_entry->iokit_acct);
3440 assert(new_entry->use_pmap);
3441 new_entry->iokit_acct = TRUE;
3442 new_entry->use_pmap = FALSE;
3443 DTRACE_VM4(
3444 vm_map_iokit_mapped_region,
3445 vm_map_t, map,
3446 vm_map_offset_t, new_entry->vme_start,
3447 vm_map_offset_t, new_entry->vme_end,
3448 int, VME_ALIAS(new_entry));
3449 vm_map_iokit_mapped_region(
3450 map,
3451 (new_entry->vme_end -
3452 new_entry->vme_start));
3453 } else if (!is_submap) {
3454 assert(!new_entry->iokit_acct);
3455 assert(new_entry->use_pmap);
3456 }
3457
3458 if (is_submap) {
3459 vm_map_t submap;
3460 boolean_t submap_is_64bit;
3461 boolean_t use_pmap;
3462
3463 assert(new_entry->is_sub_map);
3464 assert(!new_entry->use_pmap);
3465 assert(!new_entry->iokit_acct);
3466 submap = (vm_map_t) object;
3467 submap_is_64bit = vm_map_is_64bit(submap);
3468 use_pmap = vmk_flags.vmkf_nested_pmap;
3469 #ifndef NO_NESTED_PMAP
3470 if (use_pmap && submap->pmap == NULL) {
3471 ledger_t ledger = map->pmap->ledger;
3472 /* we need a sub pmap to nest... */
3473 submap->pmap = pmap_create_options(ledger, 0,
3474 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3475 if (submap->pmap == NULL) {
3476 /* let's proceed without nesting... */
3477 }
3478 #if defined(__arm64__)
3479 else {
3480 pmap_set_nested(submap->pmap);
3481 }
3482 #endif
3483 }
3484 if (use_pmap && submap->pmap != NULL) {
3485 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3486 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3487 kr = KERN_FAILURE;
3488 } else {
3489 kr = pmap_nest(map->pmap,
3490 submap->pmap,
3491 tmp_start,
3492 tmp_end - tmp_start);
3493 }
3494 if (kr != KERN_SUCCESS) {
3495 printf("vm_map_enter: "
3496 "pmap_nest(0x%llx,0x%llx) "
3497 "error 0x%x\n",
3498 (long long)tmp_start,
3499 (long long)tmp_end,
3500 kr);
3501 } else {
3502 /* we're now nested ! */
3503 new_entry->use_pmap = TRUE;
3504 pmap_empty = FALSE;
3505 }
3506 }
3507 #endif /* NO_NESTED_PMAP */
3508 }
3509 entry = new_entry;
3510
3511 if (superpage_size) {
3512 vm_page_t pages, m;
3513 vm_object_t sp_object;
3514 vm_object_offset_t sp_offset;
3515
3516 assert(object == VM_OBJECT_NULL);
3517 VME_OFFSET_SET(entry, 0);
3518
3519 /* allocate one superpage */
3520 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3521 if (kr != KERN_SUCCESS) {
3522 /* deallocate whole range... */
3523 new_mapping_established = TRUE;
3524 /* ... but only up to "tmp_end" */
3525 size -= end - tmp_end;
3526 RETURN(kr);
3527 }
3528
3529 /* create one vm_object per superpage */
3530 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3531 vm_object_lock(sp_object);
3532 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3533 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3534 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3535 VME_OBJECT_SET(entry, sp_object, false, 0);
3536 assert(entry->use_pmap);
3537
3538 /* enter the base pages into the object */
3539 for (sp_offset = 0;
3540 sp_offset < SUPERPAGE_SIZE;
3541 sp_offset += PAGE_SIZE) {
3542 m = pages;
3543 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3544 pages = NEXT_PAGE(m);
3545 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3546 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3547 }
3548 vm_object_unlock(sp_object);
3549 }
3550 } while (tmp_end != tmp2_end &&
3551 (tmp_start = tmp_end) &&
3552 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3553 tmp_end + chunk_size : tmp2_end));
3554 }
3555
3556 new_mapping_established = TRUE;
3557
3558 BailOut:
3559 assert(map_locked == TRUE);
3560
3561 /*
3562 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3563 * If we have identified and possibly established the new mapping(s),
3564 * make sure we did not go beyond the address space limit.
3565 */
3566 if (result == KERN_SUCCESS) {
3567 if (map->size_limit != RLIM_INFINITY &&
3568 map->size > map->size_limit) {
3569 /*
3570 * Establishing the requested mappings would exceed
3571 * the process's RLIMIT_AS limit: fail with
3572 * KERN_NO_SPACE.
3573 */
3574 result = KERN_NO_SPACE;
3575 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3576 proc_selfpid(),
3577 (get_bsdtask_info(current_task())
3578 ? proc_name_address(get_bsdtask_info(current_task()))
3579 : "?"),
3580 __FUNCTION__,
3581 (uint64_t) map->size,
3582 (uint64_t) map->size_limit);
3583 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3584 vm_map_size_t, map->size,
3585 uint64_t, map->size_limit);
3586 vm_map_enter_RLIMIT_AS_count++;
3587 } else if (map->data_limit != RLIM_INFINITY &&
3588 map->size > map->data_limit) {
3589 /*
3590 * Establishing the requested mappings would exceed
3591 * the process's RLIMIT_DATA limit: fail with
3592 * KERN_NO_SPACE.
3593 */
3594 result = KERN_NO_SPACE;
3595 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3596 proc_selfpid(),
3597 (get_bsdtask_info(current_task())
3598 ? proc_name_address(get_bsdtask_info(current_task()))
3599 : "?"),
3600 __FUNCTION__,
3601 (uint64_t) map->size,
3602 (uint64_t) map->data_limit);
3603 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3604 vm_map_size_t, map->size,
3605 uint64_t, map->data_limit);
3606 vm_map_enter_RLIMIT_DATA_count++;
3607 }
3608 }
3609
3610 if (result == KERN_SUCCESS) {
3611 vm_prot_t pager_prot;
3612 memory_object_t pager;
3613
3614 #if DEBUG
3615 if (pmap_empty &&
3616 !(vmk_flags.vmkf_no_pmap_check)) {
3617 assert(pmap_is_empty(map->pmap,
3618 *address,
3619 *address + size));
3620 }
3621 #endif /* DEBUG */
3622
3623 /*
3624 * For "named" VM objects, let the pager know that the
3625 * memory object is being mapped. Some pagers need to keep
3626 * track of this, to know when they can reclaim the memory
3627 * object, for example.
3628 * VM calls memory_object_map() for each mapping (specifying
3629 * the protection of each mapping) and calls
3630 * memory_object_last_unmap() when all the mappings are gone.
3631 */
3632 pager_prot = max_protection;
3633 if (needs_copy) {
3634 /*
3635 * Copy-On-Write mapping: won't modify
3636 * the memory object.
3637 */
3638 pager_prot &= ~VM_PROT_WRITE;
3639 }
3640 if (!is_submap &&
3641 object != VM_OBJECT_NULL &&
3642 object->named &&
3643 object->pager != MEMORY_OBJECT_NULL) {
3644 vm_object_lock(object);
3645 pager = object->pager;
3646 if (object->named &&
3647 pager != MEMORY_OBJECT_NULL) {
3648 assert(object->pager_ready);
3649 vm_object_mapping_wait(object, THREAD_UNINT);
3650 /* object might have lost its pager while waiting */
3651 pager = object->pager;
3652 if (object->named && pager != MEMORY_OBJECT_NULL) {
3653 vm_object_mapping_begin(object);
3654 vm_object_unlock(object);
3655
3656 kr = memory_object_map(pager, pager_prot);
3657 assert(kr == KERN_SUCCESS);
3658
3659 vm_object_lock(object);
3660 vm_object_mapping_end(object);
3661 }
3662 }
3663 vm_object_unlock(object);
3664 }
3665 }
3666
3667 assert(map_locked == TRUE);
3668
3669 if (new_mapping_established) {
3670 /*
3671 * If we release the map lock for any reason below,
3672 * another thread could deallocate our new mapping,
3673 * releasing the caller's reference on "caller_object",
3674 * which was transferred to the mapping.
3675 * If this was the only reference, the object could be
3676 * destroyed.
3677 *
3678 * We need to take an extra reference on "caller_object"
3679 * to keep it alive if we need to return the caller's
3680 * reference to the caller in case of failure.
3681 */
3682 if (is_submap) {
3683 vm_map_reference((vm_map_t)caller_object);
3684 } else {
3685 vm_object_reference(caller_object);
3686 }
3687 }
3688
3689 if (!keep_map_locked) {
3690 vm_map_unlock(map);
3691 map_locked = FALSE;
3692 entry = VM_MAP_ENTRY_NULL;
3693 new_entry = VM_MAP_ENTRY_NULL;
3694 }
3695
3696 /*
3697 * We can't hold the map lock if we enter this block.
3698 */
3699
3700 if (result == KERN_SUCCESS) {
3701 /* Wire down the new entry if the user
3702 * requested all new map entries be wired.
3703 */
3704 if ((map->wiring_required) || (superpage_size)) {
3705 assert(!keep_map_locked);
3706 pmap_empty = FALSE; /* pmap won't be empty */
3707 kr = vm_map_wire_nested(map, start, end,
3708 cur_protection, VM_KERN_MEMORY_MLOCK,
3709 TRUE, PMAP_NULL, 0, NULL);
3710 result = kr;
3711 }
3712
3713 }
3714
3715 if (result != KERN_SUCCESS) {
3716 if (new_mapping_established) {
3717 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3718
3719 /*
3720 * We have to get rid of the new mappings since we
3721 * won't make them available to the user.
3722 * Try and do that atomically, to minimize the risk
3723 * that someone else create new mappings that range.
3724 */
3725 if (!map_locked) {
3726 vm_map_lock(map);
3727 map_locked = TRUE;
3728 }
3729 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3730 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3731 if (permanent) {
3732 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3733 }
3734 (void) vm_map_delete(map,
3735 *address, *address + size,
3736 remove_flags,
3737 KMEM_GUARD_NONE, &zap_new_list);
3738 }
3739
3740 if (vm_map_zap_first_entry(&zap_old_list)) {
3741 vm_map_entry_t entry1, entry2;
3742
3743 /*
3744 * The new mapping failed. Attempt to restore
3745 * the old mappings, saved in the "zap_old_map".
3746 */
3747 if (!map_locked) {
3748 vm_map_lock(map);
3749 map_locked = TRUE;
3750 }
3751
3752 /* first check if the coast is still clear */
3753 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3754 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3755
3756 if (vm_map_lookup_entry(map, start, &entry1) ||
3757 vm_map_lookup_entry(map, end, &entry2) ||
3758 entry1 != entry2) {
3759 /*
3760 * Part of that range has already been
3761 * re-mapped: we can't restore the old
3762 * mappings...
3763 */
3764 vm_map_enter_restore_failures++;
3765 } else {
3766 /*
3767 * Transfer the saved map entries from
3768 * "zap_old_map" to the original "map",
3769 * inserting them all after "entry1".
3770 */
3771 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3772 vm_map_size_t entry_size;
3773
3774 entry_size = (entry2->vme_end -
3775 entry2->vme_start);
3776 vm_map_store_entry_link(map, entry1, entry2,
3777 VM_MAP_KERNEL_FLAGS_NONE);
3778 map->size += entry_size;
3779 entry1 = entry2;
3780 }
3781 if (map->wiring_required) {
3782 /*
3783 * XXX TODO: we should rewire the
3784 * old pages here...
3785 */
3786 }
3787 vm_map_enter_restore_successes++;
3788 }
3789 }
3790 }
3791
3792 /*
3793 * The caller is responsible for releasing the lock if it requested to
3794 * keep the map locked.
3795 */
3796 if (map_locked && !keep_map_locked) {
3797 vm_map_unlock(map);
3798 }
3799
3800 vm_map_zap_dispose(&zap_old_list);
3801 vm_map_zap_dispose(&zap_new_list);
3802
3803 if (new_mapping_established) {
3804 /*
3805 * The caller had a reference on "caller_object" and we
3806 * transferred that reference to the mapping.
3807 * We also took an extra reference on "caller_object" to keep
3808 * it alive while the map was unlocked.
3809 */
3810 if (result == KERN_SUCCESS) {
3811 /*
3812 * On success, the caller's reference on the object gets
3813 * tranferred to the mapping.
3814 * Release our extra reference.
3815 */
3816 if (is_submap) {
3817 vm_map_deallocate((vm_map_t)caller_object);
3818 } else {
3819 vm_object_deallocate(caller_object);
3820 }
3821 } else {
3822 /*
3823 * On error, the caller expects to still have a
3824 * reference on the object it gave us.
3825 * Let's use our extra reference for that.
3826 */
3827 }
3828 }
3829
3830 return result;
3831
3832 #undef RETURN
3833 }
3834
3835 /*
3836 * Counters for the prefault optimization.
3837 */
3838 int64_t vm_prefault_nb_pages = 0;
3839 int64_t vm_prefault_nb_bailout = 0;
3840
3841 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3842 vm_map_enter_adjust_offset(
3843 vm_object_offset_t *obj_offs,
3844 vm_object_offset_t *obj_end,
3845 vm_object_offset_t quantity)
3846 {
3847 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3848 os_add_overflow(*obj_end, quantity, obj_end) ||
3849 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3850 return KERN_INVALID_ARGUMENT;
3851 }
3852
3853 return KERN_SUCCESS;
3854 }
3855
3856 static __attribute__((always_inline, warn_unused_result))
3857 kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3858 vm_map_enter_mem_object_sanitize(
3859 vm_map_t target_map,
3860 vm_map_offset_ut address_u,
3861 vm_map_size_ut initial_size_u,
3862 vm_map_offset_ut mask_u,
3863 vm_object_offset_ut offset_u,
3864 vm_prot_ut cur_protection_u,
3865 vm_prot_ut max_protection_u,
3866 vm_inherit_ut inheritance_u,
3867 vm_map_kernel_flags_t vmk_flags,
3868 ipc_port_t port,
3869 vm_map_address_t *map_addr,
3870 vm_map_size_t *map_size,
3871 vm_map_offset_t *mask,
3872 vm_object_offset_t *obj_offs,
3873 vm_object_offset_t *obj_end,
3874 vm_object_size_t *obj_size,
3875 vm_prot_t *cur_protection,
3876 vm_prot_t *max_protection,
3877 vm_inherit_t *inheritance)
3878 {
3879 kern_return_t result;
3880
3881 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3882 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3883 VM_PROT_IS_MASK, cur_protection,
3884 max_protection);
3885 if (__improbable(result != KERN_SUCCESS)) {
3886 return result;
3887 }
3888
3889 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3890 inheritance);
3891 if (__improbable(result != KERN_SUCCESS)) {
3892 return result;
3893 }
3894
3895 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3896 if (__improbable(result != KERN_SUCCESS)) {
3897 return result;
3898 }
3899
3900 if (vmk_flags.vmf_fixed) {
3901 vm_map_address_t map_end;
3902
3903 result = vm_sanitize_addr_size(address_u, initial_size_u,
3904 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3905 target_map,
3906 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3907 map_addr, &map_end, map_size);
3908 if (__improbable(result != KERN_SUCCESS)) {
3909 return result;
3910 }
3911 } else {
3912 *map_addr = vm_sanitize_addr(target_map, address_u);
3913 result = vm_sanitize_size(0, initial_size_u,
3914 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3915 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3916 if (__improbable(result != KERN_SUCCESS)) {
3917 return result;
3918 }
3919 }
3920
3921 *obj_size = vm_object_round_page(*map_size);
3922 if (__improbable(*obj_size == 0)) {
3923 return KERN_INVALID_ARGUMENT;
3924 }
3925
3926 if (IP_VALID(port)) {
3927 result = vm_sanitize_addr_size(offset_u, *obj_size,
3928 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3929 PAGE_MASK,
3930 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
3931 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3932 obj_offs, obj_end, obj_size);
3933 if (__improbable(result != KERN_SUCCESS)) {
3934 return result;
3935 }
3936 } else {
3937 *obj_offs = 0;
3938 *obj_end = *obj_size;
3939 }
3940
3941 return KERN_SUCCESS;
3942 }
3943
3944 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3945 vm_map_enter_mem_object(
3946 vm_map_t target_map,
3947 vm_map_offset_ut *address_u,
3948 vm_map_size_ut initial_size_u,
3949 vm_map_offset_ut mask_u,
3950 vm_map_kernel_flags_t vmk_flags,
3951 ipc_port_t port,
3952 vm_object_offset_ut offset_u,
3953 boolean_t copy,
3954 vm_prot_ut cur_protection_u,
3955 vm_prot_ut max_protection_u,
3956 vm_inherit_ut inheritance_u,
3957 upl_page_list_ptr_t page_list,
3958 unsigned int page_list_count)
3959 {
3960 vm_map_offset_t mask;
3961 vm_prot_t cur_protection;
3962 vm_prot_t max_protection;
3963 vm_inherit_t inheritance;
3964 vm_map_address_t map_addr, map_mask;
3965 vm_map_size_t map_size;
3966 vm_object_t object = VM_OBJECT_NULL;
3967 vm_object_offset_t obj_offs, obj_end;
3968 vm_object_size_t obj_size;
3969 kern_return_t result;
3970 boolean_t mask_cur_protection, mask_max_protection;
3971 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
3972 vm_map_offset_t offset_in_mapping = 0;
3973
3974 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3975 /* XXX TODO4K prefaulting depends on page size... */
3976 try_prefault = FALSE;
3977 }
3978
3979 /*
3980 * Check arguments for validity
3981 */
3982 if ((target_map == VM_MAP_NULL) ||
3983 (try_prefault && (copy || !page_list))) {
3984 return KERN_INVALID_ARGUMENT;
3985 }
3986
3987 map_mask = vm_map_page_mask(target_map);
3988
3989 /*
3990 * Sanitize any input parameters that are addr/size/prot/inherit
3991 */
3992 result = vm_map_enter_mem_object_sanitize(
3993 target_map,
3994 *address_u,
3995 initial_size_u,
3996 mask_u,
3997 offset_u,
3998 cur_protection_u,
3999 max_protection_u,
4000 inheritance_u,
4001 vmk_flags,
4002 port,
4003 &map_addr,
4004 &map_size,
4005 &mask,
4006 &obj_offs,
4007 &obj_end,
4008 &obj_size,
4009 &cur_protection,
4010 &max_protection,
4011 &inheritance);
4012 if (__improbable(result != KERN_SUCCESS)) {
4013 return vm_sanitize_get_kr(result);
4014 }
4015
4016 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4017 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
4018
4019 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4020 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4021 cur_protection &= ~VM_PROT_IS_MASK;
4022 max_protection &= ~VM_PROT_IS_MASK;
4023
4024 #if __arm64__
4025 if (cur_protection & VM_PROT_EXECUTE) {
4026 cur_protection |= VM_PROT_READ;
4027 }
4028 #endif /* __arm64__ */
4029
4030 /*
4031 * Find the vm object (if any) corresponding to this port.
4032 */
4033 if (!IP_VALID(port)) {
4034 object = VM_OBJECT_NULL;
4035 copy = FALSE;
4036 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4037 vm_named_entry_t named_entry;
4038 vm_object_size_t initial_size;
4039
4040 named_entry = mach_memory_entry_from_port(port);
4041
4042 if (vmk_flags.vmf_return_data_addr ||
4043 vmk_flags.vmf_return_4k_data_addr) {
4044 result = vm_map_enter_adjust_offset(&obj_offs,
4045 &obj_end, named_entry->data_offset);
4046 if (__improbable(result)) {
4047 return result;
4048 }
4049 }
4050
4051 /* a few checks to make sure user is obeying rules */
4052 if (mask_max_protection) {
4053 max_protection &= named_entry->protection;
4054 }
4055 if (mask_cur_protection) {
4056 cur_protection &= named_entry->protection;
4057 }
4058 if ((named_entry->protection & max_protection) !=
4059 max_protection) {
4060 return KERN_INVALID_RIGHT;
4061 }
4062 if ((named_entry->protection & cur_protection) !=
4063 cur_protection) {
4064 return KERN_INVALID_RIGHT;
4065 }
4066
4067 /*
4068 * unwrap is safe because we know obj_size is larger and doesn't
4069 * overflow
4070 */
4071 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4072 if (named_entry->size < obj_offs + initial_size) {
4073 return KERN_INVALID_ARGUMENT;
4074 }
4075
4076 /* for a vm_map_copy, we can only map it whole */
4077 if (named_entry->is_copy &&
4078 (obj_size != named_entry->size) &&
4079 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4080 /* XXX FBDP use the rounded size... */
4081 obj_end += named_entry->size - obj_size;
4082 obj_size = named_entry->size;
4083 }
4084
4085 if (named_entry->offset) {
4086 /*
4087 * the callers parameter offset is defined to be the
4088 * offset from beginning of named entry offset in object
4089 *
4090 * Because we checked above that
4091 * obj_offs + obj_size < named_entry_size
4092 * these overflow checks should be redundant...
4093 */
4094 result = vm_map_enter_adjust_offset(&obj_offs,
4095 &obj_end, named_entry->offset);
4096 if (__improbable(result)) {
4097 return result;
4098 }
4099 }
4100
4101 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4102 /*
4103 * Let's not map more than requested;
4104 * vm_map_enter() will handle this "not map-aligned"
4105 * case.
4106 */
4107 map_size = obj_size;
4108 }
4109
4110 named_entry_lock(named_entry);
4111
4112 // rdar://130307561 (Combine copy, object, and submap fields of vm_named_entry into an enum)
4113 assert(named_entry->is_copy || named_entry->is_object || named_entry->is_sub_map);
4114
4115 if (named_entry->is_sub_map) {
4116 vm_map_t submap;
4117
4118 assert(!named_entry->is_copy);
4119 assert(!named_entry->is_object);
4120
4121 if (vmk_flags.vmf_return_data_addr ||
4122 vmk_flags.vmf_return_4k_data_addr) {
4123 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4124 }
4125
4126 submap = named_entry->backing.map;
4127 vm_map_reference(submap);
4128 named_entry_unlock(named_entry);
4129
4130 vmk_flags.vmkf_submap = TRUE;
4131 result = vm_map_enter(target_map,
4132 &map_addr,
4133 map_size,
4134 mask,
4135 vmk_flags,
4136 (vm_object_t)(uintptr_t) submap,
4137 obj_offs,
4138 copy,
4139 cur_protection,
4140 max_protection,
4141 inheritance);
4142 if (result != KERN_SUCCESS) {
4143 vm_map_deallocate(submap);
4144 return result;
4145 }
4146 /*
4147 * No need to lock "submap" just to check its
4148 * "mapped" flag: that flag is never reset
4149 * once it's been set and if we race, we'll
4150 * just end up setting it twice, which is OK.
4151 */
4152 if (submap->mapped_in_other_pmaps == FALSE &&
4153 vm_map_pmap(submap) != PMAP_NULL &&
4154 vm_map_pmap(submap) !=
4155 vm_map_pmap(target_map)) {
4156 /*
4157 * This submap is being mapped in a map
4158 * that uses a different pmap.
4159 * Set its "mapped_in_other_pmaps" flag
4160 * to indicate that we now need to
4161 * remove mappings from all pmaps rather
4162 * than just the submap's pmap.
4163 */
4164 vm_map_lock(submap);
4165 submap->mapped_in_other_pmaps = TRUE;
4166 vm_map_unlock(submap);
4167 }
4168 goto out;
4169 }
4170
4171 if (named_entry->is_copy) {
4172 kern_return_t kr;
4173 vm_map_copy_t copy_map;
4174 vm_map_entry_t copy_entry;
4175 vm_map_offset_t copy_addr;
4176 vm_map_copy_t target_copy_map;
4177 vm_map_offset_t overmap_start, overmap_end;
4178 vm_map_offset_t trimmed_start;
4179 vm_map_size_t target_size;
4180
4181 assert(!named_entry->is_object);
4182 assert(!named_entry->is_sub_map);
4183
4184 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4185 (VM_FLAGS_FIXED |
4186 VM_FLAGS_ANYWHERE |
4187 VM_FLAGS_OVERWRITE |
4188 VM_FLAGS_RETURN_4K_DATA_ADDR |
4189 VM_FLAGS_RETURN_DATA_ADDR))) {
4190 named_entry_unlock(named_entry);
4191 return KERN_INVALID_ARGUMENT;
4192 }
4193
4194 copy_map = named_entry->backing.copy;
4195 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4196 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4197 /* unsupported type; should not happen */
4198 printf("vm_map_enter_mem_object: "
4199 "memory_entry->backing.copy "
4200 "unsupported type 0x%x\n",
4201 copy_map->type);
4202 named_entry_unlock(named_entry);
4203 return KERN_INVALID_ARGUMENT;
4204 }
4205
4206 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4207 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4208 }
4209
4210 if (vmk_flags.vmf_return_data_addr ||
4211 vmk_flags.vmf_return_4k_data_addr) {
4212 offset_in_mapping = obj_offs & map_mask;
4213 if (vmk_flags.vmf_return_4k_data_addr) {
4214 offset_in_mapping &= ~((signed)(0xFFF));
4215 }
4216 }
4217
4218 target_copy_map = VM_MAP_COPY_NULL;
4219 target_size = copy_map->size;
4220 overmap_start = 0;
4221 overmap_end = 0;
4222 trimmed_start = 0;
4223 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4224 DEBUG4K_ADJUST("adjusting...\n");
4225 kr = vm_map_copy_adjust_to_target(
4226 copy_map,
4227 obj_offs,
4228 initial_size,
4229 target_map,
4230 copy,
4231 &target_copy_map,
4232 &overmap_start,
4233 &overmap_end,
4234 &trimmed_start);
4235 if (kr != KERN_SUCCESS) {
4236 named_entry_unlock(named_entry);
4237 return kr;
4238 }
4239 target_size = target_copy_map->size;
4240 } else {
4241 /*
4242 * Assert that the vm_map_copy is coming from the right
4243 * zone and hasn't been forged
4244 */
4245 vm_map_copy_require(copy_map);
4246 target_copy_map = copy_map;
4247 }
4248
4249 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4250
4251 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4252 (VM_FLAGS_FIXED |
4253 VM_FLAGS_ANYWHERE |
4254 VM_FLAGS_OVERWRITE |
4255 VM_FLAGS_RETURN_4K_DATA_ADDR |
4256 VM_FLAGS_RETURN_DATA_ADDR));
4257
4258 /* reserve a contiguous range */
4259 kr = vm_map_enter(target_map,
4260 &map_addr,
4261 vm_map_round_page(target_size, map_mask),
4262 mask,
4263 rsv_flags,
4264 VM_OBJECT_NULL,
4265 0,
4266 FALSE, /* copy */
4267 cur_protection,
4268 max_protection,
4269 inheritance);
4270 if (kr != KERN_SUCCESS) {
4271 DEBUG4K_ERROR("kr 0x%x\n", kr);
4272 if (target_copy_map != copy_map) {
4273 vm_map_copy_discard(target_copy_map);
4274 target_copy_map = VM_MAP_COPY_NULL;
4275 }
4276 named_entry_unlock(named_entry);
4277 return kr;
4278 }
4279
4280 copy_addr = map_addr;
4281
4282 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4283 copy_entry != vm_map_copy_to_entry(target_copy_map);
4284 copy_entry = copy_entry->vme_next) {
4285 vm_map_t copy_submap = VM_MAP_NULL;
4286 vm_object_t copy_object = VM_OBJECT_NULL;
4287 vm_map_size_t copy_size;
4288 vm_object_offset_t copy_offset;
4289 boolean_t do_copy = false;
4290
4291 if (copy_entry->is_sub_map) {
4292 copy_submap = VME_SUBMAP(copy_entry);
4293 copy_object = (vm_object_t)copy_submap;
4294 } else {
4295 copy_object = VME_OBJECT(copy_entry);
4296 }
4297 copy_offset = VME_OFFSET(copy_entry);
4298 copy_size = (copy_entry->vme_end -
4299 copy_entry->vme_start);
4300
4301 /* sanity check */
4302 if ((copy_addr + copy_size) >
4303 (map_addr +
4304 overmap_start + overmap_end +
4305 named_entry->size /* XXX full size */)) {
4306 /* over-mapping too much !? */
4307 kr = KERN_INVALID_ARGUMENT;
4308 DEBUG4K_ERROR("kr 0x%x\n", kr);
4309 /* abort */
4310 break;
4311 }
4312
4313 /* take a reference on the object */
4314 if (copy_entry->is_sub_map) {
4315 vm_map_reference(copy_submap);
4316 } else {
4317 if (!copy &&
4318 copy_object != VM_OBJECT_NULL &&
4319 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4320 bool is_writable;
4321
4322 /*
4323 * We need to resolve our side of this
4324 * "symmetric" copy-on-write now; we
4325 * need a new object to map and share,
4326 * instead of the current one which
4327 * might still be shared with the
4328 * original mapping.
4329 *
4330 * Note: A "vm_map_copy_t" does not
4331 * have a lock but we're protected by
4332 * the named entry's lock here.
4333 */
4334 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4335 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4336 assert(copy_object != VME_OBJECT(copy_entry));
4337 is_writable = false;
4338 if (copy_entry->protection & VM_PROT_WRITE) {
4339 is_writable = true;
4340 #if __arm64e__
4341 } else if (copy_entry->used_for_tpro) {
4342 is_writable = true;
4343 #endif /* __arm64e__ */
4344 }
4345 if (!copy_entry->needs_copy && is_writable) {
4346 vm_prot_t prot;
4347
4348 prot = copy_entry->protection & ~VM_PROT_WRITE;
4349 vm_object_pmap_protect(copy_object,
4350 copy_offset,
4351 copy_size,
4352 PMAP_NULL,
4353 PAGE_SIZE,
4354 0,
4355 prot);
4356 }
4357 copy_entry->needs_copy = FALSE;
4358 copy_entry->is_shared = TRUE;
4359 copy_object = VME_OBJECT(copy_entry);
4360 copy_offset = VME_OFFSET(copy_entry);
4361 vm_object_lock(copy_object);
4362 /* we're about to make a shared mapping of this object */
4363 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4364 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4365 vm_object_unlock(copy_object);
4366 }
4367
4368 if (copy_object != VM_OBJECT_NULL &&
4369 copy_object->named &&
4370 copy_object->pager != MEMORY_OBJECT_NULL &&
4371 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4372 memory_object_t pager;
4373 vm_prot_t pager_prot;
4374
4375 /*
4376 * For "named" VM objects, let the pager know that the
4377 * memory object is being mapped. Some pagers need to keep
4378 * track of this, to know when they can reclaim the memory
4379 * object, for example.
4380 * VM calls memory_object_map() for each mapping (specifying
4381 * the protection of each mapping) and calls
4382 * memory_object_last_unmap() when all the mappings are gone.
4383 */
4384 pager_prot = max_protection;
4385 if (copy) {
4386 /*
4387 * Copy-On-Write mapping: won't modify the
4388 * memory object.
4389 */
4390 pager_prot &= ~VM_PROT_WRITE;
4391 }
4392 vm_object_lock(copy_object);
4393 pager = copy_object->pager;
4394 if (copy_object->named &&
4395 pager != MEMORY_OBJECT_NULL &&
4396 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4397 assert(copy_object->pager_ready);
4398 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4399 /*
4400 * Object might have lost its pager
4401 * while waiting.
4402 */
4403 pager = copy_object->pager;
4404 if (copy_object->named &&
4405 pager != MEMORY_OBJECT_NULL) {
4406 vm_object_mapping_begin(copy_object);
4407 vm_object_unlock(copy_object);
4408
4409 kr = memory_object_map(pager, pager_prot);
4410 assert(kr == KERN_SUCCESS);
4411
4412 vm_object_lock(copy_object);
4413 vm_object_mapping_end(copy_object);
4414 }
4415 }
4416 vm_object_unlock(copy_object);
4417 }
4418
4419 /*
4420 * Perform the copy if requested
4421 */
4422
4423 if (copy && copy_object != VM_OBJECT_NULL) {
4424 vm_object_t new_object;
4425 vm_object_offset_t new_offset;
4426
4427 result = vm_object_copy_strategically(copy_object, copy_offset,
4428 copy_size,
4429 false, /* forking */
4430 &new_object, &new_offset,
4431 &do_copy);
4432
4433
4434 if (result == KERN_MEMORY_RESTART_COPY) {
4435 boolean_t success;
4436 boolean_t src_needs_copy;
4437
4438 /*
4439 * XXX
4440 * We currently ignore src_needs_copy.
4441 * This really is the issue of how to make
4442 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4443 * non-kernel users to use. Solution forthcoming.
4444 * In the meantime, since we don't allow non-kernel
4445 * memory managers to specify symmetric copy,
4446 * we won't run into problems here.
4447 */
4448 new_object = copy_object;
4449 new_offset = copy_offset;
4450 success = vm_object_copy_quickly(new_object,
4451 new_offset,
4452 copy_size,
4453 &src_needs_copy,
4454 &do_copy);
4455 assert(success);
4456 result = KERN_SUCCESS;
4457 }
4458 if (result != KERN_SUCCESS) {
4459 kr = result;
4460 break;
4461 }
4462
4463 copy_object = new_object;
4464 copy_offset = new_offset;
4465 /*
4466 * No extra object reference for the mapping:
4467 * the mapping should be the only thing keeping
4468 * this new object alive.
4469 */
4470 } else {
4471 /*
4472 * We already have the right object
4473 * to map.
4474 */
4475 copy_object = VME_OBJECT(copy_entry);
4476 /* take an extra ref for the mapping below */
4477 vm_object_reference(copy_object);
4478 }
4479 }
4480
4481 /*
4482 * If the caller does not want a specific
4483 * tag for this new mapping: use
4484 * the tag of the original mapping.
4485 */
4486 vm_map_kernel_flags_t vmk_remap_flags = {
4487 .vmkf_submap = copy_entry->is_sub_map,
4488 };
4489
4490 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4491 vm_map_kernel_flags_vmflags(vmk_flags),
4492 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4493
4494 /* over-map the object into destination */
4495 vmk_remap_flags.vmf_fixed = true;
4496 vmk_remap_flags.vmf_overwrite = true;
4497
4498 if (!copy && !copy_entry->is_sub_map) {
4499 /*
4500 * copy-on-write should have been
4501 * resolved at this point, or we would
4502 * end up sharing instead of copying.
4503 */
4504 assert(!copy_entry->needs_copy);
4505 }
4506 #if XNU_TARGET_OS_OSX
4507 if (copy_entry->used_for_jit) {
4508 vmk_remap_flags.vmkf_map_jit = TRUE;
4509 }
4510 #endif /* XNU_TARGET_OS_OSX */
4511
4512 kr = vm_map_enter(target_map,
4513 ©_addr,
4514 copy_size,
4515 (vm_map_offset_t) 0,
4516 vmk_remap_flags,
4517 copy_object,
4518 copy_offset,
4519 ((copy_object == NULL)
4520 ? FALSE
4521 : (copy || copy_entry->needs_copy)),
4522 cur_protection,
4523 max_protection,
4524 inheritance);
4525 if (kr != KERN_SUCCESS) {
4526 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4527 if (copy_entry->is_sub_map) {
4528 vm_map_deallocate(copy_submap);
4529 } else {
4530 vm_object_deallocate(copy_object);
4531 }
4532 /* abort */
4533 break;
4534 }
4535
4536 /* next mapping */
4537 copy_addr += copy_size;
4538 }
4539
4540 named_entry_unlock(named_entry);
4541 if (target_copy_map != copy_map) {
4542 vm_map_copy_discard(target_copy_map);
4543 target_copy_map = VM_MAP_COPY_NULL;
4544 }
4545
4546 if (kr == KERN_SUCCESS) {
4547 if (overmap_start) {
4548 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t)offset_in_mapping, (uint64_t)overmap_start, (uint64_t)(map_addr + offset_in_mapping + overmap_start));
4549 }
4550 offset_in_mapping += overmap_start;
4551 } else if (!vmk_flags.vmf_overwrite) {
4552 /* deallocate the contiguous range */
4553 vm_map_remove(target_map, map_addr,
4554 map_addr + map_size);
4555 }
4556 result = kr;
4557 goto out;
4558 }
4559
4560 if (named_entry->is_object) {
4561 unsigned int access;
4562 unsigned int wimg_mode;
4563
4564 assert(!named_entry->is_copy);
4565 assert(!named_entry->is_sub_map);
4566
4567 /* we are mapping a VM object */
4568
4569 access = named_entry->access;
4570
4571 if (vmk_flags.vmf_return_data_addr ||
4572 vmk_flags.vmf_return_4k_data_addr) {
4573 offset_in_mapping = obj_offs & map_mask;
4574 if (vmk_flags.vmf_return_4k_data_addr) {
4575 offset_in_mapping &= ~((signed)(0xFFF));
4576 }
4577 obj_offs -= offset_in_mapping;
4578 map_size = vm_map_round_page(initial_size +
4579 offset_in_mapping, map_mask);
4580 }
4581
4582 object = vm_named_entry_to_vm_object(named_entry);
4583 assert(object != VM_OBJECT_NULL);
4584 vm_object_lock(object);
4585 named_entry_unlock(named_entry);
4586
4587 vm_object_reference_locked(object);
4588
4589 wimg_mode = object->wimg_bits;
4590 vm_prot_to_wimg(access, &wimg_mode);
4591 if (object->wimg_bits != wimg_mode) {
4592 vm_object_change_wimg_mode(object, wimg_mode);
4593 }
4594
4595 vm_object_unlock(object);
4596 } else {
4597 panic("invalid VM named entry %p", named_entry);
4598 }
4599 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4600 /*
4601 * JMM - This is temporary until we unify named entries
4602 * and raw memory objects.
4603 *
4604 * Detected fake ip_kotype for a memory object. In
4605 * this case, the port isn't really a port at all, but
4606 * instead is just a raw memory object.
4607 */
4608 if (vmk_flags.vmf_return_data_addr ||
4609 vmk_flags.vmf_return_4k_data_addr) {
4610 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4611 }
4612
4613 object = memory_object_to_vm_object((memory_object_t)port);
4614 if (object == VM_OBJECT_NULL) {
4615 return KERN_INVALID_OBJECT;
4616 }
4617 vm_object_reference(object);
4618
4619 /* wait for object (if any) to be ready */
4620 if (object != VM_OBJECT_NULL) {
4621 if (is_kernel_object(object)) {
4622 printf("Warning: Attempt to map kernel object"
4623 " by a non-private kernel entity\n");
4624 return KERN_INVALID_OBJECT;
4625 }
4626 if (!object->pager_ready) {
4627 vm_object_lock(object);
4628
4629 while (!object->pager_ready) {
4630 vm_object_sleep(object,
4631 VM_OBJECT_EVENT_PAGER_READY,
4632 THREAD_UNINT,
4633 LCK_SLEEP_EXCLUSIVE);
4634 }
4635 vm_object_unlock(object);
4636 }
4637 }
4638 } else {
4639 return KERN_INVALID_OBJECT;
4640 }
4641
4642 if (object != VM_OBJECT_NULL &&
4643 object->named &&
4644 object->pager != MEMORY_OBJECT_NULL &&
4645 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4646 memory_object_t pager;
4647 vm_prot_t pager_prot;
4648 kern_return_t kr;
4649
4650 /*
4651 * For "named" VM objects, let the pager know that the
4652 * memory object is being mapped. Some pagers need to keep
4653 * track of this, to know when they can reclaim the memory
4654 * object, for example.
4655 * VM calls memory_object_map() for each mapping (specifying
4656 * the protection of each mapping) and calls
4657 * memory_object_last_unmap() when all the mappings are gone.
4658 */
4659 pager_prot = max_protection;
4660 if (copy) {
4661 /*
4662 * Copy-On-Write mapping: won't modify the
4663 * memory object.
4664 */
4665 pager_prot &= ~VM_PROT_WRITE;
4666 }
4667 vm_object_lock(object);
4668 pager = object->pager;
4669 if (object->named &&
4670 pager != MEMORY_OBJECT_NULL &&
4671 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4672 assert(object->pager_ready);
4673 vm_object_mapping_wait(object, THREAD_UNINT);
4674 /* object might have lost its pager while waiting */
4675 pager = object->pager;
4676 if (object->named && pager != MEMORY_OBJECT_NULL) {
4677 vm_object_mapping_begin(object);
4678 vm_object_unlock(object);
4679
4680 kr = memory_object_map(pager, pager_prot);
4681 assert(kr == KERN_SUCCESS);
4682
4683 vm_object_lock(object);
4684 vm_object_mapping_end(object);
4685 }
4686 }
4687 vm_object_unlock(object);
4688 }
4689
4690 /*
4691 * Perform the copy if requested
4692 */
4693
4694 if (copy) {
4695 vm_object_t new_object;
4696 vm_object_offset_t new_offset;
4697
4698 result = vm_object_copy_strategically(object,
4699 obj_offs,
4700 map_size,
4701 false, /* forking */
4702 &new_object, &new_offset,
4703 ©);
4704
4705
4706 if (result == KERN_MEMORY_RESTART_COPY) {
4707 boolean_t success;
4708 boolean_t src_needs_copy;
4709
4710 /*
4711 * XXX
4712 * We currently ignore src_needs_copy.
4713 * This really is the issue of how to make
4714 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4715 * non-kernel users to use. Solution forthcoming.
4716 * In the meantime, since we don't allow non-kernel
4717 * memory managers to specify symmetric copy,
4718 * we won't run into problems here.
4719 */
4720 new_object = object;
4721 new_offset = obj_offs;
4722 success = vm_object_copy_quickly(new_object,
4723 new_offset,
4724 map_size,
4725 &src_needs_copy,
4726 ©);
4727 assert(success);
4728 result = KERN_SUCCESS;
4729 }
4730 /*
4731 * Throw away the reference to the
4732 * original object, as it won't be mapped.
4733 */
4734
4735 vm_object_deallocate(object);
4736
4737 if (result != KERN_SUCCESS) {
4738 return result;
4739 }
4740
4741 object = new_object;
4742 obj_offs = new_offset;
4743 }
4744
4745 /*
4746 * If non-kernel users want to try to prefault pages, the mapping and prefault
4747 * needs to be atomic.
4748 */
4749 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4750 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4751
4752 result = vm_map_enter(target_map,
4753 &map_addr, map_size,
4754 (vm_map_offset_t)mask,
4755 vmk_flags,
4756 object, obj_offs,
4757 copy,
4758 cur_protection, max_protection,
4759 inheritance);
4760 if (result != KERN_SUCCESS) {
4761 vm_object_deallocate(object);
4762 }
4763
4764 /*
4765 * Try to prefault, and do not forget to release the vm map lock.
4766 */
4767 if (result == KERN_SUCCESS && try_prefault) {
4768 mach_vm_address_t va = map_addr;
4769 kern_return_t kr = KERN_SUCCESS;
4770 unsigned int i = 0;
4771 int pmap_options;
4772
4773 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4774 if (object->internal) {
4775 pmap_options |= PMAP_OPTIONS_INTERNAL;
4776 }
4777
4778 for (i = 0; i < page_list_count; ++i) {
4779 if (!UPL_VALID_PAGE(page_list, i)) {
4780 if (kernel_prefault) {
4781 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4782 result = KERN_MEMORY_ERROR;
4783 break;
4784 }
4785 } else {
4786 /*
4787 * If this function call failed, we should stop
4788 * trying to optimize, other calls are likely
4789 * going to fail too.
4790 *
4791 * We are not gonna report an error for such
4792 * failure though. That's an optimization, not
4793 * something critical.
4794 */
4795 kr = pmap_enter_options(target_map->pmap,
4796 va, UPL_PHYS_PAGE(page_list, i),
4797 cur_protection, VM_PROT_NONE,
4798 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4799 if (kr != KERN_SUCCESS) {
4800 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4801 if (kernel_prefault) {
4802 result = kr;
4803 }
4804 break;
4805 }
4806 OSIncrementAtomic64(&vm_prefault_nb_pages);
4807 }
4808
4809 /* Next virtual address */
4810 va += PAGE_SIZE;
4811 }
4812 if (vmk_flags.vmkf_keep_map_locked) {
4813 vm_map_unlock(target_map);
4814 }
4815 }
4816
4817 out:
4818 if (result == KERN_SUCCESS) {
4819 #if KASAN
4820 if (target_map->pmap == kernel_pmap) {
4821 kasan_notify_address(map_addr, map_size);
4822 }
4823 #endif
4824 *address_u = vm_sanitize_wrap_addr(map_addr + offset_in_mapping);
4825 }
4826 return result;
4827 }
4828
4829 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4830 vm_map_enter_mem_object_prefault(
4831 vm_map_t target_map,
4832 vm_map_offset_ut *address,
4833 vm_map_size_ut initial_size,
4834 vm_map_offset_ut mask,
4835 vm_map_kernel_flags_t vmk_flags,
4836 ipc_port_t port,
4837 vm_object_offset_ut offset,
4838 vm_prot_ut cur_protection,
4839 vm_prot_ut max_protection,
4840 upl_page_list_ptr_t page_list,
4841 unsigned int page_list_count)
4842 {
4843 /* range_id is set by vm_map_enter_mem_object */
4844 return vm_map_enter_mem_object(target_map,
4845 address,
4846 initial_size,
4847 mask,
4848 vmk_flags,
4849 port,
4850 offset,
4851 FALSE,
4852 cur_protection,
4853 max_protection,
4854 VM_INHERIT_DEFAULT,
4855 page_list,
4856 page_list_count);
4857 }
4858
4859 static __attribute__((always_inline, warn_unused_result))
4860 kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4861 vm_map_enter_mem_object_control_sanitize(
4862 vm_map_t target_map,
4863 vm_map_offset_ut address_u,
4864 vm_map_size_ut initial_size_u,
4865 vm_map_offset_ut mask_u,
4866 vm_object_offset_ut offset_u,
4867 vm_prot_ut cur_protection_u,
4868 vm_prot_ut max_protection_u,
4869 vm_inherit_ut inheritance_u,
4870 vm_map_kernel_flags_t vmk_flags,
4871 vm_map_address_t *map_addr,
4872 vm_map_size_t *map_size,
4873 vm_map_offset_t *mask,
4874 vm_object_offset_t *obj_offs,
4875 vm_object_offset_t *obj_end,
4876 vm_object_size_t *obj_size,
4877 vm_prot_t *cur_protection,
4878 vm_prot_t *max_protection,
4879 vm_inherit_t *inheritance)
4880 {
4881 kern_return_t kr;
4882
4883 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4884 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4885 cur_protection, max_protection);
4886 if (__improbable(kr != KERN_SUCCESS)) {
4887 return kr;
4888 }
4889
4890 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4891 inheritance);
4892 if (__improbable(kr != KERN_SUCCESS)) {
4893 return kr;
4894 }
4895
4896 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4897 if (__improbable(kr != KERN_SUCCESS)) {
4898 return kr;
4899 }
4900 /*
4901 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4902 * pages).
4903 * We keep unaligned values for now. The call we eventually make to
4904 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4905 * target_map pages or kernel pages. But this isn't enough to guarantee
4906 * kernel space alignment.
4907 */
4908 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4909 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4910 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS |
4911 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4912 obj_offs, obj_end, obj_size);
4913 if (__improbable(kr != KERN_SUCCESS)) {
4914 return kr;
4915 }
4916
4917 /*
4918 * There is no vm_sanitize_addr_size variant that also adjusts for
4919 * a separate offset. Rather than create one for this one-off issue,
4920 * we sanitize map_addr and map_size individually, relying on
4921 * vm_sanitize_size to incorporate the offset. Then, we perform the
4922 * overflow check manually below.
4923 */
4924 *map_addr = vm_sanitize_addr(target_map, address_u);
4925 kr = vm_sanitize_size(offset_u, initial_size_u,
4926 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4927 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4928 if (__improbable(kr != KERN_SUCCESS)) {
4929 return kr;
4930 }
4931
4932 /*
4933 * Ensure arithmetic doesn't overflow in target_map space.
4934 * The computation of map_size above accounts for the possibility that
4935 * offset_u might be unaligned in target_map space.
4936 */
4937 if (vmk_flags.vmf_fixed) {
4938 vm_map_address_t map_end;
4939
4940 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4941 return KERN_INVALID_ARGUMENT;
4942 }
4943 }
4944
4945 return KERN_SUCCESS;
4946 }
4947
4948 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4949 vm_map_enter_mem_object_control(
4950 vm_map_t target_map,
4951 vm_map_offset_ut *address_u,
4952 vm_map_size_ut initial_size_u,
4953 vm_map_offset_ut mask_u,
4954 vm_map_kernel_flags_t vmk_flags,
4955 memory_object_control_t control,
4956 vm_object_offset_ut offset_u,
4957 boolean_t needs_copy,
4958 vm_prot_ut cur_protection_u,
4959 vm_prot_ut max_protection_u,
4960 vm_inherit_ut inheritance_u)
4961 {
4962 vm_map_offset_t mask;
4963 vm_prot_t cur_protection;
4964 vm_prot_t max_protection;
4965 vm_inherit_t inheritance;
4966 vm_map_address_t map_addr;
4967 vm_map_size_t map_size;
4968 vm_object_t object;
4969 vm_object_offset_t obj_offs, obj_end;
4970 vm_object_size_t obj_size;
4971 kern_return_t result;
4972 memory_object_t pager;
4973 vm_prot_t pager_prot;
4974 kern_return_t kr;
4975
4976 /*
4977 * Check arguments for validity
4978 */
4979 if (target_map == VM_MAP_NULL) {
4980 return KERN_INVALID_ARGUMENT;
4981 }
4982
4983 /*
4984 * We only support vmf_return_data_addr-like behavior.
4985 */
4986 vmk_flags.vmf_return_data_addr = true;
4987
4988 /*
4989 * Sanitize any input parameters that are addr/size/prot/inherit
4990 */
4991 kr = vm_map_enter_mem_object_control_sanitize(target_map,
4992 *address_u,
4993 initial_size_u,
4994 mask_u,
4995 offset_u,
4996 cur_protection_u,
4997 max_protection_u,
4998 inheritance_u,
4999 vmk_flags,
5000 &map_addr,
5001 &map_size,
5002 &mask,
5003 &obj_offs,
5004 &obj_end,
5005 &obj_size,
5006 &cur_protection,
5007 &max_protection,
5008 &inheritance);
5009 if (__improbable(kr != KERN_SUCCESS)) {
5010 return vm_sanitize_get_kr(kr);
5011 }
5012
5013 object = memory_object_control_to_vm_object(control);
5014
5015 if (object == VM_OBJECT_NULL) {
5016 return KERN_INVALID_OBJECT;
5017 }
5018
5019 if (is_kernel_object(object)) {
5020 printf("Warning: Attempt to map kernel object"
5021 " by a non-private kernel entity\n");
5022 return KERN_INVALID_OBJECT;
5023 }
5024
5025 vm_object_lock(object);
5026 os_ref_retain_locked_raw(&object->ref_count, &vm_object_refgrp);
5027
5028
5029 /*
5030 * For "named" VM objects, let the pager know that the
5031 * memory object is being mapped. Some pagers need to keep
5032 * track of this, to know when they can reclaim the memory
5033 * object, for example.
5034 * VM calls memory_object_map() for each mapping (specifying
5035 * the protection of each mapping) and calls
5036 * memory_object_last_unmap() when all the mappings are gone.
5037 */
5038 pager_prot = max_protection;
5039 if (needs_copy) {
5040 pager_prot &= ~VM_PROT_WRITE;
5041 }
5042 pager = object->pager;
5043 if (object->named &&
5044 pager != MEMORY_OBJECT_NULL &&
5045 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5046 assert(object->pager_ready);
5047 vm_object_mapping_wait(object, THREAD_UNINT);
5048 /* object might have lost its pager while waiting */
5049 pager = object->pager;
5050 if (object->named && pager != MEMORY_OBJECT_NULL) {
5051 vm_object_mapping_begin(object);
5052 vm_object_unlock(object);
5053
5054 kr = memory_object_map(pager, pager_prot);
5055 assert(kr == KERN_SUCCESS);
5056
5057 vm_object_lock(object);
5058 vm_object_mapping_end(object);
5059 }
5060 }
5061 vm_object_unlock(object);
5062
5063 /*
5064 * Perform the copy if requested
5065 */
5066
5067 if (needs_copy) {
5068 vm_object_t new_object;
5069 vm_object_offset_t new_offset;
5070
5071 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5072 false, /* forking */
5073 &new_object, &new_offset,
5074 &needs_copy);
5075
5076
5077 if (result == KERN_MEMORY_RESTART_COPY) {
5078 boolean_t success;
5079 boolean_t src_needs_copy;
5080
5081 /*
5082 * XXX
5083 * We currently ignore src_needs_copy.
5084 * This really is the issue of how to make
5085 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5086 * non-kernel users to use. Solution forthcoming.
5087 * In the meantime, since we don't allow non-kernel
5088 * memory managers to specify symmetric copy,
5089 * we won't run into problems here.
5090 */
5091 new_object = object;
5092 new_offset = obj_offs;
5093 success = vm_object_copy_quickly(new_object,
5094 new_offset, obj_size,
5095 &src_needs_copy,
5096 &needs_copy);
5097 assert(success);
5098 result = KERN_SUCCESS;
5099 }
5100 /*
5101 * Throw away the reference to the
5102 * original object, as it won't be mapped.
5103 */
5104
5105 vm_object_deallocate(object);
5106
5107 if (result != KERN_SUCCESS) {
5108 return result;
5109 }
5110
5111 object = new_object;
5112 obj_offs = new_offset;
5113 }
5114
5115 result = vm_map_enter(target_map,
5116 &map_addr, map_size,
5117 (vm_map_offset_t)mask,
5118 vmk_flags,
5119 object,
5120 obj_offs,
5121 needs_copy,
5122 cur_protection, max_protection,
5123 inheritance);
5124
5125 if (result == KERN_SUCCESS) {
5126 *address_u = vm_sanitize_wrap_addr(
5127 map_addr + (obj_offs & vm_map_page_mask(target_map)));
5128 } else {
5129 vm_object_deallocate(object);
5130 }
5131
5132 return result;
5133 }
5134
5135
5136 /* Not used without nested pmaps */
5137 #ifndef NO_NESTED_PMAP
5138 /*
5139 * Clip and unnest a portion of a nested submap mapping.
5140 */
5141
5142
5143 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5144 vm_map_clip_unnest(
5145 vm_map_t map,
5146 vm_map_entry_t entry,
5147 vm_map_offset_t start_unnest,
5148 vm_map_offset_t end_unnest)
5149 {
5150 vm_map_offset_t old_start_unnest = start_unnest;
5151 vm_map_offset_t old_end_unnest = end_unnest;
5152
5153 assert(entry->is_sub_map);
5154 assert(VME_SUBMAP(entry) != NULL);
5155 assert(entry->use_pmap);
5156
5157 /*
5158 * Query the platform for the optimal unnest range.
5159 * DRK: There's some duplication of effort here, since
5160 * callers may have adjusted the range to some extent. This
5161 * routine was introduced to support 1GiB subtree nesting
5162 * for x86 platforms, which can also nest on 2MiB boundaries
5163 * depending on size/alignment.
5164 */
5165 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5166 assert(VME_SUBMAP(entry)->is_nested_map);
5167 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5168 log_unnest_badness(map,
5169 old_start_unnest,
5170 old_end_unnest,
5171 VME_SUBMAP(entry)->is_nested_map,
5172 (entry->vme_start +
5173 VME_SUBMAP(entry)->lowest_unnestable_start -
5174 VME_OFFSET(entry)));
5175 }
5176
5177 if (entry->vme_start > start_unnest ||
5178 entry->vme_end < end_unnest) {
5179 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5180 "bad nested entry: start=0x%llx end=0x%llx\n",
5181 (long long)start_unnest, (long long)end_unnest,
5182 (long long)entry->vme_start, (long long)entry->vme_end);
5183 }
5184
5185 if (start_unnest > entry->vme_start) {
5186 _vm_map_clip_start(&map->hdr,
5187 entry,
5188 start_unnest);
5189 if (map->holelistenabled) {
5190 vm_map_store_update_first_free(map, NULL, FALSE);
5191 } else {
5192 vm_map_store_update_first_free(map, map->first_free, FALSE);
5193 }
5194 }
5195 if (entry->vme_end > end_unnest) {
5196 _vm_map_clip_end(&map->hdr,
5197 entry,
5198 end_unnest);
5199 if (map->holelistenabled) {
5200 vm_map_store_update_first_free(map, NULL, FALSE);
5201 } else {
5202 vm_map_store_update_first_free(map, map->first_free, FALSE);
5203 }
5204 }
5205
5206 pmap_unnest(map->pmap,
5207 entry->vme_start,
5208 entry->vme_end - entry->vme_start);
5209 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5210 /* clean up parent map/maps */
5211 vm_map_submap_pmap_clean(
5212 map, entry->vme_start,
5213 entry->vme_end,
5214 VME_SUBMAP(entry),
5215 VME_OFFSET(entry));
5216 }
5217 entry->use_pmap = FALSE;
5218 if ((map->pmap != kernel_pmap) &&
5219 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5220 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5221 }
5222 }
5223 #endif /* NO_NESTED_PMAP */
5224
5225 __abortlike
5226 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5227 __vm_map_clip_atomic_entry_panic(
5228 vm_map_t map,
5229 vm_map_entry_t entry,
5230 vm_map_offset_t where)
5231 {
5232 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5233 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5234 (uint64_t)entry->vme_start,
5235 (uint64_t)entry->vme_end,
5236 (uint64_t)where);
5237 }
5238
5239 /*
5240 * vm_map_clip_start: [ internal use only ]
5241 *
5242 * Asserts that the given entry begins at or after
5243 * the specified address; if necessary,
5244 * it splits the entry into two.
5245 */
5246 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5247 vm_map_clip_start(
5248 vm_map_t map,
5249 vm_map_entry_t entry,
5250 vm_map_offset_t startaddr)
5251 {
5252 #ifndef NO_NESTED_PMAP
5253 if (entry->is_sub_map &&
5254 entry->use_pmap &&
5255 startaddr >= entry->vme_start) {
5256 vm_map_offset_t start_unnest, end_unnest;
5257
5258 /*
5259 * Make sure "startaddr" is no longer in a nested range
5260 * before we clip. Unnest only the minimum range the platform
5261 * can handle.
5262 * vm_map_clip_unnest may perform additional adjustments to
5263 * the unnest range.
5264 */
5265 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5266 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5267 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5268 }
5269 #endif /* NO_NESTED_PMAP */
5270 if (startaddr > entry->vme_start) {
5271 if (!entry->is_sub_map &&
5272 VME_OBJECT(entry) &&
5273 VME_OBJECT(entry)->phys_contiguous) {
5274 pmap_remove(map->pmap,
5275 (addr64_t)(entry->vme_start),
5276 (addr64_t)(entry->vme_end));
5277 }
5278 if (entry->vme_atomic) {
5279 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5280 }
5281
5282 DTRACE_VM5(
5283 vm_map_clip_start,
5284 vm_map_t, map,
5285 vm_map_offset_t, entry->vme_start,
5286 vm_map_offset_t, entry->vme_end,
5287 vm_map_offset_t, startaddr,
5288 int, VME_ALIAS(entry));
5289
5290 _vm_map_clip_start(&map->hdr, entry, startaddr);
5291 if (map->holelistenabled) {
5292 vm_map_store_update_first_free(map, NULL, FALSE);
5293 } else {
5294 vm_map_store_update_first_free(map, map->first_free, FALSE);
5295 }
5296 }
5297 }
5298
5299
5300 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5301 MACRO_BEGIN \
5302 if ((startaddr) > (entry)->vme_start) \
5303 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5304 MACRO_END
5305
5306 /*
5307 * This routine is called only when it is known that
5308 * the entry must be split.
5309 */
5310 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5311 _vm_map_clip_start(
5312 struct vm_map_header *map_header,
5313 vm_map_entry_t entry,
5314 vm_map_offset_t start)
5315 {
5316 vm_map_entry_t new_entry;
5317
5318 /*
5319 * Split off the front portion --
5320 * note that we must insert the new
5321 * entry BEFORE this one, so that
5322 * this entry has the specified starting
5323 * address.
5324 */
5325
5326 if (entry->map_aligned) {
5327 assert(VM_MAP_PAGE_ALIGNED(start,
5328 VM_MAP_HDR_PAGE_MASK(map_header)));
5329 }
5330
5331 new_entry = _vm_map_entry_create(map_header);
5332 vm_map_entry_copy_full(new_entry, entry);
5333
5334 new_entry->vme_end = start;
5335 assert(new_entry->vme_start < new_entry->vme_end);
5336 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5337 if (__improbable(start >= entry->vme_end)) {
5338 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5339 }
5340 assert(start < entry->vme_end);
5341 entry->vme_start = start;
5342
5343 #if VM_BTLOG_TAGS
5344 if (new_entry->vme_kernel_object) {
5345 btref_retain(new_entry->vme_tag_btref);
5346 }
5347 #endif /* VM_BTLOG_TAGS */
5348
5349 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5350
5351 if (entry->is_sub_map) {
5352 vm_map_reference(VME_SUBMAP(new_entry));
5353 } else {
5354 vm_object_reference(VME_OBJECT(new_entry));
5355 }
5356 }
5357
5358
5359 /*
5360 * vm_map_clip_end: [ internal use only ]
5361 *
5362 * Asserts that the given entry ends at or before
5363 * the specified address; if necessary,
5364 * it splits the entry into two.
5365 */
5366 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5367 vm_map_clip_end(
5368 vm_map_t map,
5369 vm_map_entry_t entry,
5370 vm_map_offset_t endaddr)
5371 {
5372 if (endaddr > entry->vme_end) {
5373 /*
5374 * Within the scope of this clipping, limit "endaddr" to
5375 * the end of this map entry...
5376 */
5377 endaddr = entry->vme_end;
5378 }
5379 #ifndef NO_NESTED_PMAP
5380 if (entry->is_sub_map && entry->use_pmap) {
5381 vm_map_offset_t start_unnest, end_unnest;
5382
5383 /*
5384 * Make sure the range between the start of this entry and
5385 * the new "endaddr" is no longer nested before we clip.
5386 * Unnest only the minimum range the platform can handle.
5387 * vm_map_clip_unnest may perform additional adjustments to
5388 * the unnest range.
5389 */
5390 start_unnest = entry->vme_start;
5391 end_unnest =
5392 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5393 ~(pmap_shared_region_size_min(map->pmap) - 1);
5394 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5395 }
5396 #endif /* NO_NESTED_PMAP */
5397 if (endaddr < entry->vme_end) {
5398 if (!entry->is_sub_map &&
5399 VME_OBJECT(entry) &&
5400 VME_OBJECT(entry)->phys_contiguous) {
5401 pmap_remove(map->pmap,
5402 (addr64_t)(entry->vme_start),
5403 (addr64_t)(entry->vme_end));
5404 }
5405 if (entry->vme_atomic) {
5406 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5407 }
5408 DTRACE_VM5(
5409 vm_map_clip_end,
5410 vm_map_t, map,
5411 vm_map_offset_t, entry->vme_start,
5412 vm_map_offset_t, entry->vme_end,
5413 vm_map_offset_t, endaddr,
5414 int, VME_ALIAS(entry));
5415
5416 _vm_map_clip_end(&map->hdr, entry, endaddr);
5417 if (map->holelistenabled) {
5418 vm_map_store_update_first_free(map, NULL, FALSE);
5419 } else {
5420 vm_map_store_update_first_free(map, map->first_free, FALSE);
5421 }
5422 }
5423 }
5424
5425
5426 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5427 MACRO_BEGIN \
5428 if ((endaddr) < (entry)->vme_end) \
5429 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5430 MACRO_END
5431
5432 /*
5433 * This routine is called only when it is known that
5434 * the entry must be split.
5435 */
5436 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5437 _vm_map_clip_end(
5438 struct vm_map_header *map_header,
5439 vm_map_entry_t entry,
5440 vm_map_offset_t end)
5441 {
5442 vm_map_entry_t new_entry;
5443
5444 /*
5445 * Create a new entry and insert it
5446 * AFTER the specified entry
5447 */
5448
5449 if (entry->map_aligned) {
5450 assert(VM_MAP_PAGE_ALIGNED(end,
5451 VM_MAP_HDR_PAGE_MASK(map_header)));
5452 }
5453
5454 new_entry = _vm_map_entry_create(map_header);
5455 vm_map_entry_copy_full(new_entry, entry);
5456
5457 if (__improbable(end <= entry->vme_start)) {
5458 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5459 }
5460 assert(entry->vme_start < end);
5461 new_entry->vme_start = entry->vme_end = end;
5462 VME_OFFSET_SET(new_entry,
5463 VME_OFFSET(new_entry) + (end - entry->vme_start));
5464 assert(new_entry->vme_start < new_entry->vme_end);
5465
5466 #if VM_BTLOG_TAGS
5467 if (new_entry->vme_kernel_object) {
5468 btref_retain(new_entry->vme_tag_btref);
5469 }
5470 #endif /* VM_BTLOG_TAGS */
5471
5472 _vm_map_store_entry_link(map_header, entry, new_entry);
5473
5474 if (entry->is_sub_map) {
5475 vm_map_reference(VME_SUBMAP(new_entry));
5476 } else {
5477 vm_object_reference(VME_OBJECT(new_entry));
5478 }
5479 }
5480
5481
5482 /*
5483 * VM_MAP_RANGE_CHECK: [ internal use only ]
5484 *
5485 * Asserts that the starting and ending region
5486 * addresses fall within the valid range of the map.
5487 */
5488 #define VM_MAP_RANGE_CHECK(map, start, end) \
5489 MACRO_BEGIN \
5490 if (start < vm_map_min(map)) \
5491 start = vm_map_min(map); \
5492 if (end > vm_map_max(map)) \
5493 end = vm_map_max(map); \
5494 if (start > end) \
5495 start = end; \
5496 MACRO_END
5497
5498 /*
5499 * vm_map_range_check: [ internal use only ]
5500 *
5501 * Check that the region defined by the specified start and
5502 * end addresses are wholly contained within a single map
5503 * entry or set of adjacent map entries of the spacified map,
5504 * i.e. the specified region contains no unmapped space.
5505 * If any or all of the region is unmapped, FALSE is returned.
5506 * Otherwise, TRUE is returned and if the output argument 'entry'
5507 * is not NULL it points to the map entry containing the start
5508 * of the region.
5509 *
5510 * The map is locked for reading on entry and is left locked.
5511 */
5512 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5513 vm_map_range_check(
5514 vm_map_t map,
5515 vm_map_offset_t start,
5516 vm_map_offset_t end,
5517 vm_map_entry_t *entry)
5518 {
5519 vm_map_entry_t cur;
5520 vm_map_offset_t prev;
5521
5522 /*
5523 * Basic sanity checks first
5524 */
5525 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5526 return FALSE;
5527 }
5528
5529 /*
5530 * Check first if the region starts within a valid
5531 * mapping for the map.
5532 */
5533 if (!vm_map_lookup_entry(map, start, &cur)) {
5534 return FALSE;
5535 }
5536
5537 /*
5538 * Optimize for the case that the region is contained
5539 * in a single map entry.
5540 */
5541 if (entry != (vm_map_entry_t *) NULL) {
5542 *entry = cur;
5543 }
5544 if (end <= cur->vme_end) {
5545 return TRUE;
5546 }
5547
5548 /*
5549 * If the region is not wholly contained within a
5550 * single entry, walk the entries looking for holes.
5551 */
5552 prev = cur->vme_end;
5553 cur = cur->vme_next;
5554 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5555 if (end <= cur->vme_end) {
5556 return TRUE;
5557 }
5558 prev = cur->vme_end;
5559 cur = cur->vme_next;
5560 }
5561 return FALSE;
5562 }
5563
5564 static __attribute__((always_inline, warn_unused_result))
5565 kern_return_t
vm_map_protect_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut new_prot_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * new_prot)5566 vm_map_protect_sanitize(
5567 vm_map_t map,
5568 vm_map_offset_ut start_u,
5569 vm_map_offset_ut end_u,
5570 vm_prot_ut new_prot_u,
5571 vm_map_offset_t *start,
5572 vm_map_offset_t *end,
5573 vm_prot_t *new_prot)
5574 {
5575 kern_return_t kr;
5576 vm_map_size_t size;
5577
5578 kr = vm_sanitize_prot(new_prot_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5579 map, VM_PROT_COPY, new_prot);
5580 if (__improbable(kr != KERN_SUCCESS)) {
5581 return kr;
5582 }
5583
5584 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_PROTECT,
5585 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
5586 if (__improbable(kr != KERN_SUCCESS)) {
5587 return kr;
5588 }
5589
5590 return KERN_SUCCESS;
5591 }
5592
5593 /*
5594 * vm_map_protect:
5595 *
5596 * Sets the protection of the specified address
5597 * region in the target map. If "set_max" is
5598 * specified, the maximum protection is to be set;
5599 * otherwise, only the current protection is affected.
5600 */
5601 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t set_max,vm_prot_ut new_prot_u)5602 vm_map_protect(
5603 vm_map_t map,
5604 vm_map_offset_ut start_u,
5605 vm_map_offset_ut end_u,
5606 boolean_t set_max,
5607 vm_prot_ut new_prot_u)
5608 {
5609 vm_map_entry_t current;
5610 vm_map_offset_t prev;
5611 vm_map_entry_t entry;
5612 vm_prot_t new_prot;
5613 vm_prot_t new_max;
5614 int pmap_options = 0;
5615 kern_return_t kr;
5616 vm_map_offset_t start, original_start;
5617 vm_map_offset_t end;
5618
5619 kr = vm_map_protect_sanitize(map,
5620 start_u,
5621 end_u,
5622 new_prot_u,
5623 &start,
5624 &end,
5625 &new_prot);
5626 if (__improbable(kr != KERN_SUCCESS)) {
5627 return vm_sanitize_get_kr(kr);
5628 }
5629 original_start = start;
5630
5631 if (new_prot & VM_PROT_COPY) {
5632 vm_map_offset_t new_start;
5633 vm_prot_t cur_prot, max_prot;
5634 vm_map_kernel_flags_t kflags;
5635
5636 /* LP64todo - see below */
5637 if (start >= map->max_offset) {
5638 return KERN_INVALID_ADDRESS;
5639 }
5640
5641 if ((new_prot & VM_PROT_ALLEXEC) &&
5642 map->pmap != kernel_pmap &&
5643 (vm_map_cs_enforcement(map)
5644 #if XNU_TARGET_OS_OSX && __arm64__
5645 || !VM_MAP_IS_EXOTIC(map)
5646 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5647 ) &&
5648 VM_MAP_POLICY_WX_FAIL(map)) {
5649 DTRACE_VM3(cs_wx,
5650 uint64_t, (uint64_t) start,
5651 uint64_t, (uint64_t) end,
5652 vm_prot_t, new_prot);
5653 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5654 proc_selfpid(),
5655 (get_bsdtask_info(current_task())
5656 ? proc_name_address(get_bsdtask_info(current_task()))
5657 : "?"),
5658 __FUNCTION__, __LINE__,
5659 #if DEVELOPMENT || DEBUG
5660 (uint64_t)start,
5661 (uint64_t)end,
5662 #else /* DEVELOPMENT || DEBUG */
5663 (uint64_t)0,
5664 (uint64_t)0,
5665 #endif /* DEVELOPMENT || DEBUG */
5666 new_prot);
5667 return KERN_PROTECTION_FAILURE;
5668 }
5669
5670 /*
5671 * Let vm_map_remap_extract() know that it will need to:
5672 * + make a copy of the mapping
5673 * + add VM_PROT_WRITE to the max protections
5674 * + remove any protections that are no longer allowed from the
5675 * max protections (to avoid any WRITE/EXECUTE conflict, for
5676 * example).
5677 * Note that "max_prot" is an IN/OUT parameter only for this
5678 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5679 * only.
5680 */
5681 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5682 cur_prot = VM_PROT_NONE;
5683 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5684 kflags.vmkf_remap_prot_copy = true;
5685 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5686 new_start = start;
5687 kr = vm_map_remap(map,
5688 vm_sanitize_wrap_addr_ref(&new_start),
5689 end - start,
5690 0, /* mask */
5691 kflags,
5692 map,
5693 start,
5694 TRUE, /* copy-on-write remapping! */
5695 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5696 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5697 VM_INHERIT_DEFAULT);
5698 if (kr != KERN_SUCCESS) {
5699 return kr;
5700 }
5701 new_prot &= ~VM_PROT_COPY;
5702 }
5703
5704 vm_map_lock(map);
5705 restart_after_unlock:
5706
5707 /* LP64todo - remove this check when vm_map_commpage64()
5708 * no longer has to stuff in a map_entry for the commpage
5709 * above the map's max_offset.
5710 */
5711 if (start >= map->max_offset) {
5712 vm_map_unlock(map);
5713 return KERN_INVALID_ADDRESS;
5714 }
5715
5716 while (1) {
5717 /*
5718 * Lookup the entry. If it doesn't start in a valid
5719 * entry, return an error.
5720 */
5721 if (!vm_map_lookup_entry(map, start, &entry)) {
5722 vm_map_unlock(map);
5723 return KERN_INVALID_ADDRESS;
5724 }
5725
5726 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5727 start = SUPERPAGE_ROUND_DOWN(start);
5728 continue;
5729 }
5730 break;
5731 }
5732 if (entry->superpage_size) {
5733 end = SUPERPAGE_ROUND_UP(end);
5734 }
5735
5736 /*
5737 * Make a first pass to check for protection and address
5738 * violations.
5739 */
5740
5741 current = entry;
5742 prev = current->vme_start;
5743 while ((current != vm_map_to_entry(map)) &&
5744 (current->vme_start < end)) {
5745 /*
5746 * If there is a hole, return an error.
5747 */
5748 if (current->vme_start != prev) {
5749 vm_map_unlock(map);
5750 return KERN_INVALID_ADDRESS;
5751 }
5752
5753 new_max = current->max_protection;
5754
5755 #if defined(__x86_64__)
5756 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5757 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5758 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5759 }
5760 #elif CODE_SIGNING_MONITOR
5761 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5762 new_max |= VM_PROT_EXECUTE;
5763 }
5764 #endif
5765 if ((new_prot & new_max) != new_prot) {
5766 vm_map_unlock(map);
5767 return KERN_PROTECTION_FAILURE;
5768 }
5769
5770 if (current->used_for_jit &&
5771 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5772 vm_map_unlock(map);
5773 return KERN_PROTECTION_FAILURE;
5774 }
5775
5776 #if __arm64e__
5777 /* Disallow protecting hw assisted TPRO mappings */
5778 if (current->used_for_tpro) {
5779 vm_map_unlock(map);
5780 return KERN_PROTECTION_FAILURE;
5781 }
5782 #endif /* __arm64e__ */
5783
5784
5785 if ((new_prot & VM_PROT_WRITE) &&
5786 (new_prot & VM_PROT_ALLEXEC) &&
5787 #if XNU_TARGET_OS_OSX
5788 map->pmap != kernel_pmap &&
5789 (vm_map_cs_enforcement(map)
5790 #if __arm64__
5791 || !VM_MAP_IS_EXOTIC(map)
5792 #endif /* __arm64__ */
5793 ) &&
5794 #endif /* XNU_TARGET_OS_OSX */
5795 #if CODE_SIGNING_MONITOR
5796 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5797 #endif
5798 !(current->used_for_jit)) {
5799 DTRACE_VM3(cs_wx,
5800 uint64_t, (uint64_t) current->vme_start,
5801 uint64_t, (uint64_t) current->vme_end,
5802 vm_prot_t, new_prot);
5803 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5804 proc_selfpid(),
5805 (get_bsdtask_info(current_task())
5806 ? proc_name_address(get_bsdtask_info(current_task()))
5807 : "?"),
5808 __FUNCTION__, __LINE__,
5809 #if DEVELOPMENT || DEBUG
5810 (uint64_t)current->vme_start,
5811 (uint64_t)current->vme_end,
5812 #else /* DEVELOPMENT || DEBUG */
5813 (uint64_t)0,
5814 (uint64_t)0,
5815 #endif /* DEVELOPMENT || DEBUG */
5816 new_prot);
5817 new_prot &= ~VM_PROT_ALLEXEC;
5818 if (VM_MAP_POLICY_WX_FAIL(map)) {
5819 vm_map_unlock(map);
5820 return KERN_PROTECTION_FAILURE;
5821 }
5822 }
5823
5824 /*
5825 * If the task has requested executable lockdown,
5826 * deny both:
5827 * - adding executable protections OR
5828 * - adding write protections to an existing executable mapping.
5829 */
5830 if (map->map_disallow_new_exec == TRUE) {
5831 if ((new_prot & VM_PROT_ALLEXEC) ||
5832 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5833 vm_map_unlock(map);
5834 return KERN_PROTECTION_FAILURE;
5835 }
5836 }
5837
5838 prev = current->vme_end;
5839 current = current->vme_next;
5840 }
5841
5842 #if __arm64__
5843 if (end > prev &&
5844 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5845 vm_map_entry_t prev_entry;
5846
5847 prev_entry = current->vme_prev;
5848 if (prev_entry != vm_map_to_entry(map) &&
5849 !prev_entry->map_aligned &&
5850 (vm_map_round_page(prev_entry->vme_end,
5851 VM_MAP_PAGE_MASK(map))
5852 == end)) {
5853 /*
5854 * The last entry in our range is not "map-aligned"
5855 * but it would have reached all the way to "end"
5856 * if it had been map-aligned, so this is not really
5857 * a hole in the range and we can proceed.
5858 */
5859 prev = end;
5860 }
5861 }
5862 #endif /* __arm64__ */
5863
5864 if (end > prev) {
5865 vm_map_unlock(map);
5866 return KERN_INVALID_ADDRESS;
5867 }
5868
5869 /*
5870 * Go back and fix up protections.
5871 * Clip to start here if the range starts within
5872 * the entry.
5873 */
5874
5875 current = entry;
5876 if (current != vm_map_to_entry(map)) {
5877 /* clip and unnest if necessary */
5878 vm_map_clip_start(map, current, start);
5879 }
5880
5881 while ((current != vm_map_to_entry(map)) &&
5882 (current->vme_start < end)) {
5883 vm_prot_t old_prot;
5884
5885 if (current->in_transition) {
5886 wait_result_t wait_result;
5887 vm_map_offset_t current_start;
5888
5889 /*
5890 * Another thread is wiring/unwiring this entry.
5891 * Let the other thread know we are waiting.
5892 */
5893 current_start = current->vme_start;
5894 current->needs_wakeup = true;
5895 /* wait for the other thread to be done */
5896 wait_result = vm_map_entry_wait(map, TH_UNINT);
5897 /*
5898 * We unlocked the map, so anything could have changed in the
5899 * range and we need to re-check from "current_start" to "end".
5900 * Our entries might no longer be valid.
5901 */
5902 current = NULL;
5903 entry = NULL;
5904 /*
5905 * Re-lookup and re-clip "current_start".
5906 * If it's no longer mapped,
5907 */
5908 vm_map_lookup_entry_or_next(map, current_start, ¤t);
5909 if (current != vm_map_to_entry(map)) {
5910 vm_map_clip_start(map, current, current_start);
5911 }
5912 /* restart from this point */
5913 start = current_start;
5914 goto restart_after_unlock;
5915 }
5916
5917 vm_map_clip_end(map, current, end);
5918
5919 #if DEVELOPMENT || DEBUG
5920 if (current->csm_associated && vm_log_xnu_user_debug) {
5921 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5922 proc_selfpid(),
5923 (get_bsdtask_info(current_task())
5924 ? proc_name_address(get_bsdtask_info(current_task()))
5925 : "?"),
5926 __FUNCTION__,
5927 (uint64_t)start,
5928 (uint64_t)end,
5929 new_prot,
5930 map, current,
5931 current->vme_start,
5932 current->vme_end,
5933 current->protection,
5934 current->max_protection);
5935 }
5936 #endif /* DEVELOPMENT || DEBUG */
5937
5938 if (current->is_sub_map) {
5939 /* clipping did unnest if needed */
5940 assert(!current->use_pmap);
5941 }
5942
5943 old_prot = current->protection;
5944
5945 if (set_max) {
5946 current->max_protection = new_prot;
5947 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5948 current->protection = (new_prot & old_prot);
5949 } else {
5950 current->protection = new_prot;
5951 }
5952
5953 #if CODE_SIGNING_MONITOR
5954 if (!current->vme_xnu_user_debug &&
5955 /* a !csm_associated mapping becoming executable */
5956 ((!current->csm_associated &&
5957 !(old_prot & VM_PROT_EXECUTE) &&
5958 (current->protection & VM_PROT_EXECUTE))
5959 ||
5960 /* a csm_associated mapping becoming writable */
5961 (current->csm_associated &&
5962 !(old_prot & VM_PROT_WRITE) &&
5963 (current->protection & VM_PROT_WRITE)))) {
5964 /*
5965 * This mapping has not already been marked as
5966 * "user_debug" and it is either:
5967 * 1. not code-signing-monitored and becoming executable
5968 * 2. code-signing-monitored and becoming writable,
5969 * so inform the CodeSigningMonitor and mark the
5970 * mapping as "user_debug" if appropriate.
5971 */
5972 vm_map_kernel_flags_t vmk_flags;
5973 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5974 /* pretend it's a vm_protect(VM_PROT_COPY)... */
5975 vmk_flags.vmkf_remap_prot_copy = true;
5976 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5977 #if DEVELOPMENT || DEBUG
5978 if (vm_log_xnu_user_debug) {
5979 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5980 proc_selfpid(),
5981 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5982 __FUNCTION__, __LINE__,
5983 map, current,
5984 current->vme_start, current->vme_end,
5985 old_prot, current->protection,
5986 kr, current->vme_xnu_user_debug);
5987 }
5988 #endif /* DEVELOPMENT || DEBUG */
5989 }
5990 #endif /* CODE_SIGNING_MONITOR */
5991
5992 /*
5993 * Update physical map if necessary.
5994 * If the request is to turn off write protection,
5995 * we won't do it for real (in pmap). This is because
5996 * it would cause copy-on-write to fail. We've already
5997 * set, the new protection in the map, so if a
5998 * write-protect fault occurred, it will be fixed up
5999 * properly, COW or not.
6000 */
6001 if (current->protection != old_prot) {
6002 /* Look one level in we support nested pmaps */
6003 /* from mapped submaps which are direct entries */
6004 /* in our map */
6005
6006 vm_prot_t prot;
6007
6008 prot = current->protection;
6009 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6010 prot &= ~VM_PROT_WRITE;
6011 } else {
6012 assert(!VME_OBJECT(current)->code_signed);
6013 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6014 if (prot & VM_PROT_WRITE) {
6015 /*
6016 * For write requests on the
6017 * compressor, we wil ask the
6018 * pmap layer to prevent us from
6019 * taking a write fault when we
6020 * attempt to access the mapping
6021 * next.
6022 */
6023 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6024 }
6025 }
6026
6027 if (override_nx(map, VME_ALIAS(current)) && prot) {
6028 prot |= VM_PROT_EXECUTE;
6029 }
6030
6031 #if DEVELOPMENT || DEBUG
6032 if (!(old_prot & VM_PROT_EXECUTE) &&
6033 (prot & VM_PROT_EXECUTE) &&
6034 panic_on_unsigned_execute &&
6035 (proc_selfcsflags() & CS_KILL)) {
6036 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6037 }
6038 #endif /* DEVELOPMENT || DEBUG */
6039
6040 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6041 if (current->wired_count) {
6042 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6043 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6044 }
6045
6046 /* If the pmap layer cares about this
6047 * protection type, force a fault for
6048 * each page so that vm_fault will
6049 * repopulate the page with the full
6050 * set of protections.
6051 */
6052 /*
6053 * TODO: We don't seem to need this,
6054 * but this is due to an internal
6055 * implementation detail of
6056 * pmap_protect. Do we want to rely
6057 * on this?
6058 */
6059 prot = VM_PROT_NONE;
6060 }
6061
6062 if (current->is_sub_map && current->use_pmap) {
6063 pmap_protect(VME_SUBMAP(current)->pmap,
6064 current->vme_start,
6065 current->vme_end,
6066 prot);
6067 } else {
6068 pmap_protect_options(map->pmap,
6069 current->vme_start,
6070 current->vme_end,
6071 prot,
6072 pmap_options,
6073 NULL);
6074 }
6075 }
6076 current = current->vme_next;
6077 }
6078
6079 if (entry == VM_MAP_ENTRY_NULL) {
6080 /*
6081 * Re-lookup the original start of our range.
6082 * If it's no longer mapped, start with the next mapping.
6083 */
6084 vm_map_lookup_entry_or_next(map, original_start, &entry);
6085 }
6086 current = entry;
6087 while ((current != vm_map_to_entry(map)) &&
6088 (current->vme_start <= end)) {
6089 vm_map_simplify_entry(map, current);
6090 current = current->vme_next;
6091 }
6092
6093 vm_map_unlock(map);
6094 return KERN_SUCCESS;
6095 }
6096
6097 static __attribute__((always_inline, warn_unused_result))
6098 kern_return_t
vm_map_inherit_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u,vm_map_offset_t * start,vm_map_offset_t * end,vm_inherit_t * new_inheritance)6099 vm_map_inherit_sanitize(
6100 vm_map_t map,
6101 vm_map_offset_ut start_u,
6102 vm_map_offset_ut end_u,
6103 vm_inherit_ut new_inheritance_u,
6104 vm_map_offset_t *start,
6105 vm_map_offset_t *end,
6106 vm_inherit_t *new_inheritance)
6107 {
6108 kern_return_t kr;
6109 vm_map_size_t size;
6110
6111 kr = vm_sanitize_inherit(new_inheritance_u,
6112 VM_SANITIZE_CALLER_VM_MAP_INHERIT, new_inheritance);
6113 if (__improbable(kr != KERN_SUCCESS)) {
6114 return kr;
6115 }
6116
6117 kr = vm_sanitize_addr_end(start_u, end_u, VM_SANITIZE_CALLER_VM_MAP_INHERIT,
6118 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end, &size);
6119 if (__improbable(kr != KERN_SUCCESS)) {
6120 return kr;
6121 }
6122
6123 return KERN_SUCCESS;
6124 }
6125
6126 /*
6127 * vm_map_inherit:
6128 *
6129 * Sets the inheritance of the specified address
6130 * range in the target map. Inheritance
6131 * affects how the map will be shared with
6132 * child maps at the time of vm_map_fork.
6133 */
6134 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_inherit_ut new_inheritance_u)6135 vm_map_inherit(
6136 vm_map_t map,
6137 vm_map_offset_ut start_u,
6138 vm_map_offset_ut end_u,
6139 vm_inherit_ut new_inheritance_u)
6140 {
6141 vm_map_entry_t entry;
6142 vm_map_entry_t temp_entry;
6143 kern_return_t kr;
6144 vm_map_offset_t start;
6145 vm_map_offset_t end;
6146 vm_inherit_t new_inheritance;
6147
6148 kr = vm_map_inherit_sanitize(map,
6149 start_u,
6150 end_u,
6151 new_inheritance_u,
6152 &start,
6153 &end,
6154 &new_inheritance);
6155 if (__improbable(kr != KERN_SUCCESS)) {
6156 return vm_sanitize_get_kr(kr);
6157 }
6158
6159 vm_map_lock(map);
6160
6161 VM_MAP_RANGE_CHECK(map, start, end);
6162
6163 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6164 entry = temp_entry;
6165 } else {
6166 temp_entry = temp_entry->vme_next;
6167 entry = temp_entry;
6168 }
6169
6170 /* first check entire range for submaps which can't support the */
6171 /* given inheritance. */
6172 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6173 if (entry->is_sub_map) {
6174 if (new_inheritance == VM_INHERIT_COPY) {
6175 vm_map_unlock(map);
6176 return KERN_INVALID_ARGUMENT;
6177 }
6178 }
6179
6180 entry = entry->vme_next;
6181 }
6182
6183 entry = temp_entry;
6184 if (entry != vm_map_to_entry(map)) {
6185 /* clip and unnest if necessary */
6186 vm_map_clip_start(map, entry, start);
6187 }
6188
6189 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6190 vm_map_clip_end(map, entry, end);
6191 if (entry->is_sub_map) {
6192 /* clip did unnest if needed */
6193 assert(!entry->use_pmap);
6194 }
6195
6196 entry->inheritance = new_inheritance;
6197
6198 entry = entry->vme_next;
6199 }
6200
6201 vm_map_unlock(map);
6202 return KERN_SUCCESS;
6203 }
6204
6205 /*
6206 * Update the accounting for the amount of wired memory in this map. If the user has
6207 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6208 */
6209
6210 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6211 add_wire_counts(
6212 vm_map_t map,
6213 vm_map_entry_t entry,
6214 boolean_t user_wire)
6215 {
6216 vm_map_size_t size;
6217
6218 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6219
6220 if (user_wire) {
6221 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6222
6223 /*
6224 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6225 * this map entry.
6226 */
6227
6228 if (entry->user_wired_count == 0) {
6229 size = entry->vme_end - entry->vme_start;
6230
6231 /*
6232 * Since this is the first time the user is wiring this map entry, check to see if we're
6233 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6234 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6235 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6236 * limit, then we fail.
6237 */
6238
6239 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6240 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6241 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6242 #if DEVELOPMENT || DEBUG
6243 if (panic_on_mlock_failure) {
6244 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6245 }
6246 #endif /* DEVELOPMENT || DEBUG */
6247 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6248 } else {
6249 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6250 #if DEVELOPMENT || DEBUG
6251 if (panic_on_mlock_failure) {
6252 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6253 }
6254 #endif /* DEVELOPMENT || DEBUG */
6255 }
6256 return KERN_RESOURCE_SHORTAGE;
6257 }
6258
6259 /*
6260 * The first time the user wires an entry, we also increment the wired_count and add this to
6261 * the total that has been wired in the map.
6262 */
6263
6264 if (entry->wired_count >= MAX_WIRE_COUNT) {
6265 return KERN_FAILURE;
6266 }
6267
6268 entry->wired_count++;
6269 map->user_wire_size += size;
6270 }
6271
6272 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6273 return KERN_FAILURE;
6274 }
6275
6276 entry->user_wired_count++;
6277 } else {
6278 /*
6279 * The kernel's wiring the memory. Just bump the count and continue.
6280 */
6281
6282 if (entry->wired_count >= MAX_WIRE_COUNT) {
6283 panic("vm_map_wire: too many wirings");
6284 }
6285
6286 entry->wired_count++;
6287 }
6288
6289 if (first_wire) {
6290 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6291 }
6292
6293 return KERN_SUCCESS;
6294 }
6295
6296 /*
6297 * Update the memory wiring accounting now that the given map entry is being unwired.
6298 */
6299
6300 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6301 subtract_wire_counts(
6302 vm_map_t map,
6303 vm_map_entry_t entry,
6304 boolean_t user_wire)
6305 {
6306 if (user_wire) {
6307 /*
6308 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6309 */
6310
6311 if (entry->user_wired_count == 1) {
6312 /*
6313 * We're removing the last user wire reference. Decrement the wired_count and the total
6314 * user wired memory for this map.
6315 */
6316
6317 assert(entry->wired_count >= 1);
6318 entry->wired_count--;
6319 map->user_wire_size -= entry->vme_end - entry->vme_start;
6320 }
6321
6322 assert(entry->user_wired_count >= 1);
6323 entry->user_wired_count--;
6324 } else {
6325 /*
6326 * The kernel is unwiring the memory. Just update the count.
6327 */
6328
6329 assert(entry->wired_count >= 1);
6330 entry->wired_count--;
6331 }
6332
6333 vme_btref_consider_and_put(entry);
6334 }
6335
6336 int cs_executable_wire = 0;
6337
6338 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6339 vm_map_wire_nested(
6340 vm_map_t map,
6341 vm_map_offset_t start,
6342 vm_map_offset_t end,
6343 vm_prot_t caller_prot,
6344 vm_tag_t tag,
6345 boolean_t user_wire,
6346 pmap_t map_pmap,
6347 vm_map_offset_t pmap_addr,
6348 ppnum_t *physpage_p)
6349 {
6350 vm_map_entry_t entry;
6351 vm_prot_t access_type;
6352 struct vm_map_entry *first_entry, tmp_entry;
6353 vm_map_t real_map;
6354 vm_map_offset_t s, e;
6355 kern_return_t rc;
6356 boolean_t need_wakeup;
6357 boolean_t main_map = FALSE;
6358 wait_interrupt_t interruptible_state;
6359 thread_t cur_thread;
6360 unsigned int last_timestamp;
6361 vm_map_size_t size;
6362 boolean_t wire_and_extract;
6363 vm_prot_t extra_prots;
6364
6365 extra_prots = VM_PROT_COPY;
6366 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6367 #if XNU_TARGET_OS_OSX
6368 if (map->pmap == kernel_pmap ||
6369 !vm_map_cs_enforcement(map)) {
6370 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6371 }
6372 #endif /* XNU_TARGET_OS_OSX */
6373 #if CODE_SIGNING_MONITOR
6374 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6375 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6376 }
6377 #endif /* CODE_SIGNING_MONITOR */
6378
6379 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6380
6381 wire_and_extract = FALSE;
6382 if (physpage_p != NULL) {
6383 /*
6384 * The caller wants the physical page number of the
6385 * wired page. We return only one physical page number
6386 * so this works for only one page at a time.
6387 *
6388 * The only caller (vm_map_wire_and_extract)
6389 * guarantees it.
6390 */
6391 assert(end - start == VM_MAP_PAGE_SIZE(map));
6392 wire_and_extract = TRUE;
6393 *physpage_p = 0;
6394 }
6395
6396 VM_MAP_RANGE_CHECK(map, start, end);
6397 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6398 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6399 if (start == end) {
6400 /* We wired what the caller asked for, zero pages */
6401 return KERN_SUCCESS;
6402 }
6403
6404 vm_map_lock(map);
6405 if (map_pmap == NULL) {
6406 main_map = TRUE;
6407 }
6408 last_timestamp = map->timestamp;
6409
6410 need_wakeup = FALSE;
6411 cur_thread = current_thread();
6412
6413 s = start;
6414 rc = KERN_SUCCESS;
6415
6416 if (vm_map_lookup_entry(map, s, &first_entry)) {
6417 entry = first_entry;
6418 /*
6419 * vm_map_clip_start will be done later.
6420 * We don't want to unnest any nested submaps here !
6421 */
6422 } else {
6423 /* Start address is not in map */
6424 rc = KERN_INVALID_ADDRESS;
6425 goto done;
6426 }
6427
6428 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6429 /*
6430 * At this point, we have wired from "start" to "s".
6431 * We still need to wire from "s" to "end".
6432 *
6433 * "entry" hasn't been clipped, so it could start before "s"
6434 * and/or end after "end".
6435 */
6436
6437 /* "e" is how far we want to wire in this entry */
6438 e = entry->vme_end;
6439 if (e > end) {
6440 e = end;
6441 }
6442
6443 /*
6444 * If another thread is wiring/unwiring this entry then
6445 * block after informing other thread to wake us up.
6446 */
6447 if (entry->in_transition) {
6448 wait_result_t wait_result;
6449
6450 /*
6451 * We have not clipped the entry. Make sure that
6452 * the start address is in range so that the lookup
6453 * below will succeed.
6454 * "s" is the current starting point: we've already
6455 * wired from "start" to "s" and we still have
6456 * to wire from "s" to "end".
6457 */
6458
6459 entry->needs_wakeup = TRUE;
6460
6461 /*
6462 * wake up anybody waiting on entries that we have
6463 * already wired.
6464 */
6465 if (need_wakeup) {
6466 vm_map_entry_wakeup(map);
6467 need_wakeup = FALSE;
6468 }
6469 /*
6470 * User wiring is interruptible
6471 */
6472 wait_result = vm_map_entry_wait(map,
6473 (user_wire) ? THREAD_ABORTSAFE :
6474 THREAD_UNINT);
6475 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6476 /*
6477 * undo the wirings we have done so far
6478 * We do not clear the needs_wakeup flag,
6479 * because we cannot tell if we were the
6480 * only one waiting.
6481 */
6482 rc = KERN_FAILURE;
6483 goto done;
6484 }
6485
6486 /*
6487 * Cannot avoid a lookup here. reset timestamp.
6488 */
6489 last_timestamp = map->timestamp;
6490
6491 /*
6492 * The entry could have been clipped, look it up again.
6493 * Worse that can happen is, it may not exist anymore.
6494 */
6495 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6496 /*
6497 * User: undo everything upto the previous
6498 * entry. let vm_map_unwire worry about
6499 * checking the validity of the range.
6500 */
6501 rc = KERN_FAILURE;
6502 goto done;
6503 }
6504 entry = first_entry;
6505 continue;
6506 }
6507
6508 if (entry->is_sub_map) {
6509 vm_map_offset_t sub_start;
6510 vm_map_offset_t sub_end;
6511 vm_map_offset_t local_start;
6512 vm_map_offset_t local_end;
6513 pmap_t pmap;
6514
6515 if (wire_and_extract) {
6516 /*
6517 * Wiring would result in copy-on-write
6518 * which would not be compatible with
6519 * the sharing we have with the original
6520 * provider of this memory.
6521 */
6522 rc = KERN_INVALID_ARGUMENT;
6523 goto done;
6524 }
6525
6526 vm_map_clip_start(map, entry, s);
6527 vm_map_clip_end(map, entry, end);
6528
6529 sub_start = VME_OFFSET(entry);
6530 sub_end = entry->vme_end;
6531 sub_end += VME_OFFSET(entry) - entry->vme_start;
6532
6533 local_end = entry->vme_end;
6534 if (map_pmap == NULL) {
6535 vm_object_t object;
6536 vm_object_offset_t offset;
6537 vm_prot_t prot;
6538 boolean_t wired;
6539 vm_map_entry_t local_entry;
6540 vm_map_version_t version;
6541 vm_map_t lookup_map;
6542
6543 if (entry->use_pmap) {
6544 pmap = VME_SUBMAP(entry)->pmap;
6545 /* ppc implementation requires that */
6546 /* submaps pmap address ranges line */
6547 /* up with parent map */
6548 #ifdef notdef
6549 pmap_addr = sub_start;
6550 #endif
6551 pmap_addr = s;
6552 } else {
6553 pmap = map->pmap;
6554 pmap_addr = s;
6555 }
6556
6557 if (entry->wired_count) {
6558 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6559 goto done;
6560 }
6561
6562 /*
6563 * The map was not unlocked:
6564 * no need to goto re-lookup.
6565 * Just go directly to next entry.
6566 */
6567 entry = entry->vme_next;
6568 s = entry->vme_start;
6569 continue;
6570 }
6571
6572 /* call vm_map_lookup_and_lock_object to */
6573 /* cause any needs copy to be */
6574 /* evaluated */
6575 local_start = entry->vme_start;
6576 lookup_map = map;
6577 vm_map_lock_write_to_read(map);
6578 rc = vm_map_lookup_and_lock_object(
6579 &lookup_map, local_start,
6580 (access_type | extra_prots),
6581 OBJECT_LOCK_EXCLUSIVE,
6582 &version, &object,
6583 &offset, &prot, &wired,
6584 NULL,
6585 &real_map, NULL);
6586 if (rc != KERN_SUCCESS) {
6587 vm_map_unlock_read(lookup_map);
6588 assert(map_pmap == NULL);
6589 vm_map_unwire_nested(map, start,
6590 s, user_wire, PMAP_NULL, 0);
6591 return rc;
6592 }
6593 vm_object_unlock(object);
6594 if (real_map != lookup_map) {
6595 vm_map_unlock(real_map);
6596 }
6597 vm_map_unlock_read(lookup_map);
6598 vm_map_lock(map);
6599
6600 /* we unlocked, so must re-lookup */
6601 if (!vm_map_lookup_entry(map,
6602 local_start,
6603 &local_entry)) {
6604 rc = KERN_FAILURE;
6605 goto done;
6606 }
6607
6608 /*
6609 * entry could have been "simplified",
6610 * so re-clip
6611 */
6612 entry = local_entry;
6613 assert(s == local_start);
6614 vm_map_clip_start(map, entry, s);
6615 vm_map_clip_end(map, entry, end);
6616 /* re-compute "e" */
6617 e = entry->vme_end;
6618 if (e > end) {
6619 e = end;
6620 }
6621
6622 /* did we have a change of type? */
6623 if (!entry->is_sub_map) {
6624 last_timestamp = map->timestamp;
6625 continue;
6626 }
6627 } else {
6628 local_start = entry->vme_start;
6629 pmap = map_pmap;
6630 }
6631
6632 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6633 goto done;
6634 }
6635
6636 entry->in_transition = TRUE;
6637
6638 vm_map_unlock(map);
6639 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6640 sub_start, sub_end,
6641 caller_prot, tag,
6642 user_wire, pmap, pmap_addr,
6643 NULL);
6644 vm_map_lock(map);
6645
6646 /*
6647 * Find the entry again. It could have been clipped
6648 * after we unlocked the map.
6649 */
6650 if (!vm_map_lookup_entry(map, local_start,
6651 &first_entry)) {
6652 panic("vm_map_wire: re-lookup failed");
6653 }
6654 entry = first_entry;
6655
6656 assert(local_start == s);
6657 /* re-compute "e" */
6658 e = entry->vme_end;
6659 if (e > end) {
6660 e = end;
6661 }
6662
6663 last_timestamp = map->timestamp;
6664 while ((entry != vm_map_to_entry(map)) &&
6665 (entry->vme_start < e)) {
6666 assert(entry->in_transition);
6667 entry->in_transition = FALSE;
6668 if (entry->needs_wakeup) {
6669 entry->needs_wakeup = FALSE;
6670 need_wakeup = TRUE;
6671 }
6672 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6673 subtract_wire_counts(map, entry, user_wire);
6674 }
6675 entry = entry->vme_next;
6676 }
6677 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6678 goto done;
6679 }
6680
6681 /* no need to relookup again */
6682 s = entry->vme_start;
6683 continue;
6684 }
6685
6686 /*
6687 * If this entry is already wired then increment
6688 * the appropriate wire reference count.
6689 */
6690 if (entry->wired_count) {
6691 if ((entry->protection & access_type) != access_type) {
6692 /* found a protection problem */
6693
6694 /*
6695 * XXX FBDP
6696 * We should always return an error
6697 * in this case but since we didn't
6698 * enforce it before, let's do
6699 * it only for the new "wire_and_extract"
6700 * code path for now...
6701 */
6702 if (wire_and_extract) {
6703 rc = KERN_PROTECTION_FAILURE;
6704 goto done;
6705 }
6706 }
6707
6708 /*
6709 * entry is already wired down, get our reference
6710 * after clipping to our range.
6711 */
6712 vm_map_clip_start(map, entry, s);
6713 vm_map_clip_end(map, entry, end);
6714
6715 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6716 goto done;
6717 }
6718
6719 if (wire_and_extract) {
6720 vm_object_t object;
6721 vm_object_offset_t offset;
6722 vm_page_t m;
6723
6724 /*
6725 * We don't have to "wire" the page again
6726 * bit we still have to "extract" its
6727 * physical page number, after some sanity
6728 * checks.
6729 */
6730 assert((entry->vme_end - entry->vme_start)
6731 == PAGE_SIZE);
6732 assert(!entry->needs_copy);
6733 assert(!entry->is_sub_map);
6734 assert(VME_OBJECT(entry));
6735 if (((entry->vme_end - entry->vme_start)
6736 != PAGE_SIZE) ||
6737 entry->needs_copy ||
6738 entry->is_sub_map ||
6739 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6740 rc = KERN_INVALID_ARGUMENT;
6741 goto done;
6742 }
6743
6744 object = VME_OBJECT(entry);
6745 offset = VME_OFFSET(entry);
6746 /* need exclusive lock to update m->dirty */
6747 if (entry->protection & VM_PROT_WRITE) {
6748 vm_object_lock(object);
6749 } else {
6750 vm_object_lock_shared(object);
6751 }
6752 m = vm_page_lookup(object, offset);
6753 assert(m != VM_PAGE_NULL);
6754 assert(VM_PAGE_WIRED(m));
6755 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6756 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6757 if (entry->protection & VM_PROT_WRITE) {
6758 vm_object_lock_assert_exclusive(
6759 object);
6760 m->vmp_dirty = TRUE;
6761 }
6762 } else {
6763 /* not already wired !? */
6764 *physpage_p = 0;
6765 }
6766 vm_object_unlock(object);
6767 }
6768
6769 /* map was not unlocked: no need to relookup */
6770 entry = entry->vme_next;
6771 s = entry->vme_start;
6772 continue;
6773 }
6774
6775 /*
6776 * Unwired entry or wire request transmitted via submap
6777 */
6778
6779 /*
6780 * Wiring would copy the pages to the shadow object.
6781 * The shadow object would not be code-signed so
6782 * attempting to execute code from these copied pages
6783 * would trigger a code-signing violation.
6784 */
6785
6786 if ((entry->protection & VM_PROT_EXECUTE)
6787 #if XNU_TARGET_OS_OSX
6788 &&
6789 map->pmap != kernel_pmap &&
6790 (vm_map_cs_enforcement(map)
6791 #if __arm64__
6792 || !VM_MAP_IS_EXOTIC(map)
6793 #endif /* __arm64__ */
6794 )
6795 #endif /* XNU_TARGET_OS_OSX */
6796 #if CODE_SIGNING_MONITOR
6797 &&
6798 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6799 #endif
6800 ) {
6801 #if MACH_ASSERT
6802 printf("pid %d[%s] wiring executable range from "
6803 "0x%llx to 0x%llx: rejected to preserve "
6804 "code-signing\n",
6805 proc_selfpid(),
6806 (get_bsdtask_info(current_task())
6807 ? proc_name_address(get_bsdtask_info(current_task()))
6808 : "?"),
6809 (uint64_t) entry->vme_start,
6810 (uint64_t) entry->vme_end);
6811 #endif /* MACH_ASSERT */
6812 DTRACE_VM2(cs_executable_wire,
6813 uint64_t, (uint64_t)entry->vme_start,
6814 uint64_t, (uint64_t)entry->vme_end);
6815 cs_executable_wire++;
6816 rc = KERN_PROTECTION_FAILURE;
6817 goto done;
6818 }
6819
6820 /*
6821 * Perform actions of vm_map_lookup that need the write
6822 * lock on the map: create a shadow object for a
6823 * copy-on-write region, or an object for a zero-fill
6824 * region.
6825 */
6826 size = entry->vme_end - entry->vme_start;
6827 /*
6828 * If wiring a copy-on-write page, we need to copy it now
6829 * even if we're only (currently) requesting read access.
6830 * This is aggressive, but once it's wired we can't move it.
6831 */
6832 if (entry->needs_copy) {
6833 if (wire_and_extract) {
6834 /*
6835 * We're supposed to share with the original
6836 * provider so should not be "needs_copy"
6837 */
6838 rc = KERN_INVALID_ARGUMENT;
6839 goto done;
6840 }
6841
6842 VME_OBJECT_SHADOW(entry, size,
6843 vm_map_always_shadow(map));
6844 entry->needs_copy = FALSE;
6845 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6846 if (wire_and_extract) {
6847 /*
6848 * We're supposed to share with the original
6849 * provider so should already have an object.
6850 */
6851 rc = KERN_INVALID_ARGUMENT;
6852 goto done;
6853 }
6854 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6855 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6856 assert(entry->use_pmap);
6857 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6858 if (wire_and_extract) {
6859 /*
6860 * We're supposed to share with the original
6861 * provider so should not be COPY_SYMMETRIC.
6862 */
6863 rc = KERN_INVALID_ARGUMENT;
6864 goto done;
6865 }
6866 /*
6867 * Force an unrequested "copy-on-write" but only for
6868 * the range we're wiring.
6869 */
6870 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6871 vm_map_clip_start(map, entry, s);
6872 vm_map_clip_end(map, entry, end);
6873 /* recompute "size" */
6874 size = entry->vme_end - entry->vme_start;
6875 /* make a shadow object */
6876 vm_object_t orig_object;
6877 vm_object_offset_t orig_offset;
6878 orig_object = VME_OBJECT(entry);
6879 orig_offset = VME_OFFSET(entry);
6880 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6881 if (VME_OBJECT(entry) != orig_object) {
6882 /*
6883 * This mapping has not been shared (or it would be
6884 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6885 * not been copied-on-write (or it would be marked
6886 * as "needs_copy" and would have been handled above
6887 * and also already write-protected).
6888 * We still need to write-protect here to prevent
6889 * other threads from modifying these pages while
6890 * we're in the process of copying and wiring
6891 * the copied pages.
6892 * Since the mapping is neither shared nor COWed,
6893 * we only need to write-protect the PTEs for this
6894 * mapping.
6895 */
6896 vm_object_pmap_protect(orig_object,
6897 orig_offset,
6898 size,
6899 map->pmap,
6900 VM_MAP_PAGE_SIZE(map),
6901 entry->vme_start,
6902 entry->protection & ~VM_PROT_WRITE);
6903 }
6904 }
6905 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6906 /*
6907 * Make the object COPY_DELAY to get a stable object
6908 * to wire.
6909 * That should avoid creating long shadow chains while
6910 * wiring/unwiring the same range repeatedly.
6911 * That also prevents part of the object from being
6912 * wired while another part is "needs_copy", which
6913 * could result in conflicting rules wrt copy-on-write.
6914 */
6915 vm_object_t object;
6916
6917 object = VME_OBJECT(entry);
6918 vm_object_lock(object);
6919 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6920 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6921 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6922 object, (uint64_t)object->vo_size,
6923 entry,
6924 (uint64_t)entry->vme_start,
6925 (uint64_t)entry->vme_end,
6926 (uint64_t)VME_OFFSET(entry),
6927 (uint64_t)size);
6928 assertf(os_ref_get_count_raw(&object->ref_count) == 1,
6929 "object %p ref_count %d\n",
6930 object, os_ref_get_count_raw(&object->ref_count));
6931 assertf(!entry->needs_copy,
6932 "entry %p\n", entry);
6933 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6934 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6935 }
6936 vm_object_unlock(object);
6937 }
6938
6939 vm_map_clip_start(map, entry, s);
6940 vm_map_clip_end(map, entry, end);
6941
6942 /* re-compute "e" */
6943 e = entry->vme_end;
6944 if (e > end) {
6945 e = end;
6946 }
6947
6948 /*
6949 * Check for holes and protection mismatch.
6950 * Holes: Next entry should be contiguous unless this
6951 * is the end of the region.
6952 * Protection: Access requested must be allowed, unless
6953 * wiring is by protection class
6954 */
6955 if ((entry->vme_end < end) &&
6956 ((entry->vme_next == vm_map_to_entry(map)) ||
6957 (entry->vme_next->vme_start > entry->vme_end))) {
6958 /* found a hole */
6959 rc = KERN_INVALID_ADDRESS;
6960 goto done;
6961 }
6962 if ((entry->protection & access_type) != access_type) {
6963 /* found a protection problem */
6964 rc = KERN_PROTECTION_FAILURE;
6965 goto done;
6966 }
6967
6968 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6969
6970 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6971 goto done;
6972 }
6973
6974 entry->in_transition = TRUE;
6975
6976 /*
6977 * This entry might get split once we unlock the map.
6978 * In vm_fault_wire(), we need the current range as
6979 * defined by this entry. In order for this to work
6980 * along with a simultaneous clip operation, we make a
6981 * temporary copy of this entry and use that for the
6982 * wiring. Note that the underlying objects do not
6983 * change during a clip.
6984 */
6985 tmp_entry = *entry;
6986
6987 /*
6988 * The in_transition state guarentees that the entry
6989 * (or entries for this range, if split occured) will be
6990 * there when the map lock is acquired for the second time.
6991 */
6992 vm_map_unlock(map);
6993
6994 if (!user_wire && cur_thread != THREAD_NULL) {
6995 interruptible_state = thread_interrupt_level(THREAD_UNINT);
6996 } else {
6997 interruptible_state = THREAD_UNINT;
6998 }
6999
7000 if (map_pmap) {
7001 rc = vm_fault_wire(map,
7002 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
7003 physpage_p);
7004 } else {
7005 rc = vm_fault_wire(map,
7006 &tmp_entry, caller_prot, tag, map->pmap,
7007 tmp_entry.vme_start,
7008 physpage_p);
7009 }
7010
7011 if (!user_wire && cur_thread != THREAD_NULL) {
7012 thread_interrupt_level(interruptible_state);
7013 }
7014
7015 vm_map_lock(map);
7016
7017 if (last_timestamp + 1 != map->timestamp) {
7018 /*
7019 * Find the entry again. It could have been clipped
7020 * after we unlocked the map.
7021 */
7022 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7023 &first_entry)) {
7024 panic("vm_map_wire: re-lookup failed");
7025 }
7026
7027 entry = first_entry;
7028 }
7029
7030 last_timestamp = map->timestamp;
7031
7032 while ((entry != vm_map_to_entry(map)) &&
7033 (entry->vme_start < tmp_entry.vme_end)) {
7034 assert(entry->in_transition);
7035 entry->in_transition = FALSE;
7036 if (entry->needs_wakeup) {
7037 entry->needs_wakeup = FALSE;
7038 need_wakeup = TRUE;
7039 }
7040 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7041 subtract_wire_counts(map, entry, user_wire);
7042 }
7043 entry = entry->vme_next;
7044 }
7045
7046 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7047 goto done;
7048 }
7049
7050 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7051 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7052 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7053 /* found a "new" hole */
7054 s = tmp_entry.vme_end;
7055 rc = KERN_INVALID_ADDRESS;
7056 goto done;
7057 }
7058
7059 s = entry->vme_start;
7060 } /* end while loop through map entries */
7061
7062 done:
7063 if (rc == KERN_SUCCESS) {
7064 /* repair any damage we may have made to the VM map */
7065 vm_map_simplify_range(map, start, end);
7066 }
7067
7068 vm_map_unlock(map);
7069
7070 /*
7071 * wake up anybody waiting on entries we wired.
7072 */
7073 if (need_wakeup) {
7074 vm_map_entry_wakeup(map);
7075 }
7076
7077 if (rc != KERN_SUCCESS) {
7078 /* undo what has been wired so far */
7079 vm_map_unwire_nested(map, start, s, user_wire,
7080 map_pmap, pmap_addr);
7081 if (physpage_p) {
7082 *physpage_p = 0;
7083 }
7084 }
7085
7086 return rc;
7087 }
7088
7089 static __attribute__((always_inline, warn_unused_result))
7090 kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)7091 vm_map_wire_sanitize(
7092 vm_map_t map,
7093 vm_map_offset_ut start_u,
7094 vm_map_offset_ut end_u,
7095 vm_prot_ut prot_u,
7096 vm_sanitize_caller_t vm_sanitize_caller,
7097 vm_map_offset_t *start,
7098 vm_map_offset_t *end,
7099 vm_map_size_t *size,
7100 vm_prot_t *prot)
7101 {
7102 kern_return_t kr;
7103
7104 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7105 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7106 size);
7107 if (__improbable(kr != KERN_SUCCESS)) {
7108 return kr;
7109 }
7110
7111 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
7112 if (__improbable(kr != KERN_SUCCESS)) {
7113 return kr;
7114 }
7115
7116 return KERN_SUCCESS;
7117 }
7118
7119 /*
7120 * Validation function for vm_map_wire_nested().
7121 */
7122 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)7123 vm_map_wire_impl(
7124 vm_map_t map,
7125 vm_map_offset_ut start_u,
7126 vm_map_offset_ut end_u,
7127 vm_prot_ut prot_u,
7128 vm_tag_t tag,
7129 boolean_t user_wire,
7130 ppnum_t *physpage_p,
7131 vm_sanitize_caller_t vm_sanitize_caller)
7132 {
7133 vm_map_offset_t start, end;
7134 vm_map_size_t size;
7135 vm_prot_t prot;
7136 kern_return_t kr;
7137
7138 /*
7139 * Sanitize any input parameters that are addr/size/prot/inherit
7140 */
7141 kr = vm_map_wire_sanitize(map,
7142 start_u,
7143 end_u,
7144 prot_u,
7145 vm_sanitize_caller,
7146 &start,
7147 &end,
7148 &size,
7149 &prot);
7150 if (__improbable(kr != KERN_SUCCESS)) {
7151 if (physpage_p) {
7152 *physpage_p = 0;
7153 }
7154 return vm_sanitize_get_kr(kr);
7155 }
7156
7157 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7158 PMAP_NULL, 0, physpage_p);
7159 }
7160
7161 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7162 vm_map_wire_external(
7163 vm_map_t map,
7164 vm_map_offset_ut start_u,
7165 vm_map_offset_ut end_u,
7166 vm_prot_ut prot_u,
7167 boolean_t user_wire)
7168 {
7169 vm_tag_t tag = vm_tag_bt();
7170
7171 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7172 }
7173
7174 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7175 vm_map_wire_kernel(
7176 vm_map_t map,
7177 vm_map_offset_ut start_u,
7178 vm_map_offset_ut end_u,
7179 vm_prot_ut prot_u,
7180 vm_tag_t tag,
7181 boolean_t user_wire)
7182 {
7183 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7184 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7185 }
7186
7187 #if XNU_PLATFORM_MacOSX
7188
7189 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7190 vm_map_wire_and_extract(
7191 vm_map_t map,
7192 vm_map_offset_ut start_u,
7193 vm_prot_ut prot_u,
7194 boolean_t user_wire,
7195 ppnum_t *physpage_p)
7196 {
7197 vm_tag_t tag = vm_tag_bt();
7198 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7199 vm_map_offset_ut end_u = vm_sanitize_compute_ut_end(start_u, size_u);
7200
7201 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7202 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7203 }
7204
7205 #endif /* XNU_PLATFORM_MacOSX */
7206
7207 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7208 vm_map_unwire_nested(
7209 vm_map_t map,
7210 vm_map_offset_t start,
7211 vm_map_offset_t end,
7212 boolean_t user_wire,
7213 pmap_t map_pmap,
7214 vm_map_offset_t pmap_addr)
7215 {
7216 vm_map_entry_t entry;
7217 struct vm_map_entry *first_entry, tmp_entry;
7218 boolean_t need_wakeup;
7219 boolean_t main_map = FALSE;
7220 unsigned int last_timestamp;
7221
7222 VM_MAP_RANGE_CHECK(map, start, end);
7223 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7224 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7225
7226 if (start == end) {
7227 /* We unwired what the caller asked for: zero pages */
7228 return KERN_SUCCESS;
7229 }
7230
7231 vm_map_lock(map);
7232 if (map_pmap == NULL) {
7233 main_map = TRUE;
7234 }
7235 last_timestamp = map->timestamp;
7236
7237 if (vm_map_lookup_entry(map, start, &first_entry)) {
7238 entry = first_entry;
7239 /*
7240 * vm_map_clip_start will be done later.
7241 * We don't want to unnest any nested sub maps here !
7242 */
7243 } else {
7244 if (!user_wire) {
7245 panic("vm_map_unwire: start not found");
7246 }
7247 /* Start address is not in map. */
7248 vm_map_unlock(map);
7249 return KERN_INVALID_ADDRESS;
7250 }
7251
7252 if (entry->superpage_size) {
7253 /* superpages are always wired */
7254 vm_map_unlock(map);
7255 return KERN_INVALID_ADDRESS;
7256 }
7257
7258 need_wakeup = FALSE;
7259 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7260 if (entry->in_transition) {
7261 /*
7262 * 1)
7263 * Another thread is wiring down this entry. Note
7264 * that if it is not for the other thread we would
7265 * be unwiring an unwired entry. This is not
7266 * permitted. If we wait, we will be unwiring memory
7267 * we did not wire.
7268 *
7269 * 2)
7270 * Another thread is unwiring this entry. We did not
7271 * have a reference to it, because if we did, this
7272 * entry will not be getting unwired now.
7273 */
7274 if (!user_wire) {
7275 /*
7276 * XXX FBDP
7277 * This could happen: there could be some
7278 * overlapping vslock/vsunlock operations
7279 * going on.
7280 * We should probably just wait and retry,
7281 * but then we have to be careful that this
7282 * entry could get "simplified" after
7283 * "in_transition" gets unset and before
7284 * we re-lookup the entry, so we would
7285 * have to re-clip the entry to avoid
7286 * re-unwiring what we have already unwired...
7287 * See vm_map_wire_nested().
7288 *
7289 * Or we could just ignore "in_transition"
7290 * here and proceed to decement the wired
7291 * count(s) on this entry. That should be fine
7292 * as long as "wired_count" doesn't drop all
7293 * the way to 0 (and we should panic if THAT
7294 * happens).
7295 */
7296 panic("vm_map_unwire: in_transition entry");
7297 }
7298
7299 entry = entry->vme_next;
7300 continue;
7301 }
7302
7303 if (entry->is_sub_map) {
7304 vm_map_offset_t sub_start;
7305 vm_map_offset_t sub_end;
7306 vm_map_offset_t local_end;
7307 pmap_t pmap;
7308
7309 vm_map_clip_start(map, entry, start);
7310 vm_map_clip_end(map, entry, end);
7311
7312 sub_start = VME_OFFSET(entry);
7313 sub_end = entry->vme_end - entry->vme_start;
7314 sub_end += VME_OFFSET(entry);
7315 local_end = entry->vme_end;
7316 if (map_pmap == NULL) {
7317 if (entry->use_pmap) {
7318 pmap = VME_SUBMAP(entry)->pmap;
7319 pmap_addr = sub_start;
7320 } else {
7321 pmap = map->pmap;
7322 pmap_addr = start;
7323 }
7324 if (entry->wired_count == 0 ||
7325 (user_wire && entry->user_wired_count == 0)) {
7326 if (!user_wire) {
7327 panic("vm_map_unwire: entry is unwired");
7328 }
7329 entry = entry->vme_next;
7330 continue;
7331 }
7332
7333 /*
7334 * Check for holes
7335 * Holes: Next entry should be contiguous unless
7336 * this is the end of the region.
7337 */
7338 if (((entry->vme_end < end) &&
7339 ((entry->vme_next == vm_map_to_entry(map)) ||
7340 (entry->vme_next->vme_start
7341 > entry->vme_end)))) {
7342 if (!user_wire) {
7343 panic("vm_map_unwire: non-contiguous region");
7344 }
7345 /*
7346 * entry = entry->vme_next;
7347 * continue;
7348 */
7349 }
7350
7351 subtract_wire_counts(map, entry, user_wire);
7352
7353 if (entry->wired_count != 0) {
7354 entry = entry->vme_next;
7355 continue;
7356 }
7357
7358 entry->in_transition = TRUE;
7359 tmp_entry = *entry;/* see comment in vm_map_wire() */
7360
7361 /*
7362 * We can unlock the map now. The in_transition state
7363 * guarantees existance of the entry.
7364 */
7365 vm_map_unlock(map);
7366 vm_map_unwire_nested(VME_SUBMAP(entry),
7367 sub_start, sub_end, user_wire, pmap, pmap_addr);
7368 vm_map_lock(map);
7369
7370 if (last_timestamp + 1 != map->timestamp) {
7371 /*
7372 * Find the entry again. It could have been
7373 * clipped or deleted after we unlocked the map.
7374 */
7375 if (!vm_map_lookup_entry(map,
7376 tmp_entry.vme_start,
7377 &first_entry)) {
7378 if (!user_wire) {
7379 panic("vm_map_unwire: re-lookup failed");
7380 }
7381 entry = first_entry->vme_next;
7382 } else {
7383 entry = first_entry;
7384 }
7385 }
7386 last_timestamp = map->timestamp;
7387
7388 /*
7389 * clear transition bit for all constituent entries
7390 * that were in the original entry (saved in
7391 * tmp_entry). Also check for waiters.
7392 */
7393 while ((entry != vm_map_to_entry(map)) &&
7394 (entry->vme_start < tmp_entry.vme_end)) {
7395 assert(entry->in_transition);
7396 entry->in_transition = FALSE;
7397 if (entry->needs_wakeup) {
7398 entry->needs_wakeup = FALSE;
7399 need_wakeup = TRUE;
7400 }
7401 entry = entry->vme_next;
7402 }
7403 continue;
7404 } else {
7405 tmp_entry = *entry;
7406 vm_map_unlock(map);
7407 vm_map_unwire_nested(VME_SUBMAP(entry),
7408 sub_start, sub_end, user_wire, map_pmap,
7409 pmap_addr);
7410 vm_map_lock(map);
7411
7412 if (last_timestamp + 1 != map->timestamp) {
7413 /*
7414 * Find the entry again. It could have been
7415 * clipped or deleted after we unlocked the map.
7416 */
7417 if (!vm_map_lookup_entry(map,
7418 tmp_entry.vme_start,
7419 &first_entry)) {
7420 if (!user_wire) {
7421 panic("vm_map_unwire: re-lookup failed");
7422 }
7423 entry = first_entry->vme_next;
7424 } else {
7425 entry = first_entry;
7426 }
7427 }
7428 last_timestamp = map->timestamp;
7429 }
7430 }
7431
7432
7433 if ((entry->wired_count == 0) ||
7434 (user_wire && entry->user_wired_count == 0)) {
7435 if (!user_wire) {
7436 panic("vm_map_unwire: entry is unwired");
7437 }
7438
7439 entry = entry->vme_next;
7440 continue;
7441 }
7442
7443 assert(entry->wired_count > 0 &&
7444 (!user_wire || entry->user_wired_count > 0));
7445
7446 vm_map_clip_start(map, entry, start);
7447 vm_map_clip_end(map, entry, end);
7448
7449 /*
7450 * Check for holes
7451 * Holes: Next entry should be contiguous unless
7452 * this is the end of the region.
7453 */
7454 if (((entry->vme_end < end) &&
7455 ((entry->vme_next == vm_map_to_entry(map)) ||
7456 (entry->vme_next->vme_start > entry->vme_end)))) {
7457 if (!user_wire) {
7458 panic("vm_map_unwire: non-contiguous region");
7459 }
7460 entry = entry->vme_next;
7461 continue;
7462 }
7463
7464 subtract_wire_counts(map, entry, user_wire);
7465
7466 if (entry->wired_count != 0) {
7467 entry = entry->vme_next;
7468 continue;
7469 }
7470
7471 if (entry->zero_wired_pages) {
7472 entry->zero_wired_pages = FALSE;
7473 }
7474
7475 entry->in_transition = TRUE;
7476 tmp_entry = *entry; /* see comment in vm_map_wire() */
7477
7478 /*
7479 * We can unlock the map now. The in_transition state
7480 * guarantees existance of the entry.
7481 */
7482 vm_map_unlock(map);
7483 if (map_pmap) {
7484 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7485 pmap_addr, tmp_entry.vme_end);
7486 } else {
7487 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7488 tmp_entry.vme_start, tmp_entry.vme_end);
7489 }
7490 vm_map_lock(map);
7491
7492 if (last_timestamp + 1 != map->timestamp) {
7493 /*
7494 * Find the entry again. It could have been clipped
7495 * or deleted after we unlocked the map.
7496 */
7497 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7498 &first_entry)) {
7499 if (!user_wire) {
7500 panic("vm_map_unwire: re-lookup failed");
7501 }
7502 entry = first_entry->vme_next;
7503 } else {
7504 entry = first_entry;
7505 }
7506 }
7507 last_timestamp = map->timestamp;
7508
7509 /*
7510 * clear transition bit for all constituent entries that
7511 * were in the original entry (saved in tmp_entry). Also
7512 * check for waiters.
7513 */
7514 while ((entry != vm_map_to_entry(map)) &&
7515 (entry->vme_start < tmp_entry.vme_end)) {
7516 assert(entry->in_transition);
7517 entry->in_transition = FALSE;
7518 if (entry->needs_wakeup) {
7519 entry->needs_wakeup = FALSE;
7520 need_wakeup = TRUE;
7521 }
7522 entry = entry->vme_next;
7523 }
7524 }
7525
7526 /*
7527 * We might have fragmented the address space when we wired this
7528 * range of addresses. Attempt to re-coalesce these VM map entries
7529 * with their neighbors now that they're no longer wired.
7530 * Under some circumstances, address space fragmentation can
7531 * prevent VM object shadow chain collapsing, which can cause
7532 * swap space leaks.
7533 */
7534 vm_map_simplify_range(map, start, end);
7535
7536 vm_map_unlock(map);
7537 /*
7538 * wake up anybody waiting on entries that we have unwired.
7539 */
7540 if (need_wakeup) {
7541 vm_map_entry_wakeup(map);
7542 }
7543 return KERN_SUCCESS;
7544 }
7545
7546 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7547 vm_map_unwire(
7548 vm_map_t map,
7549 vm_map_offset_ut start_u,
7550 vm_map_offset_ut end_u,
7551 boolean_t user_wire)
7552 {
7553 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7554 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7555 }
7556
7557 static __attribute__((always_inline, warn_unused_result))
7558 kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7559 vm_map_unwire_sanitize(
7560 vm_map_t map,
7561 vm_map_offset_ut start_u,
7562 vm_map_offset_ut end_u,
7563 vm_sanitize_caller_t vm_sanitize_caller,
7564 vm_map_offset_t *start,
7565 vm_map_offset_t *end,
7566 vm_map_size_t *size)
7567 {
7568 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7569 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7570 size);
7571 }
7572
7573 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7574 vm_map_unwire_impl(
7575 vm_map_t map,
7576 vm_map_offset_ut start_u,
7577 vm_map_offset_ut end_u,
7578 boolean_t user_wire,
7579 vm_sanitize_caller_t vm_sanitize_caller)
7580 {
7581 vm_map_offset_t start, end;
7582 vm_map_size_t size;
7583 kern_return_t kr;
7584
7585 /*
7586 * Sanitize any input parameters that are addr/size/prot/inherit
7587 */
7588 kr = vm_map_unwire_sanitize(
7589 map,
7590 start_u,
7591 end_u,
7592 vm_sanitize_caller,
7593 &start,
7594 &end,
7595 &size);
7596 if (__improbable(kr != KERN_SUCCESS)) {
7597 return vm_sanitize_get_kr(kr);
7598 }
7599
7600 return vm_map_unwire_nested(map, start, end,
7601 user_wire, (pmap_t)NULL, 0);
7602 }
7603
7604
7605 /*
7606 * vm_map_entry_zap: [ internal use only ]
7607 *
7608 * Remove the entry from the target map
7609 * and put it on a zap list.
7610 */
7611 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7612 vm_map_entry_zap(
7613 vm_map_t map,
7614 vm_map_entry_t entry,
7615 vm_map_zap_t zap)
7616 {
7617 vm_map_offset_t s, e;
7618
7619 s = entry->vme_start;
7620 e = entry->vme_end;
7621 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7622 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7623 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7624 assert(page_aligned(s));
7625 assert(page_aligned(e));
7626 }
7627 if (entry->map_aligned == TRUE) {
7628 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7629 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7630 }
7631 assert(entry->wired_count == 0);
7632 assert(entry->user_wired_count == 0);
7633 assert(!entry->vme_permanent);
7634
7635 vm_map_store_entry_unlink(map, entry, false);
7636 map->size -= e - s;
7637
7638 vm_map_zap_append(zap, entry);
7639 }
7640
7641 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7642 vm_map_submap_pmap_clean(
7643 vm_map_t map,
7644 vm_map_offset_t start,
7645 vm_map_offset_t end,
7646 vm_map_t sub_map,
7647 vm_map_offset_t offset)
7648 {
7649 vm_map_offset_t submap_start;
7650 vm_map_offset_t submap_end;
7651 vm_map_size_t remove_size;
7652 vm_map_entry_t entry;
7653
7654 submap_end = offset + (end - start);
7655 submap_start = offset;
7656
7657 vm_map_lock_read(sub_map);
7658 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7659 remove_size = (entry->vme_end - entry->vme_start);
7660 if (offset > entry->vme_start) {
7661 remove_size -= offset - entry->vme_start;
7662 }
7663
7664
7665 if (submap_end < entry->vme_end) {
7666 remove_size -=
7667 entry->vme_end - submap_end;
7668 }
7669 if (entry->is_sub_map) {
7670 vm_map_submap_pmap_clean(
7671 sub_map,
7672 start,
7673 start + remove_size,
7674 VME_SUBMAP(entry),
7675 VME_OFFSET(entry));
7676 } else {
7677 if (map->mapped_in_other_pmaps &&
7678 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7679 VME_OBJECT(entry) != NULL) {
7680 vm_object_pmap_protect_options(
7681 VME_OBJECT(entry),
7682 (VME_OFFSET(entry) +
7683 offset -
7684 entry->vme_start),
7685 remove_size,
7686 PMAP_NULL,
7687 PAGE_SIZE,
7688 entry->vme_start,
7689 VM_PROT_NONE,
7690 PMAP_OPTIONS_REMOVE);
7691 } else {
7692 pmap_remove(map->pmap,
7693 (addr64_t)start,
7694 (addr64_t)(start + remove_size));
7695 }
7696 }
7697 }
7698
7699 entry = entry->vme_next;
7700
7701 while ((entry != vm_map_to_entry(sub_map))
7702 && (entry->vme_start < submap_end)) {
7703 remove_size = (entry->vme_end - entry->vme_start);
7704 if (submap_end < entry->vme_end) {
7705 remove_size -= entry->vme_end - submap_end;
7706 }
7707 if (entry->is_sub_map) {
7708 vm_map_submap_pmap_clean(
7709 sub_map,
7710 (start + entry->vme_start) - offset,
7711 ((start + entry->vme_start) - offset) + remove_size,
7712 VME_SUBMAP(entry),
7713 VME_OFFSET(entry));
7714 } else {
7715 if (map->mapped_in_other_pmaps &&
7716 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7717 VME_OBJECT(entry) != NULL) {
7718 vm_object_pmap_protect_options(
7719 VME_OBJECT(entry),
7720 VME_OFFSET(entry),
7721 remove_size,
7722 PMAP_NULL,
7723 PAGE_SIZE,
7724 entry->vme_start,
7725 VM_PROT_NONE,
7726 PMAP_OPTIONS_REMOVE);
7727 } else {
7728 pmap_remove(map->pmap,
7729 (addr64_t)((start + entry->vme_start)
7730 - offset),
7731 (addr64_t)(((start + entry->vme_start)
7732 - offset) + remove_size));
7733 }
7734 }
7735 entry = entry->vme_next;
7736 }
7737 vm_map_unlock_read(sub_map);
7738 return;
7739 }
7740
7741 /*
7742 * virt_memory_guard_ast:
7743 *
7744 * Handle the AST callout for a virtual memory guard.
7745 * raise an EXC_GUARD exception and terminate the task
7746 * if configured to do so.
7747 */
7748 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7749 virt_memory_guard_ast(
7750 thread_t thread,
7751 mach_exception_data_type_t code,
7752 mach_exception_data_type_t subcode)
7753 {
7754 task_t task = get_threadtask(thread);
7755 assert(task != kernel_task);
7756 assert(task == current_task());
7757 kern_return_t sync_exception_result;
7758 uint32_t behavior;
7759
7760 behavior = task->task_exc_guard;
7761
7762 /* Is delivery enabled */
7763 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7764 return;
7765 }
7766
7767 /* If only once, make sure we're that once */
7768 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7769 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7770
7771 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7772 break;
7773 }
7774 behavior = task->task_exc_guard;
7775 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7776 return;
7777 }
7778 }
7779
7780 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7781 /* Raise exception synchronously and see if handler claimed it */
7782 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7783
7784 if (fatal) {
7785 /*
7786 * If Synchronous EXC_GUARD delivery was successful then
7787 * kill the process and return, else kill the process
7788 * and deliver the exception via EXC_CORPSE_NOTIFY.
7789 */
7790
7791
7792 int flags = PX_DEBUG_NO_HONOR;
7793 exception_info_t info = {
7794 .os_reason = OS_REASON_GUARD,
7795 .exception_type = EXC_GUARD,
7796 .mx_code = code,
7797 .mx_subcode = subcode
7798 };
7799
7800 if (sync_exception_result == KERN_SUCCESS) {
7801 flags |= PX_PSIGNAL;
7802 }
7803 exit_with_mach_exception(current_proc(), info, flags);
7804 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7805 /*
7806 * If the synchronous EXC_GUARD delivery was not successful,
7807 * raise a simulated crash.
7808 */
7809 if (sync_exception_result != KERN_SUCCESS) {
7810 task_violated_guard(code, subcode, NULL, FALSE);
7811 }
7812 }
7813 }
7814
7815 /*
7816 * vm_map_guard_exception:
7817 *
7818 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7819 *
7820 * Right now, we do this when we find nothing mapped, or a
7821 * gap in the mapping when a user address space deallocate
7822 * was requested. We report the address of the first gap found.
7823 */
7824 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7825 vm_map_guard_exception(
7826 vm_map_offset_t gap_start,
7827 unsigned reason)
7828 {
7829 mach_exception_code_t code = 0;
7830 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7831 unsigned int target = 0; /* should we pass in pid associated with map? */
7832 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7833 boolean_t fatal = FALSE;
7834
7835 task_t task = current_task_early();
7836
7837 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7838 if (task == NULL || task == kernel_task) {
7839 return;
7840 }
7841
7842 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7843 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7844 EXC_GUARD_ENCODE_TARGET(code, target);
7845
7846 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7847 fatal = TRUE;
7848 }
7849 thread_guard_violation(current_thread(), code, subcode, fatal);
7850 }
7851
7852 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7853 vm_map_delete_submap_recurse(
7854 vm_map_t submap,
7855 vm_map_offset_t submap_start,
7856 vm_map_offset_t submap_end)
7857 {
7858 vm_map_entry_t submap_entry;
7859
7860 /*
7861 * Verify that the submap does not contain any "permanent" entries
7862 * within the specified range. We permit TPRO ranges to be overwritten
7863 * as we only reach this path if TPRO const protection is disabled for a
7864 * given map.
7865 *
7866 * We do not care about gaps.
7867 */
7868
7869 vm_map_lock(submap);
7870
7871 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7872 submap_entry = submap_entry->vme_next;
7873 }
7874
7875 for (;
7876 submap_entry != vm_map_to_entry(submap) &&
7877 submap_entry->vme_start < submap_end;
7878 submap_entry = submap_entry->vme_next) {
7879 if (submap_entry->vme_permanent
7880 #ifdef __arm64e__
7881 /* allow TPRO submap entries to be overwritten */
7882 && !submap_entry->used_for_tpro
7883 #endif
7884 ) {
7885 /* "permanent" entry -> fail */
7886 vm_map_unlock(submap);
7887 return KERN_PROTECTION_FAILURE;
7888 }
7889 }
7890 /* no "permanent" entries in the range -> success */
7891 vm_map_unlock(submap);
7892 return KERN_SUCCESS;
7893 }
7894
7895 __abortlike
7896 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7897 __vm_map_delete_misaligned_panic(
7898 vm_map_t map,
7899 vm_map_offset_t start,
7900 vm_map_offset_t end)
7901 {
7902 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7903 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7904 }
7905
7906 __abortlike
7907 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7908 __vm_map_delete_failed_panic(
7909 vm_map_t map,
7910 vm_map_offset_t start,
7911 vm_map_offset_t end,
7912 kern_return_t kr)
7913 {
7914 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7915 map, (uint64_t)start, (uint64_t)end, kr);
7916 }
7917
7918 __abortlike
7919 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7920 __vm_map_delete_gap_panic(
7921 vm_map_t map,
7922 vm_map_offset_t where,
7923 vm_map_offset_t start,
7924 vm_map_offset_t end)
7925 {
7926 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7927 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7928 }
7929
7930 __abortlike
7931 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7932 __vm_map_delete_permanent_panic(
7933 vm_map_t map,
7934 vm_map_offset_t start,
7935 vm_map_offset_t end,
7936 vm_map_entry_t entry)
7937 {
7938 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7939 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7940 map, (uint64_t)start, (uint64_t)end, entry,
7941 (uint64_t)entry->vme_start,
7942 (uint64_t)entry->vme_end);
7943 }
7944
7945 __options_decl(vm_map_delete_state_t, uint32_t, {
7946 VMDS_NONE = 0x0000,
7947
7948 VMDS_FOUND_GAP = 0x0001,
7949 VMDS_GAPS_OK = 0x0002,
7950
7951 VMDS_KERNEL_PMAP = 0x0004,
7952 VMDS_NEEDS_LOOKUP = 0x0008,
7953 VMDS_NEEDS_WAKEUP = 0x0010,
7954 VMDS_KERNEL_KMEMPTR = 0x0020
7955 });
7956
7957 /*
7958 * vm_map_clamp_to_pmap(map, start, end)
7959 *
7960 * Modify *start and *end so they fall within the bounds of map->pmap.
7961 */
7962 #if MACH_ASSERT
7963 static void
vm_map_clamp_to_pmap(vm_map_t map,vm_map_address_t * start,vm_map_address_t * end)7964 vm_map_clamp_to_pmap(vm_map_t map, vm_map_address_t *start, vm_map_address_t *end)
7965 {
7966 vm_map_address_t min;
7967 vm_map_address_t max;
7968
7969 #if __x86_64__
7970 /* x86_64 struct pmap does not have min and max fields */
7971 if (map->pmap == kernel_pmap) {
7972 min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
7973 max = VM_MAX_KERNEL_ADDRESS;
7974 } else {
7975 min = VM_MAP_MIN_ADDRESS;
7976 max = VM_MAP_MAX_ADDRESS;
7977 }
7978 #else
7979 min = map->pmap->min;
7980 max = map->pmap->max;
7981 #endif
7982
7983 if (*start < min) {
7984 *start = min;
7985 } else if (*start > max) {
7986 *start = max;
7987 }
7988 if (*end < min) {
7989 *end = min;
7990 } else if (*end > max) {
7991 *end = max;
7992 }
7993 }
7994 #endif
7995
7996 int vm_log_map_delete_permanent_prot_none = 0;
7997 /*
7998 * vm_map_delete: [ internal use only ]
7999 *
8000 * Deallocates the given address range from the target map.
8001 * Removes all user wirings. Unwires one kernel wiring if
8002 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8003 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8004 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8005 *
8006 *
8007 * When the map is a kernel map, then any error in removing mappings
8008 * will lead to a panic so that clients do not have to repeat the panic
8009 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8010 * is also passed, then KERN_ABORTED will not lead to a panic.
8011 *
8012 * This routine is called with map locked and leaves map locked.
8013 */
8014 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)8015 vm_map_delete(
8016 vm_map_t map,
8017 vm_map_offset_t start,
8018 vm_map_offset_t end,
8019 vmr_flags_t flags,
8020 kmem_guard_t guard,
8021 vm_map_zap_t zap_list)
8022 {
8023 vm_map_entry_t entry, next;
8024 int interruptible;
8025 vm_map_offset_t gap_start = 0;
8026 vm_map_offset_t clear_in_transition_end = 0;
8027 __unused vm_map_offset_t save_start = start;
8028 __unused vm_map_offset_t save_end = end;
8029 vm_map_delete_state_t state = VMDS_NONE;
8030 kmem_return_t ret = { };
8031 vm_map_range_id_t range_id = 0;
8032 struct kmem_page_meta *meta = NULL;
8033 uint32_t size_idx, slot_idx;
8034 struct mach_vm_range slot;
8035
8036 if (vm_map_pmap(map) == kernel_pmap) {
8037 state |= VMDS_KERNEL_PMAP;
8038 range_id = kmem_addr_get_range(start, end - start);
8039 if (kmem_is_ptr_range(range_id)) {
8040 state |= VMDS_KERNEL_KMEMPTR;
8041 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
8042 &size_idx, &slot);
8043 }
8044 }
8045
8046 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
8047 state |= VMDS_GAPS_OK;
8048 }
8049
8050 if (map->corpse_source &&
8051 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8052 !map->terminated) {
8053 /*
8054 * The map is being used for corpses related diagnostics.
8055 * So skip any entry removal to avoid perturbing the map state.
8056 * The cleanup will happen in task_terminate_internal after the
8057 * call to task_port_no_senders.
8058 */
8059 goto out;
8060 }
8061
8062 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8063 THREAD_ABORTSAFE : THREAD_UNINT;
8064
8065 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8066 (start & VM_MAP_PAGE_MASK(map))) {
8067 __vm_map_delete_misaligned_panic(map, start, end);
8068 }
8069
8070 if ((state & VMDS_GAPS_OK) == 0) {
8071 /*
8072 * If the map isn't terminated then all deletions must have
8073 * no gaps, and be within the [min, max) of the map.
8074 *
8075 * We got here without VM_MAP_RANGE_CHECK() being called,
8076 * and hence must validate bounds manually.
8077 *
8078 * It is worth noting that because vm_deallocate() will
8079 * round_page() the deallocation size, it's possible for "end"
8080 * to be 0 here due to overflow. We hence must treat it as being
8081 * beyond vm_map_max(map).
8082 *
8083 * Similarly, end < start means some wrap around happend,
8084 * which should cause an error or panic.
8085 */
8086 if (end == 0 || end > vm_map_max(map)) {
8087 state |= VMDS_FOUND_GAP;
8088 gap_start = vm_map_max(map);
8089 if (state & VMDS_KERNEL_PMAP) {
8090 __vm_map_delete_gap_panic(map,
8091 gap_start, start, end);
8092 }
8093 goto out;
8094 }
8095
8096 if (end < start) {
8097 if (state & VMDS_KERNEL_PMAP) {
8098 __vm_map_delete_gap_panic(map,
8099 vm_map_max(map), start, end);
8100 }
8101 ret.kmr_return = KERN_INVALID_ARGUMENT;
8102 goto out;
8103 }
8104
8105 if (start < vm_map_min(map)) {
8106 state |= VMDS_FOUND_GAP;
8107 gap_start = start;
8108 if (state & VMDS_KERNEL_PMAP) {
8109 __vm_map_delete_gap_panic(map,
8110 gap_start, start, end);
8111 }
8112 goto out;
8113 }
8114 } else {
8115 /*
8116 * If the map is terminated, we must accept start/end
8117 * being beyond the boundaries of the map as this is
8118 * how some of the mappings like commpage mappings
8119 * can be destroyed (they're outside of those bounds).
8120 *
8121 * end < start is still something we can't cope with,
8122 * so just bail.
8123 */
8124 if (end < start) {
8125 goto out;
8126 }
8127 }
8128
8129
8130 /*
8131 * Find the start of the region.
8132 *
8133 * If in a superpage, extend the range
8134 * to include the start of the mapping.
8135 */
8136 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
8137 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8138 start = SUPERPAGE_ROUND_DOWN(start);
8139 } else {
8140 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8141 break;
8142 }
8143 }
8144
8145 if (entry->superpage_size) {
8146 end = SUPERPAGE_ROUND_UP(end);
8147 }
8148
8149 /*
8150 * Step through all entries in this region
8151 */
8152 for (vm_map_offset_t s = start; s < end;) {
8153 /*
8154 * At this point, we have deleted all the memory entries
8155 * in [start, s) and are proceeding with the [s, end) range.
8156 *
8157 * This loop might drop the map lock, and it is possible that
8158 * some memory was already reallocated within [start, s)
8159 * and we don't want to mess with those entries.
8160 *
8161 * Some of those entries could even have been re-assembled
8162 * with an entry after "s" (in vm_map_simplify_entry()), so
8163 * we may have to vm_map_clip_start() again.
8164 *
8165 * When clear_in_transition_end is set, the we had marked
8166 * [start, clear_in_transition_end) as "in_transition"
8167 * during a previous iteration and we need to clear it.
8168 */
8169
8170 /*
8171 * Step 1: If needed (because we dropped locks),
8172 * lookup the entry again.
8173 *
8174 * If we're coming back from unwiring (Step 5),
8175 * we also need to mark the entries as no longer
8176 * in transition after that.
8177 */
8178
8179 if (state & VMDS_NEEDS_LOOKUP) {
8180 state &= ~VMDS_NEEDS_LOOKUP;
8181
8182 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
8183 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8184 }
8185
8186 if (state & VMDS_KERNEL_KMEMPTR) {
8187 kmem_validate_slot(s, meta, size_idx, slot_idx);
8188 }
8189 }
8190
8191 if (clear_in_transition_end) {
8192 for (vm_map_entry_t it = entry;
8193 it != vm_map_to_entry(map) &&
8194 it->vme_start < clear_in_transition_end;
8195 it = it->vme_next) {
8196 assert(it->in_transition);
8197 it->in_transition = FALSE;
8198 if (it->needs_wakeup) {
8199 it->needs_wakeup = FALSE;
8200 state |= VMDS_NEEDS_WAKEUP;
8201 }
8202 }
8203
8204 clear_in_transition_end = 0;
8205 }
8206
8207
8208 /*
8209 * Step 2: Perform various policy checks
8210 * before we do _anything_ to this entry.
8211 */
8212
8213 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8214 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8215 /*
8216 * Either we found a gap already,
8217 * or we are tearing down a map,
8218 * keep going.
8219 */
8220 } else if (state & VMDS_KERNEL_PMAP) {
8221 __vm_map_delete_gap_panic(map, s, start, end);
8222 } else if (s < end) {
8223 state |= VMDS_FOUND_GAP;
8224 gap_start = s;
8225 }
8226
8227 if (entry == vm_map_to_entry(map) ||
8228 end <= entry->vme_start) {
8229 break;
8230 }
8231
8232 s = entry->vme_start;
8233 }
8234
8235 if (state & VMDS_KERNEL_PMAP) {
8236 /*
8237 * In the kernel map and its submaps,
8238 * permanent entries never die, even
8239 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8240 */
8241 if (entry->vme_permanent) {
8242 __vm_map_delete_permanent_panic(map, start, end, entry);
8243 }
8244
8245 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8246 end = entry->vme_end;
8247 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8248 }
8249
8250 /*
8251 * In the kernel map and its submaps,
8252 * the removal of an atomic/guarded entry is strict.
8253 *
8254 * An atomic entry is processed only if it was
8255 * specifically targeted.
8256 *
8257 * We might have deleted non-atomic entries before
8258 * we reach this this point however...
8259 */
8260 kmem_entry_validate_guard(map, entry,
8261 start, end - start, guard);
8262 }
8263
8264 /*
8265 * Step 2.1: handle "permanent" and "submap" entries
8266 * *before* clipping to avoid triggering some unnecessary
8267 * un-nesting of the shared region.
8268 */
8269 if (entry->vme_permanent && entry->is_sub_map) {
8270 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8271 /*
8272 * Un-mapping a "permanent" mapping of a user-space
8273 * submap is not allowed unless...
8274 */
8275 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8276 /*
8277 * a. explicitly requested by the kernel caller.
8278 */
8279 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8280 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8281 developer_mode_state()) {
8282 /*
8283 * b. we're in "developer" mode (for
8284 * breakpoints, dtrace probes, ...).
8285 */
8286 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8287 } else if (map->terminated) {
8288 /*
8289 * c. this is the final address space cleanup.
8290 */
8291 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8292 } else {
8293 vm_map_offset_t submap_start, submap_end;
8294 kern_return_t submap_kr;
8295
8296 /*
8297 * Check if there are any "permanent" mappings
8298 * in this range in the submap.
8299 */
8300 if (entry->in_transition) {
8301 /* can that even happen ? */
8302 goto in_transition;
8303 }
8304 /* compute the clipped range in the submap */
8305 submap_start = s - entry->vme_start;
8306 submap_start += VME_OFFSET(entry);
8307 submap_end = end - entry->vme_start;
8308 submap_end += VME_OFFSET(entry);
8309 submap_kr = vm_map_delete_submap_recurse(
8310 VME_SUBMAP(entry),
8311 submap_start,
8312 submap_end);
8313 if (submap_kr != KERN_SUCCESS) {
8314 /*
8315 * There are some "permanent" mappings
8316 * in the submap: we are not allowed
8317 * to remove this range.
8318 */
8319 printf("%d[%s] removing permanent submap entry "
8320 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8321 proc_selfpid(),
8322 (get_bsdtask_info(current_task())
8323 ? proc_name_address(get_bsdtask_info(current_task()))
8324 : "?"), entry,
8325 (uint64_t)entry->vme_start,
8326 (uint64_t)entry->vme_end,
8327 entry->protection,
8328 entry->max_protection);
8329 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8330 vm_map_entry_t, entry,
8331 vm_map_offset_t, entry->vme_start,
8332 vm_map_offset_t, entry->vme_end,
8333 vm_prot_t, entry->protection,
8334 vm_prot_t, entry->max_protection,
8335 int, VME_ALIAS(entry));
8336 ret.kmr_return = KERN_PROTECTION_FAILURE;
8337 goto out;
8338 }
8339 /* no permanent mappings: proceed */
8340 }
8341 }
8342
8343 /*
8344 * Step 3: Perform any clipping needed.
8345 *
8346 * After this, "entry" starts at "s", ends before "end"
8347 */
8348
8349 if (entry->vme_start < s) {
8350 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8351 entry->map_aligned &&
8352 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8353 /*
8354 * The entry will no longer be map-aligned
8355 * after clipping and the caller said it's OK.
8356 */
8357 entry->map_aligned = FALSE;
8358 }
8359 vm_map_clip_start(map, entry, s);
8360 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8361 }
8362
8363 if (end < entry->vme_end) {
8364 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8365 entry->map_aligned &&
8366 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8367 /*
8368 * The entry will no longer be map-aligned
8369 * after clipping and the caller said it's OK.
8370 */
8371 entry->map_aligned = FALSE;
8372 }
8373 vm_map_clip_end(map, entry, end);
8374 }
8375
8376 if (entry->vme_permanent && entry->is_sub_map) {
8377 /*
8378 * We already went through step 2.1 which did not deny
8379 * the removal of this "permanent" and "is_sub_map"
8380 * entry.
8381 * Now that we've clipped what we actually want to
8382 * delete, undo the "permanent" part to allow the
8383 * removal to proceed.
8384 */
8385 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8386 vm_map_entry_t, entry,
8387 vm_map_offset_t, entry->vme_start,
8388 vm_map_offset_t, entry->vme_end,
8389 vm_prot_t, entry->protection,
8390 vm_prot_t, entry->max_protection,
8391 int, VME_ALIAS(entry));
8392 entry->vme_permanent = false;
8393 }
8394
8395 assert(s == entry->vme_start);
8396 assert(entry->vme_end <= end);
8397
8398
8399 /*
8400 * Step 4: If the entry is in flux, wait for this to resolve.
8401 */
8402
8403 if (entry->in_transition) {
8404 wait_result_t wait_result;
8405
8406 in_transition:
8407 /*
8408 * Another thread is wiring/unwiring this entry.
8409 * Let the other thread know we are waiting.
8410 */
8411
8412 entry->needs_wakeup = TRUE;
8413
8414 /*
8415 * wake up anybody waiting on entries that we have
8416 * already unwired/deleted.
8417 */
8418 if (state & VMDS_NEEDS_WAKEUP) {
8419 vm_map_entry_wakeup(map);
8420 state &= ~VMDS_NEEDS_WAKEUP;
8421 }
8422
8423 wait_result = vm_map_entry_wait(map, interruptible);
8424
8425 if (interruptible &&
8426 wait_result == THREAD_INTERRUPTED) {
8427 /*
8428 * We do not clear the needs_wakeup flag,
8429 * since we cannot tell if we were the only one.
8430 */
8431 ret.kmr_return = KERN_ABORTED;
8432 return ret;
8433 }
8434
8435 /*
8436 * The entry could have been clipped or it
8437 * may not exist anymore. Look it up again.
8438 */
8439 state |= VMDS_NEEDS_LOOKUP;
8440 continue;
8441 }
8442
8443
8444 /*
8445 * Step 5: Handle wiring
8446 */
8447
8448 if (entry->wired_count) {
8449 struct vm_map_entry tmp_entry;
8450 boolean_t user_wire;
8451 unsigned int last_timestamp;
8452
8453 user_wire = entry->user_wired_count > 0;
8454
8455 /*
8456 * Remove a kernel wiring if requested
8457 */
8458 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8459 entry->wired_count--;
8460 vme_btref_consider_and_put(entry);
8461 }
8462
8463 /*
8464 * Remove all user wirings for proper accounting
8465 */
8466 while (entry->user_wired_count) {
8467 subtract_wire_counts(map, entry, user_wire);
8468 }
8469
8470 /*
8471 * All our DMA I/O operations in IOKit are currently
8472 * done by wiring through the map entries of the task
8473 * requesting the I/O.
8474 *
8475 * Because of this, we must always wait for kernel wirings
8476 * to go away on the entries before deleting them.
8477 *
8478 * Any caller who wants to actually remove a kernel wiring
8479 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8480 * properly remove one wiring instead of blasting through
8481 * them all.
8482 */
8483 if (entry->wired_count != 0) {
8484 assert(map != kernel_map);
8485 /*
8486 * Cannot continue. Typical case is when
8487 * a user thread has physical io pending on
8488 * on this page. Either wait for the
8489 * kernel wiring to go away or return an
8490 * error.
8491 */
8492 wait_result_t wait_result;
8493
8494 entry->needs_wakeup = TRUE;
8495 wait_result = vm_map_entry_wait(map,
8496 interruptible);
8497
8498 if (interruptible &&
8499 wait_result == THREAD_INTERRUPTED) {
8500 /*
8501 * We do not clear the
8502 * needs_wakeup flag, since we
8503 * cannot tell if we were the
8504 * only one.
8505 */
8506 ret.kmr_return = KERN_ABORTED;
8507 return ret;
8508 }
8509
8510
8511 /*
8512 * The entry could have been clipped or
8513 * it may not exist anymore. Look it
8514 * up again.
8515 */
8516 state |= VMDS_NEEDS_LOOKUP;
8517 continue;
8518 }
8519
8520 /*
8521 * We can unlock the map now.
8522 *
8523 * The entry might be split once we unlock the map,
8524 * but we need the range as defined by this entry
8525 * to be stable. So we must make a local copy.
8526 *
8527 * The underlying objects do not change during clips,
8528 * and the in_transition state guarentees existence
8529 * of the entry.
8530 */
8531 last_timestamp = map->timestamp;
8532 entry->in_transition = TRUE;
8533 tmp_entry = *entry;
8534 vm_map_unlock(map);
8535
8536 if (tmp_entry.is_sub_map) {
8537 vm_map_t sub_map;
8538 vm_map_offset_t sub_start, sub_end;
8539 pmap_t pmap;
8540 vm_map_offset_t pmap_addr;
8541
8542
8543 sub_map = VME_SUBMAP(&tmp_entry);
8544 sub_start = VME_OFFSET(&tmp_entry);
8545 sub_end = sub_start + (tmp_entry.vme_end -
8546 tmp_entry.vme_start);
8547 if (tmp_entry.use_pmap) {
8548 pmap = sub_map->pmap;
8549 pmap_addr = tmp_entry.vme_start;
8550 } else {
8551 pmap = map->pmap;
8552 pmap_addr = tmp_entry.vme_start;
8553 }
8554 (void) vm_map_unwire_nested(sub_map,
8555 sub_start, sub_end,
8556 user_wire,
8557 pmap, pmap_addr);
8558 } else {
8559 vm_map_offset_t entry_end = tmp_entry.vme_end;
8560 vm_map_offset_t max_end;
8561
8562 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8563 max_end = end - VM_MAP_PAGE_SIZE(map);
8564 if (entry_end > max_end) {
8565 entry_end = max_end;
8566 }
8567 }
8568
8569 if (tmp_entry.vme_kernel_object) {
8570 pmap_protect_options(
8571 map->pmap,
8572 tmp_entry.vme_start,
8573 entry_end,
8574 VM_PROT_NONE,
8575 PMAP_OPTIONS_REMOVE,
8576 NULL);
8577 }
8578 vm_fault_unwire(map, &tmp_entry,
8579 tmp_entry.vme_kernel_object, map->pmap,
8580 tmp_entry.vme_start, entry_end);
8581 }
8582
8583 vm_map_lock(map);
8584
8585 /*
8586 * Unwiring happened, we can now go back to deleting
8587 * them (after we clear the in_transition bit for the range).
8588 */
8589 if (last_timestamp + 1 != map->timestamp) {
8590 state |= VMDS_NEEDS_LOOKUP;
8591 }
8592 clear_in_transition_end = tmp_entry.vme_end;
8593 continue;
8594 }
8595
8596 assert(entry->wired_count == 0);
8597 assert(entry->user_wired_count == 0);
8598
8599
8600 /*
8601 * Step 6: Entry is unwired and ready for us to delete !
8602 */
8603
8604 if (!entry->vme_permanent) {
8605 /*
8606 * Typical case: the entry really shouldn't be permanent
8607 */
8608 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8609 (entry->protection & VM_PROT_EXECUTE) &&
8610 developer_mode_state()) {
8611 /*
8612 * Allow debuggers to undo executable mappings
8613 * when developer mode is on.
8614 */
8615 #if 0
8616 printf("FBDP %d[%s] removing permanent executable entry "
8617 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8618 proc_selfpid(),
8619 (current_task()->bsd_info
8620 ? proc_name_address(current_task()->bsd_info)
8621 : "?"), entry,
8622 (uint64_t)entry->vme_start,
8623 (uint64_t)entry->vme_end,
8624 entry->protection,
8625 entry->max_protection);
8626 #endif
8627 entry->vme_permanent = FALSE;
8628 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8629 #if 0
8630 printf("FBDP %d[%s] removing permanent entry "
8631 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8632 proc_selfpid(),
8633 (current_task()->bsd_info
8634 ? proc_name_address(current_task()->bsd_info)
8635 : "?"), entry,
8636 (uint64_t)entry->vme_start,
8637 (uint64_t)entry->vme_end,
8638 entry->protection,
8639 entry->max_protection);
8640 #endif
8641 entry->vme_permanent = FALSE;
8642 #if CODE_SIGNING_MONITOR
8643 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8644 entry->vme_permanent = FALSE;
8645
8646 printf("%d[%s] %s(0x%llx,0x%llx): "
8647 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8648 "prot 0x%x/0x%x\n",
8649 proc_selfpid(),
8650 (get_bsdtask_info(current_task())
8651 ? proc_name_address(get_bsdtask_info(current_task()))
8652 : "?"),
8653 __FUNCTION__,
8654 (uint64_t)start,
8655 (uint64_t)end,
8656 (uint64_t)entry->vme_start,
8657 (uint64_t)entry->vme_end,
8658 entry->protection,
8659 entry->max_protection);
8660 #endif
8661 } else {
8662 DTRACE_VM6(vm_map_delete_permanent,
8663 vm_map_entry_t, entry,
8664 vm_map_offset_t, entry->vme_start,
8665 vm_map_offset_t, entry->vme_end,
8666 vm_prot_t, entry->protection,
8667 vm_prot_t, entry->max_protection,
8668 int, VME_ALIAS(entry));
8669 }
8670
8671 if (entry->is_sub_map) {
8672 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8673 "map %p (%d) entry %p submap %p (%d)\n",
8674 map, VM_MAP_PAGE_SHIFT(map), entry,
8675 VME_SUBMAP(entry),
8676 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8677 if (entry->use_pmap) {
8678 #ifndef NO_NESTED_PMAP
8679 int pmap_flags;
8680
8681 if (map->terminated) {
8682 /*
8683 * This is the final cleanup of the
8684 * address space being terminated.
8685 * No new mappings are expected and
8686 * we don't really need to unnest the
8687 * shared region (and lose the "global"
8688 * pmap mappings, if applicable).
8689 *
8690 * Tell the pmap layer that we're
8691 * "clean" wrt nesting.
8692 */
8693 pmap_flags = PMAP_UNNEST_CLEAN;
8694 } else {
8695 /*
8696 * We're unmapping part of the nested
8697 * shared region, so we can't keep the
8698 * nested pmap.
8699 */
8700 pmap_flags = 0;
8701 }
8702 pmap_unnest_options(
8703 map->pmap,
8704 (addr64_t)entry->vme_start,
8705 entry->vme_end - entry->vme_start,
8706 pmap_flags);
8707 #endif /* NO_NESTED_PMAP */
8708 if (map->mapped_in_other_pmaps &&
8709 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8710 /* clean up parent map/maps */
8711 vm_map_submap_pmap_clean(
8712 map, entry->vme_start,
8713 entry->vme_end,
8714 VME_SUBMAP(entry),
8715 VME_OFFSET(entry));
8716 }
8717 } else {
8718 vm_map_submap_pmap_clean(
8719 map, entry->vme_start, entry->vme_end,
8720 VME_SUBMAP(entry),
8721 VME_OFFSET(entry));
8722 }
8723 } else if (entry->vme_kernel_object ||
8724 VME_OBJECT(entry) == compressor_object) {
8725 /*
8726 * nothing to do
8727 */
8728 } else if (map->mapped_in_other_pmaps &&
8729 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8730 vm_object_pmap_protect_options(
8731 VME_OBJECT(entry), VME_OFFSET(entry),
8732 entry->vme_end - entry->vme_start,
8733 PMAP_NULL,
8734 PAGE_SIZE,
8735 entry->vme_start,
8736 VM_PROT_NONE,
8737 PMAP_OPTIONS_REMOVE);
8738 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8739 (state & VMDS_KERNEL_PMAP)) {
8740 /* Remove translations associated
8741 * with this range unless the entry
8742 * does not have an object, or
8743 * it's the kernel map or a descendant
8744 * since the platform could potentially
8745 * create "backdoor" mappings invisible
8746 * to the VM. It is expected that
8747 * objectless, non-kernel ranges
8748 * do not have such VM invisible
8749 * translations.
8750 */
8751 vm_map_address_t remove_start = entry->vme_start;
8752 vm_map_address_t remove_end = entry->vme_end;
8753 #if MACH_ASSERT
8754 /*
8755 * Prevent panics in pmap_remove() from some vm test code
8756 * which uses virtual address ranges that pmap disallows.
8757 */
8758 if (thread_get_test_option(test_option_vm_map_clamp_pmap_remove)) {
8759 vm_map_clamp_to_pmap(map, &remove_start, &remove_end);
8760 }
8761 #endif /* MACH_ASSERT */
8762 pmap_remove(map->pmap, remove_start, remove_end);
8763 }
8764
8765 #if DEBUG
8766 /*
8767 * All pmap mappings for this map entry must have been
8768 * cleared by now.
8769 */
8770 assert(pmap_is_empty(map->pmap,
8771 entry->vme_start,
8772 entry->vme_end));
8773 #endif /* DEBUG */
8774
8775 if (entry->iokit_acct) {
8776 /* alternate accounting */
8777 DTRACE_VM4(vm_map_iokit_unmapped_region,
8778 vm_map_t, map,
8779 vm_map_offset_t, entry->vme_start,
8780 vm_map_offset_t, entry->vme_end,
8781 int, VME_ALIAS(entry));
8782 vm_map_iokit_unmapped_region(map,
8783 (entry->vme_end -
8784 entry->vme_start));
8785 entry->iokit_acct = FALSE;
8786 entry->use_pmap = FALSE;
8787 }
8788
8789 /* move "s" forward */
8790 s = entry->vme_end;
8791 next = entry->vme_next;
8792 if (!entry->map_aligned) {
8793 vm_map_offset_t rounded_s;
8794
8795 /*
8796 * Skip artificial gap due to mis-aligned entry
8797 * on devices with a page size smaller than the
8798 * map's page size (i.e. 16k task on a 4k device).
8799 */
8800 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8801 if (next == vm_map_to_entry(map)) {
8802 s = rounded_s;
8803 } else if (s < rounded_s) {
8804 s = MIN(rounded_s, next->vme_start);
8805 }
8806 }
8807 ret.kmr_size += s - entry->vme_start;
8808
8809 if (entry->vme_permanent) {
8810 /*
8811 * A permanent entry can not be removed, so leave it
8812 * in place but remove all access permissions.
8813 */
8814 if (__improbable(vm_log_map_delete_permanent_prot_none)) {
8815 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8816 __FUNCTION__, __LINE__,
8817 proc_selfpid(),
8818 (get_bsdtask_info(current_task())
8819 ? proc_name_address(get_bsdtask_info(current_task()))
8820 : "?"),
8821 map,
8822 entry,
8823 (uint64_t)entry->vme_start,
8824 (uint64_t)entry->vme_end,
8825 entry->is_sub_map,
8826 entry->protection,
8827 entry->max_protection);
8828 }
8829 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8830 vm_map_entry_t, entry,
8831 vm_map_offset_t, entry->vme_start,
8832 vm_map_offset_t, entry->vme_end,
8833 vm_prot_t, entry->protection,
8834 vm_prot_t, entry->max_protection,
8835 int, VME_ALIAS(entry));
8836 entry->protection = VM_PROT_NONE;
8837 entry->max_protection = VM_PROT_NONE;
8838 #ifdef __arm64e__
8839 entry->used_for_tpro = FALSE;
8840 #endif
8841 } else {
8842 vm_map_entry_zap(map, entry, zap_list);
8843 }
8844
8845 entry = next;
8846 next = VM_MAP_ENTRY_NULL;
8847
8848 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8849 unsigned int last_timestamp = map->timestamp++;
8850
8851 if (lck_rw_lock_yield_exclusive(&map->lock,
8852 LCK_RW_YIELD_ANY_WAITER)) {
8853 if (last_timestamp != map->timestamp + 1) {
8854 state |= VMDS_NEEDS_LOOKUP;
8855 }
8856 } else {
8857 /* we didn't yield, undo our change */
8858 map->timestamp--;
8859 }
8860 }
8861 }
8862
8863 if (map->wait_for_space) {
8864 thread_wakeup((event_t) map);
8865 }
8866
8867 if (state & VMDS_NEEDS_WAKEUP) {
8868 vm_map_entry_wakeup(map);
8869 }
8870
8871 out:
8872 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8873 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8874 }
8875
8876 if (state & VMDS_KERNEL_KMEMPTR) {
8877 kmem_free_space(start, end, range_id, &slot);
8878 }
8879
8880 if (state & VMDS_FOUND_GAP) {
8881 DTRACE_VM3(kern_vm_deallocate_gap,
8882 vm_map_offset_t, gap_start,
8883 vm_map_offset_t, save_start,
8884 vm_map_offset_t, save_end);
8885 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8886 ret.kmr_return = KERN_INVALID_VALUE;
8887 } else {
8888 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8889 }
8890 }
8891
8892 return ret;
8893 }
8894
8895 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8896 vm_map_remove_and_unlock(
8897 vm_map_t map,
8898 vm_map_offset_t start,
8899 vm_map_offset_t end,
8900 vmr_flags_t flags,
8901 kmem_guard_t guard)
8902 {
8903 kmem_return_t ret;
8904 VM_MAP_ZAP_DECLARE(zap);
8905
8906 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8907 vm_map_unlock(map);
8908
8909 vm_map_zap_dispose(&zap);
8910
8911 return ret;
8912 }
8913
8914 /*
8915 * vm_map_remove_guard:
8916 *
8917 * Remove the given address range from the target map.
8918 * This is the exported form of vm_map_delete.
8919 */
8920 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8921 vm_map_remove_guard(
8922 vm_map_t map,
8923 vm_map_offset_t start,
8924 vm_map_offset_t end,
8925 vmr_flags_t flags,
8926 kmem_guard_t guard)
8927 {
8928 vm_map_lock(map);
8929 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8930 }
8931
8932 /*
8933 * vm_map_terminate:
8934 *
8935 * Clean out a task's map.
8936 */
8937 kern_return_t
vm_map_terminate(vm_map_t map)8938 vm_map_terminate(
8939 vm_map_t map)
8940 {
8941 vm_map_lock(map);
8942 map->terminated = TRUE;
8943 vm_map_disable_hole_optimization(map);
8944 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8945 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8946 return KERN_SUCCESS;
8947 }
8948
8949 /*
8950 * Routine: vm_map_copy_allocate
8951 *
8952 * Description:
8953 * Allocates and initializes a map copy object.
8954 */
8955 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8956 vm_map_copy_allocate(uint16_t type)
8957 {
8958 vm_map_copy_t new_copy;
8959
8960 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8961 new_copy->type = type;
8962 if (type == VM_MAP_COPY_ENTRY_LIST) {
8963 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8964 vm_map_store_init(&new_copy->cpy_hdr);
8965 }
8966 return new_copy;
8967 }
8968
8969 /*
8970 * Routine: vm_map_copy_discard
8971 *
8972 * Description:
8973 * Dispose of a map copy object (returned by
8974 * vm_map_copyin).
8975 */
8976 void
vm_map_copy_discard(vm_map_copy_t copy)8977 vm_map_copy_discard(
8978 vm_map_copy_t copy)
8979 {
8980 if (copy == VM_MAP_COPY_NULL) {
8981 return;
8982 }
8983
8984 /*
8985 * Assert that the vm_map_copy is coming from the right
8986 * zone and hasn't been forged
8987 */
8988 vm_map_copy_require(copy);
8989
8990 switch (copy->type) {
8991 case VM_MAP_COPY_ENTRY_LIST:
8992 while (vm_map_copy_first_entry(copy) !=
8993 vm_map_copy_to_entry(copy)) {
8994 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8995
8996 vm_map_copy_entry_unlink(copy, entry);
8997 if (entry->is_sub_map) {
8998 vm_map_deallocate(VME_SUBMAP(entry));
8999 } else {
9000 vm_object_deallocate(VME_OBJECT(entry));
9001 }
9002 vm_map_copy_entry_dispose(entry);
9003 }
9004 break;
9005 case VM_MAP_COPY_KERNEL_BUFFER:
9006
9007 /*
9008 * The vm_map_copy_t and possibly the data buffer were
9009 * allocated by a single call to kalloc_data(), i.e. the
9010 * vm_map_copy_t was not allocated out of the zone.
9011 */
9012 if (copy->size > msg_ool_size_small || copy->offset) {
9013 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9014 (long long)copy->size, (long long)copy->offset);
9015 }
9016 kfree_data(copy->cpy_kdata, copy->size);
9017 }
9018 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9019 }
9020
9021 #if XNU_PLATFORM_MacOSX
9022
9023 __exported
9024 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
9025
9026 /*
9027 * Routine: vm_map_copy_copy
9028 *
9029 * Description:
9030 * Move the information in a map copy object to
9031 * a new map copy object, leaving the old one
9032 * empty.
9033 *
9034 * This is used by kernel routines that need
9035 * to look at out-of-line data (in copyin form)
9036 * before deciding whether to return SUCCESS.
9037 * If the routine returns FAILURE, the original
9038 * copy object will be deallocated; therefore,
9039 * these routines must make a copy of the copy
9040 * object and leave the original empty so that
9041 * deallocation will not fail.
9042 */
9043 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)9044 vm_map_copy_copy(
9045 vm_map_copy_t copy)
9046 {
9047 vm_map_copy_t new_copy;
9048
9049 if (copy == VM_MAP_COPY_NULL) {
9050 return VM_MAP_COPY_NULL;
9051 }
9052
9053 /*
9054 * Assert that the vm_map_copy is coming from the right
9055 * zone and hasn't been forged
9056 */
9057 vm_map_copy_require(copy);
9058
9059 /*
9060 * Allocate a new copy object, and copy the information
9061 * from the old one into it.
9062 */
9063
9064 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9065 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
9066 #if __has_feature(ptrauth_calls)
9067 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9068 new_copy->cpy_kdata = copy->cpy_kdata;
9069 }
9070 #endif
9071
9072 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9073 /*
9074 * The links in the entry chain must be
9075 * changed to point to the new copy object.
9076 */
9077 vm_map_copy_first_entry(copy)->vme_prev
9078 = vm_map_copy_to_entry(new_copy);
9079 vm_map_copy_last_entry(copy)->vme_next
9080 = vm_map_copy_to_entry(new_copy);
9081 }
9082
9083 /*
9084 * Change the old copy object into one that contains
9085 * nothing to be deallocated.
9086 */
9087 bzero(copy, sizeof(struct vm_map_copy));
9088 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9089
9090 /*
9091 * Return the new object.
9092 */
9093 return new_copy;
9094 }
9095
9096 #endif /* XNU_PLATFORM_MacOSX */
9097
9098 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)9099 vm_map_entry_is_overwritable(
9100 vm_map_t dst_map __unused,
9101 vm_map_entry_t entry)
9102 {
9103 if (!(entry->protection & VM_PROT_WRITE)) {
9104 /* can't overwrite if not writable */
9105 return FALSE;
9106 }
9107 #if !__x86_64__
9108 if (entry->used_for_jit &&
9109 vm_map_cs_enforcement(dst_map) &&
9110 !dst_map->cs_debugged) {
9111 /*
9112 * Can't overwrite a JIT region while cs_enforced
9113 * and not cs_debugged.
9114 */
9115 return FALSE;
9116 }
9117
9118 #if __arm64e__
9119 /* Do not allow overwrite HW assisted TPRO entries */
9120 if (entry->used_for_tpro) {
9121 return FALSE;
9122 }
9123 #endif /* __arm64e__ */
9124
9125 if (entry->vme_permanent) {
9126 if (entry->is_sub_map) {
9127 /*
9128 * We can't tell if the submap contains "permanent"
9129 * entries within the range targeted by the caller.
9130 * The caller will have to check for that with
9131 * vm_map_overwrite_submap_recurse() for example.
9132 */
9133 } else {
9134 /*
9135 * Do not allow overwriting of a "permanent"
9136 * entry.
9137 */
9138 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9139 vm_map_entry_t, entry,
9140 vm_map_offset_t, entry->vme_start,
9141 vm_map_offset_t, entry->vme_end,
9142 vm_prot_t, entry->protection,
9143 vm_prot_t, entry->max_protection,
9144 int, VME_ALIAS(entry));
9145 return FALSE;
9146 }
9147 }
9148 #endif /* !__x86_64__ */
9149
9150 if (entry->is_sub_map) {
9151 /* remember not to assume every entry has a VM object... */
9152 }
9153
9154 return TRUE;
9155 }
9156
9157 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)9158 vm_map_overwrite_submap_recurse(
9159 vm_map_t dst_map,
9160 vm_map_offset_t dst_addr,
9161 vm_map_size_t dst_size)
9162 {
9163 vm_map_offset_t dst_end;
9164 vm_map_entry_t tmp_entry;
9165 vm_map_entry_t entry;
9166 kern_return_t result;
9167 boolean_t encountered_sub_map = FALSE;
9168
9169
9170
9171 /*
9172 * Verify that the destination is all writeable
9173 * initially. We have to trunc the destination
9174 * address and round the copy size or we'll end up
9175 * splitting entries in strange ways.
9176 */
9177
9178 dst_end = vm_map_round_page(dst_addr + dst_size,
9179 VM_MAP_PAGE_MASK(dst_map));
9180 vm_map_lock(dst_map);
9181
9182 start_pass_1:
9183 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9184 vm_map_unlock(dst_map);
9185 return KERN_INVALID_ADDRESS;
9186 }
9187
9188 vm_map_clip_start(dst_map,
9189 tmp_entry,
9190 vm_map_trunc_page(dst_addr,
9191 VM_MAP_PAGE_MASK(dst_map)));
9192 if (tmp_entry->is_sub_map) {
9193 /* clipping did unnest if needed */
9194 assert(!tmp_entry->use_pmap);
9195 }
9196
9197 for (entry = tmp_entry;;) {
9198 vm_map_entry_t next;
9199
9200 next = entry->vme_next;
9201 while (entry->is_sub_map) {
9202 vm_map_offset_t sub_start;
9203 vm_map_offset_t sub_end;
9204 vm_map_offset_t local_end;
9205
9206 if (entry->in_transition) {
9207 /*
9208 * Say that we are waiting, and wait for entry.
9209 */
9210 entry->needs_wakeup = TRUE;
9211 vm_map_entry_wait(dst_map, THREAD_UNINT);
9212
9213 goto start_pass_1;
9214 }
9215
9216 encountered_sub_map = TRUE;
9217 sub_start = VME_OFFSET(entry);
9218
9219 if (entry->vme_end < dst_end) {
9220 sub_end = entry->vme_end;
9221 } else {
9222 sub_end = dst_end;
9223 }
9224 sub_end -= entry->vme_start;
9225 sub_end += VME_OFFSET(entry);
9226 local_end = entry->vme_end;
9227 vm_map_unlock(dst_map);
9228
9229 result = vm_map_overwrite_submap_recurse(
9230 VME_SUBMAP(entry),
9231 sub_start,
9232 sub_end - sub_start);
9233
9234 if (result != KERN_SUCCESS) {
9235 return result;
9236 }
9237 if (dst_end <= entry->vme_end) {
9238 return KERN_SUCCESS;
9239 }
9240 vm_map_lock(dst_map);
9241 if (!vm_map_lookup_entry(dst_map, local_end,
9242 &tmp_entry)) {
9243 vm_map_unlock(dst_map);
9244 return KERN_INVALID_ADDRESS;
9245 }
9246 entry = tmp_entry;
9247 next = entry->vme_next;
9248 }
9249 assert(!entry->is_sub_map);
9250
9251 if (!(entry->protection & VM_PROT_WRITE)) {
9252 vm_map_unlock(dst_map);
9253 return KERN_PROTECTION_FAILURE;
9254 }
9255
9256 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9257 vm_map_unlock(dst_map);
9258 return KERN_PROTECTION_FAILURE;
9259 }
9260
9261 /*
9262 * If the entry is in transition, we must wait
9263 * for it to exit that state. Anything could happen
9264 * when we unlock the map, so start over.
9265 */
9266 if (entry->in_transition) {
9267 /*
9268 * Say that we are waiting, and wait for entry.
9269 */
9270 entry->needs_wakeup = TRUE;
9271 vm_map_entry_wait(dst_map, THREAD_UNINT);
9272
9273 goto start_pass_1;
9274 }
9275
9276 /*
9277 * our range is contained completely within this map entry
9278 */
9279 if (dst_end <= entry->vme_end) {
9280 vm_map_unlock(dst_map);
9281 return KERN_SUCCESS;
9282 }
9283 /*
9284 * check that range specified is contiguous region
9285 */
9286 if ((next == vm_map_to_entry(dst_map)) ||
9287 (next->vme_start != entry->vme_end)) {
9288 vm_map_unlock(dst_map);
9289 return KERN_INVALID_ADDRESS;
9290 }
9291
9292 /*
9293 * Check for permanent objects in the destination.
9294 */
9295 assert(!entry->is_sub_map);
9296 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9297 ((!VME_OBJECT(entry)->internal) ||
9298 (VME_OBJECT(entry)->true_share))) {
9299 if (encountered_sub_map) {
9300 vm_map_unlock(dst_map);
9301 return KERN_FAILURE;
9302 }
9303 }
9304
9305
9306 entry = next;
9307 }/* for */
9308 vm_map_unlock(dst_map);
9309 return KERN_SUCCESS;
9310 }
9311
9312 /*
9313 * Routine: vm_map_copy_overwrite
9314 *
9315 * Description:
9316 * Copy the memory described by the map copy
9317 * object (copy; returned by vm_map_copyin) onto
9318 * the specified destination region (dst_map, dst_addr).
9319 * The destination must be writeable.
9320 *
9321 * Unlike vm_map_copyout, this routine actually
9322 * writes over previously-mapped memory. If the
9323 * previous mapping was to a permanent (user-supplied)
9324 * memory object, it is preserved.
9325 *
9326 * The attributes (protection and inheritance) of the
9327 * destination region are preserved.
9328 *
9329 * If successful, consumes the copy object.
9330 * Otherwise, the caller is responsible for it.
9331 *
9332 * Implementation notes:
9333 * To overwrite aligned temporary virtual memory, it is
9334 * sufficient to remove the previous mapping and insert
9335 * the new copy. This replacement is done either on
9336 * the whole region (if no permanent virtual memory
9337 * objects are embedded in the destination region) or
9338 * in individual map entries.
9339 *
9340 * To overwrite permanent virtual memory , it is necessary
9341 * to copy each page, as the external memory management
9342 * interface currently does not provide any optimizations.
9343 *
9344 * Unaligned memory also has to be copied. It is possible
9345 * to use 'vm_trickery' to copy the aligned data. This is
9346 * not done but not hard to implement.
9347 *
9348 * Once a page of permanent memory has been overwritten,
9349 * it is impossible to interrupt this function; otherwise,
9350 * the call would be neither atomic nor location-independent.
9351 * The kernel-state portion of a user thread must be
9352 * interruptible.
9353 *
9354 * It may be expensive to forward all requests that might
9355 * overwrite permanent memory (vm_write, vm_copy) to
9356 * uninterruptible kernel threads. This routine may be
9357 * called by interruptible threads; however, success is
9358 * not guaranteed -- if the request cannot be performed
9359 * atomically and interruptibly, an error indication is
9360 * returned.
9361 *
9362 * Callers of this function must call vm_map_copy_require on
9363 * previously created vm_map_copy_t or pass a newly created
9364 * one to ensure that it hasn't been forged.
9365 */
9366 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9367 vm_map_copy_overwrite_nested(
9368 vm_map_t dst_map,
9369 vm_map_address_t dst_addr,
9370 vm_map_copy_t copy,
9371 boolean_t interruptible,
9372 pmap_t pmap,
9373 boolean_t discard_on_success)
9374 {
9375 vm_map_offset_t dst_end;
9376 vm_map_entry_t tmp_entry;
9377 vm_map_entry_t entry;
9378 kern_return_t kr;
9379 boolean_t aligned = TRUE;
9380 boolean_t contains_permanent_objects = FALSE;
9381 boolean_t encountered_sub_map = FALSE;
9382 vm_map_offset_t base_addr;
9383 vm_map_size_t copy_size;
9384 vm_map_size_t total_size;
9385 uint16_t copy_page_shift;
9386
9387 /*
9388 * Check for special kernel buffer allocated
9389 * by new_ipc_kmsg_copyin.
9390 */
9391
9392 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9393 kr = vm_map_copyout_kernel_buffer(
9394 dst_map, &dst_addr,
9395 copy, copy->size, TRUE, discard_on_success);
9396 return kr;
9397 }
9398
9399 /*
9400 * Only works for entry lists at the moment. Will
9401 * support page lists later.
9402 */
9403
9404 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9405
9406 if (copy->size == 0) {
9407 if (discard_on_success) {
9408 vm_map_copy_discard(copy);
9409 }
9410 return KERN_SUCCESS;
9411 }
9412
9413 copy_page_shift = copy->cpy_hdr.page_shift;
9414
9415 /*
9416 * Verify that the destination is all writeable
9417 * initially. We have to trunc the destination
9418 * address and round the copy size or we'll end up
9419 * splitting entries in strange ways.
9420 */
9421
9422 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9423 VM_MAP_PAGE_MASK(dst_map)) ||
9424 !VM_MAP_PAGE_ALIGNED(copy->offset,
9425 VM_MAP_PAGE_MASK(dst_map)) ||
9426 !VM_MAP_PAGE_ALIGNED(dst_addr,
9427 VM_MAP_PAGE_MASK(dst_map)) ||
9428 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9429 aligned = FALSE;
9430 dst_end = vm_map_round_page(dst_addr + copy->size,
9431 VM_MAP_PAGE_MASK(dst_map));
9432 } else {
9433 dst_end = dst_addr + copy->size;
9434 }
9435
9436 vm_map_lock(dst_map);
9437
9438 /* LP64todo - remove this check when vm_map_commpage64()
9439 * no longer has to stuff in a map_entry for the commpage
9440 * above the map's max_offset.
9441 */
9442 if (dst_addr >= dst_map->max_offset) {
9443 vm_map_unlock(dst_map);
9444 return KERN_INVALID_ADDRESS;
9445 }
9446
9447 start_pass_1:
9448 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9449 vm_map_unlock(dst_map);
9450 return KERN_INVALID_ADDRESS;
9451 }
9452 vm_map_clip_start(dst_map,
9453 tmp_entry,
9454 vm_map_trunc_page(dst_addr,
9455 VM_MAP_PAGE_MASK(dst_map)));
9456 for (entry = tmp_entry;;) {
9457 vm_map_entry_t next = entry->vme_next;
9458
9459 while (entry->is_sub_map) {
9460 vm_map_offset_t sub_start;
9461 vm_map_offset_t sub_end;
9462 vm_map_offset_t local_end;
9463
9464 if (entry->in_transition) {
9465 /*
9466 * Say that we are waiting, and wait for entry.
9467 */
9468 entry->needs_wakeup = TRUE;
9469 vm_map_entry_wait(dst_map, THREAD_UNINT);
9470
9471 goto start_pass_1;
9472 }
9473
9474 local_end = entry->vme_end;
9475 if (!(entry->needs_copy)) {
9476 /* if needs_copy we are a COW submap */
9477 /* in such a case we just replace so */
9478 /* there is no need for the follow- */
9479 /* ing check. */
9480 encountered_sub_map = TRUE;
9481 sub_start = VME_OFFSET(entry);
9482
9483 if (entry->vme_end < dst_end) {
9484 sub_end = entry->vme_end;
9485 } else {
9486 sub_end = dst_end;
9487 }
9488 sub_end -= entry->vme_start;
9489 sub_end += VME_OFFSET(entry);
9490 vm_map_unlock(dst_map);
9491
9492 kr = vm_map_overwrite_submap_recurse(
9493 VME_SUBMAP(entry),
9494 sub_start,
9495 sub_end - sub_start);
9496 if (kr != KERN_SUCCESS) {
9497 return kr;
9498 }
9499 vm_map_lock(dst_map);
9500 }
9501
9502 if (dst_end <= entry->vme_end) {
9503 goto start_overwrite;
9504 }
9505 if (!vm_map_lookup_entry(dst_map, local_end,
9506 &entry)) {
9507 vm_map_unlock(dst_map);
9508 return KERN_INVALID_ADDRESS;
9509 }
9510 next = entry->vme_next;
9511 }
9512 assert(!entry->is_sub_map);
9513
9514 if (!(entry->protection & VM_PROT_WRITE)) {
9515 vm_map_unlock(dst_map);
9516 return KERN_PROTECTION_FAILURE;
9517 }
9518
9519 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9520 vm_map_unlock(dst_map);
9521 return KERN_PROTECTION_FAILURE;
9522 }
9523
9524 /*
9525 * If the entry is in transition, we must wait
9526 * for it to exit that state. Anything could happen
9527 * when we unlock the map, so start over.
9528 */
9529 if (entry->in_transition) {
9530 /*
9531 * Say that we are waiting, and wait for entry.
9532 */
9533 entry->needs_wakeup = TRUE;
9534 vm_map_entry_wait(dst_map, THREAD_UNINT);
9535
9536 goto start_pass_1;
9537 }
9538
9539 /*
9540 * our range is contained completely within this map entry
9541 */
9542 if (dst_end <= entry->vme_end) {
9543 break;
9544 }
9545 /*
9546 * check that range specified is contiguous region
9547 */
9548 if ((next == vm_map_to_entry(dst_map)) ||
9549 (next->vme_start != entry->vme_end)) {
9550 vm_map_unlock(dst_map);
9551 return KERN_INVALID_ADDRESS;
9552 }
9553
9554
9555 /*
9556 * Check for permanent objects in the destination.
9557 */
9558 assert(!entry->is_sub_map);
9559 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9560 ((!VME_OBJECT(entry)->internal) ||
9561 (VME_OBJECT(entry)->true_share))) {
9562 contains_permanent_objects = TRUE;
9563 }
9564
9565 entry = next;
9566 }/* for */
9567
9568 start_overwrite:
9569 /*
9570 * If there are permanent objects in the destination, then
9571 * the copy cannot be interrupted.
9572 */
9573
9574 if (interruptible && contains_permanent_objects) {
9575 vm_map_unlock(dst_map);
9576 return KERN_FAILURE; /* XXX */
9577 }
9578
9579 /*
9580 *
9581 * Make a second pass, overwriting the data
9582 * At the beginning of each loop iteration,
9583 * the next entry to be overwritten is "tmp_entry"
9584 * (initially, the value returned from the lookup above),
9585 * and the starting address expected in that entry
9586 * is "start".
9587 */
9588
9589 total_size = copy->size;
9590 if (encountered_sub_map) {
9591 copy_size = 0;
9592 /* re-calculate tmp_entry since we've had the map */
9593 /* unlocked */
9594 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9595 vm_map_unlock(dst_map);
9596 return KERN_INVALID_ADDRESS;
9597 }
9598 } else {
9599 copy_size = copy->size;
9600 }
9601
9602 base_addr = dst_addr;
9603 while (TRUE) {
9604 /* deconstruct the copy object and do in parts */
9605 /* only in sub_map, interruptable case */
9606 vm_map_entry_t copy_entry;
9607 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9608 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9609 int nentries;
9610 int remaining_entries = 0;
9611 vm_map_offset_t new_offset = 0;
9612
9613 for (entry = tmp_entry; copy_size == 0;) {
9614 vm_map_entry_t next;
9615
9616 next = entry->vme_next;
9617
9618 /* tmp_entry and base address are moved along */
9619 /* each time we encounter a sub-map. Otherwise */
9620 /* entry can outpase tmp_entry, and the copy_size */
9621 /* may reflect the distance between them */
9622 /* if the current entry is found to be in transition */
9623 /* we will start over at the beginning or the last */
9624 /* encounter of a submap as dictated by base_addr */
9625 /* we will zero copy_size accordingly. */
9626 if (entry->in_transition) {
9627 /*
9628 * Say that we are waiting, and wait for entry.
9629 */
9630 entry->needs_wakeup = TRUE;
9631 vm_map_entry_wait(dst_map, THREAD_UNINT);
9632
9633 if (!vm_map_lookup_entry(dst_map, base_addr,
9634 &tmp_entry)) {
9635 vm_map_unlock(dst_map);
9636 return KERN_INVALID_ADDRESS;
9637 }
9638 copy_size = 0;
9639 entry = tmp_entry;
9640 continue;
9641 }
9642 if (entry->is_sub_map) {
9643 vm_map_offset_t sub_start;
9644 vm_map_offset_t sub_end;
9645 vm_map_offset_t local_end;
9646
9647 if (entry->needs_copy) {
9648 /* if this is a COW submap */
9649 /* just back the range with a */
9650 /* anonymous entry */
9651 assert(!entry->vme_permanent);
9652 if (entry->vme_end < dst_end) {
9653 sub_end = entry->vme_end;
9654 } else {
9655 sub_end = dst_end;
9656 }
9657 if (entry->vme_start < base_addr) {
9658 sub_start = base_addr;
9659 } else {
9660 sub_start = entry->vme_start;
9661 }
9662 vm_map_clip_end(
9663 dst_map, entry, sub_end);
9664 vm_map_clip_start(
9665 dst_map, entry, sub_start);
9666 assert(!entry->use_pmap);
9667 assert(!entry->iokit_acct);
9668 entry->use_pmap = TRUE;
9669 vm_map_deallocate(VME_SUBMAP(entry));
9670 assert(!entry->vme_permanent);
9671 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9672 VME_OFFSET_SET(entry, 0);
9673 entry->is_shared = FALSE;
9674 entry->needs_copy = FALSE;
9675 entry->protection = VM_PROT_DEFAULT;
9676 entry->max_protection = VM_PROT_ALL;
9677 entry->wired_count = 0;
9678 entry->user_wired_count = 0;
9679 if (entry->inheritance
9680 == VM_INHERIT_SHARE) {
9681 entry->inheritance = VM_INHERIT_COPY;
9682 }
9683 continue;
9684 }
9685 /* first take care of any non-sub_map */
9686 /* entries to send */
9687 if (base_addr < entry->vme_start) {
9688 /* stuff to send */
9689 copy_size =
9690 entry->vme_start - base_addr;
9691 break;
9692 }
9693 sub_start = VME_OFFSET(entry);
9694
9695 if (entry->vme_end < dst_end) {
9696 sub_end = entry->vme_end;
9697 } else {
9698 sub_end = dst_end;
9699 }
9700 sub_end -= entry->vme_start;
9701 sub_end += VME_OFFSET(entry);
9702 local_end = entry->vme_end;
9703 vm_map_unlock(dst_map);
9704 copy_size = sub_end - sub_start;
9705
9706 /* adjust the copy object */
9707 if (total_size > copy_size) {
9708 vm_map_size_t local_size = 0;
9709 vm_map_size_t entry_size;
9710
9711 nentries = 1;
9712 new_offset = copy->offset;
9713 copy_entry = vm_map_copy_first_entry(copy);
9714 while (copy_entry !=
9715 vm_map_copy_to_entry(copy)) {
9716 entry_size = copy_entry->vme_end -
9717 copy_entry->vme_start;
9718 if ((local_size < copy_size) &&
9719 ((local_size + entry_size)
9720 >= copy_size)) {
9721 vm_map_copy_clip_end(copy,
9722 copy_entry,
9723 copy_entry->vme_start +
9724 (copy_size - local_size));
9725 entry_size = copy_entry->vme_end -
9726 copy_entry->vme_start;
9727 local_size += entry_size;
9728 new_offset += entry_size;
9729 }
9730 if (local_size >= copy_size) {
9731 next_copy = copy_entry->vme_next;
9732 copy_entry->vme_next =
9733 vm_map_copy_to_entry(copy);
9734 previous_prev =
9735 copy->cpy_hdr.links.prev;
9736 copy->cpy_hdr.links.prev = copy_entry;
9737 copy->size = copy_size;
9738 remaining_entries =
9739 copy->cpy_hdr.nentries;
9740 remaining_entries -= nentries;
9741 copy->cpy_hdr.nentries = nentries;
9742 break;
9743 } else {
9744 local_size += entry_size;
9745 new_offset += entry_size;
9746 nentries++;
9747 }
9748 copy_entry = copy_entry->vme_next;
9749 }
9750 }
9751
9752 if ((entry->use_pmap) && (pmap == NULL)) {
9753 kr = vm_map_copy_overwrite_nested(
9754 VME_SUBMAP(entry),
9755 sub_start,
9756 copy,
9757 interruptible,
9758 VME_SUBMAP(entry)->pmap,
9759 TRUE);
9760 } else if (pmap != NULL) {
9761 kr = vm_map_copy_overwrite_nested(
9762 VME_SUBMAP(entry),
9763 sub_start,
9764 copy,
9765 interruptible, pmap,
9766 TRUE);
9767 } else {
9768 kr = vm_map_copy_overwrite_nested(
9769 VME_SUBMAP(entry),
9770 sub_start,
9771 copy,
9772 interruptible,
9773 dst_map->pmap,
9774 TRUE);
9775 }
9776 if (kr != KERN_SUCCESS) {
9777 if (next_copy != NULL) {
9778 copy->cpy_hdr.nentries +=
9779 remaining_entries;
9780 copy->cpy_hdr.links.prev->vme_next =
9781 next_copy;
9782 copy->cpy_hdr.links.prev
9783 = previous_prev;
9784 copy->size = total_size;
9785 }
9786 return kr;
9787 }
9788 if (dst_end <= local_end) {
9789 return KERN_SUCCESS;
9790 }
9791 /* otherwise copy no longer exists, it was */
9792 /* destroyed after successful copy_overwrite */
9793 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9794 copy->offset = new_offset;
9795 copy->cpy_hdr.page_shift = copy_page_shift;
9796
9797 total_size -= copy_size;
9798 copy_size = 0;
9799 /* put back remainder of copy in container */
9800 if (next_copy != NULL) {
9801 copy->cpy_hdr.nentries = remaining_entries;
9802 copy->cpy_hdr.links.next = next_copy;
9803 copy->cpy_hdr.links.prev = previous_prev;
9804 copy->size = total_size;
9805 next_copy->vme_prev =
9806 vm_map_copy_to_entry(copy);
9807 next_copy = NULL;
9808 }
9809 base_addr = local_end;
9810 vm_map_lock(dst_map);
9811 if (!vm_map_lookup_entry(dst_map,
9812 local_end, &tmp_entry)) {
9813 vm_map_unlock(dst_map);
9814 return KERN_INVALID_ADDRESS;
9815 }
9816 entry = tmp_entry;
9817 continue;
9818 }
9819 assert(!entry->is_sub_map);
9820
9821 if (dst_end <= entry->vme_end) {
9822 copy_size = dst_end - base_addr;
9823 break;
9824 }
9825
9826 if ((next == vm_map_to_entry(dst_map)) ||
9827 (next->vme_start != entry->vme_end)) {
9828 vm_map_unlock(dst_map);
9829 return KERN_INVALID_ADDRESS;
9830 }
9831
9832 entry = next;
9833 }/* for */
9834
9835 next_copy = NULL;
9836 nentries = 1;
9837
9838 /* adjust the copy object */
9839 if (total_size > copy_size) {
9840 vm_map_size_t local_size = 0;
9841 vm_map_size_t entry_size;
9842
9843 new_offset = copy->offset;
9844 copy_entry = vm_map_copy_first_entry(copy);
9845 while (copy_entry != vm_map_copy_to_entry(copy)) {
9846 entry_size = copy_entry->vme_end -
9847 copy_entry->vme_start;
9848 if ((local_size < copy_size) &&
9849 ((local_size + entry_size)
9850 >= copy_size)) {
9851 vm_map_copy_clip_end(copy, copy_entry,
9852 copy_entry->vme_start +
9853 (copy_size - local_size));
9854 entry_size = copy_entry->vme_end -
9855 copy_entry->vme_start;
9856 local_size += entry_size;
9857 new_offset += entry_size;
9858 }
9859 if (local_size >= copy_size) {
9860 next_copy = copy_entry->vme_next;
9861 copy_entry->vme_next =
9862 vm_map_copy_to_entry(copy);
9863 previous_prev =
9864 copy->cpy_hdr.links.prev;
9865 copy->cpy_hdr.links.prev = copy_entry;
9866 copy->size = copy_size;
9867 remaining_entries =
9868 copy->cpy_hdr.nentries;
9869 remaining_entries -= nentries;
9870 copy->cpy_hdr.nentries = nentries;
9871 break;
9872 } else {
9873 local_size += entry_size;
9874 new_offset += entry_size;
9875 nentries++;
9876 }
9877 copy_entry = copy_entry->vme_next;
9878 }
9879 }
9880
9881 if (aligned) {
9882 pmap_t local_pmap;
9883
9884 if (pmap) {
9885 local_pmap = pmap;
9886 } else {
9887 local_pmap = dst_map->pmap;
9888 }
9889
9890 if ((kr = vm_map_copy_overwrite_aligned(
9891 dst_map, tmp_entry, copy,
9892 base_addr, local_pmap)) != KERN_SUCCESS) {
9893 if (next_copy != NULL) {
9894 copy->cpy_hdr.nentries +=
9895 remaining_entries;
9896 copy->cpy_hdr.links.prev->vme_next =
9897 next_copy;
9898 copy->cpy_hdr.links.prev =
9899 previous_prev;
9900 copy->size += copy_size;
9901 }
9902 return kr;
9903 }
9904 vm_map_unlock(dst_map);
9905 } else {
9906 /*
9907 * Performance gain:
9908 *
9909 * if the copy and dst address are misaligned but the same
9910 * offset within the page we can copy_not_aligned the
9911 * misaligned parts and copy aligned the rest. If they are
9912 * aligned but len is unaligned we simply need to copy
9913 * the end bit unaligned. We'll need to split the misaligned
9914 * bits of the region in this case !
9915 */
9916 /* ALWAYS UNLOCKS THE dst_map MAP */
9917 kr = vm_map_copy_overwrite_unaligned(
9918 dst_map,
9919 tmp_entry,
9920 copy,
9921 base_addr,
9922 discard_on_success);
9923 if (kr != KERN_SUCCESS) {
9924 if (next_copy != NULL) {
9925 copy->cpy_hdr.nentries +=
9926 remaining_entries;
9927 copy->cpy_hdr.links.prev->vme_next =
9928 next_copy;
9929 copy->cpy_hdr.links.prev =
9930 previous_prev;
9931 copy->size += copy_size;
9932 }
9933 return kr;
9934 }
9935 }
9936 total_size -= copy_size;
9937 if (total_size == 0) {
9938 break;
9939 }
9940 base_addr += copy_size;
9941 copy_size = 0;
9942 copy->offset = new_offset;
9943 if (next_copy != NULL) {
9944 copy->cpy_hdr.nentries = remaining_entries;
9945 copy->cpy_hdr.links.next = next_copy;
9946 copy->cpy_hdr.links.prev = previous_prev;
9947 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9948 copy->size = total_size;
9949 }
9950 vm_map_lock(dst_map);
9951 while (TRUE) {
9952 if (!vm_map_lookup_entry(dst_map,
9953 base_addr, &tmp_entry)) {
9954 vm_map_unlock(dst_map);
9955 return KERN_INVALID_ADDRESS;
9956 }
9957 if (tmp_entry->in_transition) {
9958 entry->needs_wakeup = TRUE;
9959 vm_map_entry_wait(dst_map, THREAD_UNINT);
9960 } else {
9961 break;
9962 }
9963 }
9964 vm_map_clip_start(dst_map,
9965 tmp_entry,
9966 vm_map_trunc_page(base_addr,
9967 VM_MAP_PAGE_MASK(dst_map)));
9968
9969 entry = tmp_entry;
9970 } /* while */
9971
9972 /*
9973 * Throw away the vm_map_copy object
9974 */
9975 if (discard_on_success) {
9976 vm_map_copy_discard(copy);
9977 }
9978
9979 return KERN_SUCCESS;
9980 }/* vm_map_copy_overwrite */
9981
9982 static __attribute__((always_inline, warn_unused_result))
9983 kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9984 vm_map_copy_addr_size_sanitize(
9985 vm_map_t map,
9986 vm_map_offset_ut addr_u,
9987 vm_map_size_ut size_u,
9988 vm_sanitize_caller_t vm_sanitize_caller,
9989 vm_map_offset_t *addr,
9990 vm_map_offset_t *end,
9991 vm_map_size_t *size)
9992 {
9993 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
9994 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
9995
9996
9997 return vm_sanitize_addr_size(addr_u, size_u,
9998 vm_sanitize_caller, map,
9999 flags,
10000 addr, end, size);
10001 }
10002
10003 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)10004 vm_map_copy_overwrite(
10005 vm_map_t dst_map,
10006 vm_map_offset_ut dst_addr_u,
10007 vm_map_copy_t copy,
10008 vm_map_size_ut copy_size_u,
10009 boolean_t interruptible)
10010 {
10011 vm_map_offset_t dst_addr, dst_end;
10012 vm_map_size_t copy_size;
10013 vm_map_size_t head_size, tail_size;
10014 vm_map_copy_t head_copy, tail_copy;
10015 vm_map_offset_t head_addr, tail_addr;
10016 vm_map_entry_t entry;
10017 kern_return_t kr;
10018 vm_map_offset_t effective_page_mask, effective_page_size;
10019 uint16_t copy_page_shift;
10020
10021 head_size = 0;
10022 tail_size = 0;
10023 head_copy = NULL;
10024 tail_copy = NULL;
10025 head_addr = 0;
10026 tail_addr = 0;
10027
10028 /*
10029 * Check for null copy object.
10030 */
10031 if (copy == VM_MAP_COPY_NULL) {
10032 return KERN_SUCCESS;
10033 }
10034
10035 /*
10036 * Sanitize any input parameters that are addr/size/prot/inherit
10037 */
10038 kr = vm_map_copy_addr_size_sanitize(
10039 dst_map,
10040 dst_addr_u,
10041 copy_size_u,
10042 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
10043 &dst_addr,
10044 &dst_end,
10045 ©_size);
10046 if (__improbable(kr != KERN_SUCCESS)) {
10047 return vm_sanitize_get_kr(kr);
10048 }
10049
10050 /*
10051 * Assert that the vm_map_copy is coming from the right
10052 * zone and hasn't been forged
10053 */
10054 vm_map_copy_require(copy);
10055
10056 if (interruptible ||
10057 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10058 /*
10059 * We can't split the "copy" map if we're interruptible
10060 * or if we don't have a "copy" map...
10061 */
10062 blunt_copy:
10063 kr = vm_map_copy_overwrite_nested(dst_map,
10064 dst_addr,
10065 copy,
10066 interruptible,
10067 (pmap_t) NULL,
10068 TRUE);
10069 if (kr) {
10070 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
10071 }
10072 return kr;
10073 }
10074
10075 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10076 if (copy_page_shift < PAGE_SHIFT ||
10077 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10078 goto blunt_copy;
10079 }
10080
10081 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
10082 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10083 } else {
10084 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10085 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10086 effective_page_mask);
10087 }
10088 effective_page_size = effective_page_mask + 1;
10089
10090 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10091 /*
10092 * Too small to bother with optimizing...
10093 */
10094 goto blunt_copy;
10095 }
10096
10097 if ((dst_addr & effective_page_mask) !=
10098 (copy->offset & effective_page_mask)) {
10099 /*
10100 * Incompatible mis-alignment of source and destination...
10101 */
10102 goto blunt_copy;
10103 }
10104
10105 /*
10106 * Proper alignment or identical mis-alignment at the beginning.
10107 * Let's try and do a small unaligned copy first (if needed)
10108 * and then an aligned copy for the rest.
10109 */
10110 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
10111 head_addr = dst_addr;
10112 head_size = (effective_page_size -
10113 (copy->offset & effective_page_mask));
10114 head_size = MIN(head_size, copy_size);
10115 }
10116 if (!vm_map_page_aligned(copy->offset + copy_size,
10117 effective_page_mask)) {
10118 /*
10119 * Mis-alignment at the end.
10120 * Do an aligned copy up to the last page and
10121 * then an unaligned copy for the remaining bytes.
10122 */
10123 tail_size = ((copy->offset + copy_size) &
10124 effective_page_mask);
10125 tail_size = MIN(tail_size, copy_size);
10126 tail_addr = dst_addr + copy_size - tail_size;
10127 assert(tail_addr >= head_addr + head_size);
10128 }
10129 assert(head_size + tail_size <= copy_size);
10130
10131 if (head_size + tail_size == copy_size) {
10132 /*
10133 * It's all unaligned, no optimization possible...
10134 */
10135 goto blunt_copy;
10136 }
10137
10138 /*
10139 * Can't optimize if there are any submaps in the
10140 * destination due to the way we free the "copy" map
10141 * progressively in vm_map_copy_overwrite_nested()
10142 * in that case.
10143 */
10144 vm_map_lock_read(dst_map);
10145 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
10146 vm_map_unlock_read(dst_map);
10147 goto blunt_copy;
10148 }
10149 for (;
10150 (entry != vm_map_to_entry(dst_map) &&
10151 entry->vme_start < dst_addr + copy_size);
10152 entry = entry->vme_next) {
10153 if (entry->is_sub_map) {
10154 vm_map_unlock_read(dst_map);
10155 goto blunt_copy;
10156 }
10157 }
10158 vm_map_unlock_read(dst_map);
10159
10160 if (head_size) {
10161 /*
10162 * Unaligned copy of the first "head_size" bytes, to reach
10163 * a page boundary.
10164 */
10165
10166 /*
10167 * Extract "head_copy" out of "copy".
10168 */
10169 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10170 head_copy->cpy_hdr.entries_pageable =
10171 copy->cpy_hdr.entries_pageable;
10172 head_copy->cpy_hdr.page_shift = copy_page_shift;
10173
10174 entry = vm_map_copy_first_entry(copy);
10175 if (entry->vme_end < copy->offset + head_size) {
10176 head_size = entry->vme_end - copy->offset;
10177 }
10178
10179 head_copy->offset = copy->offset;
10180 head_copy->size = head_size;
10181 copy->offset += head_size;
10182 copy->size -= head_size;
10183 copy_size -= head_size;
10184 assert(copy_size > 0);
10185
10186 vm_map_copy_clip_end(copy, entry, copy->offset);
10187 vm_map_copy_entry_unlink(copy, entry);
10188 vm_map_copy_entry_link(head_copy,
10189 vm_map_copy_to_entry(head_copy),
10190 entry);
10191
10192 /*
10193 * Do the unaligned copy.
10194 */
10195 kr = vm_map_copy_overwrite_nested(dst_map,
10196 head_addr,
10197 head_copy,
10198 interruptible,
10199 (pmap_t) NULL,
10200 FALSE);
10201 if (kr != KERN_SUCCESS) {
10202 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
10203 goto done;
10204 }
10205 }
10206
10207 if (tail_size) {
10208 /*
10209 * Extract "tail_copy" out of "copy".
10210 */
10211 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10212 tail_copy->cpy_hdr.entries_pageable =
10213 copy->cpy_hdr.entries_pageable;
10214 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10215
10216 tail_copy->offset = copy->offset + copy_size - tail_size;
10217 tail_copy->size = tail_size;
10218
10219 copy->size -= tail_size;
10220 copy_size -= tail_size;
10221 assert(copy_size > 0);
10222
10223 entry = vm_map_copy_last_entry(copy);
10224 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10225 entry = vm_map_copy_last_entry(copy);
10226 vm_map_copy_entry_unlink(copy, entry);
10227 vm_map_copy_entry_link(tail_copy,
10228 vm_map_copy_last_entry(tail_copy),
10229 entry);
10230 }
10231
10232 /*
10233 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10234 * we want to avoid TOCTOU issues w.r.t copy->size but
10235 * we don't need to change vm_map_copy_overwrite_nested()
10236 * and all other vm_map_copy_overwrite variants.
10237 *
10238 * So we assign the original copy_size that was passed into
10239 * this routine back to copy.
10240 *
10241 * This use of local 'copy_size' passed into this routine is
10242 * to try and protect against TOCTOU attacks where the kernel
10243 * has been exploited. We don't expect this to be an issue
10244 * during normal system operation.
10245 */
10246 assertf(copy->size == copy_size,
10247 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10248 copy->size = copy_size;
10249
10250 /*
10251 * Copy most (or possibly all) of the data.
10252 */
10253 kr = vm_map_copy_overwrite_nested(dst_map,
10254 dst_addr + head_size,
10255 copy,
10256 interruptible,
10257 (pmap_t) NULL,
10258 FALSE);
10259 if (kr != KERN_SUCCESS) {
10260 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10261 goto done;
10262 }
10263
10264 if (tail_size) {
10265 kr = vm_map_copy_overwrite_nested(dst_map,
10266 tail_addr,
10267 tail_copy,
10268 interruptible,
10269 (pmap_t) NULL,
10270 FALSE);
10271 if (kr) {
10272 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10273 }
10274 }
10275
10276 done:
10277 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10278 if (kr == KERN_SUCCESS) {
10279 /*
10280 * Discard all the copy maps.
10281 */
10282 if (head_copy) {
10283 vm_map_copy_discard(head_copy);
10284 head_copy = NULL;
10285 }
10286 vm_map_copy_discard(copy);
10287 if (tail_copy) {
10288 vm_map_copy_discard(tail_copy);
10289 tail_copy = NULL;
10290 }
10291 } else {
10292 /*
10293 * Re-assemble the original copy map.
10294 */
10295 if (head_copy) {
10296 entry = vm_map_copy_first_entry(head_copy);
10297 vm_map_copy_entry_unlink(head_copy, entry);
10298 vm_map_copy_entry_link(copy,
10299 vm_map_copy_to_entry(copy),
10300 entry);
10301 copy->offset -= head_size;
10302 copy->size += head_size;
10303 vm_map_copy_discard(head_copy);
10304 head_copy = NULL;
10305 }
10306 if (tail_copy) {
10307 entry = vm_map_copy_last_entry(tail_copy);
10308 vm_map_copy_entry_unlink(tail_copy, entry);
10309 vm_map_copy_entry_link(copy,
10310 vm_map_copy_last_entry(copy),
10311 entry);
10312 copy->size += tail_size;
10313 vm_map_copy_discard(tail_copy);
10314 tail_copy = NULL;
10315 }
10316 }
10317 return kr;
10318 }
10319
10320
10321 /*
10322 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10323 *
10324 * Decription:
10325 * Physically copy unaligned data
10326 *
10327 * Implementation:
10328 * Unaligned parts of pages have to be physically copied. We use
10329 * a modified form of vm_fault_copy (which understands none-aligned
10330 * page offsets and sizes) to do the copy. We attempt to copy as
10331 * much memory in one go as possibly, however vm_fault_copy copies
10332 * within 1 memory object so we have to find the smaller of "amount left"
10333 * "source object data size" and "target object data size". With
10334 * unaligned data we don't need to split regions, therefore the source
10335 * (copy) object should be one map entry, the target range may be split
10336 * over multiple map entries however. In any event we are pessimistic
10337 * about these assumptions.
10338 *
10339 * Callers of this function must call vm_map_copy_require on
10340 * previously created vm_map_copy_t or pass a newly created
10341 * one to ensure that it hasn't been forged.
10342 *
10343 * Assumptions:
10344 * dst_map is locked on entry and is return locked on success,
10345 * unlocked on error.
10346 */
10347
10348 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10349 vm_map_copy_overwrite_unaligned(
10350 vm_map_t dst_map,
10351 vm_map_entry_t entry,
10352 vm_map_copy_t copy,
10353 vm_map_offset_t start,
10354 boolean_t discard_on_success)
10355 {
10356 vm_map_entry_t copy_entry;
10357 vm_map_entry_t copy_entry_next;
10358 vm_map_version_t version;
10359 vm_object_t dst_object;
10360 vm_object_offset_t dst_offset;
10361 vm_object_offset_t src_offset;
10362 vm_object_offset_t entry_offset;
10363 vm_map_offset_t entry_end;
10364 vm_map_size_t src_size,
10365 dst_size,
10366 copy_size,
10367 amount_left;
10368 kern_return_t kr = KERN_SUCCESS;
10369
10370
10371 copy_entry = vm_map_copy_first_entry(copy);
10372
10373 vm_map_lock_write_to_read(dst_map);
10374
10375 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10376 amount_left = copy->size;
10377 /*
10378 * unaligned so we never clipped this entry, we need the offset into
10379 * the vm_object not just the data.
10380 */
10381 while (amount_left > 0) {
10382 if (entry == vm_map_to_entry(dst_map)) {
10383 vm_map_unlock_read(dst_map);
10384 return KERN_INVALID_ADDRESS;
10385 }
10386
10387 /* "start" must be within the current map entry */
10388 assert((start >= entry->vme_start) && (start < entry->vme_end));
10389
10390 /*
10391 * Check protection again
10392 */
10393 if (!(entry->protection & VM_PROT_WRITE)) {
10394 vm_map_unlock_read(dst_map);
10395 return KERN_PROTECTION_FAILURE;
10396 }
10397 if (entry->is_sub_map) {
10398 /* not implemented... */
10399 vm_map_unlock_read(dst_map);
10400 return KERN_INVALID_ARGUMENT;
10401 }
10402 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10403 vm_map_unlock_read(dst_map);
10404 return KERN_PROTECTION_FAILURE;
10405 }
10406 /*
10407 * If the entry is in transition, we must wait
10408 * for it to exit that state. Anything could happen
10409 * when we unlock the map, so start over.
10410 */
10411 if (entry->in_transition) {
10412 /*
10413 * Say that we are waiting, and wait for entry.
10414 */
10415 entry->needs_wakeup = TRUE;
10416 vm_map_entry_wait(dst_map, THREAD_UNINT);
10417
10418 goto RetryLookup;
10419 }
10420
10421 dst_offset = start - entry->vme_start;
10422
10423 dst_size = entry->vme_end - start;
10424
10425 src_size = copy_entry->vme_end -
10426 (copy_entry->vme_start + src_offset);
10427
10428 if (dst_size < src_size) {
10429 /*
10430 * we can only copy dst_size bytes before
10431 * we have to get the next destination entry
10432 */
10433 copy_size = dst_size;
10434 } else {
10435 /*
10436 * we can only copy src_size bytes before
10437 * we have to get the next source copy entry
10438 */
10439 copy_size = src_size;
10440 }
10441
10442 if (copy_size > amount_left) {
10443 copy_size = amount_left;
10444 }
10445 /*
10446 * Entry needs copy, create a shadow shadow object for
10447 * Copy on write region.
10448 */
10449 assert(!entry->is_sub_map);
10450 if (entry->needs_copy) {
10451 if (vm_map_lock_read_to_write(dst_map)) {
10452 vm_map_lock_read(dst_map);
10453 goto RetryLookup;
10454 }
10455 VME_OBJECT_SHADOW(entry,
10456 (vm_map_size_t)(entry->vme_end
10457 - entry->vme_start),
10458 vm_map_always_shadow(dst_map));
10459 entry->needs_copy = FALSE;
10460 vm_map_lock_write_to_read(dst_map);
10461 }
10462 dst_object = VME_OBJECT(entry);
10463 /*
10464 * unlike with the virtual (aligned) copy we're going
10465 * to fault on it therefore we need a target object.
10466 */
10467 if (dst_object == VM_OBJECT_NULL) {
10468 if (vm_map_lock_read_to_write(dst_map)) {
10469 vm_map_lock_read(dst_map);
10470 goto RetryLookup;
10471 }
10472 dst_object = vm_object_allocate((vm_map_size_t)
10473 entry->vme_end - entry->vme_start);
10474 VME_OBJECT_SET(entry, dst_object, false, 0);
10475 VME_OFFSET_SET(entry, 0);
10476 assert(entry->use_pmap);
10477 vm_map_lock_write_to_read(dst_map);
10478 }
10479 /*
10480 * Take an object reference and unlock map. The "entry" may
10481 * disappear or change when the map is unlocked.
10482 */
10483 vm_object_reference(dst_object);
10484 version.main_timestamp = dst_map->timestamp;
10485 entry_offset = VME_OFFSET(entry);
10486 entry_end = entry->vme_end;
10487 vm_map_unlock_read(dst_map);
10488 /*
10489 * Copy as much as possible in one pass
10490 */
10491 kr = vm_fault_copy(
10492 VME_OBJECT(copy_entry),
10493 VME_OFFSET(copy_entry) + src_offset,
10494 ©_size,
10495 dst_object,
10496 entry_offset + dst_offset,
10497 dst_map,
10498 &version,
10499 THREAD_UNINT );
10500
10501 start += copy_size;
10502 src_offset += copy_size;
10503 amount_left -= copy_size;
10504 /*
10505 * Release the object reference
10506 */
10507 vm_object_deallocate(dst_object);
10508 /*
10509 * If a hard error occurred, return it now
10510 */
10511 if (kr != KERN_SUCCESS) {
10512 return kr;
10513 }
10514
10515 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10516 || amount_left == 0) {
10517 /*
10518 * all done with this copy entry, dispose.
10519 */
10520 copy_entry_next = copy_entry->vme_next;
10521
10522 if (discard_on_success) {
10523 vm_map_copy_entry_unlink(copy, copy_entry);
10524 assert(!copy_entry->is_sub_map);
10525 vm_object_deallocate(VME_OBJECT(copy_entry));
10526 vm_map_copy_entry_dispose(copy_entry);
10527 }
10528
10529 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10530 amount_left) {
10531 /*
10532 * not finished copying but run out of source
10533 */
10534 return KERN_INVALID_ADDRESS;
10535 }
10536
10537 copy_entry = copy_entry_next;
10538
10539 src_offset = 0;
10540 }
10541
10542 if (amount_left == 0) {
10543 return KERN_SUCCESS;
10544 }
10545
10546 vm_map_lock_read(dst_map);
10547 if (version.main_timestamp == dst_map->timestamp) {
10548 if (start == entry_end) {
10549 /*
10550 * destination region is split. Use the version
10551 * information to avoid a lookup in the normal
10552 * case.
10553 */
10554 entry = entry->vme_next;
10555 /*
10556 * should be contiguous. Fail if we encounter
10557 * a hole in the destination.
10558 */
10559 if (start != entry->vme_start) {
10560 vm_map_unlock_read(dst_map);
10561 return KERN_INVALID_ADDRESS;
10562 }
10563 }
10564 } else {
10565 /*
10566 * Map version check failed.
10567 * we must lookup the entry because somebody
10568 * might have changed the map behind our backs.
10569 */
10570 RetryLookup:
10571 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10572 vm_map_unlock_read(dst_map);
10573 return KERN_INVALID_ADDRESS;
10574 }
10575 }
10576 }/* while */
10577
10578 return KERN_SUCCESS;
10579 }/* vm_map_copy_overwrite_unaligned */
10580
10581 /*
10582 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10583 *
10584 * Description:
10585 * Does all the vm_trickery possible for whole pages.
10586 *
10587 * Implementation:
10588 *
10589 * If there are no permanent objects in the destination,
10590 * and the source and destination map entry zones match,
10591 * and the destination map entry is not shared,
10592 * then the map entries can be deleted and replaced
10593 * with those from the copy. The following code is the
10594 * basic idea of what to do, but there are lots of annoying
10595 * little details about getting protection and inheritance
10596 * right. Should add protection, inheritance, and sharing checks
10597 * to the above pass and make sure that no wiring is involved.
10598 *
10599 * Callers of this function must call vm_map_copy_require on
10600 * previously created vm_map_copy_t or pass a newly created
10601 * one to ensure that it hasn't been forged.
10602 */
10603
10604 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10605 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10606 int vm_map_copy_overwrite_aligned_src_large = 0;
10607
10608 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10609 vm_map_copy_overwrite_aligned(
10610 vm_map_t dst_map,
10611 vm_map_entry_t tmp_entry,
10612 vm_map_copy_t copy,
10613 vm_map_offset_t start,
10614 __unused pmap_t pmap)
10615 {
10616 vm_object_t object;
10617 vm_map_entry_t copy_entry;
10618 vm_map_size_t copy_size;
10619 vm_map_size_t size;
10620 vm_map_entry_t entry;
10621
10622 while ((copy_entry = vm_map_copy_first_entry(copy))
10623 != vm_map_copy_to_entry(copy)) {
10624 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10625
10626 entry = tmp_entry;
10627
10628 if (entry->is_sub_map) {
10629 /* unnested when clipped earlier */
10630 assert(!entry->use_pmap);
10631 }
10632 if (entry == vm_map_to_entry(dst_map)) {
10633 vm_map_unlock(dst_map);
10634 return KERN_INVALID_ADDRESS;
10635 }
10636 size = (entry->vme_end - entry->vme_start);
10637 /*
10638 * Make sure that no holes popped up in the
10639 * address map, and that the protection is
10640 * still valid, in case the map was unlocked
10641 * earlier.
10642 */
10643
10644 if ((entry->vme_start != start) || ((entry->is_sub_map)
10645 && !entry->needs_copy)) {
10646 vm_map_unlock(dst_map);
10647 return KERN_INVALID_ADDRESS;
10648 }
10649 assert(entry != vm_map_to_entry(dst_map));
10650
10651 /*
10652 * Check protection again
10653 */
10654
10655 if (!(entry->protection & VM_PROT_WRITE)) {
10656 vm_map_unlock(dst_map);
10657 return KERN_PROTECTION_FAILURE;
10658 }
10659
10660 if (entry->is_sub_map) {
10661 /* not properly implemented */
10662 vm_map_unlock(dst_map);
10663 return KERN_PROTECTION_FAILURE;
10664 }
10665
10666 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10667 vm_map_unlock(dst_map);
10668 return KERN_PROTECTION_FAILURE;
10669 }
10670
10671 /*
10672 * If the entry is in transition, we must wait
10673 * for it to exit that state. Anything could happen
10674 * when we unlock the map, so start over.
10675 */
10676 if (entry->in_transition) {
10677 /*
10678 * Say that we are waiting, and wait for entry.
10679 */
10680 entry->needs_wakeup = TRUE;
10681 vm_map_entry_wait(dst_map, THREAD_UNINT);
10682
10683 goto RetryLookup;
10684 }
10685
10686 /*
10687 * Adjust to source size first
10688 */
10689
10690 if (copy_size < size) {
10691 if (entry->map_aligned &&
10692 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10693 VM_MAP_PAGE_MASK(dst_map))) {
10694 /* no longer map-aligned */
10695 entry->map_aligned = FALSE;
10696 }
10697 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10698 size = copy_size;
10699 }
10700
10701 /*
10702 * Adjust to destination size
10703 */
10704
10705 if (size < copy_size) {
10706 vm_map_copy_clip_end(copy, copy_entry,
10707 copy_entry->vme_start + size);
10708 copy_size = size;
10709 }
10710
10711 assert((entry->vme_end - entry->vme_start) == size);
10712 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10713 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10714
10715 /*
10716 * If the destination contains temporary unshared memory,
10717 * we can perform the copy by throwing it away and
10718 * installing the source data.
10719 *
10720 * Exceptions for mappings with special semantics:
10721 * + "permanent" entries,
10722 * + JIT regions,
10723 * + TPRO regions,
10724 * + pmap-specific protection policies,
10725 * + VM objects with COPY_NONE copy strategy.
10726 */
10727
10728 object = VME_OBJECT(entry);
10729 if ((!entry->is_shared &&
10730 !entry->vme_permanent &&
10731 !entry->used_for_jit &&
10732 #if __arm64e__
10733 !entry->used_for_tpro &&
10734 #endif /* __arm64e__ */
10735 !(entry->protection & VM_PROT_EXECUTE) &&
10736 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10737 ((object == VM_OBJECT_NULL) ||
10738 (object->internal &&
10739 !object->true_share &&
10740 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10741 entry->needs_copy) {
10742 vm_object_t old_object = VME_OBJECT(entry);
10743 vm_object_offset_t old_offset = VME_OFFSET(entry);
10744 vm_object_offset_t offset;
10745
10746 assert(!entry->is_sub_map);
10747 /*
10748 * Ensure that the source and destination aren't
10749 * identical
10750 */
10751 if (old_object == VME_OBJECT(copy_entry) &&
10752 old_offset == VME_OFFSET(copy_entry)) {
10753 vm_map_copy_entry_unlink(copy, copy_entry);
10754 vm_map_copy_entry_dispose(copy_entry);
10755
10756 if (old_object != VM_OBJECT_NULL) {
10757 vm_object_deallocate(old_object);
10758 }
10759
10760 start = tmp_entry->vme_end;
10761 tmp_entry = tmp_entry->vme_next;
10762 continue;
10763 }
10764
10765 #if XNU_TARGET_OS_OSX
10766 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10767 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10768 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10769 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10770 copy_size <= __TRADEOFF1_COPY_SIZE) {
10771 /*
10772 * Virtual vs. Physical copy tradeoff #1.
10773 *
10774 * Copying only a few pages out of a large
10775 * object: do a physical copy instead of
10776 * a virtual copy, to avoid possibly keeping
10777 * the entire large object alive because of
10778 * those few copy-on-write pages.
10779 */
10780 vm_map_copy_overwrite_aligned_src_large++;
10781 goto slow_copy;
10782 }
10783 #endif /* XNU_TARGET_OS_OSX */
10784
10785 if ((dst_map->pmap != kernel_pmap) &&
10786 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10787 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10788 vm_object_t new_object, new_shadow;
10789
10790 /*
10791 * We're about to map something over a mapping
10792 * established by malloc()...
10793 */
10794 new_object = VME_OBJECT(copy_entry);
10795 if (new_object != VM_OBJECT_NULL) {
10796 vm_object_lock_shared(new_object);
10797 }
10798 while (new_object != VM_OBJECT_NULL &&
10799 #if XNU_TARGET_OS_OSX
10800 !new_object->true_share &&
10801 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10802 #endif /* XNU_TARGET_OS_OSX */
10803 new_object->internal) {
10804 new_shadow = new_object->shadow;
10805 if (new_shadow == VM_OBJECT_NULL) {
10806 break;
10807 }
10808 vm_object_lock_shared(new_shadow);
10809 vm_object_unlock(new_object);
10810 new_object = new_shadow;
10811 }
10812 if (new_object != VM_OBJECT_NULL) {
10813 if (!new_object->internal) {
10814 /*
10815 * The new mapping is backed
10816 * by an external object. We
10817 * don't want malloc'ed memory
10818 * to be replaced with such a
10819 * non-anonymous mapping, so
10820 * let's go off the optimized
10821 * path...
10822 */
10823 vm_map_copy_overwrite_aligned_src_not_internal++;
10824 vm_object_unlock(new_object);
10825 goto slow_copy;
10826 }
10827 #if XNU_TARGET_OS_OSX
10828 if (new_object->true_share ||
10829 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10830 /*
10831 * Same if there's a "true_share"
10832 * object in the shadow chain, or
10833 * an object with a non-default
10834 * (SYMMETRIC) copy strategy.
10835 */
10836 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10837 vm_object_unlock(new_object);
10838 goto slow_copy;
10839 }
10840 #endif /* XNU_TARGET_OS_OSX */
10841 vm_object_unlock(new_object);
10842 }
10843 /*
10844 * The new mapping is still backed by
10845 * anonymous (internal) memory, so it's
10846 * OK to substitute it for the original
10847 * malloc() mapping.
10848 */
10849 }
10850
10851 if (old_object != VM_OBJECT_NULL) {
10852 assert(!entry->vme_permanent);
10853 if (entry->is_sub_map) {
10854 if (entry->use_pmap) {
10855 #ifndef NO_NESTED_PMAP
10856 pmap_unnest(dst_map->pmap,
10857 (addr64_t)entry->vme_start,
10858 entry->vme_end - entry->vme_start);
10859 #endif /* NO_NESTED_PMAP */
10860 if (dst_map->mapped_in_other_pmaps) {
10861 /* clean up parent */
10862 /* map/maps */
10863 vm_map_submap_pmap_clean(
10864 dst_map, entry->vme_start,
10865 entry->vme_end,
10866 VME_SUBMAP(entry),
10867 VME_OFFSET(entry));
10868 }
10869 } else {
10870 vm_map_submap_pmap_clean(
10871 dst_map, entry->vme_start,
10872 entry->vme_end,
10873 VME_SUBMAP(entry),
10874 VME_OFFSET(entry));
10875 }
10876 vm_map_deallocate(VME_SUBMAP(entry));
10877 } else {
10878 if (dst_map->mapped_in_other_pmaps) {
10879 vm_object_pmap_protect_options(
10880 VME_OBJECT(entry),
10881 VME_OFFSET(entry),
10882 entry->vme_end
10883 - entry->vme_start,
10884 PMAP_NULL,
10885 PAGE_SIZE,
10886 entry->vme_start,
10887 VM_PROT_NONE,
10888 PMAP_OPTIONS_REMOVE);
10889 } else {
10890 pmap_remove_options(
10891 dst_map->pmap,
10892 (addr64_t)(entry->vme_start),
10893 (addr64_t)(entry->vme_end),
10894 PMAP_OPTIONS_REMOVE);
10895 }
10896 vm_object_deallocate(old_object);
10897 }
10898 }
10899
10900 if (entry->iokit_acct) {
10901 /* keep using iokit accounting */
10902 entry->use_pmap = FALSE;
10903 } else {
10904 /* use pmap accounting */
10905 entry->use_pmap = TRUE;
10906 }
10907 assert(!entry->vme_permanent);
10908 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10909 object = VME_OBJECT(entry);
10910 entry->needs_copy = copy_entry->needs_copy;
10911 entry->wired_count = 0;
10912 entry->user_wired_count = 0;
10913 offset = VME_OFFSET(copy_entry);
10914 VME_OFFSET_SET(entry, offset);
10915
10916 vm_map_copy_entry_unlink(copy, copy_entry);
10917 vm_map_copy_entry_dispose(copy_entry);
10918
10919 /*
10920 * we could try to push pages into the pmap at this point, BUT
10921 * this optimization only saved on average 2 us per page if ALL
10922 * the pages in the source were currently mapped
10923 * and ALL the pages in the dest were touched, if there were fewer
10924 * than 2/3 of the pages touched, this optimization actually cost more cycles
10925 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10926 */
10927
10928 /*
10929 * Set up for the next iteration. The map
10930 * has not been unlocked, so the next
10931 * address should be at the end of this
10932 * entry, and the next map entry should be
10933 * the one following it.
10934 */
10935
10936 start = tmp_entry->vme_end;
10937 tmp_entry = tmp_entry->vme_next;
10938 } else {
10939 vm_map_version_t version;
10940 vm_object_t dst_object;
10941 vm_object_offset_t dst_offset;
10942 kern_return_t r;
10943
10944 slow_copy:
10945 if (entry->needs_copy) {
10946 VME_OBJECT_SHADOW(entry,
10947 (entry->vme_end -
10948 entry->vme_start),
10949 vm_map_always_shadow(dst_map));
10950 entry->needs_copy = FALSE;
10951 }
10952
10953 dst_object = VME_OBJECT(entry);
10954 dst_offset = VME_OFFSET(entry);
10955
10956 /*
10957 * Take an object reference, and record
10958 * the map version information so that the
10959 * map can be safely unlocked.
10960 */
10961
10962 if (dst_object == VM_OBJECT_NULL) {
10963 /*
10964 * We would usually have just taken the
10965 * optimized path above if the destination
10966 * object has not been allocated yet. But we
10967 * now disable that optimization if the copy
10968 * entry's object is not backed by anonymous
10969 * memory to avoid replacing malloc'ed
10970 * (i.e. re-usable) anonymous memory with a
10971 * not-so-anonymous mapping.
10972 * So we have to handle this case here and
10973 * allocate a new VM object for this map entry.
10974 */
10975 dst_object = vm_object_allocate(
10976 entry->vme_end - entry->vme_start);
10977 dst_offset = 0;
10978 VME_OBJECT_SET(entry, dst_object, false, 0);
10979 VME_OFFSET_SET(entry, dst_offset);
10980 assert(entry->use_pmap);
10981 }
10982
10983 vm_object_reference(dst_object);
10984
10985 /* account for unlock bumping up timestamp */
10986 version.main_timestamp = dst_map->timestamp + 1;
10987
10988 vm_map_unlock(dst_map);
10989
10990 /*
10991 * Copy as much as possible in one pass
10992 */
10993
10994 copy_size = size;
10995 r = vm_fault_copy(
10996 VME_OBJECT(copy_entry),
10997 VME_OFFSET(copy_entry),
10998 ©_size,
10999 dst_object,
11000 dst_offset,
11001 dst_map,
11002 &version,
11003 THREAD_UNINT );
11004
11005 /*
11006 * Release the object reference
11007 */
11008
11009 vm_object_deallocate(dst_object);
11010
11011 /*
11012 * If a hard error occurred, return it now
11013 */
11014
11015 if (r != KERN_SUCCESS) {
11016 return r;
11017 }
11018
11019 if (copy_size != 0) {
11020 /*
11021 * Dispose of the copied region
11022 */
11023
11024 vm_map_copy_clip_end(copy, copy_entry,
11025 copy_entry->vme_start + copy_size);
11026 vm_map_copy_entry_unlink(copy, copy_entry);
11027 vm_object_deallocate(VME_OBJECT(copy_entry));
11028 vm_map_copy_entry_dispose(copy_entry);
11029 }
11030
11031 /*
11032 * Pick up in the destination map where we left off.
11033 *
11034 * Use the version information to avoid a lookup
11035 * in the normal case.
11036 */
11037
11038 start += copy_size;
11039 vm_map_lock(dst_map);
11040 if (version.main_timestamp == dst_map->timestamp &&
11041 copy_size != 0) {
11042 /* We can safely use saved tmp_entry value */
11043
11044 if (tmp_entry->map_aligned &&
11045 !VM_MAP_PAGE_ALIGNED(
11046 start,
11047 VM_MAP_PAGE_MASK(dst_map))) {
11048 /* no longer map-aligned */
11049 tmp_entry->map_aligned = FALSE;
11050 }
11051 vm_map_clip_end(dst_map, tmp_entry, start);
11052 tmp_entry = tmp_entry->vme_next;
11053 } else {
11054 /* Must do lookup of tmp_entry */
11055
11056 RetryLookup:
11057 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
11058 vm_map_unlock(dst_map);
11059 return KERN_INVALID_ADDRESS;
11060 }
11061 if (tmp_entry->map_aligned &&
11062 !VM_MAP_PAGE_ALIGNED(
11063 start,
11064 VM_MAP_PAGE_MASK(dst_map))) {
11065 /* no longer map-aligned */
11066 tmp_entry->map_aligned = FALSE;
11067 }
11068 vm_map_clip_start(dst_map, tmp_entry, start);
11069 }
11070 }
11071 }/* while */
11072
11073 return KERN_SUCCESS;
11074 }/* vm_map_copy_overwrite_aligned */
11075
11076 /*
11077 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11078 *
11079 * Description:
11080 * Copy in data to a kernel buffer from space in the
11081 * source map. The original space may be optionally
11082 * deallocated.
11083 *
11084 * If successful, returns a new copy object.
11085 */
11086 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11087 vm_map_copyin_kernel_buffer(
11088 vm_map_t src_map,
11089 vm_map_offset_t src_addr,
11090 vm_map_size_t len,
11091 boolean_t src_destroy,
11092 vm_map_copy_t *copy_result)
11093 {
11094 kern_return_t kr;
11095 vm_map_copy_t copy;
11096 void *kdata;
11097
11098 if (len > msg_ool_size_small) {
11099 return KERN_INVALID_ARGUMENT;
11100 }
11101
11102 kdata = kalloc_data(len, Z_WAITOK);
11103 if (kdata == NULL) {
11104 return KERN_RESOURCE_SHORTAGE;
11105 }
11106 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
11107 if (kr != KERN_SUCCESS) {
11108 kfree_data(kdata, len);
11109 return kr;
11110 }
11111
11112 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11113 copy->cpy_kdata = kdata;
11114 copy->size = len;
11115 copy->offset = 0;
11116
11117 if (src_destroy) {
11118 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11119
11120 if (src_map == kernel_map) {
11121 flags |= VM_MAP_REMOVE_KUNWIRE;
11122 }
11123
11124 (void)vm_map_remove_guard(src_map,
11125 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11126 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11127 flags, KMEM_GUARD_NONE);
11128 }
11129
11130 *copy_result = copy;
11131 return KERN_SUCCESS;
11132 }
11133
11134 /*
11135 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11136 *
11137 * Description:
11138 * Copy out data from a kernel buffer into space in the
11139 * destination map. The space may be otpionally dynamically
11140 * allocated.
11141 *
11142 * If successful, consumes the copy object.
11143 * Otherwise, the caller is responsible for it.
11144 *
11145 * Callers of this function must call vm_map_copy_require on
11146 * previously created vm_map_copy_t or pass a newly created
11147 * one to ensure that it hasn't been forged.
11148 */
11149 static int vm_map_copyout_kernel_buffer_failures = 0;
11150 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)11151 vm_map_copyout_kernel_buffer(
11152 vm_map_t map,
11153 vm_map_address_t *addr, /* IN/OUT */
11154 vm_map_copy_t copy,
11155 vm_map_size_t copy_size,
11156 boolean_t overwrite,
11157 boolean_t consume_on_success)
11158 {
11159 kern_return_t kr = KERN_SUCCESS;
11160 thread_t thread = current_thread();
11161
11162 assert(copy->size == copy_size);
11163
11164 /*
11165 * check for corrupted vm_map_copy structure
11166 */
11167 if (copy_size > msg_ool_size_small || copy->offset) {
11168 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11169 (long long)copy->size, (long long)copy->offset);
11170 }
11171
11172 if (!overwrite) {
11173 /*
11174 * Allocate space in the target map for the data
11175 */
11176 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11177
11178 if (map == kernel_map) {
11179 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11180 }
11181
11182 *addr = 0;
11183 kr = vm_map_enter(map,
11184 addr,
11185 vm_map_round_page(copy_size,
11186 VM_MAP_PAGE_MASK(map)),
11187 (vm_map_offset_t) 0,
11188 vmk_flags,
11189 VM_OBJECT_NULL,
11190 (vm_object_offset_t) 0,
11191 FALSE,
11192 VM_PROT_DEFAULT,
11193 VM_PROT_ALL,
11194 VM_INHERIT_DEFAULT);
11195 if (kr != KERN_SUCCESS) {
11196 return kr;
11197 }
11198 #if KASAN
11199 if (map->pmap == kernel_pmap) {
11200 kasan_notify_address(*addr, copy->size);
11201 }
11202 #endif
11203 }
11204
11205 /*
11206 * Copyout the data from the kernel buffer to the target map.
11207 */
11208 if (thread->map == map) {
11209 /*
11210 * If the target map is the current map, just do
11211 * the copy.
11212 */
11213 assert((vm_size_t)copy_size == copy_size);
11214 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11215 kr = KERN_INVALID_ADDRESS;
11216 }
11217 } else {
11218 vm_map_t oldmap;
11219
11220 /*
11221 * If the target map is another map, assume the
11222 * target's address space identity for the duration
11223 * of the copy.
11224 */
11225 vm_map_reference(map);
11226 oldmap = vm_map_switch(map);
11227
11228 assert((vm_size_t)copy_size == copy_size);
11229 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11230 vm_map_copyout_kernel_buffer_failures++;
11231 kr = KERN_INVALID_ADDRESS;
11232 }
11233
11234 (void) vm_map_switch(oldmap);
11235 vm_map_deallocate(map);
11236 }
11237
11238 if (kr != KERN_SUCCESS) {
11239 /* the copy failed, clean up */
11240 if (!overwrite) {
11241 /*
11242 * Deallocate the space we allocated in the target map.
11243 */
11244 (void) vm_map_remove(map,
11245 vm_map_trunc_page(*addr,
11246 VM_MAP_PAGE_MASK(map)),
11247 vm_map_round_page((*addr +
11248 vm_map_round_page(copy_size,
11249 VM_MAP_PAGE_MASK(map))),
11250 VM_MAP_PAGE_MASK(map)));
11251 *addr = 0;
11252 }
11253 } else {
11254 /* copy was successful, dicard the copy structure */
11255 if (consume_on_success) {
11256 kfree_data(copy->cpy_kdata, copy_size);
11257 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11258 }
11259 }
11260
11261 return kr;
11262 }
11263
11264 /*
11265 * Routine: vm_map_copy_insert [internal use only]
11266 *
11267 * Description:
11268 * Link a copy chain ("copy") into a map at the
11269 * specified location (after "where").
11270 *
11271 * Callers of this function must call vm_map_copy_require on
11272 * previously created vm_map_copy_t or pass a newly created
11273 * one to ensure that it hasn't been forged.
11274 * Side effects:
11275 * The copy chain is destroyed.
11276 */
11277 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11278 vm_map_copy_insert(
11279 vm_map_t map,
11280 vm_map_entry_t after_where,
11281 vm_map_copy_t copy)
11282 {
11283 vm_map_entry_t entry;
11284
11285 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11286 entry = vm_map_copy_first_entry(copy);
11287 vm_map_copy_entry_unlink(copy, entry);
11288 vm_map_store_entry_link(map, after_where, entry,
11289 VM_MAP_KERNEL_FLAGS_NONE);
11290 after_where = entry;
11291 }
11292 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11293 }
11294
11295 /*
11296 * Callers of this function must call vm_map_copy_require on
11297 * previously created vm_map_copy_t or pass a newly created
11298 * one to ensure that it hasn't been forged.
11299 */
11300 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11301 vm_map_copy_remap(
11302 vm_map_t map,
11303 vm_map_entry_t where,
11304 vm_map_copy_t copy,
11305 vm_map_offset_t adjustment,
11306 vm_prot_t cur_prot,
11307 vm_prot_t max_prot,
11308 vm_inherit_t inheritance)
11309 {
11310 vm_map_entry_t copy_entry, new_entry;
11311
11312 for (copy_entry = vm_map_copy_first_entry(copy);
11313 copy_entry != vm_map_copy_to_entry(copy);
11314 copy_entry = copy_entry->vme_next) {
11315 /* get a new VM map entry for the map */
11316 new_entry = vm_map_entry_create(map);
11317 /* copy the "copy entry" to the new entry */
11318 vm_map_entry_copy(map, new_entry, copy_entry);
11319 /* adjust "start" and "end" */
11320 new_entry->vme_start += adjustment;
11321 new_entry->vme_end += adjustment;
11322 /* clear some attributes */
11323 new_entry->inheritance = inheritance;
11324 new_entry->protection = cur_prot;
11325 new_entry->max_protection = max_prot;
11326 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11327 /* take an extra reference on the entry's "object" */
11328 if (new_entry->is_sub_map) {
11329 assert(!new_entry->use_pmap); /* not nested */
11330 vm_map_reference(VME_SUBMAP(new_entry));
11331 } else {
11332 vm_object_reference(VME_OBJECT(new_entry));
11333 }
11334 /* insert the new entry in the map */
11335 vm_map_store_entry_link(map, where, new_entry,
11336 VM_MAP_KERNEL_FLAGS_NONE);
11337 /* continue inserting the "copy entries" after the new entry */
11338 where = new_entry;
11339 }
11340 }
11341
11342
11343 /*
11344 * Returns true if *size matches (or is in the range of) copy->size.
11345 * Upon returning true, the *size field is updated with the actual size of the
11346 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11347 */
11348 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11349 vm_map_copy_validate_size(
11350 vm_map_t dst_map,
11351 vm_map_copy_t copy,
11352 vm_map_size_t *size)
11353 {
11354 if (copy == VM_MAP_COPY_NULL) {
11355 return FALSE;
11356 }
11357
11358 /*
11359 * Assert that the vm_map_copy is coming from the right
11360 * zone and hasn't been forged
11361 */
11362 vm_map_copy_require(copy);
11363
11364 vm_map_size_t copy_sz = copy->size;
11365 vm_map_size_t sz = *size;
11366 switch (copy->type) {
11367 case VM_MAP_COPY_KERNEL_BUFFER:
11368 if (sz == copy_sz) {
11369 return TRUE;
11370 }
11371 break;
11372 case VM_MAP_COPY_ENTRY_LIST:
11373 /*
11374 * potential page-size rounding prevents us from exactly
11375 * validating this flavor of vm_map_copy, but we can at least
11376 * assert that it's within a range.
11377 */
11378 if (copy_sz >= sz &&
11379 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11380 *size = copy_sz;
11381 return TRUE;
11382 }
11383 break;
11384 default:
11385 break;
11386 }
11387 return FALSE;
11388 }
11389
11390 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11391 vm_map_copyout_internal(
11392 vm_map_t dst_map,
11393 vm_map_address_t *dst_addr, /* OUT */
11394 vm_map_copy_t copy,
11395 vm_map_size_ut copy_size_u,
11396 boolean_t consume_on_success,
11397 vm_prot_t cur_protection,
11398 vm_prot_t max_protection,
11399 vm_inherit_t inheritance)
11400 {
11401 vm_map_size_t size, copy_size;
11402 vm_map_size_t adjustment;
11403 vm_map_offset_t start;
11404 vm_object_offset_t vm_copy_start;
11405 vm_map_entry_t last;
11406 vm_map_entry_t entry;
11407 vm_map_copy_t original_copy;
11408 kern_return_t kr;
11409 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11410
11411 /*
11412 * Check for null copy object.
11413 */
11414
11415 if (copy == VM_MAP_COPY_NULL) {
11416 *dst_addr = 0;
11417 return KERN_SUCCESS;
11418 }
11419
11420 /*
11421 * Assert that the vm_map_copy is coming from the right
11422 * zone and hasn't been forged
11423 */
11424 vm_map_copy_require(copy);
11425
11426 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11427 *dst_addr = 0;
11428 ktriage_record(thread_tid(current_thread()),
11429 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11430 KDBG_TRIAGE_RESERVED,
11431 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11432 KERN_FAILURE /* arg */);
11433 return KERN_FAILURE;
11434 }
11435 copy_size = copy->size;
11436
11437 /*
11438 * Check for special kernel buffer allocated
11439 * by new_ipc_kmsg_copyin.
11440 */
11441
11442 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11443 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11444 copy, copy_size, FALSE,
11445 consume_on_success);
11446 if (kr) {
11447 ktriage_record(thread_tid(current_thread()),
11448 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11449 KDBG_TRIAGE_RESERVED,
11450 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11451 }
11452 return kr;
11453 }
11454
11455 original_copy = copy;
11456 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11457 vm_map_copy_t target_copy;
11458 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11459
11460 target_copy = VM_MAP_COPY_NULL;
11461 DEBUG4K_ADJUST("adjusting...\n");
11462 kr = vm_map_copy_adjust_to_target(
11463 copy,
11464 0, /* offset */
11465 copy->size, /* size */
11466 dst_map,
11467 TRUE, /* copy */
11468 &target_copy,
11469 &overmap_start,
11470 &overmap_end,
11471 &trimmed_start);
11472 if (kr != KERN_SUCCESS) {
11473 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11474 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11475 return kr;
11476 }
11477 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11478 if (target_copy != copy) {
11479 copy = target_copy;
11480 }
11481 copy_size = copy->size;
11482 }
11483
11484 /*
11485 * Find space for the data
11486 */
11487
11488 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11489 VM_MAP_COPY_PAGE_MASK(copy));
11490 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11491 VM_MAP_COPY_PAGE_MASK(copy))
11492 - vm_copy_start;
11493
11494 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11495
11496 vm_map_lock(dst_map);
11497 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11498 &start, &last);
11499 if (kr != KERN_SUCCESS) {
11500 vm_map_unlock(dst_map);
11501 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11502 return kr;
11503 }
11504
11505 adjustment = start - vm_copy_start;
11506 if (!consume_on_success) {
11507 /*
11508 * We're not allowed to consume "copy", so we'll have to
11509 * copy its map entries into the destination map below.
11510 * No need to re-allocate map entries from the correct
11511 * (pageable or not) zone, since we'll get new map entries
11512 * during the transfer.
11513 * We'll also adjust the map entries's "start" and "end"
11514 * during the transfer, to keep "copy"'s entries consistent
11515 * with its "offset".
11516 */
11517 goto after_adjustments;
11518 }
11519
11520 /*
11521 * Since we're going to just drop the map
11522 * entries from the copy into the destination
11523 * map, they must come from the same pool.
11524 */
11525
11526 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11527 /*
11528 * Mismatches occur when dealing with the default
11529 * pager.
11530 */
11531 vm_map_entry_t next, new;
11532
11533 /*
11534 * Find the zone that the copies were allocated from
11535 */
11536
11537 entry = vm_map_copy_first_entry(copy);
11538
11539 /*
11540 * Reinitialize the copy so that vm_map_copy_entry_link
11541 * will work.
11542 */
11543 vm_map_store_copy_reset(copy, entry);
11544 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11545
11546 /*
11547 * Copy each entry.
11548 */
11549 while (entry != vm_map_copy_to_entry(copy)) {
11550 new = vm_map_copy_entry_create(copy);
11551 vm_map_entry_copy_full(new, entry);
11552 new->vme_no_copy_on_read = FALSE;
11553 assert(!new->iokit_acct);
11554 if (new->is_sub_map) {
11555 /* clr address space specifics */
11556 new->use_pmap = FALSE;
11557 }
11558 vm_map_copy_entry_link(copy,
11559 vm_map_copy_last_entry(copy),
11560 new);
11561 next = entry->vme_next;
11562 vm_map_entry_dispose(entry);
11563 entry = next;
11564 }
11565 }
11566
11567 /*
11568 * Adjust the addresses in the copy chain, and
11569 * reset the region attributes.
11570 */
11571
11572 for (entry = vm_map_copy_first_entry(copy);
11573 entry != vm_map_copy_to_entry(copy);
11574 entry = entry->vme_next) {
11575 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11576 /*
11577 * We're injecting this copy entry into a map that
11578 * has the standard page alignment, so clear
11579 * "map_aligned" (which might have been inherited
11580 * from the original map entry).
11581 */
11582 entry->map_aligned = FALSE;
11583 }
11584
11585 entry->vme_start += adjustment;
11586 entry->vme_end += adjustment;
11587
11588 if (entry->map_aligned) {
11589 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11590 VM_MAP_PAGE_MASK(dst_map)));
11591 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11592 VM_MAP_PAGE_MASK(dst_map)));
11593 }
11594
11595 entry->inheritance = VM_INHERIT_DEFAULT;
11596 entry->protection = VM_PROT_DEFAULT;
11597 entry->max_protection = VM_PROT_ALL;
11598 entry->behavior = VM_BEHAVIOR_DEFAULT;
11599
11600 /*
11601 * If the entry is now wired,
11602 * map the pages into the destination map.
11603 */
11604 if (entry->wired_count != 0) {
11605 vm_map_offset_t va;
11606 vm_object_offset_t offset;
11607 vm_object_t object;
11608 vm_prot_t prot;
11609 int type_of_fault;
11610 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11611
11612 /* TODO4K would need to use actual page size */
11613 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11614
11615 object = VME_OBJECT(entry);
11616 offset = VME_OFFSET(entry);
11617 va = entry->vme_start;
11618
11619 pmap_pageable(dst_map->pmap,
11620 entry->vme_start,
11621 entry->vme_end,
11622 TRUE);
11623
11624 while (va < entry->vme_end) {
11625 vm_page_t m;
11626 struct vm_object_fault_info fault_info = {};
11627
11628 /*
11629 * Look up the page in the object.
11630 * Assert that the page will be found in the
11631 * top object:
11632 * either
11633 * the object was newly created by
11634 * vm_object_copy_slowly, and has
11635 * copies of all of the pages from
11636 * the source object
11637 * or
11638 * the object was moved from the old
11639 * map entry; because the old map
11640 * entry was wired, all of the pages
11641 * were in the top-level object.
11642 * (XXX not true if we wire pages for
11643 * reading)
11644 */
11645 vm_object_lock(object);
11646
11647 m = vm_page_lookup(object, offset);
11648 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11649 m->vmp_absent) {
11650 panic("vm_map_copyout: wiring %p", m);
11651 }
11652
11653 prot = entry->protection;
11654
11655 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11656 prot) {
11657 prot |= VM_PROT_EXECUTE;
11658 }
11659
11660 type_of_fault = DBG_CACHE_HIT_FAULT;
11661
11662 fault_info.user_tag = VME_ALIAS(entry);
11663 fault_info.pmap_options = 0;
11664 if (entry->iokit_acct ||
11665 (!entry->is_sub_map && !entry->use_pmap)) {
11666 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11667 }
11668 if (entry->vme_xnu_user_debug &&
11669 !VM_PAGE_OBJECT(m)->code_signed) {
11670 /*
11671 * Modified code-signed executable
11672 * region: this page does not belong
11673 * to a code-signed VM object, so it
11674 * must have been copied and should
11675 * therefore be typed XNU_USER_DEBUG
11676 * rather than XNU_USER_EXEC.
11677 */
11678 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11679 }
11680
11681 vm_fault_enter(m,
11682 dst_map->pmap,
11683 va,
11684 PAGE_SIZE, 0,
11685 prot,
11686 prot,
11687 VM_PAGE_WIRED(m),
11688 FALSE, /* change_wiring */
11689 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11690 &fault_info,
11691 NULL, /* need_retry */
11692 &type_of_fault,
11693 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11694
11695 vm_object_unlock(object);
11696
11697 offset += PAGE_SIZE_64;
11698 va += PAGE_SIZE;
11699 }
11700 }
11701 }
11702
11703 after_adjustments:
11704
11705 /*
11706 * Correct the page alignment for the result
11707 */
11708
11709 *dst_addr = start + (copy->offset - vm_copy_start);
11710
11711 #if KASAN
11712 kasan_notify_address(*dst_addr, size);
11713 #endif
11714
11715 /*
11716 * Update the hints and the map size
11717 */
11718
11719 if (consume_on_success) {
11720 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11721 } else {
11722 SAVE_HINT_MAP_WRITE(dst_map, last);
11723 }
11724
11725 dst_map->size += size;
11726
11727 /*
11728 * Link in the copy
11729 */
11730
11731 if (consume_on_success) {
11732 vm_map_copy_insert(dst_map, last, copy);
11733 if (copy != original_copy) {
11734 vm_map_copy_discard(original_copy);
11735 original_copy = VM_MAP_COPY_NULL;
11736 }
11737 } else {
11738 vm_map_copy_remap(dst_map, last, copy, adjustment,
11739 cur_protection, max_protection,
11740 inheritance);
11741 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11742 vm_map_copy_discard(copy);
11743 copy = original_copy;
11744 }
11745 }
11746
11747
11748 vm_map_unlock(dst_map);
11749
11750 /*
11751 * XXX If wiring_required, call vm_map_pageable
11752 */
11753
11754 return KERN_SUCCESS;
11755 }
11756
11757 /*
11758 * Routine: vm_map_copyout_size
11759 *
11760 * Description:
11761 * Copy out a copy chain ("copy") into newly-allocated
11762 * space in the destination map. Uses a prevalidated
11763 * size for the copy object (vm_map_copy_validate_size).
11764 *
11765 * If successful, consumes the copy object.
11766 * Otherwise, the caller is responsible for it.
11767 */
11768 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11769 vm_map_copyout_size(
11770 vm_map_t dst_map,
11771 vm_map_address_t *dst_addr, /* OUT */
11772 vm_map_copy_t copy,
11773 vm_map_size_ut copy_size)
11774 {
11775 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11776 TRUE, /* consume_on_success */
11777 VM_PROT_DEFAULT,
11778 VM_PROT_ALL,
11779 VM_INHERIT_DEFAULT);
11780 }
11781
11782 /*
11783 * Routine: vm_map_copyout
11784 *
11785 * Description:
11786 * Copy out a copy chain ("copy") into newly-allocated
11787 * space in the destination map.
11788 *
11789 * If successful, consumes the copy object.
11790 * Otherwise, the caller is responsible for it.
11791 */
11792 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11793 vm_map_copyout(
11794 vm_map_t dst_map,
11795 vm_map_address_t *dst_addr, /* OUT */
11796 vm_map_copy_t copy)
11797 {
11798 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11799 TRUE, /* consume_on_success */
11800 VM_PROT_DEFAULT,
11801 VM_PROT_ALL,
11802 VM_INHERIT_DEFAULT);
11803 }
11804
11805 /*
11806 * Routine: vm_map_copyin
11807 *
11808 * Description:
11809 * see vm_map_copyin_common. Exported via Unsupported.exports.
11810 *
11811 */
11812 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11813 vm_map_copyin(
11814 vm_map_t src_map,
11815 vm_map_address_ut src_addr,
11816 vm_map_size_ut len,
11817 boolean_t src_destroy,
11818 vm_map_copy_t *copy_result) /* OUT */
11819 {
11820 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11821 FALSE, copy_result, FALSE);
11822 }
11823
11824 /*
11825 * Routine: vm_map_copyin_common
11826 *
11827 * Description:
11828 * Copy the specified region (src_addr, len) from the
11829 * source address space (src_map), possibly removing
11830 * the region from the source address space (src_destroy).
11831 *
11832 * Returns:
11833 * A vm_map_copy_t object (copy_result), suitable for
11834 * insertion into another address space (using vm_map_copyout),
11835 * copying over another address space region (using
11836 * vm_map_copy_overwrite). If the copy is unused, it
11837 * should be destroyed (using vm_map_copy_discard).
11838 *
11839 * In/out conditions:
11840 * The source map should not be locked on entry.
11841 */
11842
11843 typedef struct submap_map {
11844 vm_map_t parent_map;
11845 vm_map_offset_t base_start;
11846 vm_map_offset_t base_end;
11847 vm_map_size_t base_len;
11848 struct submap_map *next;
11849 } submap_map_t;
11850
11851 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11852 vm_map_copyin_common(
11853 vm_map_t src_map,
11854 vm_map_address_ut src_addr,
11855 vm_map_size_ut len,
11856 boolean_t src_destroy,
11857 __unused boolean_t src_volatile,
11858 vm_map_copy_t *copy_result, /* OUT */
11859 boolean_t use_maxprot)
11860 {
11861 int flags;
11862
11863 flags = 0;
11864 if (src_destroy) {
11865 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11866 }
11867 if (use_maxprot) {
11868 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11869 }
11870 return vm_map_copyin_internal(src_map,
11871 src_addr,
11872 len,
11873 flags,
11874 copy_result);
11875 }
11876
11877 static __attribute__((always_inline, warn_unused_result))
11878 kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11879 vm_map_copyin_sanitize(
11880 vm_map_t src_map,
11881 vm_map_address_ut src_addr_u,
11882 vm_map_size_ut len_u,
11883 vm_map_offset_t *src_start,
11884 vm_map_offset_t *src_end,
11885 vm_map_size_t *len,
11886 vm_map_offset_t *src_addr_unaligned)
11887 {
11888 kern_return_t kr;
11889 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS |
11890 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
11891
11892 if (src_map->pmap == kernel_pmap) {
11893 flags |= VM_SANITIZE_FLAGS_CANONICALIZE;
11894 }
11895
11896
11897 kr = vm_sanitize_addr_size(src_addr_u, len_u,
11898 VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11899 src_map,
11900 flags,
11901 src_start, src_end, len);
11902 if (__improbable(kr != KERN_SUCCESS)) {
11903 return kr;
11904 }
11905
11906 /*
11907 * Compute (page aligned) start and end of region
11908 */
11909 *src_addr_unaligned = *src_start; /* remember unaligned value */
11910 *src_start = vm_map_trunc_page(*src_addr_unaligned,
11911 VM_MAP_PAGE_MASK(src_map));
11912 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11913 return KERN_SUCCESS;
11914 }
11915
11916 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11917 vm_map_copyin_internal(
11918 vm_map_t src_map,
11919 vm_map_address_ut src_addr_u,
11920 vm_map_size_ut len_u,
11921 int flags,
11922 vm_map_copy_t *copy_result) /* OUT */
11923 {
11924 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11925 * in multi-level lookup, this
11926 * entry contains the actual
11927 * vm_object/offset.
11928 */
11929 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11930
11931 vm_map_offset_t src_start; /* Start of current entry --
11932 * where copy is taking place now
11933 */
11934 vm_map_offset_t src_end; /* End of entire region to be
11935 * copied */
11936 vm_map_offset_t src_addr_unaligned;
11937 vm_map_offset_t src_base;
11938 vm_map_size_t len;
11939 vm_map_t base_map = src_map;
11940 boolean_t map_share = FALSE;
11941 submap_map_t *parent_maps = NULL;
11942
11943 vm_map_copy_t copy; /* Resulting copy */
11944 vm_map_address_t copy_addr;
11945 vm_map_size_t copy_size;
11946 boolean_t src_destroy;
11947 boolean_t use_maxprot;
11948 boolean_t preserve_purgeable;
11949 boolean_t entry_was_shared;
11950 vm_map_entry_t saved_src_entry;
11951 kern_return_t kr;
11952
11953
11954 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11955 return KERN_INVALID_ARGUMENT;
11956 }
11957
11958 /*
11959 * Check for copies of zero bytes.
11960 */
11961 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11962 *copy_result = VM_MAP_COPY_NULL;
11963 return KERN_SUCCESS;
11964 }
11965
11966 /*
11967 * Sanitize any input parameters that are addr/size/prot/inherit
11968 */
11969 kr = vm_map_copyin_sanitize(
11970 src_map,
11971 src_addr_u,
11972 len_u,
11973 &src_start,
11974 &src_end,
11975 &len,
11976 &src_addr_unaligned);
11977 if (__improbable(kr != KERN_SUCCESS)) {
11978 return vm_sanitize_get_kr(kr);
11979 }
11980
11981 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11982 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11983 preserve_purgeable =
11984 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11985
11986 /*
11987 * If the copy is sufficiently small, use a kernel buffer instead
11988 * of making a virtual copy. The theory being that the cost of
11989 * setting up VM (and taking C-O-W faults) dominates the copy costs
11990 * for small regions.
11991 */
11992 if ((len <= msg_ool_size_small) &&
11993 !use_maxprot &&
11994 !preserve_purgeable &&
11995 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11996 /*
11997 * Since the "msg_ool_size_small" threshold was increased and
11998 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11999 * address space limits, we revert to doing a virtual copy if the
12000 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12001 * of the commpage would now fail when it used to work.
12002 */
12003 (src_start >= vm_map_min(src_map) &&
12004 src_start < vm_map_max(src_map) &&
12005 src_end >= vm_map_min(src_map) &&
12006 src_end < vm_map_max(src_map))) {
12007 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
12008 src_destroy, copy_result);
12009 }
12010
12011 /*
12012 * Allocate a header element for the list.
12013 *
12014 * Use the start and end in the header to
12015 * remember the endpoints prior to rounding.
12016 */
12017
12018 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12019 copy->cpy_hdr.entries_pageable = TRUE;
12020 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
12021 copy->offset = src_addr_unaligned;
12022 copy->size = len;
12023
12024 new_entry = vm_map_copy_entry_create(copy);
12025
12026 #define RETURN(x) \
12027 MACRO_BEGIN \
12028 vm_map_unlock(src_map); \
12029 if(src_map != base_map) \
12030 vm_map_deallocate(src_map); \
12031 if (new_entry != VM_MAP_ENTRY_NULL) \
12032 vm_map_copy_entry_dispose(new_entry); \
12033 vm_map_copy_discard(copy); \
12034 { \
12035 submap_map_t *_ptr; \
12036 \
12037 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12038 parent_maps=parent_maps->next; \
12039 if (_ptr->parent_map != base_map) \
12040 vm_map_deallocate(_ptr->parent_map); \
12041 kfree_type(submap_map_t, _ptr); \
12042 } \
12043 } \
12044 MACRO_RETURN(x); \
12045 MACRO_END
12046
12047 /*
12048 * Find the beginning of the region.
12049 */
12050
12051 vm_map_lock(src_map);
12052
12053 /*
12054 * Lookup the original "src_addr_unaligned" rather than the truncated
12055 * "src_start", in case "src_start" falls in a non-map-aligned
12056 * map entry *before* the map entry that contains "src_addr_unaligned"...
12057 */
12058 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
12059 RETURN(KERN_INVALID_ADDRESS);
12060 }
12061 if (!tmp_entry->is_sub_map) {
12062 /*
12063 * ... but clip to the map-rounded "src_start" rather than
12064 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
12065 * first copy entry at the end, if needed.
12066 */
12067 vm_map_clip_start(src_map, tmp_entry, src_start);
12068 }
12069 if (src_start < tmp_entry->vme_start) {
12070 /*
12071 * Move "src_start" up to the start of the
12072 * first map entry to copy.
12073 */
12074 src_start = tmp_entry->vme_start;
12075 }
12076 /* set for later submap fix-up */
12077 copy_addr = src_start;
12078
12079 /*
12080 * Go through entries until we get to the end.
12081 */
12082
12083 while (TRUE) {
12084 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12085 vm_map_size_t src_size; /* Size of source
12086 * map entry (in both
12087 * maps)
12088 */
12089
12090 vm_object_t src_object; /* Object to copy */
12091 vm_object_offset_t src_offset;
12092
12093 vm_object_t new_copy_object;/* vm_object_copy_* result */
12094
12095 boolean_t src_needs_copy; /* Should source map
12096 * be made read-only
12097 * for copy-on-write?
12098 */
12099
12100 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12101
12102 boolean_t was_wired; /* Was source wired? */
12103 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12104 vm_map_version_t version; /* Version before locks
12105 * dropped to make copy
12106 */
12107 kern_return_t result; /* Return value from
12108 * copy_strategically.
12109 */
12110 while (tmp_entry->is_sub_map) {
12111 vm_map_size_t submap_len;
12112 submap_map_t *ptr;
12113
12114 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12115 ptr->next = parent_maps;
12116 parent_maps = ptr;
12117 ptr->parent_map = src_map;
12118 ptr->base_start = src_start;
12119 ptr->base_end = src_end;
12120 submap_len = tmp_entry->vme_end - src_start;
12121 if (submap_len > (src_end - src_start)) {
12122 submap_len = src_end - src_start;
12123 }
12124 ptr->base_len = submap_len;
12125
12126 src_start -= tmp_entry->vme_start;
12127 src_start += VME_OFFSET(tmp_entry);
12128 src_end = src_start + submap_len;
12129 src_map = VME_SUBMAP(tmp_entry);
12130 vm_map_lock(src_map);
12131 /* keep an outstanding reference for all maps in */
12132 /* the parents tree except the base map */
12133 vm_map_reference(src_map);
12134 vm_map_unlock(ptr->parent_map);
12135 if (!vm_map_lookup_entry(
12136 src_map, src_start, &tmp_entry)) {
12137 RETURN(KERN_INVALID_ADDRESS);
12138 }
12139 map_share = TRUE;
12140 if (!tmp_entry->is_sub_map) {
12141 vm_map_clip_start(src_map, tmp_entry, src_start);
12142 }
12143 src_entry = tmp_entry;
12144 }
12145 /* we are now in the lowest level submap... */
12146
12147 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12148 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12149 /* This is not, supported for now.In future */
12150 /* we will need to detect the phys_contig */
12151 /* condition and then upgrade copy_slowly */
12152 /* to do physical copy from the device mem */
12153 /* based object. We can piggy-back off of */
12154 /* the was wired boolean to set-up the */
12155 /* proper handling */
12156 RETURN(KERN_PROTECTION_FAILURE);
12157 }
12158 /*
12159 * Create a new address map entry to hold the result.
12160 * Fill in the fields from the appropriate source entries.
12161 * We must unlock the source map to do this if we need
12162 * to allocate a map entry.
12163 */
12164 if (new_entry == VM_MAP_ENTRY_NULL) {
12165 version.main_timestamp = src_map->timestamp;
12166 vm_map_unlock(src_map);
12167
12168 new_entry = vm_map_copy_entry_create(copy);
12169
12170 vm_map_lock(src_map);
12171 if ((version.main_timestamp + 1) != src_map->timestamp) {
12172 if (!vm_map_lookup_entry(src_map, src_start,
12173 &tmp_entry)) {
12174 RETURN(KERN_INVALID_ADDRESS);
12175 }
12176 if (!tmp_entry->is_sub_map) {
12177 vm_map_clip_start(src_map, tmp_entry, src_start);
12178 }
12179 continue; /* restart w/ new tmp_entry */
12180 }
12181 }
12182
12183 /*
12184 * Verify that the region can be read.
12185 */
12186 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12187 !use_maxprot) ||
12188 (src_entry->max_protection & VM_PROT_READ) == 0) {
12189 RETURN(KERN_PROTECTION_FAILURE);
12190 }
12191
12192 /*
12193 * Clip against the endpoints of the entire region.
12194 */
12195
12196 vm_map_clip_end(src_map, src_entry, src_end);
12197
12198 src_size = src_entry->vme_end - src_start;
12199 src_object = VME_OBJECT(src_entry);
12200 src_offset = VME_OFFSET(src_entry);
12201 was_wired = (src_entry->wired_count != 0);
12202
12203 vm_map_entry_copy(src_map, new_entry, src_entry);
12204 if (new_entry->is_sub_map) {
12205 /* clr address space specifics */
12206 new_entry->use_pmap = FALSE;
12207 } else {
12208 /*
12209 * We're dealing with a copy-on-write operation,
12210 * so the resulting mapping should not inherit the
12211 * original mapping's accounting settings.
12212 * "iokit_acct" should have been cleared in
12213 * vm_map_entry_copy().
12214 * "use_pmap" should be reset to its default (TRUE)
12215 * so that the new mapping gets accounted for in
12216 * the task's memory footprint.
12217 */
12218 assert(!new_entry->iokit_acct);
12219 new_entry->use_pmap = TRUE;
12220 }
12221
12222 /*
12223 * Attempt non-blocking copy-on-write optimizations.
12224 */
12225
12226 /*
12227 * If we are destroying the source, and the object
12228 * is internal, we could move the object reference
12229 * from the source to the copy. The copy is
12230 * copy-on-write only if the source is.
12231 * We make another reference to the object, because
12232 * destroying the source entry will deallocate it.
12233 *
12234 * This memory transfer has to be atomic, (to prevent
12235 * the VM object from being shared or copied while
12236 * it's being moved here), so we could only do this
12237 * if we won't have to unlock the VM map until the
12238 * original mapping has been fully removed.
12239 */
12240
12241 RestartCopy:
12242 if ((src_object == VM_OBJECT_NULL ||
12243 (!was_wired && !map_share && !tmp_entry->is_shared
12244 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
12245 vm_object_copy_quickly(
12246 VME_OBJECT(new_entry),
12247 src_offset,
12248 src_size,
12249 &src_needs_copy,
12250 &new_entry_needs_copy)) {
12251 new_entry->needs_copy = new_entry_needs_copy;
12252
12253 /*
12254 * Handle copy-on-write obligations
12255 */
12256
12257 if (src_needs_copy && !tmp_entry->needs_copy) {
12258 vm_prot_t prot;
12259
12260 prot = src_entry->protection & ~VM_PROT_WRITE;
12261
12262 if (override_nx(src_map, VME_ALIAS(src_entry))
12263 && prot) {
12264 prot |= VM_PROT_EXECUTE;
12265 }
12266
12267 vm_object_pmap_protect(
12268 src_object,
12269 src_offset,
12270 src_size,
12271 (src_entry->is_shared ?
12272 PMAP_NULL
12273 : src_map->pmap),
12274 VM_MAP_PAGE_SIZE(src_map),
12275 src_entry->vme_start,
12276 prot);
12277
12278 assert(tmp_entry->wired_count == 0);
12279 tmp_entry->needs_copy = TRUE;
12280 }
12281
12282 /*
12283 * The map has never been unlocked, so it's safe
12284 * to move to the next entry rather than doing
12285 * another lookup.
12286 */
12287
12288 goto CopySuccessful;
12289 }
12290
12291 entry_was_shared = tmp_entry->is_shared;
12292
12293 /*
12294 * Take an object reference, so that we may
12295 * release the map lock(s).
12296 */
12297
12298 assert(src_object != VM_OBJECT_NULL);
12299 vm_object_reference(src_object);
12300
12301 /*
12302 * Record the timestamp for later verification.
12303 * Unlock the map.
12304 */
12305
12306 version.main_timestamp = src_map->timestamp;
12307 vm_map_unlock(src_map); /* Increments timestamp once! */
12308 saved_src_entry = src_entry;
12309 tmp_entry = VM_MAP_ENTRY_NULL;
12310 src_entry = VM_MAP_ENTRY_NULL;
12311
12312 /*
12313 * Perform the copy
12314 */
12315
12316 if (was_wired ||
12317 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12318 !(flags & VM_MAP_COPYIN_FORK)) ||
12319 (debug4k_no_cow_copyin &&
12320 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12321 CopySlowly:
12322 vm_object_lock(src_object);
12323 result = vm_object_copy_slowly(
12324 src_object,
12325 src_offset,
12326 src_size,
12327 THREAD_UNINT,
12328 &new_copy_object);
12329 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12330 saved_used_for_jit = new_entry->used_for_jit;
12331 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12332 new_entry->used_for_jit = saved_used_for_jit;
12333 VME_OFFSET_SET(new_entry,
12334 src_offset - vm_object_trunc_page(src_offset));
12335 new_entry->needs_copy = FALSE;
12336 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12337 (entry_was_shared || map_share)) {
12338 vm_object_t new_object;
12339
12340 vm_object_lock_shared(src_object);
12341 new_object = vm_object_copy_delayed(
12342 src_object,
12343 src_offset,
12344 src_size,
12345 TRUE);
12346 if (new_object == VM_OBJECT_NULL) {
12347 goto CopySlowly;
12348 }
12349
12350 VME_OBJECT_SET(new_entry, new_object, false, 0);
12351 assert(new_entry->wired_count == 0);
12352 new_entry->needs_copy = TRUE;
12353 assert(!new_entry->iokit_acct);
12354 assert(new_object->purgable == VM_PURGABLE_DENY);
12355 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12356 result = KERN_SUCCESS;
12357 } else {
12358 vm_object_offset_t new_offset;
12359 new_offset = VME_OFFSET(new_entry);
12360 result = vm_object_copy_strategically(src_object,
12361 src_offset,
12362 src_size,
12363 (flags & VM_MAP_COPYIN_FORK),
12364 &new_copy_object,
12365 &new_offset,
12366 &new_entry_needs_copy);
12367 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12368 saved_used_for_jit = new_entry->used_for_jit;
12369 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12370 new_entry->used_for_jit = saved_used_for_jit;
12371 if (new_offset != VME_OFFSET(new_entry)) {
12372 VME_OFFSET_SET(new_entry, new_offset);
12373 }
12374
12375 new_entry->needs_copy = new_entry_needs_copy;
12376 }
12377
12378 if (result == KERN_SUCCESS &&
12379 ((preserve_purgeable &&
12380 src_object->purgable != VM_PURGABLE_DENY) ||
12381 new_entry->used_for_jit)) {
12382 /*
12383 * Purgeable objects should be COPY_NONE, true share;
12384 * this should be propogated to the copy.
12385 *
12386 * Also force mappings the pmap specially protects to
12387 * be COPY_NONE; trying to COW these mappings would
12388 * change the effective protections, which could have
12389 * side effects if the pmap layer relies on the
12390 * specified protections.
12391 */
12392
12393 vm_object_t new_object;
12394
12395 new_object = VME_OBJECT(new_entry);
12396 assert(new_object != src_object);
12397 vm_object_lock(new_object);
12398 assert(os_ref_get_count_raw(&new_object->ref_count) == 1);
12399 assert(new_object->shadow == VM_OBJECT_NULL);
12400 assert(new_object->vo_copy == VM_OBJECT_NULL);
12401 assert(new_object->vo_owner == NULL);
12402
12403 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12404
12405 if (preserve_purgeable &&
12406 src_object->purgable != VM_PURGABLE_DENY) {
12407 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12408
12409 /* start as non-volatile with no owner... */
12410 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12411 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12412 /* ... and move to src_object's purgeable state */
12413 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12414 int state;
12415 state = src_object->purgable;
12416 vm_object_purgable_control(
12417 new_object,
12418 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12419 &state);
12420 }
12421 /* no pmap accounting for purgeable objects */
12422 new_entry->use_pmap = FALSE;
12423 }
12424
12425 vm_object_unlock(new_object);
12426 new_object = VM_OBJECT_NULL;
12427 }
12428
12429 /*
12430 * Throw away the extra reference
12431 */
12432
12433 vm_object_deallocate(src_object);
12434
12435 if (result != KERN_SUCCESS &&
12436 result != KERN_MEMORY_RESTART_COPY) {
12437 vm_map_lock(src_map);
12438 RETURN(result);
12439 }
12440
12441 /*
12442 * Verify that the map has not substantially
12443 * changed while the copy was being made.
12444 */
12445
12446 vm_map_lock(src_map);
12447
12448 if ((version.main_timestamp + 1) == src_map->timestamp) {
12449 /* src_map hasn't changed: src_entry is still valid */
12450 src_entry = saved_src_entry;
12451 goto VerificationSuccessful;
12452 }
12453
12454 /*
12455 * Simple version comparison failed.
12456 *
12457 * Retry the lookup and verify that the
12458 * same object/offset are still present.
12459 *
12460 * [Note: a memory manager that colludes with
12461 * the calling task can detect that we have
12462 * cheated. While the map was unlocked, the
12463 * mapping could have been changed and restored.]
12464 */
12465
12466 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12467 if (result != KERN_MEMORY_RESTART_COPY) {
12468 vm_object_deallocate(VME_OBJECT(new_entry));
12469 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12470 /* reset accounting state */
12471 new_entry->iokit_acct = FALSE;
12472 new_entry->use_pmap = TRUE;
12473 }
12474 RETURN(KERN_INVALID_ADDRESS);
12475 }
12476
12477 src_entry = tmp_entry;
12478 vm_map_clip_start(src_map, src_entry, src_start);
12479
12480 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12481 !use_maxprot) ||
12482 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12483 goto VerificationFailed;
12484 }
12485
12486 if (src_entry->vme_end < new_entry->vme_end) {
12487 /*
12488 * This entry might have been shortened
12489 * (vm_map_clip_end) or been replaced with
12490 * an entry that ends closer to "src_start"
12491 * than before.
12492 * Adjust "new_entry" accordingly; copying
12493 * less memory would be correct but we also
12494 * redo the copy (see below) if the new entry
12495 * no longer points at the same object/offset.
12496 */
12497 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12498 VM_MAP_COPY_PAGE_MASK(copy)));
12499 new_entry->vme_end = src_entry->vme_end;
12500 src_size = new_entry->vme_end - src_start;
12501 } else if (src_entry->vme_end > new_entry->vme_end) {
12502 /*
12503 * This entry might have been extended
12504 * (vm_map_entry_simplify() or coalesce)
12505 * or been replaced with an entry that ends farther
12506 * from "src_start" than before.
12507 *
12508 * We've called vm_object_copy_*() only on
12509 * the previous <start:end> range, so we can't
12510 * just extend new_entry. We have to re-do
12511 * the copy based on the new entry as if it was
12512 * pointing at a different object/offset (see
12513 * "Verification failed" below).
12514 */
12515 }
12516
12517 if ((VME_OBJECT(src_entry) != src_object) ||
12518 (VME_OFFSET(src_entry) != src_offset) ||
12519 (src_entry->vme_end > new_entry->vme_end)) {
12520 /*
12521 * Verification failed.
12522 *
12523 * Start over with this top-level entry.
12524 */
12525
12526 VerificationFailed: ;
12527
12528 vm_object_deallocate(VME_OBJECT(new_entry));
12529 tmp_entry = src_entry;
12530 continue;
12531 }
12532
12533 /*
12534 * Verification succeeded.
12535 */
12536
12537 VerificationSuccessful:;
12538
12539 if (result == KERN_MEMORY_RESTART_COPY) {
12540 goto RestartCopy;
12541 }
12542
12543 /*
12544 * Copy succeeded.
12545 */
12546
12547 CopySuccessful: ;
12548
12549 /*
12550 * Link in the new copy entry.
12551 */
12552
12553 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12554 new_entry);
12555
12556 /*
12557 * Determine whether the entire region
12558 * has been copied.
12559 */
12560 src_base = src_start;
12561 src_start = new_entry->vme_end;
12562 new_entry = VM_MAP_ENTRY_NULL;
12563 while ((src_start >= src_end) && (src_end != 0)) {
12564 submap_map_t *ptr;
12565
12566 if (src_map == base_map) {
12567 /* back to the top */
12568 break;
12569 }
12570
12571 ptr = parent_maps;
12572 assert(ptr != NULL);
12573 parent_maps = parent_maps->next;
12574
12575 /* fix up the damage we did in that submap */
12576 vm_map_simplify_range(src_map,
12577 src_base,
12578 src_end);
12579
12580 vm_map_unlock(src_map);
12581 vm_map_deallocate(src_map);
12582 vm_map_lock(ptr->parent_map);
12583 src_map = ptr->parent_map;
12584 src_base = ptr->base_start;
12585 src_start = ptr->base_start + ptr->base_len;
12586 src_end = ptr->base_end;
12587 if (!vm_map_lookup_entry(src_map,
12588 src_start,
12589 &tmp_entry) &&
12590 (src_end > src_start)) {
12591 RETURN(KERN_INVALID_ADDRESS);
12592 }
12593 kfree_type(submap_map_t, ptr);
12594 if (parent_maps == NULL) {
12595 map_share = FALSE;
12596 }
12597 src_entry = tmp_entry->vme_prev;
12598 }
12599
12600 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12601 (src_start >= src_addr_unaligned + len) &&
12602 (src_addr_unaligned + len != 0)) {
12603 /*
12604 * Stop copying now, even though we haven't reached
12605 * "src_end". We'll adjust the end of the last copy
12606 * entry at the end, if needed.
12607 *
12608 * If src_map's aligment is different from the
12609 * system's page-alignment, there could be
12610 * extra non-map-aligned map entries between
12611 * the original (non-rounded) "src_addr_unaligned + len"
12612 * and the rounded "src_end".
12613 * We do not want to copy those map entries since
12614 * they're not part of the copied range.
12615 */
12616 break;
12617 }
12618
12619 if ((src_start >= src_end) && (src_end != 0)) {
12620 break;
12621 }
12622
12623 /*
12624 * Verify that there are no gaps in the region
12625 */
12626
12627 tmp_entry = src_entry->vme_next;
12628 if ((tmp_entry->vme_start != src_start) ||
12629 (tmp_entry == vm_map_to_entry(src_map))) {
12630 RETURN(KERN_INVALID_ADDRESS);
12631 }
12632 }
12633
12634 /*
12635 * If the source should be destroyed, do it now, since the
12636 * copy was successful.
12637 */
12638 if (src_destroy) {
12639 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12640
12641 if (src_map == kernel_map) {
12642 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12643 }
12644 (void)vm_map_remove_and_unlock(src_map,
12645 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12646 src_end,
12647 remove_flags,
12648 KMEM_GUARD_NONE);
12649 } else {
12650 /* fix up the damage we did in the base map */
12651 vm_map_simplify_range(
12652 src_map,
12653 vm_map_trunc_page(src_addr_unaligned,
12654 VM_MAP_PAGE_MASK(src_map)),
12655 vm_map_round_page(src_end,
12656 VM_MAP_PAGE_MASK(src_map)));
12657 vm_map_unlock(src_map);
12658 }
12659
12660 tmp_entry = VM_MAP_ENTRY_NULL;
12661
12662 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12663 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12664 vm_map_offset_t original_start, original_offset, original_end;
12665
12666 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12667
12668 /* adjust alignment of first copy_entry's "vme_start" */
12669 tmp_entry = vm_map_copy_first_entry(copy);
12670 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12671 vm_map_offset_t adjustment;
12672
12673 original_start = tmp_entry->vme_start;
12674 original_offset = VME_OFFSET(tmp_entry);
12675
12676 /* map-align the start of the first copy entry... */
12677 adjustment = (tmp_entry->vme_start -
12678 vm_map_trunc_page(
12679 tmp_entry->vme_start,
12680 VM_MAP_PAGE_MASK(src_map)));
12681 tmp_entry->vme_start -= adjustment;
12682 VME_OFFSET_SET(tmp_entry,
12683 VME_OFFSET(tmp_entry) - adjustment);
12684 copy_addr -= adjustment;
12685 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12686 /* ... adjust for mis-aligned start of copy range */
12687 adjustment =
12688 (vm_map_trunc_page(copy->offset,
12689 PAGE_MASK) -
12690 vm_map_trunc_page(copy->offset,
12691 VM_MAP_PAGE_MASK(src_map)));
12692 if (adjustment) {
12693 assert(page_aligned(adjustment));
12694 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12695 tmp_entry->vme_start += adjustment;
12696 VME_OFFSET_SET(tmp_entry,
12697 (VME_OFFSET(tmp_entry) +
12698 adjustment));
12699 copy_addr += adjustment;
12700 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12701 }
12702
12703 /*
12704 * Assert that the adjustments haven't exposed
12705 * more than was originally copied...
12706 */
12707 assert(tmp_entry->vme_start >= original_start);
12708 assert(VME_OFFSET(tmp_entry) >= original_offset);
12709 /*
12710 * ... and that it did not adjust outside of a
12711 * a single 16K page.
12712 */
12713 assert(vm_map_trunc_page(tmp_entry->vme_start,
12714 VM_MAP_PAGE_MASK(src_map)) ==
12715 vm_map_trunc_page(original_start,
12716 VM_MAP_PAGE_MASK(src_map)));
12717 }
12718
12719 /* adjust alignment of last copy_entry's "vme_end" */
12720 tmp_entry = vm_map_copy_last_entry(copy);
12721 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12722 vm_map_offset_t adjustment;
12723
12724 original_end = tmp_entry->vme_end;
12725
12726 /* map-align the end of the last copy entry... */
12727 tmp_entry->vme_end =
12728 vm_map_round_page(tmp_entry->vme_end,
12729 VM_MAP_PAGE_MASK(src_map));
12730 /* ... adjust for mis-aligned end of copy range */
12731 adjustment =
12732 (vm_map_round_page((copy->offset +
12733 copy->size),
12734 VM_MAP_PAGE_MASK(src_map)) -
12735 vm_map_round_page((copy->offset +
12736 copy->size),
12737 PAGE_MASK));
12738 if (adjustment) {
12739 assert(page_aligned(adjustment));
12740 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12741 tmp_entry->vme_end -= adjustment;
12742 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12743 }
12744
12745 /*
12746 * Assert that the adjustments haven't exposed
12747 * more than was originally copied...
12748 */
12749 assert(tmp_entry->vme_end <= original_end);
12750 /*
12751 * ... and that it did not adjust outside of a
12752 * a single 16K page.
12753 */
12754 assert(vm_map_round_page(tmp_entry->vme_end,
12755 VM_MAP_PAGE_MASK(src_map)) ==
12756 vm_map_round_page(original_end,
12757 VM_MAP_PAGE_MASK(src_map)));
12758 }
12759 }
12760
12761 /* Fix-up start and end points in copy. This is necessary */
12762 /* when the various entries in the copy object were picked */
12763 /* up from different sub-maps */
12764
12765 tmp_entry = vm_map_copy_first_entry(copy);
12766 copy_size = 0; /* compute actual size */
12767 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12768 assert(VM_MAP_PAGE_ALIGNED(
12769 copy_addr + (tmp_entry->vme_end -
12770 tmp_entry->vme_start),
12771 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12772 assert(VM_MAP_PAGE_ALIGNED(
12773 copy_addr,
12774 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12775
12776 /*
12777 * The copy_entries will be injected directly into the
12778 * destination map and might not be "map aligned" there...
12779 */
12780 tmp_entry->map_aligned = FALSE;
12781
12782 tmp_entry->vme_end = copy_addr +
12783 (tmp_entry->vme_end - tmp_entry->vme_start);
12784 tmp_entry->vme_start = copy_addr;
12785 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12786 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12787 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12788 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12789 }
12790
12791 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12792 copy_size < copy->size) {
12793 /*
12794 * The actual size of the VM map copy is smaller than what
12795 * was requested by the caller. This must be because some
12796 * PAGE_SIZE-sized pages are missing at the end of the last
12797 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12798 * The caller might not have been aware of those missing
12799 * pages and might not want to be aware of it, which is
12800 * fine as long as they don't try to access (and crash on)
12801 * those missing pages.
12802 * Let's adjust the size of the "copy", to avoid failing
12803 * in vm_map_copyout() or vm_map_copy_overwrite().
12804 */
12805 assert(vm_map_round_page(copy_size,
12806 VM_MAP_PAGE_MASK(src_map)) ==
12807 vm_map_round_page(copy->size,
12808 VM_MAP_PAGE_MASK(src_map)));
12809 copy->size = copy_size;
12810 }
12811
12812 *copy_result = copy;
12813 return KERN_SUCCESS;
12814
12815 #undef RETURN
12816 }
12817
12818 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12819 vm_map_copy_extract(
12820 vm_map_t src_map,
12821 vm_map_address_t src_addr,
12822 vm_map_size_t len,
12823 boolean_t do_copy,
12824 vm_map_copy_t *copy_result, /* OUT */
12825 vm_prot_t *cur_prot, /* IN/OUT */
12826 vm_prot_t *max_prot, /* IN/OUT */
12827 vm_inherit_t inheritance,
12828 vm_map_kernel_flags_t vmk_flags)
12829 {
12830 vm_map_copy_t copy;
12831 kern_return_t kr;
12832 vm_prot_t required_cur_prot, required_max_prot;
12833
12834 /*
12835 * Check for copies of zero bytes.
12836 */
12837
12838 if (len == 0) {
12839 *copy_result = VM_MAP_COPY_NULL;
12840 return KERN_SUCCESS;
12841 }
12842
12843 /*
12844 * Check that the end address doesn't overflow
12845 */
12846 if (src_addr + len < src_addr) {
12847 return KERN_INVALID_ADDRESS;
12848 }
12849 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12850 return KERN_INVALID_ADDRESS;
12851 }
12852
12853 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12854 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12855 }
12856
12857 required_cur_prot = *cur_prot;
12858 required_max_prot = *max_prot;
12859
12860 /*
12861 * Allocate a header element for the list.
12862 *
12863 * Use the start and end in the header to
12864 * remember the endpoints prior to rounding.
12865 */
12866
12867 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12868 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12869 copy->offset = 0;
12870 copy->size = len;
12871
12872 kr = vm_map_remap_extract(src_map,
12873 src_addr,
12874 len,
12875 do_copy, /* copy */
12876 copy,
12877 cur_prot, /* IN/OUT */
12878 max_prot, /* IN/OUT */
12879 inheritance,
12880 vmk_flags);
12881 if (kr != KERN_SUCCESS) {
12882 vm_map_copy_discard(copy);
12883 if ((kr == KERN_INVALID_ADDRESS ||
12884 kr == KERN_INVALID_ARGUMENT) &&
12885 src_map->terminated) {
12886 /* tell the caller that this address space is gone */
12887 kr = KERN_TERMINATED;
12888 }
12889 return kr;
12890 }
12891 if (required_cur_prot != VM_PROT_NONE) {
12892 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12893 assert((*max_prot & required_max_prot) == required_max_prot);
12894 }
12895
12896 *copy_result = copy;
12897 return KERN_SUCCESS;
12898 }
12899
12900 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12901 vm_map_fork_share(
12902 vm_map_t old_map,
12903 vm_map_entry_t old_entry,
12904 vm_map_t new_map)
12905 {
12906 vm_object_t object;
12907 vm_map_entry_t new_entry;
12908
12909 /*
12910 * New sharing code. New map entry
12911 * references original object. Internal
12912 * objects use asynchronous copy algorithm for
12913 * future copies. First make sure we have
12914 * the right object. If we need a shadow,
12915 * or someone else already has one, then
12916 * make a new shadow and share it.
12917 */
12918
12919 if (!old_entry->is_sub_map) {
12920 object = VME_OBJECT(old_entry);
12921 }
12922
12923 if (old_entry->is_sub_map) {
12924 assert(old_entry->wired_count == 0);
12925 #ifndef NO_NESTED_PMAP
12926 #if !PMAP_FORK_NEST
12927 if (old_entry->use_pmap) {
12928 kern_return_t result;
12929
12930 result = pmap_nest(new_map->pmap,
12931 (VME_SUBMAP(old_entry))->pmap,
12932 (addr64_t)old_entry->vme_start,
12933 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12934 if (result) {
12935 panic("vm_map_fork_share: pmap_nest failed!");
12936 }
12937 }
12938 #endif /* !PMAP_FORK_NEST */
12939 #endif /* NO_NESTED_PMAP */
12940 } else if (object == VM_OBJECT_NULL) {
12941 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12942 old_entry->vme_start));
12943 VME_OFFSET_SET(old_entry, 0);
12944 VME_OBJECT_SET(old_entry, object, false, 0);
12945 old_entry->use_pmap = TRUE;
12946 // assert(!old_entry->needs_copy);
12947 } else if (object->copy_strategy !=
12948 MEMORY_OBJECT_COPY_SYMMETRIC) {
12949 /*
12950 * We are already using an asymmetric
12951 * copy, and therefore we already have
12952 * the right object.
12953 */
12954
12955 assert(!old_entry->needs_copy);
12956 } else if (old_entry->needs_copy || /* case 1 */
12957 object->shadowed || /* case 2 */
12958 (!object->true_share && /* case 3 */
12959 !old_entry->is_shared &&
12960 (object->vo_size >
12961 (vm_map_size_t)(old_entry->vme_end -
12962 old_entry->vme_start)))) {
12963 bool is_writable;
12964
12965 /*
12966 * We need to create a shadow.
12967 * There are three cases here.
12968 * In the first case, we need to
12969 * complete a deferred symmetrical
12970 * copy that we participated in.
12971 * In the second and third cases,
12972 * we need to create the shadow so
12973 * that changes that we make to the
12974 * object do not interfere with
12975 * any symmetrical copies which
12976 * have occured (case 2) or which
12977 * might occur (case 3).
12978 *
12979 * The first case is when we had
12980 * deferred shadow object creation
12981 * via the entry->needs_copy mechanism.
12982 * This mechanism only works when
12983 * only one entry points to the source
12984 * object, and we are about to create
12985 * a second entry pointing to the
12986 * same object. The problem is that
12987 * there is no way of mapping from
12988 * an object to the entries pointing
12989 * to it. (Deferred shadow creation
12990 * works with one entry because occurs
12991 * at fault time, and we walk from the
12992 * entry to the object when handling
12993 * the fault.)
12994 *
12995 * The second case is when the object
12996 * to be shared has already been copied
12997 * with a symmetric copy, but we point
12998 * directly to the object without
12999 * needs_copy set in our entry. (This
13000 * can happen because different ranges
13001 * of an object can be pointed to by
13002 * different entries. In particular,
13003 * a single entry pointing to an object
13004 * can be split by a call to vm_inherit,
13005 * which, combined with task_create, can
13006 * result in the different entries
13007 * having different needs_copy values.)
13008 * The shadowed flag in the object allows
13009 * us to detect this case. The problem
13010 * with this case is that if this object
13011 * has or will have shadows, then we
13012 * must not perform an asymmetric copy
13013 * of this object, since such a copy
13014 * allows the object to be changed, which
13015 * will break the previous symmetrical
13016 * copies (which rely upon the object
13017 * not changing). In a sense, the shadowed
13018 * flag says "don't change this object".
13019 * We fix this by creating a shadow
13020 * object for this object, and sharing
13021 * that. This works because we are free
13022 * to change the shadow object (and thus
13023 * to use an asymmetric copy strategy);
13024 * this is also semantically correct,
13025 * since this object is temporary, and
13026 * therefore a copy of the object is
13027 * as good as the object itself. (This
13028 * is not true for permanent objects,
13029 * since the pager needs to see changes,
13030 * which won't happen if the changes
13031 * are made to a copy.)
13032 *
13033 * The third case is when the object
13034 * to be shared has parts sticking
13035 * outside of the entry we're working
13036 * with, and thus may in the future
13037 * be subject to a symmetrical copy.
13038 * (This is a preemptive version of
13039 * case 2.)
13040 */
13041 VME_OBJECT_SHADOW(old_entry,
13042 (vm_map_size_t) (old_entry->vme_end -
13043 old_entry->vme_start),
13044 vm_map_always_shadow(old_map));
13045
13046 /*
13047 * If we're making a shadow for other than
13048 * copy on write reasons, then we have
13049 * to remove write permission.
13050 */
13051
13052 is_writable = false;
13053 if (old_entry->protection & VM_PROT_WRITE) {
13054 is_writable = true;
13055 #if __arm64e__
13056 } else if (old_entry->used_for_tpro) {
13057 is_writable = true;
13058 #endif /* __arm64e__ */
13059 }
13060 if (!old_entry->needs_copy && is_writable) {
13061 vm_prot_t prot;
13062
13063 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13064 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13065 __FUNCTION__, old_map, old_map->pmap,
13066 old_entry,
13067 (uint64_t)old_entry->vme_start,
13068 (uint64_t)old_entry->vme_end,
13069 old_entry->protection);
13070 }
13071
13072 prot = old_entry->protection & ~VM_PROT_WRITE;
13073
13074 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13075 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13076 __FUNCTION__, old_map, old_map->pmap,
13077 old_entry,
13078 (uint64_t)old_entry->vme_start,
13079 (uint64_t)old_entry->vme_end,
13080 prot);
13081 }
13082
13083 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
13084 prot |= VM_PROT_EXECUTE;
13085 }
13086
13087
13088 if (old_map->mapped_in_other_pmaps) {
13089 vm_object_pmap_protect(
13090 VME_OBJECT(old_entry),
13091 VME_OFFSET(old_entry),
13092 (old_entry->vme_end -
13093 old_entry->vme_start),
13094 PMAP_NULL,
13095 PAGE_SIZE,
13096 old_entry->vme_start,
13097 prot);
13098 } else {
13099 pmap_protect(old_map->pmap,
13100 old_entry->vme_start,
13101 old_entry->vme_end,
13102 prot);
13103 }
13104 }
13105
13106 old_entry->needs_copy = FALSE;
13107 object = VME_OBJECT(old_entry);
13108 }
13109
13110
13111 /*
13112 * If object was using a symmetric copy strategy,
13113 * change its copy strategy to the default
13114 * asymmetric copy strategy, which is copy_delay
13115 * in the non-norma case and copy_call in the
13116 * norma case. Bump the reference count for the
13117 * new entry.
13118 */
13119
13120 if (old_entry->is_sub_map) {
13121 vm_map_reference(VME_SUBMAP(old_entry));
13122 } else {
13123 vm_object_lock(object);
13124 vm_object_reference_locked(object);
13125 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13126 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13127 }
13128 vm_object_unlock(object);
13129 }
13130
13131 /*
13132 * Clone the entry, using object ref from above.
13133 * Mark both entries as shared.
13134 */
13135
13136 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13137 vm_map_entry_copy(old_map, new_entry, old_entry);
13138 old_entry->is_shared = TRUE;
13139 new_entry->is_shared = TRUE;
13140
13141 /*
13142 * We're dealing with a shared mapping, so the resulting mapping
13143 * should inherit some of the original mapping's accounting settings.
13144 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13145 * "use_pmap" should stay the same as before (if it hasn't been reset
13146 * to TRUE when we cleared "iokit_acct").
13147 */
13148 assert(!new_entry->iokit_acct);
13149
13150 /*
13151 * If old entry's inheritence is VM_INHERIT_NONE,
13152 * the new entry is for corpse fork, remove the
13153 * write permission from the new entry.
13154 */
13155 if (old_entry->inheritance == VM_INHERIT_NONE) {
13156 new_entry->protection &= ~VM_PROT_WRITE;
13157 new_entry->max_protection &= ~VM_PROT_WRITE;
13158 }
13159
13160 /*
13161 * Insert the entry into the new map -- we
13162 * know we're inserting at the end of the new
13163 * map.
13164 */
13165
13166 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
13167 VM_MAP_KERNEL_FLAGS_NONE);
13168
13169 /*
13170 * Update the physical map
13171 */
13172
13173 if (old_entry->is_sub_map) {
13174 /* Bill Angell pmap support goes here */
13175 } else {
13176 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13177 old_entry->vme_end - old_entry->vme_start,
13178 old_entry->vme_start);
13179 }
13180 }
13181
13182 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)13183 vm_map_fork_copy(
13184 vm_map_t old_map,
13185 vm_map_entry_t *old_entry_p,
13186 vm_map_t new_map,
13187 int vm_map_copyin_flags)
13188 {
13189 vm_map_entry_t old_entry = *old_entry_p;
13190 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13191 vm_map_offset_t start = old_entry->vme_start;
13192 vm_map_copy_t copy;
13193 vm_map_entry_t last = vm_map_last_entry(new_map);
13194
13195 vm_map_unlock(old_map);
13196 /*
13197 * Use maxprot version of copyin because we
13198 * care about whether this memory can ever
13199 * be accessed, not just whether it's accessible
13200 * right now.
13201 */
13202 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13203 if (vm_map_copyin_internal(old_map, start, entry_size,
13204 vm_map_copyin_flags, ©)
13205 != KERN_SUCCESS) {
13206 /*
13207 * The map might have changed while it
13208 * was unlocked, check it again. Skip
13209 * any blank space or permanently
13210 * unreadable region.
13211 */
13212 vm_map_lock(old_map);
13213 if (!vm_map_lookup_entry(old_map, start, &last) ||
13214 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13215 last = last->vme_next;
13216 }
13217 *old_entry_p = last;
13218
13219 /*
13220 * XXX For some error returns, want to
13221 * XXX skip to the next element. Note
13222 * that INVALID_ADDRESS and
13223 * PROTECTION_FAILURE are handled above.
13224 */
13225
13226 return FALSE;
13227 }
13228
13229 /*
13230 * Assert that the vm_map_copy is coming from the right
13231 * zone and hasn't been forged
13232 */
13233 vm_map_copy_require(copy);
13234
13235 /*
13236 * Insert the copy into the new map
13237 */
13238 vm_map_copy_insert(new_map, last, copy);
13239
13240 /*
13241 * Pick up the traversal at the end of
13242 * the copied region.
13243 */
13244
13245 vm_map_lock(old_map);
13246 start += entry_size;
13247 if (!vm_map_lookup_entry(old_map, start, &last)) {
13248 last = last->vme_next;
13249 } else {
13250 if (last->vme_start == start) {
13251 /*
13252 * No need to clip here and we don't
13253 * want to cause any unnecessary
13254 * unnesting...
13255 */
13256 } else {
13257 vm_map_clip_start(old_map, last, start);
13258 }
13259 }
13260 *old_entry_p = last;
13261
13262 return TRUE;
13263 }
13264
13265 #if PMAP_FORK_NEST
13266 #define PMAP_FORK_NEST_DEBUG 0
13267 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13268 vm_map_fork_unnest(
13269 pmap_t new_pmap,
13270 vm_map_offset_t pre_nested_start,
13271 vm_map_offset_t pre_nested_end,
13272 vm_map_offset_t start,
13273 vm_map_offset_t end)
13274 {
13275 kern_return_t kr;
13276 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13277
13278 assertf(pre_nested_start <= pre_nested_end,
13279 "pre_nested start 0x%llx end 0x%llx",
13280 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13281 assertf(start <= end,
13282 "start 0x%llx end 0x%llx",
13283 (uint64_t) start, (uint64_t)end);
13284
13285 if (pre_nested_start == pre_nested_end) {
13286 /* nothing was pre-nested: done */
13287 return;
13288 }
13289 if (end <= pre_nested_start) {
13290 /* fully before pre-nested range: done */
13291 return;
13292 }
13293 if (start >= pre_nested_end) {
13294 /* fully after pre-nested range: done */
13295 return;
13296 }
13297 /* ignore parts of range outside of pre_nested range */
13298 if (start < pre_nested_start) {
13299 start = pre_nested_start;
13300 }
13301 if (end > pre_nested_end) {
13302 end = pre_nested_end;
13303 }
13304 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13305 start_unnest = start & ~nesting_mask;
13306 end_unnest = (end + nesting_mask) & ~nesting_mask;
13307 kr = pmap_unnest(new_pmap,
13308 (addr64_t)start_unnest,
13309 (uint64_t)(end_unnest - start_unnest));
13310 #if PMAP_FORK_NEST_DEBUG
13311 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13312 #endif /* PMAP_FORK_NEST_DEBUG */
13313 assertf(kr == KERN_SUCCESS,
13314 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13315 (uint64_t)start, (uint64_t)end, new_pmap,
13316 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13317 kr);
13318 }
13319 #endif /* PMAP_FORK_NEST */
13320
13321 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13322 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13323 {
13324 new_map->size_limit = old_map->size_limit;
13325 new_map->data_limit = old_map->data_limit;
13326 new_map->user_wire_limit = old_map->user_wire_limit;
13327 new_map->reserved_regions = old_map->reserved_regions;
13328 }
13329
13330 /*
13331 * vm_map_fork:
13332 *
13333 * Create and return a new map based on the old
13334 * map, according to the inheritance values on the
13335 * regions in that map and the options.
13336 *
13337 * The source map must not be locked.
13338 */
13339 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13340 vm_map_fork(
13341 ledger_t ledger,
13342 vm_map_t old_map,
13343 int options)
13344 {
13345 pmap_t new_pmap;
13346 vm_map_t new_map;
13347 vm_map_entry_t old_entry;
13348 vm_map_size_t new_size = 0, entry_size;
13349 vm_map_entry_t new_entry;
13350 boolean_t src_needs_copy;
13351 boolean_t new_entry_needs_copy;
13352 boolean_t pmap_is64bit;
13353 int vm_map_copyin_flags;
13354 vm_inherit_t old_entry_inheritance;
13355 int map_create_options;
13356 kern_return_t footprint_collect_kr;
13357
13358 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13359 VM_MAP_FORK_PRESERVE_PURGEABLE |
13360 VM_MAP_FORK_CORPSE_FOOTPRINT |
13361 VM_MAP_FORK_SHARE_IF_OWNED)) {
13362 /* unsupported option */
13363 return VM_MAP_NULL;
13364 }
13365
13366 pmap_is64bit =
13367 #if defined(__i386__) || defined(__x86_64__)
13368 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13369 #elif defined(__arm64__)
13370 old_map->pmap->is_64bit;
13371 #else
13372 #error Unknown architecture.
13373 #endif
13374
13375 unsigned int pmap_flags = 0;
13376 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13377 #if defined(HAS_APPLE_PAC)
13378 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13379 #endif
13380 #if CONFIG_ROSETTA
13381 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13382 #endif
13383 #if PMAP_CREATE_FORCE_4K_PAGES
13384 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13385 PAGE_SIZE != FOURK_PAGE_SIZE) {
13386 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13387 }
13388 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13389 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13390 if (new_pmap == NULL) {
13391 return VM_MAP_NULL;
13392 }
13393
13394 vm_map_reference(old_map);
13395 vm_map_lock(old_map);
13396
13397 map_create_options = 0;
13398 if (old_map->hdr.entries_pageable) {
13399 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13400 }
13401 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13402 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13403 footprint_collect_kr = KERN_SUCCESS;
13404 }
13405 new_map = vm_map_create_options(new_pmap,
13406 old_map->min_offset,
13407 old_map->max_offset,
13408 map_create_options);
13409
13410 /* inherit cs_enforcement */
13411 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13412
13413 vm_map_lock(new_map);
13414 vm_commit_pagezero_status(new_map);
13415 /* inherit the parent map's page size */
13416 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13417
13418 /* inherit the parent rlimits */
13419 vm_map_inherit_limits(new_map, old_map);
13420
13421 #if CONFIG_MAP_RANGES
13422 /* inherit the parent map's VM ranges */
13423 vm_map_range_fork(new_map, old_map);
13424 #endif
13425
13426 #if CODE_SIGNING_MONITOR
13427 /* Prepare the monitor for the fork */
13428 csm_fork_prepare(old_map->pmap, new_pmap);
13429 #endif
13430
13431 #if PMAP_FORK_NEST
13432 /*
13433 * Pre-nest the shared region's pmap.
13434 */
13435 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13436 pmap_fork_nest(old_map->pmap, new_pmap,
13437 &pre_nested_start, &pre_nested_end);
13438 #if PMAP_FORK_NEST_DEBUG
13439 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13440 #endif /* PMAP_FORK_NEST_DEBUG */
13441 #endif /* PMAP_FORK_NEST */
13442
13443 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13444 /*
13445 * Abort any corpse collection if the system is shutting down.
13446 */
13447 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13448 get_system_inshutdown()) {
13449 #if PMAP_FORK_NEST
13450 new_entry = vm_map_last_entry(new_map);
13451 if (new_entry == vm_map_to_entry(new_map)) {
13452 /* unnest all that was pre-nested */
13453 vm_map_fork_unnest(new_pmap,
13454 pre_nested_start, pre_nested_end,
13455 vm_map_min(new_map), vm_map_max(new_map));
13456 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13457 /* unnest hole at the end, if pre-nested */
13458 vm_map_fork_unnest(new_pmap,
13459 pre_nested_start, pre_nested_end,
13460 new_entry->vme_end, vm_map_max(new_map));
13461 }
13462 #endif /* PMAP_FORK_NEST */
13463 vm_map_corpse_footprint_collect_done(new_map);
13464 vm_map_unlock(new_map);
13465 vm_map_unlock(old_map);
13466 vm_map_deallocate(new_map);
13467 vm_map_deallocate(old_map);
13468 printf("Aborting corpse map due to system shutdown\n");
13469 return VM_MAP_NULL;
13470 }
13471
13472 entry_size = old_entry->vme_end - old_entry->vme_start;
13473
13474 #if PMAP_FORK_NEST
13475 /*
13476 * Undo any unnecessary pre-nesting.
13477 */
13478 vm_map_offset_t prev_end;
13479 if (old_entry == vm_map_first_entry(old_map)) {
13480 prev_end = vm_map_min(old_map);
13481 } else {
13482 prev_end = old_entry->vme_prev->vme_end;
13483 }
13484 if (prev_end < old_entry->vme_start) {
13485 /* unnest hole before this entry, if pre-nested */
13486 vm_map_fork_unnest(new_pmap,
13487 pre_nested_start, pre_nested_end,
13488 prev_end, old_entry->vme_start);
13489 }
13490 if (old_entry->is_sub_map && old_entry->use_pmap) {
13491 /* keep this entry nested in the child */
13492 #if PMAP_FORK_NEST_DEBUG
13493 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13494 #endif /* PMAP_FORK_NEST_DEBUG */
13495 } else {
13496 /* undo nesting for this entry, if pre-nested */
13497 vm_map_fork_unnest(new_pmap,
13498 pre_nested_start, pre_nested_end,
13499 old_entry->vme_start, old_entry->vme_end);
13500 }
13501 #endif /* PMAP_FORK_NEST */
13502
13503 old_entry_inheritance = old_entry->inheritance;
13504 /*
13505 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13506 * share VM_INHERIT_NONE entries that are not backed by a
13507 * device pager.
13508 */
13509 if (old_entry_inheritance == VM_INHERIT_NONE &&
13510 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13511 (old_entry->protection & VM_PROT_READ) &&
13512 !(!old_entry->is_sub_map &&
13513 VME_OBJECT(old_entry) != NULL &&
13514 VME_OBJECT(old_entry)->pager != NULL &&
13515 is_device_pager_ops(
13516 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13517 old_entry_inheritance = VM_INHERIT_SHARE;
13518 }
13519 if (old_entry_inheritance == VM_INHERIT_COPY &&
13520 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13521 !old_entry->is_sub_map &&
13522 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13523 vm_object_t object;
13524 task_t owner;
13525 object = VME_OBJECT(old_entry);
13526 owner = VM_OBJECT_OWNER(object);
13527 if (owner != TASK_NULL &&
13528 owner->map == old_map) {
13529 /*
13530 * This mapping points at a VM object owned
13531 * by the task being forked.
13532 * Some tools reporting memory accounting
13533 * info rely on the object ID, so share this
13534 * mapping instead of copying, to make the
13535 * corpse look exactly like the original
13536 * task in that respect.
13537 */
13538 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13539 old_entry_inheritance = VM_INHERIT_SHARE;
13540 }
13541 }
13542
13543 if (old_entry_inheritance != VM_INHERIT_NONE &&
13544 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13545 footprint_collect_kr == KERN_SUCCESS) {
13546 /*
13547 * The corpse won't have old_map->pmap to query
13548 * footprint information, so collect that data now
13549 * and store it in new_map->vmmap_corpse_footprint
13550 * for later autopsy.
13551 */
13552 footprint_collect_kr =
13553 vm_map_corpse_footprint_collect(old_map,
13554 old_entry,
13555 new_map);
13556 }
13557
13558 switch (old_entry_inheritance) {
13559 case VM_INHERIT_NONE:
13560 break;
13561
13562 case VM_INHERIT_SHARE:
13563 vm_map_fork_share(old_map, old_entry, new_map);
13564 new_size += entry_size;
13565 break;
13566
13567 case VM_INHERIT_COPY:
13568
13569 /*
13570 * Inline the copy_quickly case;
13571 * upon failure, fall back on call
13572 * to vm_map_fork_copy.
13573 */
13574
13575 if (old_entry->is_sub_map) {
13576 break;
13577 }
13578 if ((old_entry->wired_count != 0) ||
13579 ((VME_OBJECT(old_entry) != NULL) &&
13580 (VME_OBJECT(old_entry)->true_share))) {
13581 goto slow_vm_map_fork_copy;
13582 }
13583
13584 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13585 vm_map_entry_copy(old_map, new_entry, old_entry);
13586 if (old_entry->vme_permanent) {
13587 /* inherit "permanent" on fork() */
13588 new_entry->vme_permanent = TRUE;
13589 }
13590
13591 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13592 new_map->jit_entry_exists = TRUE;
13593 }
13594
13595 if (new_entry->is_sub_map) {
13596 /* clear address space specifics */
13597 new_entry->use_pmap = FALSE;
13598 } else {
13599 /*
13600 * We're dealing with a copy-on-write operation,
13601 * so the resulting mapping should not inherit
13602 * the original mapping's accounting settings.
13603 * "iokit_acct" should have been cleared in
13604 * vm_map_entry_copy().
13605 * "use_pmap" should be reset to its default
13606 * (TRUE) so that the new mapping gets
13607 * accounted for in the task's memory footprint.
13608 */
13609 assert(!new_entry->iokit_acct);
13610 new_entry->use_pmap = TRUE;
13611 }
13612
13613 if (!vm_object_copy_quickly(
13614 VME_OBJECT(new_entry),
13615 VME_OFFSET(old_entry),
13616 (old_entry->vme_end -
13617 old_entry->vme_start),
13618 &src_needs_copy,
13619 &new_entry_needs_copy)) {
13620 vm_map_entry_dispose(new_entry);
13621 goto slow_vm_map_fork_copy;
13622 }
13623
13624 /*
13625 * Handle copy-on-write obligations
13626 */
13627
13628 if (src_needs_copy && !old_entry->needs_copy) {
13629 vm_prot_t prot;
13630
13631 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13632 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13633 __FUNCTION__,
13634 old_map, old_map->pmap, old_entry,
13635 (uint64_t)old_entry->vme_start,
13636 (uint64_t)old_entry->vme_end,
13637 old_entry->protection);
13638 }
13639
13640 prot = old_entry->protection & ~VM_PROT_WRITE;
13641
13642 if (override_nx(old_map, VME_ALIAS(old_entry))
13643 && prot) {
13644 prot |= VM_PROT_EXECUTE;
13645 }
13646
13647 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13648 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13649 __FUNCTION__,
13650 old_map, old_map->pmap, old_entry,
13651 (uint64_t)old_entry->vme_start,
13652 (uint64_t)old_entry->vme_end,
13653 prot);
13654 }
13655
13656 vm_object_pmap_protect(
13657 VME_OBJECT(old_entry),
13658 VME_OFFSET(old_entry),
13659 (old_entry->vme_end -
13660 old_entry->vme_start),
13661 ((old_entry->is_shared
13662 || old_map->mapped_in_other_pmaps)
13663 ? PMAP_NULL :
13664 old_map->pmap),
13665 VM_MAP_PAGE_SIZE(old_map),
13666 old_entry->vme_start,
13667 prot);
13668
13669 assert(old_entry->wired_count == 0);
13670 old_entry->needs_copy = TRUE;
13671 }
13672 new_entry->needs_copy = new_entry_needs_copy;
13673
13674 /*
13675 * Insert the entry at the end
13676 * of the map.
13677 */
13678
13679 vm_map_store_entry_link(new_map,
13680 vm_map_last_entry(new_map),
13681 new_entry,
13682 VM_MAP_KERNEL_FLAGS_NONE);
13683 new_size += entry_size;
13684 break;
13685
13686 slow_vm_map_fork_copy:
13687 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13688 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13689 vm_map_copyin_flags |=
13690 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13691 }
13692 if (vm_map_fork_copy(old_map,
13693 &old_entry,
13694 new_map,
13695 vm_map_copyin_flags)) {
13696 new_size += entry_size;
13697 }
13698 continue;
13699 }
13700 old_entry = old_entry->vme_next;
13701 }
13702
13703 #if PMAP_FORK_NEST
13704 new_entry = vm_map_last_entry(new_map);
13705 if (new_entry == vm_map_to_entry(new_map)) {
13706 /* unnest all that was pre-nested */
13707 vm_map_fork_unnest(new_pmap,
13708 pre_nested_start, pre_nested_end,
13709 vm_map_min(new_map), vm_map_max(new_map));
13710 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13711 /* unnest hole at the end, if pre-nested */
13712 vm_map_fork_unnest(new_pmap,
13713 pre_nested_start, pre_nested_end,
13714 new_entry->vme_end, vm_map_max(new_map));
13715 }
13716 #endif /* PMAP_FORK_NEST */
13717
13718 #if defined(__arm64__)
13719 pmap_insert_commpage(new_map->pmap);
13720 #endif /* __arm64__ */
13721
13722 new_map->size = new_size;
13723
13724 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13725 vm_map_corpse_footprint_collect_done(new_map);
13726 }
13727
13728 /* Propagate JIT entitlement for the pmap layer. */
13729 if (pmap_get_jit_entitled(old_map->pmap)) {
13730 /* Tell the pmap that it supports JIT. */
13731 pmap_set_jit_entitled(new_map->pmap);
13732 }
13733
13734 /* Propagate TPRO settings for the pmap layer */
13735 if (pmap_get_tpro(old_map->pmap)) {
13736 /* Tell the pmap that it supports TPRO */
13737 pmap_set_tpro(new_map->pmap);
13738 }
13739
13740
13741 vm_map_unlock(new_map);
13742 vm_map_unlock(old_map);
13743 vm_map_deallocate(old_map);
13744
13745 return new_map;
13746 }
13747
13748 /*
13749 * vm_map_exec:
13750 *
13751 * Setup the "new_map" with the proper execution environment according
13752 * to the type of executable (platform, 64bit, chroot environment).
13753 * Map the comm page and shared region, etc...
13754 */
13755 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13756 vm_map_exec(
13757 vm_map_t new_map,
13758 task_t task,
13759 boolean_t is64bit,
13760 void *fsroot,
13761 cpu_type_t cpu,
13762 cpu_subtype_t cpu_subtype,
13763 boolean_t reslide,
13764 boolean_t is_driverkit,
13765 uint32_t rsr_version)
13766 {
13767 SHARED_REGION_TRACE_DEBUG(
13768 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13769 (void *)VM_KERNEL_ADDRPERM(current_task()),
13770 (void *)VM_KERNEL_ADDRPERM(new_map),
13771 (void *)VM_KERNEL_ADDRPERM(task),
13772 (void *)VM_KERNEL_ADDRPERM(fsroot),
13773 cpu,
13774 cpu_subtype));
13775 (void) vm_commpage_enter(new_map, task, is64bit);
13776
13777 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13778
13779 SHARED_REGION_TRACE_DEBUG(
13780 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13781 (void *)VM_KERNEL_ADDRPERM(current_task()),
13782 (void *)VM_KERNEL_ADDRPERM(new_map),
13783 (void *)VM_KERNEL_ADDRPERM(task),
13784 (void *)VM_KERNEL_ADDRPERM(fsroot),
13785 cpu,
13786 cpu_subtype));
13787
13788 /*
13789 * Some devices have region(s) of memory that shouldn't get allocated by
13790 * user processes. The following code creates dummy vm_map_entry_t's for each
13791 * of the regions that needs to be reserved to prevent any allocations in
13792 * those regions.
13793 */
13794 kern_return_t kr = KERN_FAILURE;
13795 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13796 vmk_flags.vmkf_beyond_max = true;
13797
13798 const struct vm_reserved_region *regions = NULL;
13799 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13800 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13801
13802 for (size_t i = 0; i < num_regions; ++i) {
13803 vm_map_offset_t address = regions[i].vmrr_addr;
13804
13805 kr = vm_map_enter(
13806 new_map,
13807 &address,
13808 regions[i].vmrr_size,
13809 (vm_map_offset_t)0,
13810 vmk_flags,
13811 VM_OBJECT_NULL,
13812 (vm_object_offset_t)0,
13813 FALSE,
13814 VM_PROT_NONE,
13815 VM_PROT_NONE,
13816 VM_INHERIT_COPY);
13817
13818 if (kr != KERN_SUCCESS) {
13819 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13820 }
13821 }
13822
13823 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13824
13825 return KERN_SUCCESS;
13826 }
13827
13828 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13829 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13830 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13831 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13832 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13833 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13834 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13835 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13836 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13837 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13838 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13839 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13840 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13841 /*
13842 * vm_map_lookup_and_lock_object:
13843 *
13844 * Finds the VM object, offset, and
13845 * protection for a given virtual address in the
13846 * specified map, assuming a page fault of the
13847 * type specified.
13848 *
13849 * Returns the (object, offset, protection) for
13850 * this address, whether it is wired down, and whether
13851 * this map has the only reference to the data in question.
13852 * In order to later verify this lookup, a "version"
13853 * is returned.
13854 * If contended != NULL, *contended will be set to
13855 * true iff the thread had to spin or block to acquire
13856 * an exclusive lock.
13857 *
13858 * The map MUST be locked by the caller and WILL be
13859 * locked on exit. In order to guarantee the
13860 * existence of the returned object, it is returned
13861 * locked.
13862 *
13863 * If a lookup is requested with "write protection"
13864 * specified, the map may be changed to perform virtual
13865 * copying operations, although the data referenced will
13866 * remain the same.
13867 */
13868 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13869 vm_map_lookup_and_lock_object(
13870 vm_map_t *var_map, /* IN/OUT */
13871 vm_map_offset_t vaddr,
13872 vm_prot_t fault_type,
13873 int object_lock_type,
13874 vm_map_version_t *out_version, /* OUT */
13875 vm_object_t *object, /* OUT */
13876 vm_object_offset_t *offset, /* OUT */
13877 vm_prot_t *out_prot, /* OUT */
13878 boolean_t *wired, /* OUT */
13879 vm_object_fault_info_t fault_info, /* OUT */
13880 vm_map_t *real_map, /* OUT */
13881 bool *contended) /* OUT */
13882 {
13883 vm_map_entry_t entry;
13884 vm_map_t map = *var_map;
13885 vm_map_t old_map = *var_map;
13886 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13887 vm_map_offset_t cow_parent_vaddr = 0;
13888 vm_map_offset_t old_start = 0;
13889 vm_map_offset_t old_end = 0;
13890 vm_prot_t prot;
13891 boolean_t mask_protections;
13892 boolean_t force_copy;
13893 boolean_t no_force_copy_if_executable;
13894 boolean_t submap_needed_copy;
13895 vm_prot_t original_fault_type;
13896 vm_map_size_t fault_page_mask;
13897
13898 /*
13899 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13900 * as a mask against the mapping's actual protections, not as an
13901 * absolute value.
13902 */
13903 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13904 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13905 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13906 fault_type &= VM_PROT_ALL;
13907 original_fault_type = fault_type;
13908 if (contended) {
13909 *contended = false;
13910 }
13911
13912 *real_map = map;
13913
13914 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13915 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13916
13917 RetryLookup:
13918 fault_type = original_fault_type;
13919
13920 /*
13921 * If the map has an interesting hint, try it before calling
13922 * full blown lookup routine.
13923 */
13924 entry = map->hint;
13925
13926 if ((entry == vm_map_to_entry(map)) ||
13927 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13928 vm_map_entry_t tmp_entry;
13929
13930 /*
13931 * Entry was either not a valid hint, or the vaddr
13932 * was not contained in the entry, so do a full lookup.
13933 */
13934 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13935 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13936 vm_map_unlock(cow_sub_map_parent);
13937 }
13938 if ((*real_map != map)
13939 && (*real_map != cow_sub_map_parent)) {
13940 vm_map_unlock(*real_map);
13941 }
13942 return KERN_INVALID_ADDRESS;
13943 }
13944
13945 entry = tmp_entry;
13946 }
13947 if (map == old_map) {
13948 old_start = entry->vme_start;
13949 old_end = entry->vme_end;
13950 }
13951
13952 /*
13953 * Handle submaps. Drop lock on upper map, submap is
13954 * returned locked.
13955 */
13956
13957 submap_needed_copy = FALSE;
13958 submap_recurse:
13959 if (entry->is_sub_map) {
13960 vm_map_offset_t local_vaddr;
13961 vm_map_offset_t end_delta;
13962 vm_map_offset_t start_delta;
13963 vm_map_offset_t top_entry_saved_start;
13964 vm_object_offset_t top_entry_saved_offset;
13965 vm_map_entry_t submap_entry, saved_submap_entry;
13966 vm_object_offset_t submap_entry_offset;
13967 vm_object_size_t submap_entry_size;
13968 vm_prot_t subentry_protection;
13969 vm_prot_t subentry_max_protection;
13970 boolean_t subentry_no_copy_on_read;
13971 boolean_t subentry_permanent;
13972 boolean_t subentry_csm_associated;
13973 #if __arm64e__
13974 boolean_t subentry_used_for_tpro;
13975 #endif /* __arm64e__ */
13976 boolean_t mapped_needs_copy = FALSE;
13977 vm_map_version_t version;
13978
13979 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13980 "map %p (%d) entry %p submap %p (%d)\n",
13981 map, VM_MAP_PAGE_SHIFT(map), entry,
13982 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13983
13984 local_vaddr = vaddr;
13985 top_entry_saved_start = entry->vme_start;
13986 top_entry_saved_offset = VME_OFFSET(entry);
13987
13988 if ((entry->use_pmap &&
13989 !((fault_type & VM_PROT_WRITE) ||
13990 force_copy))) {
13991 /* if real_map equals map we unlock below */
13992 if ((*real_map != map) &&
13993 (*real_map != cow_sub_map_parent)) {
13994 vm_map_unlock(*real_map);
13995 }
13996 *real_map = VME_SUBMAP(entry);
13997 }
13998
13999 if (entry->needs_copy &&
14000 ((fault_type & VM_PROT_WRITE) ||
14001 force_copy)) {
14002 if (!mapped_needs_copy) {
14003 if (vm_map_lock_read_to_write(map)) {
14004 vm_map_lock_read(map);
14005 *real_map = map;
14006 goto RetryLookup;
14007 }
14008 vm_map_lock_read(VME_SUBMAP(entry));
14009 *var_map = VME_SUBMAP(entry);
14010 cow_sub_map_parent = map;
14011 /* reset base to map before cow object */
14012 /* this is the map which will accept */
14013 /* the new cow object */
14014 old_start = entry->vme_start;
14015 old_end = entry->vme_end;
14016 cow_parent_vaddr = vaddr;
14017 mapped_needs_copy = TRUE;
14018 } else {
14019 vm_map_lock_read(VME_SUBMAP(entry));
14020 *var_map = VME_SUBMAP(entry);
14021 if ((cow_sub_map_parent != map) &&
14022 (*real_map != map)) {
14023 vm_map_unlock(map);
14024 }
14025 }
14026 } else {
14027 if (entry->needs_copy) {
14028 submap_needed_copy = TRUE;
14029 }
14030 vm_map_lock_read(VME_SUBMAP(entry));
14031 *var_map = VME_SUBMAP(entry);
14032 /* leave map locked if it is a target */
14033 /* cow sub_map above otherwise, just */
14034 /* follow the maps down to the object */
14035 /* here we unlock knowing we are not */
14036 /* revisiting the map. */
14037 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14038 vm_map_unlock_read(map);
14039 }
14040 }
14041
14042 entry = NULL;
14043 map = *var_map;
14044
14045 /* calculate the offset in the submap for vaddr */
14046 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14047 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14048 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14049 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14050
14051 RetrySubMap:
14052 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
14053 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14054 vm_map_unlock(cow_sub_map_parent);
14055 }
14056 if ((*real_map != map)
14057 && (*real_map != cow_sub_map_parent)) {
14058 vm_map_unlock(*real_map);
14059 }
14060 *real_map = map;
14061 return KERN_INVALID_ADDRESS;
14062 }
14063
14064 /* find the attenuated shadow of the underlying object */
14065 /* on our target map */
14066
14067 /* in english the submap object may extend beyond the */
14068 /* region mapped by the entry or, may only fill a portion */
14069 /* of it. For our purposes, we only care if the object */
14070 /* doesn't fill. In this case the area which will */
14071 /* ultimately be clipped in the top map will only need */
14072 /* to be as big as the portion of the underlying entry */
14073 /* which is mapped */
14074 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14075 submap_entry->vme_start - top_entry_saved_offset : 0;
14076
14077 end_delta =
14078 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14079 submap_entry->vme_end ?
14080 0 : (top_entry_saved_offset +
14081 (old_end - old_start))
14082 - submap_entry->vme_end;
14083
14084 old_start += start_delta;
14085 old_end -= end_delta;
14086
14087 if (submap_entry->is_sub_map) {
14088 entry = submap_entry;
14089 vaddr = local_vaddr;
14090 goto submap_recurse;
14091 }
14092
14093 if (((fault_type & VM_PROT_WRITE) ||
14094 force_copy)
14095 && cow_sub_map_parent) {
14096 vm_object_t sub_object, copy_object;
14097 vm_object_offset_t copy_offset;
14098 vm_map_offset_t local_start;
14099 vm_map_offset_t local_end;
14100 boolean_t object_copied = FALSE;
14101 vm_object_offset_t object_copied_offset = 0;
14102 boolean_t object_copied_needs_copy = FALSE;
14103 kern_return_t kr = KERN_SUCCESS;
14104
14105 if (vm_map_lock_read_to_write(map)) {
14106 vm_map_lock_read(map);
14107 old_start -= start_delta;
14108 old_end += end_delta;
14109 goto RetrySubMap;
14110 }
14111
14112
14113 sub_object = VME_OBJECT(submap_entry);
14114 if (sub_object == VM_OBJECT_NULL) {
14115 sub_object =
14116 vm_object_allocate(
14117 (vm_map_size_t)
14118 (submap_entry->vme_end -
14119 submap_entry->vme_start));
14120 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
14121 VME_OFFSET_SET(submap_entry, 0);
14122 assert(!submap_entry->is_sub_map);
14123 assert(submap_entry->use_pmap);
14124 }
14125 local_start = local_vaddr -
14126 (cow_parent_vaddr - old_start);
14127 local_end = local_vaddr +
14128 (old_end - cow_parent_vaddr);
14129 vm_map_clip_start(map, submap_entry, local_start);
14130 vm_map_clip_end(map, submap_entry, local_end);
14131 if (submap_entry->is_sub_map) {
14132 /* unnesting was done when clipping */
14133 assert(!submap_entry->use_pmap);
14134 }
14135
14136 /* This is the COW case, lets connect */
14137 /* an entry in our space to the underlying */
14138 /* object in the submap, bypassing the */
14139 /* submap. */
14140 submap_entry_offset = VME_OFFSET(submap_entry);
14141 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14142
14143 if ((submap_entry->wired_count != 0 ||
14144 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14145 (submap_entry->protection & VM_PROT_EXECUTE) &&
14146 no_force_copy_if_executable) {
14147 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14148 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14149 vm_map_unlock(cow_sub_map_parent);
14150 }
14151 if ((*real_map != map)
14152 && (*real_map != cow_sub_map_parent)) {
14153 vm_map_unlock(*real_map);
14154 }
14155 *real_map = map;
14156 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
14157 vm_map_lock_write_to_read(map);
14158 kr = KERN_PROTECTION_FAILURE;
14159 DTRACE_VM4(submap_no_copy_executable,
14160 vm_map_t, map,
14161 vm_object_offset_t, submap_entry_offset,
14162 vm_object_size_t, submap_entry_size,
14163 int, kr);
14164 return kr;
14165 }
14166
14167 if (submap_entry->wired_count != 0) {
14168 vm_object_reference(sub_object);
14169
14170 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14171 "submap_entry %p offset 0x%llx\n",
14172 submap_entry, VME_OFFSET(submap_entry));
14173
14174 DTRACE_VM6(submap_copy_slowly,
14175 vm_map_t, cow_sub_map_parent,
14176 vm_map_offset_t, vaddr,
14177 vm_map_t, map,
14178 vm_object_size_t, submap_entry_size,
14179 int, submap_entry->wired_count,
14180 int, sub_object->copy_strategy);
14181
14182 saved_submap_entry = submap_entry;
14183 version.main_timestamp = map->timestamp;
14184 vm_map_unlock(map); /* Increments timestamp by 1 */
14185 submap_entry = VM_MAP_ENTRY_NULL;
14186
14187 vm_object_lock(sub_object);
14188 kr = vm_object_copy_slowly(sub_object,
14189 submap_entry_offset,
14190 submap_entry_size,
14191 FALSE,
14192 ©_object);
14193 object_copied = TRUE;
14194 object_copied_offset = 0;
14195 /* 4k: account for extra offset in physical page */
14196 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14197 object_copied_needs_copy = FALSE;
14198 vm_object_deallocate(sub_object);
14199
14200 vm_map_lock(map);
14201
14202 if (kr != KERN_SUCCESS &&
14203 kr != KERN_MEMORY_RESTART_COPY) {
14204 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14205 vm_map_unlock(cow_sub_map_parent);
14206 }
14207 if ((*real_map != map)
14208 && (*real_map != cow_sub_map_parent)) {
14209 vm_map_unlock(*real_map);
14210 }
14211 *real_map = map;
14212 vm_object_deallocate(copy_object);
14213 copy_object = VM_OBJECT_NULL;
14214 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
14215 vm_map_lock_write_to_read(map);
14216 DTRACE_VM4(submap_copy_error_slowly,
14217 vm_object_t, sub_object,
14218 vm_object_offset_t, submap_entry_offset,
14219 vm_object_size_t, submap_entry_size,
14220 int, kr);
14221 vm_map_lookup_and_lock_object_copy_slowly_error++;
14222 return kr;
14223 }
14224
14225 if ((kr == KERN_SUCCESS) &&
14226 (version.main_timestamp + 1) == map->timestamp) {
14227 submap_entry = saved_submap_entry;
14228 } else {
14229 saved_submap_entry = NULL;
14230 old_start -= start_delta;
14231 old_end += end_delta;
14232 vm_object_deallocate(copy_object);
14233 copy_object = VM_OBJECT_NULL;
14234 vm_map_lock_write_to_read(map);
14235 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14236 goto RetrySubMap;
14237 }
14238 vm_map_lookup_and_lock_object_copy_slowly_count++;
14239 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14240 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14241 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14242 }
14243 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14244 submap_entry_offset = VME_OFFSET(submap_entry);
14245 copy_object = VM_OBJECT_NULL;
14246 object_copied_offset = submap_entry_offset;
14247 object_copied_needs_copy = FALSE;
14248 DTRACE_VM6(submap_copy_strategically,
14249 vm_map_t, cow_sub_map_parent,
14250 vm_map_offset_t, vaddr,
14251 vm_map_t, map,
14252 vm_object_size_t, submap_entry_size,
14253 int, submap_entry->wired_count,
14254 int, sub_object->copy_strategy);
14255 kr = vm_object_copy_strategically(
14256 sub_object,
14257 submap_entry_offset,
14258 submap_entry->vme_end - submap_entry->vme_start,
14259 false, /* forking */
14260 ©_object,
14261 &object_copied_offset,
14262 &object_copied_needs_copy);
14263 if (kr == KERN_MEMORY_RESTART_COPY) {
14264 old_start -= start_delta;
14265 old_end += end_delta;
14266 vm_object_deallocate(copy_object);
14267 copy_object = VM_OBJECT_NULL;
14268 vm_map_lock_write_to_read(map);
14269 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14270 goto RetrySubMap;
14271 }
14272 if (kr != KERN_SUCCESS) {
14273 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14274 vm_map_unlock(cow_sub_map_parent);
14275 }
14276 if ((*real_map != map)
14277 && (*real_map != cow_sub_map_parent)) {
14278 vm_map_unlock(*real_map);
14279 }
14280 *real_map = map;
14281 vm_object_deallocate(copy_object);
14282 copy_object = VM_OBJECT_NULL;
14283 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14284 vm_map_lock_write_to_read(map);
14285 DTRACE_VM4(submap_copy_error_strategically,
14286 vm_object_t, sub_object,
14287 vm_object_offset_t, submap_entry_offset,
14288 vm_object_size_t, submap_entry_size,
14289 int, kr);
14290 vm_map_lookup_and_lock_object_copy_strategically_error++;
14291 return kr;
14292 }
14293 assert(copy_object != VM_OBJECT_NULL);
14294 assert(copy_object != sub_object);
14295 object_copied = TRUE;
14296 vm_map_lookup_and_lock_object_copy_strategically_count++;
14297 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14298 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14299 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14300 }
14301 } else {
14302 /* set up shadow object */
14303 object_copied = FALSE;
14304 copy_object = sub_object;
14305 vm_object_lock(sub_object);
14306 vm_object_reference_locked(sub_object);
14307 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14308 vm_object_unlock(sub_object);
14309
14310 assert(submap_entry->wired_count == 0);
14311 submap_entry->needs_copy = TRUE;
14312
14313 prot = submap_entry->protection;
14314 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14315 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14316 __FUNCTION__,
14317 map, map->pmap, submap_entry,
14318 (uint64_t)submap_entry->vme_start,
14319 (uint64_t)submap_entry->vme_end,
14320 prot);
14321 }
14322 prot = prot & ~VM_PROT_WRITE;
14323 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14324 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14325 __FUNCTION__,
14326 map, map->pmap, submap_entry,
14327 (uint64_t)submap_entry->vme_start,
14328 (uint64_t)submap_entry->vme_end,
14329 prot);
14330 }
14331
14332 if (override_nx(old_map,
14333 VME_ALIAS(submap_entry))
14334 && prot) {
14335 prot |= VM_PROT_EXECUTE;
14336 }
14337
14338 vm_object_pmap_protect(
14339 sub_object,
14340 VME_OFFSET(submap_entry),
14341 submap_entry->vme_end -
14342 submap_entry->vme_start,
14343 (submap_entry->is_shared
14344 || map->mapped_in_other_pmaps) ?
14345 PMAP_NULL : map->pmap,
14346 VM_MAP_PAGE_SIZE(map),
14347 submap_entry->vme_start,
14348 prot);
14349 vm_map_lookup_and_lock_object_copy_shadow_count++;
14350 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14351 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14352 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14353 }
14354 }
14355
14356 /*
14357 * Adjust the fault offset to the submap entry.
14358 */
14359 copy_offset = (local_vaddr -
14360 submap_entry->vme_start +
14361 VME_OFFSET(submap_entry));
14362
14363 /* This works diffently than the */
14364 /* normal submap case. We go back */
14365 /* to the parent of the cow map and*/
14366 /* clip out the target portion of */
14367 /* the sub_map, substituting the */
14368 /* new copy object, */
14369
14370 subentry_protection = submap_entry->protection;
14371 subentry_max_protection = submap_entry->max_protection;
14372 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14373 subentry_permanent = submap_entry->vme_permanent;
14374 subentry_csm_associated = submap_entry->csm_associated;
14375 #if __arm64e__
14376 subentry_used_for_tpro = submap_entry->used_for_tpro;
14377 #endif // __arm64e__
14378 vm_map_unlock(map);
14379 submap_entry = NULL; /* not valid after map unlock */
14380
14381 local_start = old_start;
14382 local_end = old_end;
14383 map = cow_sub_map_parent;
14384 *var_map = cow_sub_map_parent;
14385 vaddr = cow_parent_vaddr;
14386 cow_sub_map_parent = NULL;
14387
14388 if (!vm_map_lookup_entry(map,
14389 vaddr, &entry)) {
14390 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14391 vm_map_unlock(cow_sub_map_parent);
14392 }
14393 if ((*real_map != map)
14394 && (*real_map != cow_sub_map_parent)) {
14395 vm_map_unlock(*real_map);
14396 }
14397 *real_map = map;
14398 vm_object_deallocate(
14399 copy_object);
14400 copy_object = VM_OBJECT_NULL;
14401 vm_map_lock_write_to_read(map);
14402 DTRACE_VM4(submap_lookup_post_unlock,
14403 uint64_t, (uint64_t)entry->vme_start,
14404 uint64_t, (uint64_t)entry->vme_end,
14405 vm_map_offset_t, vaddr,
14406 int, object_copied);
14407 return KERN_INVALID_ADDRESS;
14408 }
14409
14410 /* clip out the portion of space */
14411 /* mapped by the sub map which */
14412 /* corresponds to the underlying */
14413 /* object */
14414
14415 /*
14416 * Clip (and unnest) the smallest nested chunk
14417 * possible around the faulting address...
14418 */
14419 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14420 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14421 /*
14422 * ... but don't go beyond the "old_start" to "old_end"
14423 * range, to avoid spanning over another VM region
14424 * with a possibly different VM object and/or offset.
14425 */
14426 if (local_start < old_start) {
14427 local_start = old_start;
14428 }
14429 if (local_end > old_end) {
14430 local_end = old_end;
14431 }
14432 /*
14433 * Adjust copy_offset to the start of the range.
14434 */
14435 copy_offset -= (vaddr - local_start);
14436
14437 vm_map_clip_start(map, entry, local_start);
14438 vm_map_clip_end(map, entry, local_end);
14439 if (entry->is_sub_map) {
14440 /* unnesting was done when clipping */
14441 assert(!entry->use_pmap);
14442 }
14443
14444 /* substitute copy object for */
14445 /* shared map entry */
14446 vm_map_deallocate(VME_SUBMAP(entry));
14447 assert(!entry->iokit_acct);
14448 entry->use_pmap = TRUE;
14449 VME_OBJECT_SET(entry, copy_object, false, 0);
14450
14451 /* propagate the submap entry's protections */
14452 if (entry->protection != VM_PROT_READ) {
14453 /*
14454 * Someone has already altered the top entry's
14455 * protections via vm_protect(VM_PROT_COPY).
14456 * Respect these new values and ignore the
14457 * submap entry's protections.
14458 */
14459 } else {
14460 /*
14461 * Regular copy-on-write: propagate the submap
14462 * entry's protections to the top map entry.
14463 */
14464 entry->protection |= subentry_protection;
14465 }
14466 entry->max_protection |= subentry_max_protection;
14467 /* propagate some attributes from subentry */
14468 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14469 entry->vme_permanent = subentry_permanent;
14470 entry->csm_associated = subentry_csm_associated;
14471 #if __arm64e__
14472 /* propagate TPRO iff the destination map has TPRO enabled */
14473 if (subentry_used_for_tpro) {
14474 if (vm_map_tpro(map)) {
14475 entry->used_for_tpro = subentry_used_for_tpro;
14476 } else {
14477 /* "permanent" came from being TPRO */
14478 entry->vme_permanent = FALSE;
14479 }
14480 }
14481 #endif /* __arm64e */
14482 if ((entry->protection & VM_PROT_WRITE) &&
14483 (entry->protection & VM_PROT_EXECUTE) &&
14484 #if XNU_TARGET_OS_OSX
14485 map->pmap != kernel_pmap &&
14486 (vm_map_cs_enforcement(map)
14487 #if __arm64__
14488 || !VM_MAP_IS_EXOTIC(map)
14489 #endif /* __arm64__ */
14490 ) &&
14491 #endif /* XNU_TARGET_OS_OSX */
14492 #if CODE_SIGNING_MONITOR
14493 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14494 #endif
14495 !(entry->used_for_jit) &&
14496 VM_MAP_POLICY_WX_STRIP_X(map)) {
14497 DTRACE_VM3(cs_wx,
14498 uint64_t, (uint64_t)entry->vme_start,
14499 uint64_t, (uint64_t)entry->vme_end,
14500 vm_prot_t, entry->protection);
14501 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14502 proc_selfpid(),
14503 (get_bsdtask_info(current_task())
14504 ? proc_name_address(get_bsdtask_info(current_task()))
14505 : "?"),
14506 __FUNCTION__, __LINE__,
14507 #if DEVELOPMENT || DEBUG
14508 (uint64_t)entry->vme_start,
14509 (uint64_t)entry->vme_end,
14510 #else /* DEVELOPMENT || DEBUG */
14511 (uint64_t)0,
14512 (uint64_t)0,
14513 #endif /* DEVELOPMENT || DEBUG */
14514 entry->protection);
14515 entry->protection &= ~VM_PROT_EXECUTE;
14516 }
14517
14518 if (object_copied) {
14519 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14520 entry->needs_copy = object_copied_needs_copy;
14521 entry->is_shared = FALSE;
14522 } else {
14523 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14524 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14525 assert(entry->wired_count == 0);
14526 VME_OFFSET_SET(entry, copy_offset);
14527 entry->needs_copy = TRUE;
14528 if (map != old_map) {
14529 entry->is_shared = TRUE;
14530 }
14531 }
14532 if (entry->inheritance == VM_INHERIT_SHARE) {
14533 entry->inheritance = VM_INHERIT_COPY;
14534 }
14535
14536 vm_map_lock_write_to_read(map);
14537 } else {
14538 if ((cow_sub_map_parent)
14539 && (cow_sub_map_parent != *real_map)
14540 && (cow_sub_map_parent != map)) {
14541 vm_map_unlock(cow_sub_map_parent);
14542 }
14543 entry = submap_entry;
14544 vaddr = local_vaddr;
14545 }
14546 }
14547
14548 /*
14549 * Check whether this task is allowed to have
14550 * this page.
14551 */
14552
14553 prot = entry->protection;
14554
14555 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14556 /*
14557 * HACK -- if not a stack, then allow execution
14558 */
14559 prot |= VM_PROT_EXECUTE;
14560 }
14561
14562 #if __arm64e__
14563 /*
14564 * If the entry we're dealing with is TPRO and we have a write
14565 * fault, inject VM_PROT_WRITE into protections. This allows us
14566 * to maintain RO permissions when not marked as TPRO.
14567 */
14568 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14569 prot |= VM_PROT_WRITE;
14570 }
14571 #endif /* __arm64e__ */
14572 if (mask_protections) {
14573 fault_type &= prot;
14574 if (fault_type == VM_PROT_NONE) {
14575 goto protection_failure;
14576 }
14577 }
14578 if (((fault_type & prot) != fault_type)
14579 #if __arm64__
14580 /* prefetch abort in execute-only page */
14581 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14582 #elif defined(__x86_64__)
14583 /* Consider the UEXEC bit when handling an EXECUTE fault */
14584 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14585 #endif
14586 ) {
14587 protection_failure:
14588 if (*real_map != map) {
14589 vm_map_unlock(*real_map);
14590 }
14591 *real_map = map;
14592
14593 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14594 log_stack_execution_failure((addr64_t)vaddr, prot);
14595 }
14596
14597 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14598 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14599 /*
14600 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14601 *
14602 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14603 */
14604 return KERN_PROTECTION_FAILURE;
14605 }
14606
14607 /*
14608 * If this page is not pageable, we have to get
14609 * it for all possible accesses.
14610 */
14611
14612 *wired = (entry->wired_count != 0);
14613 if (*wired) {
14614 fault_type = prot;
14615 }
14616
14617 /*
14618 * If the entry was copy-on-write, we either ...
14619 */
14620
14621 if (entry->needs_copy) {
14622 /*
14623 * If we want to write the page, we may as well
14624 * handle that now since we've got the map locked.
14625 *
14626 * If we don't need to write the page, we just
14627 * demote the permissions allowed.
14628 */
14629
14630 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14631 /*
14632 * Make a new object, and place it in the
14633 * object chain. Note that no new references
14634 * have appeared -- one just moved from the
14635 * map to the new object.
14636 */
14637
14638 if (vm_map_lock_read_to_write(map)) {
14639 vm_map_lock_read(map);
14640 goto RetryLookup;
14641 }
14642
14643 if (VME_OBJECT(entry)->shadowed == FALSE) {
14644 vm_object_lock(VME_OBJECT(entry));
14645 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14646 vm_object_unlock(VME_OBJECT(entry));
14647 }
14648 VME_OBJECT_SHADOW(entry,
14649 (vm_map_size_t) (entry->vme_end -
14650 entry->vme_start),
14651 vm_map_always_shadow(map));
14652 entry->needs_copy = FALSE;
14653
14654 vm_map_lock_write_to_read(map);
14655 }
14656 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14657 /*
14658 * We're attempting to read a copy-on-write
14659 * page -- don't allow writes.
14660 */
14661
14662 prot &= (~VM_PROT_WRITE);
14663 }
14664 }
14665
14666 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14667 /*
14668 * We went through a "needs_copy" submap without triggering
14669 * a copy, so granting write access to the page would bypass
14670 * that submap's "needs_copy".
14671 */
14672 assert(!(fault_type & VM_PROT_WRITE));
14673 assert(!*wired);
14674 assert(!force_copy);
14675 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14676 prot &= ~VM_PROT_WRITE;
14677 }
14678
14679 /*
14680 * Create an object if necessary.
14681 */
14682 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14683 if (vm_map_lock_read_to_write(map)) {
14684 vm_map_lock_read(map);
14685 goto RetryLookup;
14686 }
14687
14688 VME_OBJECT_SET(entry,
14689 vm_object_allocate(
14690 (vm_map_size_t)(entry->vme_end -
14691 entry->vme_start)), false, 0);
14692 VME_OFFSET_SET(entry, 0);
14693 assert(entry->use_pmap);
14694 vm_map_lock_write_to_read(map);
14695 }
14696
14697 /*
14698 * Return the object/offset from this entry. If the entry
14699 * was copy-on-write or empty, it has been fixed up. Also
14700 * return the protection.
14701 */
14702
14703 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14704 *object = VME_OBJECT(entry);
14705 *out_prot = prot;
14706 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14707
14708 if (fault_info) {
14709 /* ... the caller will change "interruptible" if needed */
14710 fault_info->user_tag = VME_ALIAS(entry);
14711 fault_info->pmap_options = 0;
14712 if (entry->iokit_acct ||
14713 (!entry->is_sub_map && !entry->use_pmap)) {
14714 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14715 }
14716 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14717 fault_info->behavior = entry->behavior;
14718 }
14719 fault_info->lo_offset = VME_OFFSET(entry);
14720 fault_info->hi_offset =
14721 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14722 fault_info->no_cache = entry->no_cache;
14723 fault_info->stealth = FALSE;
14724 fault_info->io_sync = FALSE;
14725 if (entry->used_for_jit ||
14726 #if CODE_SIGNING_MONITOR
14727 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14728 #endif
14729 entry->vme_resilient_codesign) {
14730 fault_info->cs_bypass = TRUE;
14731 } else {
14732 fault_info->cs_bypass = FALSE;
14733 }
14734 fault_info->csm_associated = FALSE;
14735 #if CODE_SIGNING_MONITOR
14736 if (entry->csm_associated) {
14737 /*
14738 * The pmap layer will validate this page
14739 * before allowing it to be executed from.
14740 */
14741 fault_info->csm_associated = TRUE;
14742 }
14743 #endif
14744 fault_info->mark_zf_absent = FALSE;
14745 fault_info->batch_pmap_op = FALSE;
14746 fault_info->resilient_media = entry->vme_resilient_media;
14747 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14748 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14749 #if __arm64e__
14750 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14751 #else /* __arm64e__ */
14752 fault_info->fi_used_for_tpro = FALSE;
14753 #endif
14754 if (entry->translated_allow_execute) {
14755 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14756 }
14757 }
14758
14759 /*
14760 * Lock the object to prevent it from disappearing
14761 */
14762 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14763 if (contended == NULL) {
14764 vm_object_lock(*object);
14765 } else {
14766 *contended = vm_object_lock_check_contended(*object);
14767 }
14768 } else {
14769 vm_object_lock_shared(*object);
14770 }
14771
14772 /*
14773 * Save the version number
14774 */
14775
14776 out_version->main_timestamp = map->timestamp;
14777
14778 return KERN_SUCCESS;
14779 }
14780
14781
14782 /*
14783 * vm_map_verify:
14784 *
14785 * Verifies that the map in question has not changed
14786 * since the given version. The map has to be locked
14787 * ("shared" mode is fine) before calling this function
14788 * and it will be returned locked too.
14789 */
14790 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14791 vm_map_verify(
14792 vm_map_t map,
14793 vm_map_version_t *version) /* REF */
14794 {
14795 boolean_t result;
14796
14797 vm_map_lock_assert_held(map);
14798 result = (map->timestamp == version->main_timestamp);
14799
14800 return result;
14801 }
14802
14803 /*
14804 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14805 * Goes away after regular vm_region_recurse function migrates to
14806 * 64 bits
14807 * vm_region_recurse: A form of vm_region which follows the
14808 * submaps in a target map
14809 *
14810 */
14811
14812 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14813 vm_map_region_recurse_64(
14814 vm_map_t map,
14815 vm_map_offset_ut *address_u, /* IN/OUT */
14816 vm_map_size_ut *size_u, /* OUT */
14817 natural_t *nesting_depth, /* IN/OUT */
14818 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14819 mach_msg_type_number_t *count) /* IN/OUT */
14820 {
14821 mach_msg_type_number_t original_count;
14822 vm_region_extended_info_data_t extended;
14823 vm_map_entry_t tmp_entry;
14824 vm_map_offset_t user_address;
14825 unsigned int user_max_depth;
14826
14827 /*
14828 * "curr_entry" is the VM map entry preceding or including the
14829 * address we're looking for.
14830 * "curr_map" is the map or sub-map containing "curr_entry".
14831 * "curr_address" is the equivalent of the top map's "user_address"
14832 * in the current map.
14833 * "curr_offset" is the cumulated offset of "curr_map" in the
14834 * target task's address space.
14835 * "curr_depth" is the depth of "curr_map" in the chain of
14836 * sub-maps.
14837 *
14838 * "curr_max_below" and "curr_max_above" limit the range (around
14839 * "curr_address") we should take into account in the current (sub)map.
14840 * They limit the range to what's visible through the map entries
14841 * we've traversed from the top map to the current map.
14842 *
14843 */
14844 vm_map_entry_t curr_entry;
14845 vm_map_address_t curr_address;
14846 vm_map_offset_t curr_offset;
14847 vm_map_t curr_map;
14848 unsigned int curr_depth;
14849 vm_map_offset_t curr_max_below, curr_max_above;
14850 vm_map_offset_t curr_skip;
14851
14852 /*
14853 * "next_" is the same as "curr_" but for the VM region immediately
14854 * after the address we're looking for. We need to keep track of this
14855 * too because we want to return info about that region if the
14856 * address we're looking for is not mapped.
14857 */
14858 vm_map_entry_t next_entry;
14859 vm_map_offset_t next_offset;
14860 vm_map_offset_t next_address;
14861 vm_map_t next_map;
14862 unsigned int next_depth;
14863 vm_map_offset_t next_max_below, next_max_above;
14864 vm_map_offset_t next_skip;
14865
14866 boolean_t look_for_pages;
14867 vm_region_submap_short_info_64_t short_info;
14868 boolean_t do_region_footprint;
14869 int effective_page_size, effective_page_shift;
14870 boolean_t submap_needed_copy;
14871
14872 if (map == VM_MAP_NULL) {
14873 /* no address space to work on */
14874 return KERN_INVALID_ARGUMENT;
14875 }
14876
14877 user_address = vm_sanitize_addr(map, *address_u);
14878
14879 effective_page_shift = vm_self_region_page_shift(map);
14880 effective_page_size = (1 << effective_page_shift);
14881
14882 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14883 /*
14884 * "info" structure is not big enough and
14885 * would overflow
14886 */
14887 return KERN_INVALID_ARGUMENT;
14888 }
14889
14890 do_region_footprint = task_self_region_footprint();
14891 original_count = *count;
14892
14893 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14894 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14895 look_for_pages = FALSE;
14896 short_info = (vm_region_submap_short_info_64_t) submap_info;
14897 submap_info = NULL;
14898 } else {
14899 look_for_pages = TRUE;
14900 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14901 short_info = NULL;
14902
14903 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14904 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14905 }
14906 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14907 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14908 }
14909 }
14910
14911 user_max_depth = *nesting_depth;
14912 submap_needed_copy = FALSE;
14913
14914 if (not_in_kdp) {
14915 vm_map_lock_read(map);
14916 }
14917
14918 recurse_again:
14919 curr_entry = NULL;
14920 curr_map = map;
14921 curr_address = user_address;
14922 curr_offset = 0;
14923 curr_skip = 0;
14924 curr_depth = 0;
14925 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14926 curr_max_below = curr_address;
14927
14928 next_entry = NULL;
14929 next_map = NULL;
14930 next_address = 0;
14931 next_offset = 0;
14932 next_skip = 0;
14933 next_depth = 0;
14934 next_max_above = (vm_map_offset_t) -1;
14935 next_max_below = (vm_map_offset_t) -1;
14936
14937 for (;;) {
14938 if (vm_map_lookup_entry(curr_map,
14939 curr_address,
14940 &tmp_entry)) {
14941 /* tmp_entry contains the address we're looking for */
14942 curr_entry = tmp_entry;
14943 } else {
14944 vm_map_offset_t skip;
14945 /*
14946 * The address is not mapped. "tmp_entry" is the
14947 * map entry preceding the address. We want the next
14948 * one, if it exists.
14949 */
14950 curr_entry = tmp_entry->vme_next;
14951
14952 if (curr_entry == vm_map_to_entry(curr_map) ||
14953 (curr_entry->vme_start >=
14954 curr_address + curr_max_above)) {
14955 /* no next entry at this level: stop looking */
14956 if (not_in_kdp) {
14957 vm_map_unlock_read(curr_map);
14958 }
14959 curr_entry = NULL;
14960 curr_map = NULL;
14961 curr_skip = 0;
14962 curr_offset = 0;
14963 curr_depth = 0;
14964 curr_max_above = 0;
14965 curr_max_below = 0;
14966 break;
14967 }
14968
14969 /* adjust current address and offset */
14970 skip = curr_entry->vme_start - curr_address;
14971 curr_address = curr_entry->vme_start;
14972 curr_skip += skip;
14973 curr_offset += skip;
14974 curr_max_above -= skip;
14975 curr_max_below = 0;
14976 }
14977
14978 /*
14979 * Is the next entry at this level closer to the address (or
14980 * deeper in the submap chain) than the one we had
14981 * so far ?
14982 */
14983 tmp_entry = curr_entry->vme_next;
14984 if (tmp_entry == vm_map_to_entry(curr_map)) {
14985 /* no next entry at this level */
14986 } else if (tmp_entry->vme_start >=
14987 curr_address + curr_max_above) {
14988 /*
14989 * tmp_entry is beyond the scope of what we mapped of
14990 * this submap in the upper level: ignore it.
14991 */
14992 } else if ((next_entry == NULL) ||
14993 (tmp_entry->vme_start + curr_offset <=
14994 next_entry->vme_start + next_offset)) {
14995 /*
14996 * We didn't have a "next_entry" or this one is
14997 * closer to the address we're looking for:
14998 * use this "tmp_entry" as the new "next_entry".
14999 */
15000 if (next_entry != NULL) {
15001 /* unlock the last "next_map" */
15002 if (next_map != curr_map && not_in_kdp) {
15003 vm_map_unlock_read(next_map);
15004 }
15005 }
15006 next_entry = tmp_entry;
15007 next_map = curr_map;
15008 next_depth = curr_depth;
15009 next_address = next_entry->vme_start;
15010 next_skip = curr_skip;
15011 next_skip += (next_address - curr_address);
15012 next_offset = curr_offset;
15013 next_offset += (next_address - curr_address);
15014 next_max_above = MIN(next_max_above, curr_max_above);
15015 next_max_above = MIN(next_max_above,
15016 next_entry->vme_end - next_address);
15017 next_max_below = MIN(next_max_below, curr_max_below);
15018 next_max_below = MIN(next_max_below,
15019 next_address - next_entry->vme_start);
15020 }
15021
15022 /*
15023 * "curr_max_{above,below}" allow us to keep track of the
15024 * portion of the submap that is actually mapped at this level:
15025 * the rest of that submap is irrelevant to us, since it's not
15026 * mapped here.
15027 * The relevant portion of the map starts at
15028 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15029 */
15030 curr_max_above = MIN(curr_max_above,
15031 curr_entry->vme_end - curr_address);
15032 curr_max_below = MIN(curr_max_below,
15033 curr_address - curr_entry->vme_start);
15034
15035 if (!curr_entry->is_sub_map ||
15036 curr_depth >= user_max_depth) {
15037 /*
15038 * We hit a leaf map or we reached the maximum depth
15039 * we could, so stop looking. Keep the current map
15040 * locked.
15041 */
15042 break;
15043 }
15044
15045 /*
15046 * Get down to the next submap level.
15047 */
15048
15049 if (curr_entry->needs_copy) {
15050 /* everything below this is effectively copy-on-write */
15051 submap_needed_copy = TRUE;
15052 }
15053
15054 /*
15055 * Lock the next level and unlock the current level,
15056 * unless we need to keep it locked to access the "next_entry"
15057 * later.
15058 */
15059 if (not_in_kdp) {
15060 vm_map_lock_read(VME_SUBMAP(curr_entry));
15061 }
15062 if (curr_map == next_map) {
15063 /* keep "next_map" locked in case we need it */
15064 } else {
15065 /* release this map */
15066 if (not_in_kdp) {
15067 vm_map_unlock_read(curr_map);
15068 }
15069 }
15070
15071 /*
15072 * Adjust the offset. "curr_entry" maps the submap
15073 * at relative address "curr_entry->vme_start" in the
15074 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15075 * bytes of the submap.
15076 * "curr_offset" always represents the offset of a virtual
15077 * address in the curr_map relative to the absolute address
15078 * space (i.e. the top-level VM map).
15079 */
15080 curr_offset +=
15081 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
15082 curr_address = user_address + curr_offset;
15083 /* switch to the submap */
15084 curr_map = VME_SUBMAP(curr_entry);
15085 curr_depth++;
15086 curr_entry = NULL;
15087 }
15088
15089 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
15090 // so probably should be a real 32b ID vs. ptr.
15091 // Current users just check for equality
15092
15093 if (curr_entry == NULL) {
15094 /* no VM region contains the address... */
15095
15096 if (do_region_footprint && /* we want footprint numbers */
15097 next_entry == NULL && /* & there are no more regions */
15098 /* & we haven't already provided our fake region: */
15099 user_address <= vm_map_last_entry(map)->vme_end) {
15100 ledger_amount_t ledger_resident, ledger_compressed;
15101
15102 /*
15103 * Add a fake memory region to account for
15104 * purgeable and/or ledger-tagged memory that
15105 * counts towards this task's memory footprint,
15106 * i.e. the resident/compressed pages of non-volatile
15107 * objects owned by that task.
15108 */
15109 task_ledgers_footprint(map->pmap->ledger,
15110 &ledger_resident,
15111 &ledger_compressed);
15112 if (ledger_resident + ledger_compressed == 0) {
15113 /* no purgeable memory usage to report */
15114 return KERN_INVALID_ADDRESS;
15115 }
15116 /* fake region to show nonvolatile footprint */
15117 if (look_for_pages) {
15118 submap_info->protection = VM_PROT_DEFAULT;
15119 submap_info->max_protection = VM_PROT_DEFAULT;
15120 submap_info->inheritance = VM_INHERIT_DEFAULT;
15121 submap_info->offset = 0;
15122 submap_info->user_tag = -1;
15123 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15124 submap_info->pages_shared_now_private = 0;
15125 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15126 submap_info->pages_dirtied = submap_info->pages_resident;
15127 submap_info->ref_count = 1;
15128 submap_info->shadow_depth = 0;
15129 submap_info->external_pager = 0;
15130 submap_info->share_mode = SM_PRIVATE;
15131 if (submap_needed_copy) {
15132 submap_info->share_mode = SM_COW;
15133 }
15134 submap_info->is_submap = 0;
15135 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15136 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15137 submap_info->user_wired_count = 0;
15138 submap_info->pages_reusable = 0;
15139 } else {
15140 short_info->user_tag = -1;
15141 short_info->offset = 0;
15142 short_info->protection = VM_PROT_DEFAULT;
15143 short_info->inheritance = VM_INHERIT_DEFAULT;
15144 short_info->max_protection = VM_PROT_DEFAULT;
15145 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15146 short_info->user_wired_count = 0;
15147 short_info->is_submap = 0;
15148 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15149 short_info->external_pager = 0;
15150 short_info->shadow_depth = 0;
15151 short_info->share_mode = SM_PRIVATE;
15152 if (submap_needed_copy) {
15153 short_info->share_mode = SM_COW;
15154 }
15155 short_info->ref_count = 1;
15156 }
15157 *nesting_depth = 0;
15158 *address_u = vm_sanitize_wrap_addr(vm_map_last_entry(map)->vme_end);
15159 *size_u = vm_sanitize_wrap_size(ledger_resident + ledger_compressed);
15160 return KERN_SUCCESS;
15161 }
15162
15163 if (next_entry == NULL) {
15164 /* ... and no VM region follows it either */
15165 return KERN_INVALID_ADDRESS;
15166 }
15167 /* ... gather info about the next VM region */
15168 curr_entry = next_entry;
15169 curr_map = next_map; /* still locked ... */
15170 curr_address = next_address;
15171 curr_skip = next_skip;
15172 curr_offset = next_offset;
15173 curr_depth = next_depth;
15174 curr_max_above = next_max_above;
15175 curr_max_below = next_max_below;
15176 } else {
15177 /* we won't need "next_entry" after all */
15178 if (next_entry != NULL) {
15179 /* release "next_map" */
15180 if (next_map != curr_map && not_in_kdp) {
15181 vm_map_unlock_read(next_map);
15182 }
15183 }
15184 }
15185 next_entry = NULL;
15186 next_map = NULL;
15187 next_offset = 0;
15188 next_skip = 0;
15189 next_depth = 0;
15190 next_max_below = -1;
15191 next_max_above = -1;
15192
15193 if (curr_entry->is_sub_map &&
15194 curr_depth < user_max_depth) {
15195 /*
15196 * We're not as deep as we could be: we must have
15197 * gone back up after not finding anything mapped
15198 * below the original top-level map entry's.
15199 * Let's move "curr_address" forward and recurse again.
15200 */
15201 user_address = curr_address;
15202 goto recurse_again;
15203 }
15204
15205 *nesting_depth = curr_depth;
15206 *address_u = vm_sanitize_wrap_addr(
15207 user_address + curr_skip - curr_max_below);
15208 *size_u = vm_sanitize_wrap_size(curr_max_above + curr_max_below);
15209
15210 if (look_for_pages) {
15211 submap_info->user_tag = VME_ALIAS(curr_entry);
15212 submap_info->offset = VME_OFFSET(curr_entry);
15213 submap_info->protection = curr_entry->protection;
15214 submap_info->inheritance = curr_entry->inheritance;
15215 submap_info->max_protection = curr_entry->max_protection;
15216 submap_info->behavior = curr_entry->behavior;
15217 submap_info->user_wired_count = curr_entry->user_wired_count;
15218 submap_info->is_submap = curr_entry->is_sub_map;
15219 if (curr_entry->is_sub_map) {
15220 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15221 } else {
15222 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15223 }
15224 } else {
15225 short_info->user_tag = VME_ALIAS(curr_entry);
15226 short_info->offset = VME_OFFSET(curr_entry);
15227 short_info->protection = curr_entry->protection;
15228 short_info->inheritance = curr_entry->inheritance;
15229 short_info->max_protection = curr_entry->max_protection;
15230 short_info->behavior = curr_entry->behavior;
15231 short_info->user_wired_count = curr_entry->user_wired_count;
15232 short_info->is_submap = curr_entry->is_sub_map;
15233 if (curr_entry->is_sub_map) {
15234 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15235 } else {
15236 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15237 }
15238 }
15239
15240 extended.pages_resident = 0;
15241 extended.pages_swapped_out = 0;
15242 extended.pages_shared_now_private = 0;
15243 extended.pages_dirtied = 0;
15244 extended.pages_reusable = 0;
15245 extended.external_pager = 0;
15246 extended.shadow_depth = 0;
15247 extended.share_mode = SM_EMPTY;
15248 extended.ref_count = 0;
15249
15250 if (not_in_kdp) {
15251 if (!curr_entry->is_sub_map) {
15252 vm_map_offset_t range_start, range_end;
15253 range_start = MAX((curr_address - curr_max_below),
15254 curr_entry->vme_start);
15255 range_end = MIN((curr_address + curr_max_above),
15256 curr_entry->vme_end);
15257 vm_map_region_walk(curr_map,
15258 range_start,
15259 curr_entry,
15260 (VME_OFFSET(curr_entry) +
15261 (range_start -
15262 curr_entry->vme_start)),
15263 range_end - range_start,
15264 &extended,
15265 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15266 if (submap_needed_copy) {
15267 extended.share_mode = SM_COW;
15268 }
15269 } else {
15270 if (curr_entry->use_pmap) {
15271 extended.share_mode = SM_TRUESHARED;
15272 } else {
15273 extended.share_mode = SM_PRIVATE;
15274 }
15275 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15276 }
15277 }
15278
15279 if (look_for_pages) {
15280 submap_info->pages_resident = extended.pages_resident;
15281 submap_info->pages_swapped_out = extended.pages_swapped_out;
15282 submap_info->pages_shared_now_private =
15283 extended.pages_shared_now_private;
15284 submap_info->pages_dirtied = extended.pages_dirtied;
15285 submap_info->external_pager = extended.external_pager;
15286 submap_info->shadow_depth = extended.shadow_depth;
15287 submap_info->share_mode = extended.share_mode;
15288 submap_info->ref_count = extended.ref_count;
15289
15290 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15291 submap_info->pages_reusable = extended.pages_reusable;
15292 }
15293 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15294 if (curr_entry->is_sub_map) {
15295 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15296 } else if (VME_OBJECT(curr_entry)) {
15297 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15298 } else {
15299 submap_info->object_id_full = 0ull;
15300 }
15301 }
15302 } else {
15303 short_info->external_pager = extended.external_pager;
15304 short_info->shadow_depth = extended.shadow_depth;
15305 short_info->share_mode = extended.share_mode;
15306 short_info->ref_count = extended.ref_count;
15307 }
15308
15309 if (not_in_kdp) {
15310 vm_map_unlock_read(curr_map);
15311 }
15312
15313 return KERN_SUCCESS;
15314 }
15315
15316 /*
15317 * vm_region:
15318 *
15319 * User call to obtain information about a region in
15320 * a task's address map. Currently, only one flavor is
15321 * supported.
15322 *
15323 * XXX The reserved and behavior fields cannot be filled
15324 * in until the vm merge from the IK is completed, and
15325 * vm_reserve is implemented.
15326 */
15327
15328 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_ut * address_u,vm_map_size_ut * size_u,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15329 vm_map_region(
15330 vm_map_t map,
15331 vm_map_offset_ut *address_u, /* IN/OUT */
15332 vm_map_size_ut *size_u, /* OUT */
15333 vm_region_flavor_t flavor, /* IN */
15334 vm_region_info_t info, /* OUT */
15335 mach_msg_type_number_t *count, /* IN/OUT */
15336 mach_port_t *object_name) /* OUT */
15337 {
15338 vm_map_entry_t tmp_entry;
15339 vm_map_entry_t entry;
15340 vm_map_offset_t start;
15341
15342 if (map == VM_MAP_NULL) {
15343 return KERN_INVALID_ARGUMENT;
15344 }
15345
15346 start = vm_sanitize_addr(map, *address_u);
15347
15348 switch (flavor) {
15349 case VM_REGION_BASIC_INFO:
15350 /* legacy for old 32-bit objects info */
15351 {
15352 vm_region_basic_info_t basic;
15353
15354 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15355 return KERN_INVALID_ARGUMENT;
15356 }
15357
15358 basic = (vm_region_basic_info_t) info;
15359 *count = VM_REGION_BASIC_INFO_COUNT;
15360
15361 vm_map_lock_read(map);
15362
15363 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15364 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15365 vm_map_unlock_read(map);
15366 return KERN_INVALID_ADDRESS;
15367 }
15368 } else {
15369 entry = tmp_entry;
15370 }
15371
15372 start = entry->vme_start;
15373
15374 basic->offset = (uint32_t)VME_OFFSET(entry);
15375 basic->protection = entry->protection;
15376 basic->inheritance = entry->inheritance;
15377 basic->max_protection = entry->max_protection;
15378 basic->behavior = entry->behavior;
15379 basic->user_wired_count = entry->user_wired_count;
15380 basic->reserved = entry->is_sub_map;
15381
15382 *address_u = vm_sanitize_wrap_addr(start);
15383 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15384
15385 if (object_name) {
15386 *object_name = IP_NULL;
15387 }
15388 if (entry->is_sub_map) {
15389 basic->shared = FALSE;
15390 } else {
15391 basic->shared = entry->is_shared;
15392 }
15393
15394 vm_map_unlock_read(map);
15395 return KERN_SUCCESS;
15396 }
15397
15398 case VM_REGION_BASIC_INFO_64:
15399 {
15400 vm_region_basic_info_64_t basic;
15401
15402 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15403 return KERN_INVALID_ARGUMENT;
15404 }
15405
15406 basic = (vm_region_basic_info_64_t) info;
15407 *count = VM_REGION_BASIC_INFO_COUNT_64;
15408
15409 vm_map_lock_read(map);
15410
15411 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15412 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15413 vm_map_unlock_read(map);
15414 return KERN_INVALID_ADDRESS;
15415 }
15416 } else {
15417 entry = tmp_entry;
15418 }
15419
15420 start = entry->vme_start;
15421
15422 basic->offset = VME_OFFSET(entry);
15423 basic->protection = entry->protection;
15424 basic->inheritance = entry->inheritance;
15425 basic->max_protection = entry->max_protection;
15426 basic->behavior = entry->behavior;
15427 basic->user_wired_count = entry->user_wired_count;
15428 basic->reserved = entry->is_sub_map;
15429
15430 *address_u = vm_sanitize_wrap_addr(start);
15431 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15432
15433 if (object_name) {
15434 *object_name = IP_NULL;
15435 }
15436 if (entry->is_sub_map) {
15437 basic->shared = FALSE;
15438 } else {
15439 basic->shared = entry->is_shared;
15440 }
15441
15442 vm_map_unlock_read(map);
15443 return KERN_SUCCESS;
15444 }
15445 case VM_REGION_EXTENDED_INFO:
15446 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15447 return KERN_INVALID_ARGUMENT;
15448 }
15449 OS_FALLTHROUGH;
15450 case VM_REGION_EXTENDED_INFO__legacy:
15451 {
15452 vm_region_extended_info_t extended;
15453 mach_msg_type_number_t original_count;
15454 int effective_page_size, effective_page_shift;
15455
15456 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15457 return KERN_INVALID_ARGUMENT;
15458 }
15459
15460 extended = (vm_region_extended_info_t) info;
15461
15462 effective_page_shift = vm_self_region_page_shift(map);
15463 effective_page_size = (1 << effective_page_shift);
15464
15465 vm_map_lock_read(map);
15466
15467 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15468 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15469 vm_map_unlock_read(map);
15470 return KERN_INVALID_ADDRESS;
15471 }
15472 } else {
15473 entry = tmp_entry;
15474 }
15475 start = entry->vme_start;
15476
15477 extended->protection = entry->protection;
15478 extended->user_tag = VME_ALIAS(entry);
15479 extended->pages_resident = 0;
15480 extended->pages_swapped_out = 0;
15481 extended->pages_shared_now_private = 0;
15482 extended->pages_dirtied = 0;
15483 extended->external_pager = 0;
15484 extended->shadow_depth = 0;
15485
15486 original_count = *count;
15487 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15488 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15489 } else {
15490 extended->pages_reusable = 0;
15491 *count = VM_REGION_EXTENDED_INFO_COUNT;
15492 }
15493
15494 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15495
15496 if (object_name) {
15497 *object_name = IP_NULL;
15498 }
15499
15500 *address_u = vm_sanitize_wrap_addr(start);
15501 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15502
15503 vm_map_unlock_read(map);
15504 return KERN_SUCCESS;
15505 }
15506 case VM_REGION_TOP_INFO:
15507 {
15508 vm_region_top_info_t top;
15509
15510 if (*count < VM_REGION_TOP_INFO_COUNT) {
15511 return KERN_INVALID_ARGUMENT;
15512 }
15513
15514 top = (vm_region_top_info_t) info;
15515 *count = VM_REGION_TOP_INFO_COUNT;
15516
15517 vm_map_lock_read(map);
15518
15519 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15520 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15521 vm_map_unlock_read(map);
15522 return KERN_INVALID_ADDRESS;
15523 }
15524 } else {
15525 entry = tmp_entry;
15526 }
15527 start = entry->vme_start;
15528
15529 top->private_pages_resident = 0;
15530 top->shared_pages_resident = 0;
15531
15532 vm_map_region_top_walk(entry, top);
15533
15534 if (object_name) {
15535 *object_name = IP_NULL;
15536 }
15537
15538 *address_u = vm_sanitize_wrap_addr(start);
15539 *size_u = vm_sanitize_wrap_size(entry->vme_end - start);
15540
15541 vm_map_unlock_read(map);
15542 return KERN_SUCCESS;
15543 }
15544 default:
15545 return KERN_INVALID_ARGUMENT;
15546 }
15547 }
15548
15549 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15550 MIN((entry_size), \
15551 ((obj)->all_reusable ? \
15552 (obj)->wired_page_count : \
15553 (obj)->resident_page_count - (obj)->reusable_page_count))
15554
15555 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15556 vm_map_region_top_walk(
15557 vm_map_entry_t entry,
15558 vm_region_top_info_t top)
15559 {
15560 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15561 top->share_mode = SM_EMPTY;
15562 top->ref_count = 0;
15563 top->obj_id = 0;
15564 return;
15565 }
15566
15567 {
15568 struct vm_object *obj, *tmp_obj;
15569 int ref_count;
15570 uint32_t entry_size;
15571
15572 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15573
15574 obj = VME_OBJECT(entry);
15575
15576 vm_object_lock(obj);
15577
15578 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15579 obj->paging_in_progress) {
15580 ref_count--;
15581 }
15582
15583 assert(obj->reusable_page_count <= obj->resident_page_count);
15584 if (obj->shadow) {
15585 if (ref_count == 1) {
15586 top->private_pages_resident =
15587 OBJ_RESIDENT_COUNT(obj, entry_size);
15588 } else {
15589 top->shared_pages_resident =
15590 OBJ_RESIDENT_COUNT(obj, entry_size);
15591 }
15592 top->ref_count = ref_count;
15593 top->share_mode = SM_COW;
15594
15595 while ((tmp_obj = obj->shadow)) {
15596 vm_object_lock(tmp_obj);
15597 vm_object_unlock(obj);
15598 obj = tmp_obj;
15599
15600 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15601 obj->paging_in_progress) {
15602 ref_count--;
15603 }
15604
15605 assert(obj->reusable_page_count <= obj->resident_page_count);
15606 top->shared_pages_resident +=
15607 OBJ_RESIDENT_COUNT(obj, entry_size);
15608 top->ref_count += ref_count - 1;
15609 }
15610 } else {
15611 if (entry->superpage_size) {
15612 top->share_mode = SM_LARGE_PAGE;
15613 top->shared_pages_resident = 0;
15614 top->private_pages_resident = entry_size;
15615 } else if (entry->needs_copy) {
15616 top->share_mode = SM_COW;
15617 top->shared_pages_resident =
15618 OBJ_RESIDENT_COUNT(obj, entry_size);
15619 } else {
15620 if (ref_count == 1 ||
15621 (ref_count == 2 && obj->named)) {
15622 top->share_mode = SM_PRIVATE;
15623 top->private_pages_resident =
15624 OBJ_RESIDENT_COUNT(obj,
15625 entry_size);
15626 } else {
15627 top->share_mode = SM_SHARED;
15628 top->shared_pages_resident =
15629 OBJ_RESIDENT_COUNT(obj,
15630 entry_size);
15631 }
15632 }
15633 top->ref_count = ref_count;
15634 }
15635
15636 vm_object_unlock(obj);
15637
15638 /* XXX K64: obj_id will be truncated */
15639 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15640 }
15641 }
15642
15643 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15644 vm_map_region_walk(
15645 vm_map_t map,
15646 vm_map_offset_t va,
15647 vm_map_entry_t entry,
15648 vm_object_offset_t offset,
15649 vm_object_size_t range,
15650 vm_region_extended_info_t extended,
15651 boolean_t look_for_pages,
15652 mach_msg_type_number_t count)
15653 {
15654 struct vm_object *obj, *tmp_obj;
15655 vm_map_offset_t last_offset;
15656 int i;
15657 int ref_count;
15658 struct vm_object *shadow_object;
15659 unsigned short shadow_depth;
15660 boolean_t do_region_footprint;
15661 int effective_page_size, effective_page_shift;
15662 vm_map_offset_t effective_page_mask;
15663
15664 do_region_footprint = task_self_region_footprint();
15665
15666 if ((entry->is_sub_map) ||
15667 (VME_OBJECT(entry) == 0) ||
15668 (VME_OBJECT(entry)->phys_contiguous &&
15669 !entry->superpage_size)) {
15670 extended->share_mode = SM_EMPTY;
15671 extended->ref_count = 0;
15672 return;
15673 }
15674
15675 if (entry->superpage_size) {
15676 extended->shadow_depth = 0;
15677 extended->share_mode = SM_LARGE_PAGE;
15678 extended->ref_count = 1;
15679 extended->external_pager = 0;
15680
15681 /* TODO4K: Superpage in 4k mode? */
15682 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15683 extended->shadow_depth = 0;
15684 return;
15685 }
15686
15687 effective_page_shift = vm_self_region_page_shift(map);
15688 effective_page_size = (1 << effective_page_shift);
15689 effective_page_mask = effective_page_size - 1;
15690
15691 offset = vm_map_trunc_page(offset, effective_page_mask);
15692
15693 obj = VME_OBJECT(entry);
15694
15695 vm_object_lock(obj);
15696
15697 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15698 obj->paging_in_progress) {
15699 ref_count--;
15700 }
15701
15702 if (look_for_pages) {
15703 for (last_offset = offset + range;
15704 offset < last_offset;
15705 offset += effective_page_size, va += effective_page_size) {
15706 if (do_region_footprint) {
15707 int disp;
15708
15709 disp = 0;
15710 if (map->has_corpse_footprint) {
15711 /*
15712 * Query the page info data we saved
15713 * while forking the corpse.
15714 */
15715 vm_map_corpse_footprint_query_page_info(
15716 map,
15717 va,
15718 &disp);
15719 } else {
15720 /*
15721 * Query the pmap.
15722 */
15723 vm_map_footprint_query_page_info(
15724 map,
15725 entry,
15726 va,
15727 &disp);
15728 }
15729 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15730 extended->pages_resident++;
15731 }
15732 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15733 extended->pages_reusable++;
15734 }
15735 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15736 extended->pages_dirtied++;
15737 }
15738 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15739 extended->pages_swapped_out++;
15740 }
15741 continue;
15742 }
15743
15744 vm_map_region_look_for_page(map, va, obj,
15745 vm_object_trunc_page(offset), ref_count,
15746 0, extended, count);
15747 }
15748
15749 if (do_region_footprint) {
15750 goto collect_object_info;
15751 }
15752 } else {
15753 collect_object_info:
15754 shadow_object = obj->shadow;
15755 shadow_depth = 0;
15756
15757 if (!(obj->internal)) {
15758 extended->external_pager = 1;
15759 }
15760
15761 if (shadow_object != VM_OBJECT_NULL) {
15762 vm_object_lock(shadow_object);
15763 for (;
15764 shadow_object != VM_OBJECT_NULL;
15765 shadow_depth++) {
15766 vm_object_t next_shadow;
15767
15768 if (!(shadow_object->internal)) {
15769 extended->external_pager = 1;
15770 }
15771
15772 next_shadow = shadow_object->shadow;
15773 if (next_shadow) {
15774 vm_object_lock(next_shadow);
15775 }
15776 vm_object_unlock(shadow_object);
15777 shadow_object = next_shadow;
15778 }
15779 }
15780 extended->shadow_depth = shadow_depth;
15781 }
15782
15783 if (extended->shadow_depth || entry->needs_copy) {
15784 extended->share_mode = SM_COW;
15785 } else {
15786 if (ref_count == 1) {
15787 extended->share_mode = SM_PRIVATE;
15788 } else {
15789 if (obj->true_share) {
15790 extended->share_mode = SM_TRUESHARED;
15791 } else {
15792 extended->share_mode = SM_SHARED;
15793 }
15794 }
15795 }
15796 extended->ref_count = ref_count - extended->shadow_depth;
15797
15798 for (i = 0; i < extended->shadow_depth; i++) {
15799 if ((tmp_obj = obj->shadow) == 0) {
15800 break;
15801 }
15802 vm_object_lock(tmp_obj);
15803 vm_object_unlock(obj);
15804
15805 if ((ref_count = os_ref_get_count_raw(&tmp_obj->ref_count)) > 1 &&
15806 tmp_obj->paging_in_progress) {
15807 ref_count--;
15808 }
15809
15810 extended->ref_count += ref_count;
15811 obj = tmp_obj;
15812 }
15813 vm_object_unlock(obj);
15814
15815 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15816 extended->share_mode = SM_PRIVATE;
15817 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15818 vm_map_entry_t cur;
15819 vm_map_entry_t last;
15820 int my_refs;
15821
15822 obj = VME_OBJECT(entry);
15823 last = vm_map_to_entry(map);
15824 my_refs = 0;
15825
15826 if ((ref_count = os_ref_get_count_raw(&obj->ref_count)) > 1 &&
15827 obj->paging_in_progress) {
15828 ref_count--;
15829 }
15830 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15831 if (vm_map_region_has_obj_ref(cur, obj)) {
15832 my_refs++;
15833 }
15834 }
15835
15836 if (my_refs == ref_count) {
15837 extended->share_mode = SM_PRIVATE_ALIASED;
15838 } else if (my_refs > 1) {
15839 extended->share_mode = SM_SHARED_ALIASED;
15840 }
15841 }
15842 }
15843
15844
15845 /* object is locked on entry and locked on return */
15846
15847
15848 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15849 vm_map_region_look_for_page(
15850 __unused vm_map_t map,
15851 __unused vm_map_offset_t va,
15852 vm_object_t object,
15853 vm_object_offset_t offset,
15854 int max_refcnt,
15855 unsigned short depth,
15856 vm_region_extended_info_t extended,
15857 mach_msg_type_number_t count)
15858 {
15859 vm_page_t p;
15860 vm_object_t shadow;
15861 int ref_count;
15862 vm_object_t caller_object;
15863
15864 shadow = object->shadow;
15865 caller_object = object;
15866
15867
15868 while (TRUE) {
15869 if (!(object->internal)) {
15870 extended->external_pager = 1;
15871 }
15872
15873 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15874 if (shadow && (max_refcnt == 1)) {
15875 extended->pages_shared_now_private++;
15876 }
15877
15878 if (!p->vmp_fictitious &&
15879 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15880 extended->pages_dirtied++;
15881 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15882 if (p->vmp_reusable || object->all_reusable) {
15883 extended->pages_reusable++;
15884 }
15885 }
15886
15887 extended->pages_resident++;
15888
15889 if (object != caller_object) {
15890 vm_object_unlock(object);
15891 }
15892
15893 return;
15894 }
15895 if (object->internal &&
15896 object->alive &&
15897 !object->terminating &&
15898 object->pager_ready) {
15899 if (vm_object_compressor_pager_state_get(object, offset)
15900 == VM_EXTERNAL_STATE_EXISTS) {
15901 /* the pager has that page */
15902 extended->pages_swapped_out++;
15903 if (object != caller_object) {
15904 vm_object_unlock(object);
15905 }
15906 return;
15907 }
15908 }
15909
15910 if (shadow) {
15911 vm_object_lock(shadow);
15912 if ((ref_count = os_ref_get_count_raw(&shadow->ref_count)) > 1 &&
15913 shadow->paging_in_progress) {
15914 ref_count--;
15915 }
15916
15917 if (++depth > extended->shadow_depth) {
15918 extended->shadow_depth = depth;
15919 }
15920
15921 if (ref_count > max_refcnt) {
15922 max_refcnt = ref_count;
15923 }
15924
15925 if (object != caller_object) {
15926 vm_object_unlock(object);
15927 }
15928
15929 offset = offset + object->vo_shadow_offset;
15930 object = shadow;
15931 shadow = object->shadow;
15932 continue;
15933 }
15934 if (object != caller_object) {
15935 vm_object_unlock(object);
15936 }
15937 break;
15938 }
15939 }
15940
15941 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15942 vm_map_region_has_obj_ref(
15943 vm_map_entry_t entry,
15944 vm_object_t object)
15945 {
15946 vm_object_t cur_obj;
15947 vm_object_t shadow_obj;
15948
15949 if (entry->is_sub_map) {
15950 return FALSE;
15951 }
15952
15953 cur_obj = VME_OBJECT(entry);
15954 if (cur_obj == VM_OBJECT_NULL) {
15955 return FALSE;
15956 } else if (cur_obj == object) {
15957 return TRUE;
15958 }
15959
15960 /*
15961 * Avoid locks for first shadow check, otherwise diagnostic tools will
15962 * spend most of their time obtaining locks in this function when analyzing
15963 * processes with many VM entries which may commonly have no shadow chain.
15964 *
15965 * This is acceptable because:
15966 * - Shadow's fields are not accessed outside of its lock
15967 * - Objects are unlikely to be modified due to:
15968 * - Many diagnostic tools suspend the task
15969 * - VM map is locked
15970 * - The rare incorrect return from this function turns a guess into a
15971 * slightly worse guess
15972 * - Entire shadow chain is not locked as a whole, so can still change
15973 * while traversing, resulting in incorrect guess even with locking
15974 */
15975 shadow_obj = cur_obj->shadow;
15976 if (shadow_obj == VM_OBJECT_NULL) {
15977 return FALSE;
15978 } else if (shadow_obj == object) {
15979 return TRUE;
15980 }
15981
15982 vm_object_lock(cur_obj);
15983
15984 while ((shadow_obj = cur_obj->shadow)) {
15985 /* check if object was found before grabbing a lock */
15986 if (shadow_obj == object) {
15987 vm_object_unlock(cur_obj);
15988 return TRUE;
15989 }
15990
15991 vm_object_lock(shadow_obj);
15992 vm_object_unlock(cur_obj);
15993 cur_obj = shadow_obj;
15994 }
15995
15996 /* exhausted the shadow chain */
15997 vm_object_unlock(cur_obj);
15998 return FALSE;
15999 }
16000
16001
16002 /*
16003 * Routine: vm_map_simplify
16004 *
16005 * Description:
16006 * Attempt to simplify the map representation in
16007 * the vicinity of the given starting address.
16008 * Note:
16009 * This routine is intended primarily to keep the
16010 * kernel maps more compact -- they generally don't
16011 * benefit from the "expand a map entry" technology
16012 * at allocation time because the adjacent entry
16013 * is often wired down.
16014 */
16015 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)16016 vm_map_simplify_entry(
16017 vm_map_t map,
16018 vm_map_entry_t this_entry)
16019 {
16020 vm_map_entry_t prev_entry;
16021
16022 prev_entry = this_entry->vme_prev;
16023
16024 if ((this_entry != vm_map_to_entry(map)) &&
16025 (prev_entry != vm_map_to_entry(map)) &&
16026
16027 (prev_entry->vme_end == this_entry->vme_start) &&
16028
16029 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16030 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16031 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16032 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
16033 prev_entry->vme_start))
16034 == VME_OFFSET(this_entry)) &&
16035
16036 (prev_entry->behavior == this_entry->behavior) &&
16037 (prev_entry->needs_copy == this_entry->needs_copy) &&
16038 (prev_entry->protection == this_entry->protection) &&
16039 (prev_entry->max_protection == this_entry->max_protection) &&
16040 (prev_entry->inheritance == this_entry->inheritance) &&
16041 (prev_entry->use_pmap == this_entry->use_pmap) &&
16042 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16043 (prev_entry->no_cache == this_entry->no_cache) &&
16044 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16045 (prev_entry->map_aligned == this_entry->map_aligned) &&
16046 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16047 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16048 #if __arm64e__
16049 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16050 #endif
16051 (prev_entry->csm_associated == this_entry->csm_associated) &&
16052 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16053 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16054 (prev_entry->vme_resilient_codesign ==
16055 this_entry->vme_resilient_codesign) &&
16056 (prev_entry->vme_resilient_media ==
16057 this_entry->vme_resilient_media) &&
16058 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16059 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16060
16061 (prev_entry->wired_count == this_entry->wired_count) &&
16062 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16063
16064 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16065 (prev_entry->in_transition == FALSE) &&
16066 (this_entry->in_transition == FALSE) &&
16067 (prev_entry->needs_wakeup == FALSE) &&
16068 (this_entry->needs_wakeup == FALSE) &&
16069 (prev_entry->is_shared == this_entry->is_shared) &&
16070 (prev_entry->superpage_size == FALSE) &&
16071 (this_entry->superpage_size == FALSE)
16072 ) {
16073 if (prev_entry->vme_permanent) {
16074 assert(this_entry->vme_permanent);
16075 prev_entry->vme_permanent = false;
16076 }
16077 vm_map_store_entry_unlink(map, prev_entry, true);
16078 assert(prev_entry->vme_start < this_entry->vme_end);
16079 if (prev_entry->map_aligned) {
16080 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16081 VM_MAP_PAGE_MASK(map)));
16082 }
16083 this_entry->vme_start = prev_entry->vme_start;
16084 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
16085
16086 if (map->holelistenabled) {
16087 vm_map_store_update_first_free(map, this_entry, TRUE);
16088 }
16089
16090 if (prev_entry->is_sub_map) {
16091 vm_map_deallocate(VME_SUBMAP(prev_entry));
16092 } else {
16093 vm_object_deallocate(VME_OBJECT(prev_entry));
16094 }
16095 vm_map_entry_dispose(prev_entry);
16096 SAVE_HINT_MAP_WRITE(map, this_entry);
16097 }
16098 }
16099
16100 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)16101 vm_map_simplify(
16102 vm_map_t map,
16103 vm_map_offset_t start)
16104 {
16105 vm_map_entry_t this_entry;
16106
16107 vm_map_lock(map);
16108 if (vm_map_lookup_entry(map, start, &this_entry)) {
16109 vm_map_simplify_entry(map, this_entry);
16110 vm_map_simplify_entry(map, this_entry->vme_next);
16111 }
16112 vm_map_unlock(map);
16113 }
16114
16115 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16116 vm_map_simplify_range(
16117 vm_map_t map,
16118 vm_map_offset_t start,
16119 vm_map_offset_t end)
16120 {
16121 vm_map_entry_t entry;
16122
16123 /*
16124 * The map should be locked (for "write") by the caller.
16125 */
16126
16127 if (start >= end) {
16128 /* invalid address range */
16129 return;
16130 }
16131
16132 start = vm_map_trunc_page(start,
16133 VM_MAP_PAGE_MASK(map));
16134 end = vm_map_round_page(end,
16135 VM_MAP_PAGE_MASK(map));
16136
16137 if (!vm_map_lookup_entry(map, start, &entry)) {
16138 /* "start" is not mapped and "entry" ends before "start" */
16139 if (entry == vm_map_to_entry(map)) {
16140 /* start with first entry in the map */
16141 entry = vm_map_first_entry(map);
16142 } else {
16143 /* start with next entry */
16144 entry = entry->vme_next;
16145 }
16146 }
16147
16148 while (entry != vm_map_to_entry(map) &&
16149 entry->vme_start <= end) {
16150 /* try and coalesce "entry" with its previous entry */
16151 vm_map_simplify_entry(map, entry);
16152 entry = entry->vme_next;
16153 }
16154 }
16155
16156 static __attribute__((always_inline, warn_unused_result))
16157 kern_return_t
vm_map_machine_attribute_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,mach_vm_offset_t * start,mach_vm_offset_t * end,vm_map_size_t * size)16158 vm_map_machine_attribute_sanitize(
16159 vm_map_t map,
16160 vm_map_offset_ut start_u,
16161 vm_map_offset_ut end_u,
16162 mach_vm_offset_t *start,
16163 mach_vm_offset_t *end,
16164 vm_map_size_t *size)
16165 {
16166 return vm_sanitize_addr_end(start_u, end_u,
16167 VM_SANITIZE_CALLER_VM_MAP_MACHINE_ATTRIBUTE, map,
16168 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
16169 size);
16170 }
16171
16172
16173 /*
16174 * Routine: vm_map_machine_attribute
16175 * Purpose:
16176 * Provide machine-specific attributes to mappings,
16177 * such as cachability etc. for machines that provide
16178 * them. NUMA architectures and machines with big/strange
16179 * caches will use this.
16180 * Note:
16181 * Responsibilities for locking and checking are handled here,
16182 * everything else in the pmap module. If any non-volatile
16183 * information must be kept, the pmap module should handle
16184 * it itself. [This assumes that attributes do not
16185 * need to be inherited, which seems ok to me]
16186 */
16187 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)16188 vm_map_machine_attribute(
16189 vm_map_t map,
16190 vm_map_offset_ut start_u,
16191 vm_map_offset_ut end_u,
16192 vm_machine_attribute_t attribute,
16193 vm_machine_attribute_val_t *value) /* IN/OUT */
16194 {
16195 mach_vm_offset_t start, end;
16196 vm_map_size_t sync_size;
16197 kern_return_t ret;
16198 vm_map_entry_t entry;
16199
16200 ret = vm_map_machine_attribute_sanitize(map,
16201 start_u,
16202 end_u,
16203 &start,
16204 &end,
16205 &sync_size);
16206 if (__improbable(ret != KERN_SUCCESS)) {
16207 return vm_sanitize_get_kr(ret);
16208 }
16209
16210 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16211 return KERN_INVALID_ADDRESS;
16212 }
16213
16214 vm_map_lock(map);
16215
16216 if (attribute != MATTR_CACHE) {
16217 /* If we don't have to find physical addresses, we */
16218 /* don't have to do an explicit traversal here. */
16219 ret = pmap_attribute(map->pmap, start, end - start,
16220 attribute, value);
16221 vm_map_unlock(map);
16222 return ret;
16223 }
16224
16225 ret = KERN_SUCCESS; /* Assume it all worked */
16226
16227 while (sync_size) {
16228 if (vm_map_lookup_entry(map, start, &entry)) {
16229 vm_map_size_t sub_size;
16230 if ((entry->vme_end - start) > sync_size) {
16231 sub_size = sync_size;
16232 sync_size = 0;
16233 } else {
16234 sub_size = entry->vme_end - start;
16235 sync_size -= sub_size;
16236 }
16237 if (entry->is_sub_map) {
16238 vm_map_offset_t sub_start;
16239 vm_map_offset_t sub_end;
16240
16241 sub_start = (start - entry->vme_start)
16242 + VME_OFFSET(entry);
16243 sub_end = sub_start + sub_size;
16244 vm_map_machine_attribute(
16245 VME_SUBMAP(entry),
16246 sub_start,
16247 sub_end,
16248 attribute, value);
16249 } else if (VME_OBJECT(entry)) {
16250 vm_page_t m;
16251 vm_object_t object;
16252 vm_object_t base_object;
16253 vm_object_t last_object;
16254 vm_object_offset_t offset;
16255 vm_object_offset_t base_offset;
16256 vm_map_size_t range;
16257 range = sub_size;
16258 offset = (start - entry->vme_start)
16259 + VME_OFFSET(entry);
16260 offset = vm_object_trunc_page(offset);
16261 base_offset = offset;
16262 object = VME_OBJECT(entry);
16263 base_object = object;
16264 last_object = NULL;
16265
16266 vm_object_lock(object);
16267
16268 while (range) {
16269 m = vm_page_lookup(
16270 object, offset);
16271
16272 if (m && !m->vmp_fictitious) {
16273 ret =
16274 pmap_attribute_cache_sync(
16275 VM_PAGE_GET_PHYS_PAGE(m),
16276 PAGE_SIZE,
16277 attribute, value);
16278 } else if (object->shadow) {
16279 offset = offset + object->vo_shadow_offset;
16280 last_object = object;
16281 object = object->shadow;
16282 vm_object_lock(last_object->shadow);
16283 vm_object_unlock(last_object);
16284 continue;
16285 }
16286 if (range < PAGE_SIZE) {
16287 range = 0;
16288 } else {
16289 range -= PAGE_SIZE;
16290 }
16291
16292 if (base_object != object) {
16293 vm_object_unlock(object);
16294 vm_object_lock(base_object);
16295 object = base_object;
16296 }
16297 /* Bump to the next page */
16298 base_offset += PAGE_SIZE;
16299 offset = base_offset;
16300 }
16301 vm_object_unlock(object);
16302 }
16303 start += sub_size;
16304 } else {
16305 vm_map_unlock(map);
16306 return KERN_FAILURE;
16307 }
16308 }
16309
16310 vm_map_unlock(map);
16311
16312 return ret;
16313 }
16314
16315 /*
16316 * vm_map_behavior_set:
16317 *
16318 * Sets the paging reference behavior of the specified address
16319 * range in the target map. Paging reference behavior affects
16320 * how pagein operations resulting from faults on the map will be
16321 * clustered.
16322 */
16323 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16324 vm_map_behavior_set(
16325 vm_map_t map,
16326 vm_map_offset_t start,
16327 vm_map_offset_t end,
16328 vm_behavior_t new_behavior)
16329 {
16330 vm_map_entry_t entry;
16331 vm_map_entry_t temp_entry;
16332
16333 if (start > end ||
16334 start < vm_map_min(map) ||
16335 end > vm_map_max(map)) {
16336 return KERN_NO_SPACE;
16337 }
16338 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16339 return KERN_INVALID_ADDRESS;
16340 }
16341
16342 switch (new_behavior) {
16343 /*
16344 * This first block of behaviors all set a persistent state on the specified
16345 * memory range. All we have to do here is to record the desired behavior
16346 * in the vm_map_entry_t's.
16347 */
16348
16349 case VM_BEHAVIOR_DEFAULT:
16350 case VM_BEHAVIOR_RANDOM:
16351 case VM_BEHAVIOR_SEQUENTIAL:
16352 case VM_BEHAVIOR_RSEQNTL:
16353 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16354 vm_map_lock(map);
16355
16356 /*
16357 * The entire address range must be valid for the map.
16358 * Note that vm_map_range_check() does a
16359 * vm_map_lookup_entry() internally and returns the
16360 * entry containing the start of the address range if
16361 * the entire range is valid.
16362 */
16363 if (vm_map_range_check(map, start, end, &temp_entry)) {
16364 entry = temp_entry;
16365 vm_map_clip_start(map, entry, start);
16366 } else {
16367 vm_map_unlock(map);
16368 return KERN_INVALID_ADDRESS;
16369 }
16370
16371 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16372 vm_map_clip_end(map, entry, end);
16373 if (entry->is_sub_map) {
16374 assert(!entry->use_pmap);
16375 }
16376
16377 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16378 entry->zero_wired_pages = TRUE;
16379 } else {
16380 entry->behavior = new_behavior;
16381 }
16382 entry = entry->vme_next;
16383 }
16384
16385 vm_map_unlock(map);
16386 break;
16387
16388 /*
16389 * The rest of these are different from the above in that they cause
16390 * an immediate action to take place as opposed to setting a behavior that
16391 * affects future actions.
16392 */
16393
16394 case VM_BEHAVIOR_WILLNEED:
16395 return vm_map_willneed(map, start, end);
16396
16397 case VM_BEHAVIOR_DONTNEED:
16398 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16399
16400 case VM_BEHAVIOR_FREE:
16401 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16402
16403 case VM_BEHAVIOR_REUSABLE:
16404 return vm_map_reusable_pages(map, start, end);
16405
16406 case VM_BEHAVIOR_REUSE:
16407 return vm_map_reuse_pages(map, start, end);
16408
16409 case VM_BEHAVIOR_CAN_REUSE:
16410 return vm_map_can_reuse(map, start, end);
16411
16412 #if MACH_ASSERT
16413 case VM_BEHAVIOR_PAGEOUT:
16414 return vm_map_pageout(map, start, end);
16415 #endif /* MACH_ASSERT */
16416
16417 case VM_BEHAVIOR_ZERO:
16418 return vm_map_zero(map, start, end);
16419
16420 default:
16421 return KERN_INVALID_ARGUMENT;
16422 }
16423
16424 return KERN_SUCCESS;
16425 }
16426
16427
16428 /*
16429 * Internals for madvise(MADV_WILLNEED) system call.
16430 *
16431 * The implementation is to do:-
16432 * a) read-ahead if the mapping corresponds to a mapped regular file
16433 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16434 */
16435
16436
16437 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16438 vm_map_willneed(
16439 vm_map_t map,
16440 vm_map_offset_t start,
16441 vm_map_offset_t end
16442 )
16443 {
16444 vm_map_entry_t entry;
16445 vm_object_t object;
16446 memory_object_t pager;
16447 struct vm_object_fault_info fault_info = {};
16448 kern_return_t kr;
16449 vm_object_size_t len;
16450 vm_object_offset_t offset;
16451
16452 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16453 task_pid(current_task()), start, end);
16454 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16455 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16456 fault_info.stealth = TRUE;
16457
16458 /*
16459 * The MADV_WILLNEED operation doesn't require any changes to the
16460 * vm_map_entry_t's, so the read lock is sufficient.
16461 */
16462
16463 vm_map_lock_read(map);
16464
16465 /*
16466 * The madvise semantics require that the address range be fully
16467 * allocated with no holes. Otherwise, we're required to return
16468 * an error.
16469 */
16470
16471 if (!vm_map_range_check(map, start, end, &entry)) {
16472 vm_map_unlock_read(map);
16473 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16474 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16475 return KERN_INVALID_ADDRESS;
16476 }
16477
16478 /*
16479 * Examine each vm_map_entry_t in the range.
16480 */
16481 for (; entry != vm_map_to_entry(map) && start < end;) {
16482 /*
16483 * The first time through, the start address could be anywhere
16484 * within the vm_map_entry we found. So adjust the offset to
16485 * correspond. After that, the offset will always be zero to
16486 * correspond to the beginning of the current vm_map_entry.
16487 */
16488 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16489
16490 /*
16491 * Set the length so we don't go beyond the end of the
16492 * map_entry or beyond the end of the range we were given.
16493 * This range could span also multiple map entries all of which
16494 * map different files, so make sure we only do the right amount
16495 * of I/O for each object. Note that it's possible for there
16496 * to be multiple map entries all referring to the same object
16497 * but with different page permissions, but it's not worth
16498 * trying to optimize that case.
16499 */
16500 len = MIN(entry->vme_end - start, end - start);
16501
16502 if ((vm_size_t) len != len) {
16503 /* 32-bit overflow */
16504 len = (vm_size_t) (0 - PAGE_SIZE);
16505 }
16506 fault_info.cluster_size = (vm_size_t) len;
16507 fault_info.lo_offset = offset;
16508 fault_info.hi_offset = offset + len;
16509 fault_info.user_tag = VME_ALIAS(entry);
16510 fault_info.pmap_options = 0;
16511 if (entry->iokit_acct ||
16512 (!entry->is_sub_map && !entry->use_pmap)) {
16513 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16514 }
16515 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16516
16517 /*
16518 * If the entry is a submap OR there's no read permission
16519 * to this mapping, then just skip it.
16520 */
16521 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16522 entry = entry->vme_next;
16523 start = entry->vme_start;
16524 continue;
16525 }
16526
16527 object = VME_OBJECT(entry);
16528
16529 if (object == NULL ||
16530 (object && object->internal)) {
16531 /*
16532 * Memory range backed by anonymous memory.
16533 */
16534 vm_size_t region_size = 0, effective_page_size = 0;
16535 vm_map_offset_t addr = 0, effective_page_mask = 0;
16536
16537 region_size = len;
16538 addr = start;
16539
16540 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16541 effective_page_size = effective_page_mask + 1;
16542
16543 vm_map_unlock_read(map);
16544
16545 while (region_size) {
16546 vm_pre_fault(
16547 vm_map_trunc_page(addr, effective_page_mask),
16548 VM_PROT_READ | VM_PROT_WRITE);
16549
16550 region_size -= effective_page_size;
16551 addr += effective_page_size;
16552 }
16553 } else {
16554 /*
16555 * Find the file object backing this map entry. If there is
16556 * none, then we simply ignore the "will need" advice for this
16557 * entry and go on to the next one.
16558 */
16559 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16560 entry = entry->vme_next;
16561 start = entry->vme_start;
16562 continue;
16563 }
16564
16565 vm_object_paging_begin(object);
16566 pager = object->pager;
16567 vm_object_unlock(object);
16568
16569 /*
16570 * The data_request() could take a long time, so let's
16571 * release the map lock to avoid blocking other threads.
16572 */
16573 vm_map_unlock_read(map);
16574
16575 /*
16576 * Get the data from the object asynchronously.
16577 *
16578 * Note that memory_object_data_request() places limits on the
16579 * amount of I/O it will do. Regardless of the len we
16580 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16581 * silently truncates the len to that size. This isn't
16582 * necessarily bad since madvise shouldn't really be used to
16583 * page in unlimited amounts of data. Other Unix variants
16584 * limit the willneed case as well. If this turns out to be an
16585 * issue for developers, then we can always adjust the policy
16586 * here and still be backwards compatible since this is all
16587 * just "advice".
16588 */
16589 kr = memory_object_data_request(
16590 pager,
16591 vm_object_trunc_page(offset) + object->paging_offset,
16592 0, /* ignored */
16593 VM_PROT_READ,
16594 (memory_object_fault_info_t)&fault_info);
16595
16596 vm_object_lock(object);
16597 vm_object_paging_end(object);
16598 vm_object_unlock(object);
16599
16600 /*
16601 * If we couldn't do the I/O for some reason, just give up on
16602 * the madvise. We still return success to the user since
16603 * madvise isn't supposed to fail when the advice can't be
16604 * taken.
16605 */
16606
16607 if (kr != KERN_SUCCESS) {
16608 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16609 task_pid(current_task()), start, kr);
16610 return KERN_SUCCESS;
16611 }
16612 }
16613
16614 start += len;
16615 if (start >= end) {
16616 /* done */
16617 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16618 task_pid(current_task()), start, KERN_SUCCESS);
16619 return KERN_SUCCESS;
16620 }
16621
16622 /* look up next entry */
16623 vm_map_lock_read(map);
16624 if (!vm_map_lookup_entry(map, start, &entry)) {
16625 /*
16626 * There's a new hole in the address range.
16627 */
16628 vm_map_unlock_read(map);
16629 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16630 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16631 return KERN_INVALID_ADDRESS;
16632 }
16633 }
16634
16635 vm_map_unlock_read(map);
16636 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16637 task_pid(current_task()), start, KERN_SUCCESS);
16638 return KERN_SUCCESS;
16639 }
16640
16641 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16642 vm_map_entry_is_reusable(
16643 vm_map_entry_t entry)
16644 {
16645 /* Only user map entries */
16646
16647 vm_object_t object;
16648
16649 if (entry->is_sub_map) {
16650 return FALSE;
16651 }
16652
16653 switch (VME_ALIAS(entry)) {
16654 case VM_MEMORY_MALLOC:
16655 case VM_MEMORY_MALLOC_SMALL:
16656 case VM_MEMORY_MALLOC_LARGE:
16657 case VM_MEMORY_REALLOC:
16658 case VM_MEMORY_MALLOC_TINY:
16659 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16660 case VM_MEMORY_MALLOC_LARGE_REUSED:
16661 /*
16662 * This is a malloc() memory region: check if it's still
16663 * in its original state and can be re-used for more
16664 * malloc() allocations.
16665 */
16666 break;
16667 default:
16668 /*
16669 * Not a malloc() memory region: let the caller decide if
16670 * it's re-usable.
16671 */
16672 return TRUE;
16673 }
16674
16675 if (/*entry->is_shared ||*/
16676 entry->is_sub_map ||
16677 entry->in_transition ||
16678 entry->protection != VM_PROT_DEFAULT ||
16679 entry->max_protection != VM_PROT_ALL ||
16680 entry->inheritance != VM_INHERIT_DEFAULT ||
16681 entry->no_cache ||
16682 entry->vme_permanent ||
16683 entry->superpage_size != FALSE ||
16684 entry->zero_wired_pages ||
16685 entry->wired_count != 0 ||
16686 entry->user_wired_count != 0) {
16687 return FALSE;
16688 }
16689
16690 object = VME_OBJECT(entry);
16691 if (object == VM_OBJECT_NULL) {
16692 return TRUE;
16693 }
16694 if (
16695 #if 0
16696 /*
16697 * Let's proceed even if the VM object is potentially
16698 * shared.
16699 * We check for this later when processing the actual
16700 * VM pages, so the contents will be safe if shared.
16701 *
16702 * But we can still mark this memory region as "reusable" to
16703 * acknowledge that the caller did let us know that the memory
16704 * could be re-used and should not be penalized for holding
16705 * on to it. This allows its "resident size" to not include
16706 * the reusable range.
16707 */
16708 object->ref_count == 1 &&
16709 #endif
16710 object->vo_copy == VM_OBJECT_NULL &&
16711 object->shadow == VM_OBJECT_NULL &&
16712 object->internal &&
16713 object->purgable == VM_PURGABLE_DENY &&
16714 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16715 !object->code_signed) {
16716 return TRUE;
16717 }
16718 return FALSE;
16719 }
16720
16721 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16722 vm_map_reuse_pages(
16723 vm_map_t map,
16724 vm_map_offset_t start,
16725 vm_map_offset_t end)
16726 {
16727 vm_map_entry_t entry;
16728 vm_object_t object;
16729 vm_object_offset_t start_offset, end_offset;
16730
16731 /*
16732 * The MADV_REUSE operation doesn't require any changes to the
16733 * vm_map_entry_t's, so the read lock is sufficient.
16734 */
16735
16736 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16737 /*
16738 * XXX TODO4K
16739 * need to figure out what reusable means for a
16740 * portion of a native page.
16741 */
16742 return KERN_SUCCESS;
16743 }
16744
16745 vm_map_lock_read(map);
16746 assert(map->pmap != kernel_pmap); /* protect alias access */
16747
16748 /*
16749 * The madvise semantics require that the address range be fully
16750 * allocated with no holes. Otherwise, we're required to return
16751 * an error.
16752 */
16753
16754 if (!vm_map_range_check(map, start, end, &entry)) {
16755 vm_map_unlock_read(map);
16756 vm_page_stats_reusable.reuse_pages_failure++;
16757 return KERN_INVALID_ADDRESS;
16758 }
16759
16760 /*
16761 * Examine each vm_map_entry_t in the range.
16762 */
16763 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16764 entry = entry->vme_next) {
16765 /*
16766 * Sanity check on the VM map entry.
16767 */
16768 if (!vm_map_entry_is_reusable(entry)) {
16769 vm_map_unlock_read(map);
16770 vm_page_stats_reusable.reuse_pages_failure++;
16771 return KERN_INVALID_ADDRESS;
16772 }
16773
16774 /*
16775 * The first time through, the start address could be anywhere
16776 * within the vm_map_entry we found. So adjust the offset to
16777 * correspond.
16778 */
16779 if (entry->vme_start < start) {
16780 start_offset = start - entry->vme_start;
16781 } else {
16782 start_offset = 0;
16783 }
16784 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16785 start_offset += VME_OFFSET(entry);
16786 end_offset += VME_OFFSET(entry);
16787
16788 object = VME_OBJECT(entry);
16789 if (object != VM_OBJECT_NULL) {
16790 vm_object_lock(object);
16791 vm_object_reuse_pages(object, start_offset, end_offset,
16792 TRUE);
16793 vm_object_unlock(object);
16794 }
16795
16796 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16797 /*
16798 * XXX
16799 * We do not hold the VM map exclusively here.
16800 * The "alias" field is not that critical, so it's
16801 * safe to update it here, as long as it is the only
16802 * one that can be modified while holding the VM map
16803 * "shared".
16804 */
16805 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16806 }
16807 }
16808
16809 vm_map_unlock_read(map);
16810 vm_page_stats_reusable.reuse_pages_success++;
16811 return KERN_SUCCESS;
16812 }
16813
16814
16815 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16816 vm_map_reusable_pages(
16817 vm_map_t map,
16818 vm_map_offset_t start,
16819 vm_map_offset_t end)
16820 {
16821 vm_map_entry_t entry;
16822 vm_object_t object;
16823 vm_object_offset_t start_offset, end_offset;
16824 vm_map_offset_t pmap_offset;
16825
16826 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16827 /*
16828 * XXX TODO4K
16829 * need to figure out what reusable means for a portion
16830 * of a native page.
16831 */
16832 return KERN_SUCCESS;
16833 }
16834
16835 /*
16836 * The MADV_REUSABLE operation doesn't require any changes to the
16837 * vm_map_entry_t's, so the read lock is sufficient.
16838 */
16839
16840 vm_map_lock_read(map);
16841 assert(map->pmap != kernel_pmap); /* protect alias access */
16842
16843 /*
16844 * The madvise semantics require that the address range be fully
16845 * allocated with no holes. Otherwise, we're required to return
16846 * an error.
16847 */
16848
16849 if (!vm_map_range_check(map, start, end, &entry)) {
16850 vm_map_unlock_read(map);
16851 vm_page_stats_reusable.reusable_pages_failure++;
16852 return KERN_INVALID_ADDRESS;
16853 }
16854
16855 /*
16856 * Examine each vm_map_entry_t in the range.
16857 */
16858 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16859 entry = entry->vme_next) {
16860 int kill_pages = 0;
16861 boolean_t reusable_no_write = FALSE;
16862
16863 /*
16864 * Sanity check on the VM map entry.
16865 */
16866 if (!vm_map_entry_is_reusable(entry)) {
16867 vm_map_unlock_read(map);
16868 vm_page_stats_reusable.reusable_pages_failure++;
16869 return KERN_INVALID_ADDRESS;
16870 }
16871
16872 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16873 #if __arm64e__
16874 && !entry->used_for_tpro
16875 #endif
16876 ) {
16877 /* not writable: can't discard contents */
16878 vm_map_unlock_read(map);
16879 vm_page_stats_reusable.reusable_nonwritable++;
16880 vm_page_stats_reusable.reusable_pages_failure++;
16881 return KERN_PROTECTION_FAILURE;
16882 }
16883
16884 /*
16885 * The first time through, the start address could be anywhere
16886 * within the vm_map_entry we found. So adjust the offset to
16887 * correspond.
16888 */
16889 if (entry->vme_start < start) {
16890 start_offset = start - entry->vme_start;
16891 pmap_offset = start;
16892 } else {
16893 start_offset = 0;
16894 pmap_offset = entry->vme_start;
16895 }
16896 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16897 start_offset += VME_OFFSET(entry);
16898 end_offset += VME_OFFSET(entry);
16899
16900 object = VME_OBJECT(entry);
16901 if (object == VM_OBJECT_NULL) {
16902 continue;
16903 }
16904
16905 if (entry->protection & VM_PROT_EXECUTE) {
16906 /*
16907 * Executable mappings might be write-protected by
16908 * hardware, so do not attempt to write to these pages.
16909 */
16910 reusable_no_write = TRUE;
16911 }
16912
16913 if (entry->vme_xnu_user_debug) {
16914 /*
16915 * User debug pages might be write-protected by hardware,
16916 * so do not attempt to write to these pages.
16917 */
16918 reusable_no_write = TRUE;
16919 }
16920
16921 vm_object_lock(object);
16922 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
16923 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16924 object->vo_copy == VM_OBJECT_NULL)) &&
16925 object->shadow == VM_OBJECT_NULL &&
16926 /*
16927 * "iokit_acct" entries are billed for their virtual size
16928 * (rather than for their resident pages only), so they
16929 * wouldn't benefit from making pages reusable, and it
16930 * would be hard to keep track of pages that are both
16931 * "iokit_acct" and "reusable" in the pmap stats and
16932 * ledgers.
16933 */
16934 !(entry->iokit_acct ||
16935 (!entry->is_sub_map && !entry->use_pmap))) {
16936 if (os_ref_get_count_raw(&object->ref_count) != 1) {
16937 vm_page_stats_reusable.reusable_shared++;
16938 }
16939 kill_pages = 1;
16940 } else {
16941 kill_pages = -1;
16942 }
16943 if (kill_pages != -1) {
16944 vm_object_deactivate_pages(object,
16945 start_offset,
16946 end_offset - start_offset,
16947 kill_pages,
16948 TRUE /*reusable_pages*/,
16949 reusable_no_write,
16950 map->pmap,
16951 pmap_offset);
16952 } else {
16953 vm_page_stats_reusable.reusable_pages_shared++;
16954 DTRACE_VM4(vm_map_reusable_pages_shared,
16955 unsigned int, VME_ALIAS(entry),
16956 vm_map_t, map,
16957 vm_map_entry_t, entry,
16958 vm_object_t, object);
16959 }
16960 vm_object_unlock(object);
16961
16962 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16963 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16964 /*
16965 * XXX
16966 * We do not hold the VM map exclusively here.
16967 * The "alias" field is not that critical, so it's
16968 * safe to update it here, as long as it is the only
16969 * one that can be modified while holding the VM map
16970 * "shared".
16971 */
16972 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16973 }
16974 }
16975
16976 vm_map_unlock_read(map);
16977 vm_page_stats_reusable.reusable_pages_success++;
16978 return KERN_SUCCESS;
16979 }
16980
16981
16982 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16983 vm_map_can_reuse(
16984 vm_map_t map,
16985 vm_map_offset_t start,
16986 vm_map_offset_t end)
16987 {
16988 vm_map_entry_t entry;
16989
16990 /*
16991 * The MADV_REUSABLE operation doesn't require any changes to the
16992 * vm_map_entry_t's, so the read lock is sufficient.
16993 */
16994
16995 vm_map_lock_read(map);
16996 assert(map->pmap != kernel_pmap); /* protect alias access */
16997
16998 /*
16999 * The madvise semantics require that the address range be fully
17000 * allocated with no holes. Otherwise, we're required to return
17001 * an error.
17002 */
17003
17004 if (!vm_map_range_check(map, start, end, &entry)) {
17005 vm_map_unlock_read(map);
17006 vm_page_stats_reusable.can_reuse_failure++;
17007 return KERN_INVALID_ADDRESS;
17008 }
17009
17010 /*
17011 * Examine each vm_map_entry_t in the range.
17012 */
17013 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17014 entry = entry->vme_next) {
17015 /*
17016 * Sanity check on the VM map entry.
17017 */
17018 if (!vm_map_entry_is_reusable(entry)) {
17019 vm_map_unlock_read(map);
17020 vm_page_stats_reusable.can_reuse_failure++;
17021 return KERN_INVALID_ADDRESS;
17022 }
17023 }
17024
17025 vm_map_unlock_read(map);
17026 vm_page_stats_reusable.can_reuse_success++;
17027 return KERN_SUCCESS;
17028 }
17029
17030
17031 #if MACH_ASSERT
17032 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17033 vm_map_pageout(
17034 vm_map_t map,
17035 vm_map_offset_t start,
17036 vm_map_offset_t end)
17037 {
17038 vm_map_entry_t entry;
17039
17040 /*
17041 * The MADV_PAGEOUT operation doesn't require any changes to the
17042 * vm_map_entry_t's, so the read lock is sufficient.
17043 */
17044
17045 vm_map_lock_read(map);
17046
17047 /*
17048 * The madvise semantics require that the address range be fully
17049 * allocated with no holes. Otherwise, we're required to return
17050 * an error.
17051 */
17052
17053 if (!vm_map_range_check(map, start, end, &entry)) {
17054 vm_map_unlock_read(map);
17055 return KERN_INVALID_ADDRESS;
17056 }
17057
17058 /*
17059 * Examine each vm_map_entry_t in the range.
17060 */
17061 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17062 entry = entry->vme_next) {
17063 vm_object_t object;
17064
17065 /*
17066 * Sanity check on the VM map entry.
17067 */
17068 if (entry->is_sub_map) {
17069 vm_map_t submap;
17070 vm_map_offset_t submap_start;
17071 vm_map_offset_t submap_end;
17072 vm_map_entry_t submap_entry;
17073
17074 submap = VME_SUBMAP(entry);
17075 submap_start = VME_OFFSET(entry);
17076 submap_end = submap_start + (entry->vme_end -
17077 entry->vme_start);
17078
17079 vm_map_lock_read(submap);
17080
17081 if (!vm_map_range_check(submap,
17082 submap_start,
17083 submap_end,
17084 &submap_entry)) {
17085 vm_map_unlock_read(submap);
17086 vm_map_unlock_read(map);
17087 return KERN_INVALID_ADDRESS;
17088 }
17089
17090 if (submap_entry->is_sub_map) {
17091 vm_map_unlock_read(submap);
17092 continue;
17093 }
17094
17095 object = VME_OBJECT(submap_entry);
17096 if (object == VM_OBJECT_NULL || !object->internal) {
17097 vm_map_unlock_read(submap);
17098 continue;
17099 }
17100
17101 vm_object_pageout(object);
17102
17103 vm_map_unlock_read(submap);
17104 submap = VM_MAP_NULL;
17105 submap_entry = VM_MAP_ENTRY_NULL;
17106 continue;
17107 }
17108
17109 object = VME_OBJECT(entry);
17110 if (object == VM_OBJECT_NULL || !object->internal) {
17111 continue;
17112 }
17113
17114 vm_object_pageout(object);
17115 }
17116
17117 vm_map_unlock_read(map);
17118 return KERN_SUCCESS;
17119 }
17120 #endif /* MACH_ASSERT */
17121
17122 /*
17123 * This function determines if the zero operation can be run on the
17124 * respective entry. Additional checks on the object are in
17125 * vm_object_zero_preflight.
17126 */
17127 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)17128 vm_map_zero_entry_preflight(vm_map_entry_t entry)
17129 {
17130 /*
17131 * Zeroing is restricted to writable non-executable entries and non-JIT
17132 * regions.
17133 */
17134 if (!(entry->protection & VM_PROT_WRITE) ||
17135 (entry->protection & VM_PROT_EXECUTE) ||
17136 entry->used_for_jit ||
17137 entry->vme_xnu_user_debug) {
17138 return KERN_PROTECTION_FAILURE;
17139 }
17140
17141 /*
17142 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17143 * allowed for submaps.
17144 */
17145 if (entry->needs_copy || entry->is_sub_map) {
17146 return KERN_NO_ACCESS;
17147 }
17148
17149 return KERN_SUCCESS;
17150 }
17151
17152 /*
17153 * This function translates entry's start and end to offsets in the object
17154 */
17155 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)17156 vm_map_get_bounds_in_object(
17157 vm_map_entry_t entry,
17158 vm_map_offset_t start,
17159 vm_map_offset_t end,
17160 vm_map_offset_t *start_offset,
17161 vm_map_offset_t *end_offset)
17162 {
17163 if (entry->vme_start < start) {
17164 *start_offset = start - entry->vme_start;
17165 } else {
17166 *start_offset = 0;
17167 }
17168 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17169 *start_offset += VME_OFFSET(entry);
17170 *end_offset += VME_OFFSET(entry);
17171 }
17172
17173 /*
17174 * This function iterates through the entries in the requested range
17175 * and zeroes any resident pages in the corresponding objects. Compressed
17176 * pages are dropped instead of being faulted in and zeroed.
17177 */
17178 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)17179 vm_map_zero(
17180 vm_map_t map,
17181 vm_map_offset_t start,
17182 vm_map_offset_t end)
17183 {
17184 vm_map_entry_t entry;
17185 vm_map_offset_t cur = start;
17186 kern_return_t ret;
17187
17188 /*
17189 * This operation isn't supported where the map page size is less than
17190 * the hardware page size. Caller will need to handle error and
17191 * explicitly zero memory if needed.
17192 */
17193 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17194 return KERN_NO_ACCESS;
17195 }
17196
17197 /*
17198 * The MADV_ZERO operation doesn't require any changes to the
17199 * vm_map_entry_t's, so the read lock is sufficient.
17200 */
17201 vm_map_lock_read(map);
17202 assert(map->pmap != kernel_pmap); /* protect alias access */
17203
17204 /*
17205 * The madvise semantics require that the address range be fully
17206 * allocated with no holes. Otherwise, we're required to return
17207 * an error. This check needs to be redone if the map has changed.
17208 */
17209 if (!vm_map_range_check(map, cur, end, &entry)) {
17210 vm_map_unlock_read(map);
17211 return KERN_INVALID_ADDRESS;
17212 }
17213
17214 /*
17215 * Examine each vm_map_entry_t in the range.
17216 */
17217 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17218 vm_map_offset_t cur_offset;
17219 vm_map_offset_t end_offset;
17220 unsigned int last_timestamp = map->timestamp;
17221 vm_object_t object = VME_OBJECT(entry);
17222
17223 ret = vm_map_zero_entry_preflight(entry);
17224 if (ret != KERN_SUCCESS) {
17225 vm_map_unlock_read(map);
17226 return ret;
17227 }
17228
17229 if (object == VM_OBJECT_NULL) {
17230 entry = entry->vme_next;
17231 continue;
17232 }
17233
17234 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
17235 vm_object_lock(object);
17236 /*
17237 * Take a reference on the object as vm_object_zero will drop the object
17238 * lock when it encounters a busy page.
17239 */
17240 vm_object_reference_locked(object);
17241 vm_map_unlock_read(map);
17242
17243 ret = vm_object_zero(object, cur_offset, end_offset);
17244 vm_object_unlock(object);
17245 vm_object_deallocate(object);
17246 if (ret != KERN_SUCCESS) {
17247 return ret;
17248 }
17249 /*
17250 * Update cur as vm_object_zero has succeeded.
17251 */
17252 cur += (end_offset - cur_offset);
17253 if (cur == end) {
17254 return KERN_SUCCESS;
17255 }
17256
17257 /*
17258 * If the map timestamp has changed, restart by relooking up cur in the
17259 * map
17260 */
17261 vm_map_lock_read(map);
17262 if (last_timestamp != map->timestamp) {
17263 /*
17264 * Relookup cur in the map
17265 */
17266 if (!vm_map_range_check(map, cur, end, &entry)) {
17267 vm_map_unlock_read(map);
17268 return KERN_INVALID_ADDRESS;
17269 }
17270 continue;
17271 }
17272 /*
17273 * If the map hasn't changed proceed with the next entry
17274 */
17275 entry = entry->vme_next;
17276 }
17277
17278 vm_map_unlock_read(map);
17279 return KERN_SUCCESS;
17280 }
17281
17282
17283 /*
17284 * Routine: vm_map_entry_insert
17285 *
17286 * Description: This routine inserts a new vm_entry in a locked map.
17287 */
17288 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17289 vm_map_entry_insert(
17290 vm_map_t map,
17291 vm_map_entry_t insp_entry,
17292 vm_map_offset_t start,
17293 vm_map_offset_t end,
17294 vm_object_t object,
17295 vm_object_offset_t offset,
17296 vm_map_kernel_flags_t vmk_flags,
17297 boolean_t needs_copy,
17298 vm_prot_t cur_protection,
17299 vm_prot_t max_protection,
17300 vm_inherit_t inheritance,
17301 boolean_t clear_map_aligned)
17302 {
17303 vm_map_entry_t new_entry;
17304 boolean_t map_aligned = FALSE;
17305
17306 assert(insp_entry != (vm_map_entry_t)0);
17307 vm_map_lock_assert_exclusive(map);
17308
17309 __assert_only vm_object_offset_t end_offset = 0;
17310 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17311
17312 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17313 map_aligned = TRUE;
17314 }
17315 if (clear_map_aligned &&
17316 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17317 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17318 map_aligned = FALSE;
17319 }
17320 if (map_aligned) {
17321 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17322 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17323 } else {
17324 assert(page_aligned(start));
17325 assert(page_aligned(end));
17326 }
17327 assert(start < end);
17328
17329 new_entry = vm_map_entry_create(map);
17330
17331 new_entry->vme_start = start;
17332 new_entry->vme_end = end;
17333
17334 if (vmk_flags.vmkf_submap) {
17335 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17336 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17337 } else {
17338 VME_OBJECT_SET(new_entry, object, false, 0);
17339 }
17340 VME_OFFSET_SET(new_entry, offset);
17341 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17342
17343 new_entry->map_aligned = map_aligned;
17344 new_entry->needs_copy = needs_copy;
17345 new_entry->inheritance = inheritance;
17346 new_entry->protection = cur_protection;
17347 new_entry->max_protection = max_protection;
17348 /*
17349 * submap: "use_pmap" means "nested".
17350 * default: false.
17351 *
17352 * object: "use_pmap" means "use pmap accounting" for footprint.
17353 * default: true.
17354 */
17355 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17356 new_entry->no_cache = vmk_flags.vmf_no_cache;
17357 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17358 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17359 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17360 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17361
17362 if (vmk_flags.vmkf_map_jit) {
17363 if (!(map->jit_entry_exists) ||
17364 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17365 new_entry->used_for_jit = TRUE;
17366 map->jit_entry_exists = TRUE;
17367 }
17368 }
17369
17370 /*
17371 * Insert the new entry into the list.
17372 */
17373
17374 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17375 map->size += end - start;
17376
17377 /*
17378 * Update the free space hint and the lookup hint.
17379 */
17380
17381 SAVE_HINT_MAP_WRITE(map, new_entry);
17382 return new_entry;
17383 }
17384
17385 /*
17386 * Routine: vm_map_remap_extract
17387 *
17388 * Description: This routine returns a vm_entry list from a map.
17389 */
17390 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17391 vm_map_remap_extract(
17392 vm_map_t map,
17393 vm_map_offset_t addr,
17394 vm_map_size_t size,
17395 boolean_t copy,
17396 vm_map_copy_t map_copy,
17397 vm_prot_t *cur_protection, /* IN/OUT */
17398 vm_prot_t *max_protection, /* IN/OUT */
17399 /* What, no behavior? */
17400 vm_inherit_t inheritance,
17401 vm_map_kernel_flags_t vmk_flags)
17402 {
17403 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17404 kern_return_t result;
17405 vm_map_size_t mapped_size;
17406 vm_map_size_t tmp_size;
17407 vm_map_entry_t src_entry; /* result of last map lookup */
17408 vm_map_entry_t new_entry;
17409 vm_object_offset_t offset;
17410 vm_map_offset_t map_address;
17411 vm_map_offset_t src_start; /* start of entry to map */
17412 vm_map_offset_t src_end; /* end of region to be mapped */
17413 vm_object_t object;
17414 vm_map_version_t version;
17415 boolean_t src_needs_copy;
17416 boolean_t new_entry_needs_copy;
17417 vm_map_entry_t saved_src_entry;
17418 boolean_t src_entry_was_wired;
17419 vm_prot_t max_prot_for_prot_copy;
17420 vm_map_offset_t effective_page_mask;
17421 bool pageable, same_map;
17422 boolean_t vm_remap_legacy;
17423 vm_prot_t required_cur_prot, required_max_prot;
17424 vm_object_t new_copy_object; /* vm_object_copy_* result */
17425 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17426
17427 pageable = vmk_flags.vmkf_copy_pageable;
17428 same_map = vmk_flags.vmkf_copy_same_map;
17429
17430 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17431
17432 assert(map != VM_MAP_NULL);
17433 assert(size != 0);
17434 assert(size == vm_map_round_page(size, effective_page_mask));
17435 assert(inheritance == VM_INHERIT_NONE ||
17436 inheritance == VM_INHERIT_COPY ||
17437 inheritance == VM_INHERIT_SHARE);
17438 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17439 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17440 assert((*cur_protection & *max_protection) == *cur_protection);
17441
17442 /*
17443 * Compute start and end of region.
17444 */
17445 src_start = vm_map_trunc_page(addr, effective_page_mask);
17446 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17447
17448 /*
17449 * Initialize map_header.
17450 */
17451 map_header->nentries = 0;
17452 map_header->entries_pageable = pageable;
17453 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17454 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17455 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17456 vm_map_store_init(map_header);
17457
17458 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17459 /*
17460 * Special case for vm_map_protect(VM_PROT_COPY):
17461 * we want to set the new mappings' max protection to the
17462 * specified *max_protection...
17463 */
17464 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17465 /* ... but we want to use the vm_remap() legacy mode */
17466 vmk_flags.vmkf_remap_legacy_mode = true;
17467 *max_protection = VM_PROT_NONE;
17468 *cur_protection = VM_PROT_NONE;
17469 } else {
17470 max_prot_for_prot_copy = VM_PROT_NONE;
17471 }
17472
17473 if (vmk_flags.vmkf_remap_legacy_mode) {
17474 /*
17475 * vm_remap() legacy mode:
17476 * Extract all memory regions in the specified range and
17477 * collect the strictest set of protections allowed on the
17478 * entire range, so the caller knows what they can do with
17479 * the remapped range.
17480 * We start with VM_PROT_ALL and we'll remove the protections
17481 * missing from each memory region.
17482 */
17483 vm_remap_legacy = TRUE;
17484 *cur_protection = VM_PROT_ALL;
17485 *max_protection = VM_PROT_ALL;
17486 required_cur_prot = VM_PROT_NONE;
17487 required_max_prot = VM_PROT_NONE;
17488 } else {
17489 /*
17490 * vm_remap_new() mode:
17491 * Extract all memory regions in the specified range and
17492 * ensure that they have at least the protections specified
17493 * by the caller via *cur_protection and *max_protection.
17494 * The resulting mapping should have these protections.
17495 */
17496 vm_remap_legacy = FALSE;
17497 if (copy) {
17498 required_cur_prot = VM_PROT_NONE;
17499 required_max_prot = VM_PROT_READ;
17500 } else {
17501 required_cur_prot = *cur_protection;
17502 required_max_prot = *max_protection;
17503 }
17504 }
17505
17506 map_address = 0;
17507 mapped_size = 0;
17508 result = KERN_SUCCESS;
17509
17510 /*
17511 * The specified source virtual space might correspond to
17512 * multiple map entries, need to loop on them.
17513 */
17514 vm_map_lock(map);
17515
17516 if (map->pmap == kernel_pmap) {
17517 map_copy->is_kernel_range = true;
17518 map_copy->orig_range = kmem_addr_get_range(addr, size);
17519 #if CONFIG_MAP_RANGES
17520 } else if (map->uses_user_ranges) {
17521 map_copy->is_user_range = true;
17522 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17523 #endif /* CONFIG_MAP_RANGES */
17524 }
17525
17526 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17527 /*
17528 * This address space uses sub-pages so the range might
17529 * not be re-mappable in an address space with larger
17530 * pages. Re-assemble any broken-up VM map entries to
17531 * improve our chances of making it work.
17532 */
17533 vm_map_simplify_range(map, src_start, src_end);
17534 }
17535 while (mapped_size != size) {
17536 vm_map_size_t entry_size;
17537
17538 /*
17539 * Find the beginning of the region.
17540 */
17541 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17542 result = KERN_INVALID_ADDRESS;
17543 break;
17544 }
17545
17546 if (src_start < src_entry->vme_start ||
17547 (mapped_size && src_start != src_entry->vme_start)) {
17548 result = KERN_INVALID_ADDRESS;
17549 break;
17550 }
17551
17552 tmp_size = size - mapped_size;
17553 if (src_end > src_entry->vme_end) {
17554 tmp_size -= (src_end - src_entry->vme_end);
17555 }
17556
17557 entry_size = (vm_map_size_t)(src_entry->vme_end -
17558 src_entry->vme_start);
17559
17560 if (src_entry->is_sub_map &&
17561 vmk_flags.vmkf_copy_single_object) {
17562 vm_map_t submap;
17563 vm_map_offset_t submap_start;
17564 vm_map_size_t submap_size;
17565 boolean_t submap_needs_copy;
17566
17567 /*
17568 * No check for "required protection" on "src_entry"
17569 * because the protections that matter are the ones
17570 * on the submap's VM map entry, which will be checked
17571 * during the call to vm_map_remap_extract() below.
17572 */
17573 object = VM_OBJECT_NULL;
17574
17575 submap_size = src_entry->vme_end - src_start;
17576 if (submap_size > size) {
17577 submap_size = size;
17578 }
17579 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17580 submap = VME_SUBMAP(src_entry);
17581 if (copy) {
17582 /*
17583 * The caller wants a copy-on-write re-mapping,
17584 * so let's extract from the submap accordingly.
17585 */
17586 submap_needs_copy = TRUE;
17587 } else if (src_entry->needs_copy) {
17588 /*
17589 * The caller wants a shared re-mapping but the
17590 * submap is mapped with "needs_copy", so its
17591 * contents can't be shared as is. Extract the
17592 * contents of the submap as "copy-on-write".
17593 * The re-mapping won't be shared with the
17594 * original mapping but this is equivalent to
17595 * what happened with the original "remap from
17596 * submap" code.
17597 * The shared region is mapped "needs_copy", for
17598 * example.
17599 */
17600 submap_needs_copy = TRUE;
17601 } else {
17602 /*
17603 * The caller wants a shared re-mapping and
17604 * this mapping can be shared (no "needs_copy"),
17605 * so let's extract from the submap accordingly.
17606 * Kernel submaps are mapped without
17607 * "needs_copy", for example.
17608 */
17609 submap_needs_copy = FALSE;
17610 }
17611 vm_map_reference(submap);
17612 vm_map_unlock(map);
17613 src_entry = NULL;
17614 if (vm_remap_legacy) {
17615 *cur_protection = VM_PROT_NONE;
17616 *max_protection = VM_PROT_NONE;
17617 }
17618
17619 DTRACE_VM7(remap_submap_recurse,
17620 vm_map_t, map,
17621 vm_map_offset_t, addr,
17622 vm_map_size_t, size,
17623 boolean_t, copy,
17624 vm_map_offset_t, submap_start,
17625 vm_map_size_t, submap_size,
17626 boolean_t, submap_needs_copy);
17627
17628 result = vm_map_remap_extract(submap,
17629 submap_start,
17630 submap_size,
17631 submap_needs_copy,
17632 map_copy,
17633 cur_protection,
17634 max_protection,
17635 inheritance,
17636 vmk_flags);
17637 vm_map_deallocate(submap);
17638
17639 if (result == KERN_SUCCESS &&
17640 submap_needs_copy &&
17641 !copy) {
17642 /*
17643 * We were asked for a "shared"
17644 * re-mapping but had to ask for a
17645 * "copy-on-write" remapping of the
17646 * submap's mapping to honor the
17647 * submap's "needs_copy".
17648 * We now need to resolve that
17649 * pending "copy-on-write" to
17650 * get something we can share.
17651 */
17652 vm_map_entry_t copy_entry;
17653 vm_object_offset_t copy_offset;
17654 vm_map_size_t copy_size;
17655 vm_object_t copy_object;
17656 copy_entry = vm_map_copy_first_entry(map_copy);
17657 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17658 copy_object = VME_OBJECT(copy_entry);
17659 copy_offset = VME_OFFSET(copy_entry);
17660 if (copy_object == VM_OBJECT_NULL) {
17661 assert(copy_offset == 0);
17662 assert(!copy_entry->needs_copy);
17663 if (copy_entry->max_protection == VM_PROT_NONE) {
17664 assert(copy_entry->protection == VM_PROT_NONE);
17665 /* nothing to share */
17666 } else {
17667 assert(copy_offset == 0);
17668 copy_object = vm_object_allocate(copy_size);
17669 VME_OFFSET_SET(copy_entry, 0);
17670 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17671 assert(copy_entry->use_pmap);
17672 }
17673 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17674 /* already shareable */
17675 assert(!copy_entry->needs_copy);
17676 } else if (copy_entry->needs_copy ||
17677 copy_object->shadowed ||
17678 (copy_object->internal &&
17679 !copy_object->true_share &&
17680 !copy_entry->is_shared &&
17681 copy_object->vo_size > copy_size)) {
17682 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17683 assert(copy_entry->use_pmap);
17684 if (copy_entry->needs_copy) {
17685 /* already write-protected */
17686 } else {
17687 vm_prot_t prot;
17688 prot = copy_entry->protection & ~VM_PROT_WRITE;
17689 vm_object_pmap_protect(copy_object,
17690 copy_offset,
17691 copy_size,
17692 PMAP_NULL,
17693 PAGE_SIZE,
17694 0,
17695 prot);
17696 }
17697 copy_entry->needs_copy = FALSE;
17698 }
17699 copy_object = VME_OBJECT(copy_entry);
17700 copy_offset = VME_OFFSET(copy_entry);
17701 if (copy_object &&
17702 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17703 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17704 copy_object->true_share = TRUE;
17705 }
17706 }
17707
17708 return result;
17709 }
17710
17711 if (src_entry->is_sub_map) {
17712 /* protections for submap mapping are irrelevant here */
17713 } else if (((src_entry->protection & required_cur_prot) !=
17714 required_cur_prot) ||
17715 ((src_entry->max_protection & required_max_prot) !=
17716 required_max_prot)) {
17717 if (vmk_flags.vmkf_copy_single_object &&
17718 mapped_size != 0) {
17719 /*
17720 * Single object extraction.
17721 * We can't extract more with the required
17722 * protection but we've extracted some, so
17723 * stop there and declare success.
17724 * The caller should check the size of
17725 * the copy entry we've extracted.
17726 */
17727 result = KERN_SUCCESS;
17728 } else {
17729 /*
17730 * VM range extraction.
17731 * Required proctection is not available
17732 * for this part of the range: fail.
17733 */
17734 result = KERN_PROTECTION_FAILURE;
17735 }
17736 break;
17737 }
17738
17739 if (src_entry->is_sub_map) {
17740 vm_map_t submap;
17741 vm_map_offset_t submap_start;
17742 vm_map_size_t submap_size;
17743 vm_map_copy_t submap_copy;
17744 vm_prot_t submap_curprot, submap_maxprot;
17745 boolean_t submap_needs_copy;
17746
17747 /*
17748 * No check for "required protection" on "src_entry"
17749 * because the protections that matter are the ones
17750 * on the submap's VM map entry, which will be checked
17751 * during the call to vm_map_copy_extract() below.
17752 */
17753 object = VM_OBJECT_NULL;
17754 submap_copy = VM_MAP_COPY_NULL;
17755
17756 /* find equivalent range in the submap */
17757 submap = VME_SUBMAP(src_entry);
17758 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17759 submap_size = tmp_size;
17760 if (copy) {
17761 /*
17762 * The caller wants a copy-on-write re-mapping,
17763 * so let's extract from the submap accordingly.
17764 */
17765 submap_needs_copy = TRUE;
17766 } else if (src_entry->needs_copy) {
17767 /*
17768 * The caller wants a shared re-mapping but the
17769 * submap is mapped with "needs_copy", so its
17770 * contents can't be shared as is. Extract the
17771 * contents of the submap as "copy-on-write".
17772 * The re-mapping won't be shared with the
17773 * original mapping but this is equivalent to
17774 * what happened with the original "remap from
17775 * submap" code.
17776 * The shared region is mapped "needs_copy", for
17777 * example.
17778 */
17779 submap_needs_copy = TRUE;
17780 } else {
17781 /*
17782 * The caller wants a shared re-mapping and
17783 * this mapping can be shared (no "needs_copy"),
17784 * so let's extract from the submap accordingly.
17785 * Kernel submaps are mapped without
17786 * "needs_copy", for example.
17787 */
17788 submap_needs_copy = FALSE;
17789 }
17790 /* extra ref to keep submap alive */
17791 vm_map_reference(submap);
17792
17793 DTRACE_VM7(remap_submap_recurse,
17794 vm_map_t, map,
17795 vm_map_offset_t, addr,
17796 vm_map_size_t, size,
17797 boolean_t, copy,
17798 vm_map_offset_t, submap_start,
17799 vm_map_size_t, submap_size,
17800 boolean_t, submap_needs_copy);
17801
17802 /*
17803 * The map can be safely unlocked since we
17804 * already hold a reference on the submap.
17805 *
17806 * No timestamp since we don't care if the map
17807 * gets modified while we're down in the submap.
17808 * We'll resume the extraction at src_start + tmp_size
17809 * anyway.
17810 */
17811 vm_map_unlock(map);
17812 src_entry = NULL; /* not valid once map is unlocked */
17813
17814 if (vm_remap_legacy) {
17815 submap_curprot = VM_PROT_NONE;
17816 submap_maxprot = VM_PROT_NONE;
17817 if (max_prot_for_prot_copy) {
17818 submap_maxprot = max_prot_for_prot_copy;
17819 }
17820 } else {
17821 assert(!max_prot_for_prot_copy);
17822 submap_curprot = *cur_protection;
17823 submap_maxprot = *max_protection;
17824 }
17825 result = vm_map_copy_extract(submap,
17826 submap_start,
17827 submap_size,
17828 submap_needs_copy,
17829 &submap_copy,
17830 &submap_curprot,
17831 &submap_maxprot,
17832 inheritance,
17833 vmk_flags);
17834
17835 /* release extra ref on submap */
17836 vm_map_deallocate(submap);
17837 submap = VM_MAP_NULL;
17838
17839 if (result != KERN_SUCCESS) {
17840 vm_map_lock(map);
17841 break;
17842 }
17843
17844 /* transfer submap_copy entries to map_header */
17845 while (vm_map_copy_first_entry(submap_copy) !=
17846 vm_map_copy_to_entry(submap_copy)) {
17847 vm_map_entry_t copy_entry;
17848 vm_map_size_t copy_entry_size;
17849
17850 copy_entry = vm_map_copy_first_entry(submap_copy);
17851
17852 /*
17853 * Prevent kernel_object from being exposed to
17854 * user space.
17855 */
17856 if (__improbable(copy_entry->vme_kernel_object)) {
17857 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17858 proc_selfpid(),
17859 (get_bsdtask_info(current_task())
17860 ? proc_name_address(get_bsdtask_info(current_task()))
17861 : "?"));
17862 DTRACE_VM(extract_kernel_only);
17863 result = KERN_INVALID_RIGHT;
17864 vm_map_copy_discard(submap_copy);
17865 submap_copy = VM_MAP_COPY_NULL;
17866 vm_map_lock(map);
17867 break;
17868 }
17869
17870 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17871 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17872 copy_entry->vme_start = map_address;
17873 copy_entry->vme_end = map_address + copy_entry_size;
17874 map_address += copy_entry_size;
17875 mapped_size += copy_entry_size;
17876 src_start += copy_entry_size;
17877 assert(src_start <= src_end);
17878 _vm_map_store_entry_link(map_header,
17879 map_header->links.prev,
17880 copy_entry);
17881 }
17882 /* done with submap_copy */
17883 vm_map_copy_discard(submap_copy);
17884
17885 if (vm_remap_legacy) {
17886 *cur_protection &= submap_curprot;
17887 *max_protection &= submap_maxprot;
17888 }
17889
17890 /* re-acquire the map lock and continue to next entry */
17891 vm_map_lock(map);
17892 continue;
17893 } else {
17894 object = VME_OBJECT(src_entry);
17895
17896 /*
17897 * Prevent kernel_object from being exposed to
17898 * user space.
17899 */
17900 if (__improbable(is_kernel_object(object))) {
17901 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17902 proc_selfpid(),
17903 (get_bsdtask_info(current_task())
17904 ? proc_name_address(get_bsdtask_info(current_task()))
17905 : "?"));
17906 DTRACE_VM(extract_kernel_only);
17907 result = KERN_INVALID_RIGHT;
17908 break;
17909 }
17910
17911 if (src_entry->iokit_acct) {
17912 /*
17913 * This entry uses "IOKit accounting".
17914 */
17915 } else if (object != VM_OBJECT_NULL &&
17916 object->internal &&
17917 (object->purgable != VM_PURGABLE_DENY ||
17918 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17919 /*
17920 * Purgeable objects have their own accounting:
17921 * no pmap accounting for them.
17922 */
17923 assertf(!src_entry->use_pmap,
17924 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17925 map,
17926 src_entry,
17927 (uint64_t)src_entry->vme_start,
17928 (uint64_t)src_entry->vme_end,
17929 src_entry->protection,
17930 src_entry->max_protection,
17931 VME_ALIAS(src_entry));
17932 } else {
17933 /*
17934 * Not IOKit or purgeable:
17935 * must be accounted by pmap stats.
17936 */
17937 assertf(src_entry->use_pmap,
17938 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17939 map,
17940 src_entry,
17941 (uint64_t)src_entry->vme_start,
17942 (uint64_t)src_entry->vme_end,
17943 src_entry->protection,
17944 src_entry->max_protection,
17945 VME_ALIAS(src_entry));
17946 }
17947
17948 if (object == VM_OBJECT_NULL) {
17949 assert(!src_entry->needs_copy);
17950 if (src_entry->max_protection == VM_PROT_NONE) {
17951 assert(src_entry->protection == VM_PROT_NONE);
17952 /*
17953 * No VM object and no permissions:
17954 * this must be a reserved range with
17955 * nothing to share or copy.
17956 * There could also be all sorts of
17957 * pmap shenanigans within that reserved
17958 * range, so let's just copy the map
17959 * entry as is to remap a similar
17960 * reserved range.
17961 */
17962 offset = 0; /* no object => no offset */
17963 goto copy_src_entry;
17964 }
17965 object = vm_object_allocate(entry_size);
17966 VME_OFFSET_SET(src_entry, 0);
17967 VME_OBJECT_SET(src_entry, object, false, 0);
17968 assert(src_entry->use_pmap);
17969 assert(!map->mapped_in_other_pmaps);
17970 } else if (src_entry->wired_count ||
17971 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17972 /*
17973 * A wired memory region should not have
17974 * any pending copy-on-write and needs to
17975 * keep pointing at the VM object that
17976 * contains the wired pages.
17977 * If we're sharing this memory (copy=false),
17978 * we'll share this VM object.
17979 * If we're copying this memory (copy=true),
17980 * we'll call vm_object_copy_slowly() below
17981 * and use the new VM object for the remapping.
17982 *
17983 * Or, we are already using an asymmetric
17984 * copy, and therefore we already have
17985 * the right object.
17986 */
17987 assert(!src_entry->needs_copy);
17988 } else if (src_entry->needs_copy || object->shadowed ||
17989 (object->internal && !object->true_share &&
17990 !src_entry->is_shared &&
17991 object->vo_size > entry_size)) {
17992 bool is_writable;
17993
17994 VME_OBJECT_SHADOW(src_entry, entry_size,
17995 vm_map_always_shadow(map));
17996 assert(src_entry->use_pmap);
17997
17998 is_writable = false;
17999 if (src_entry->protection & VM_PROT_WRITE) {
18000 is_writable = true;
18001 #if __arm64e__
18002 } else if (src_entry->used_for_tpro) {
18003 is_writable = true;
18004 #endif /* __arm64e__ */
18005 }
18006 if (!src_entry->needs_copy && is_writable) {
18007 vm_prot_t prot;
18008
18009 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18010 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18011 __FUNCTION__,
18012 map, map->pmap,
18013 src_entry,
18014 (uint64_t)src_entry->vme_start,
18015 (uint64_t)src_entry->vme_end,
18016 src_entry->protection);
18017 }
18018
18019 prot = src_entry->protection & ~VM_PROT_WRITE;
18020
18021 if (override_nx(map,
18022 VME_ALIAS(src_entry))
18023 && prot) {
18024 prot |= VM_PROT_EXECUTE;
18025 }
18026
18027 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18028 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18029 __FUNCTION__,
18030 map, map->pmap,
18031 src_entry,
18032 (uint64_t)src_entry->vme_start,
18033 (uint64_t)src_entry->vme_end,
18034 prot);
18035 }
18036
18037 if (map->mapped_in_other_pmaps) {
18038 vm_object_pmap_protect(
18039 VME_OBJECT(src_entry),
18040 VME_OFFSET(src_entry),
18041 entry_size,
18042 PMAP_NULL,
18043 PAGE_SIZE,
18044 src_entry->vme_start,
18045 prot);
18046 #if MACH_ASSERT
18047 } else if (__improbable(map->pmap == PMAP_NULL)) {
18048 /*
18049 * Some VM tests (in vm_tests.c)
18050 * sometimes want to use a VM
18051 * map without a pmap.
18052 * Otherwise, this should never
18053 * happen.
18054 */
18055 if (!thread_get_test_option(test_option_vm_map_allow_null_pmap)) {
18056 panic("null pmap");
18057 }
18058 #endif /* MACH_ASSERT */
18059 } else {
18060 pmap_protect(vm_map_pmap(map),
18061 src_entry->vme_start,
18062 src_entry->vme_end,
18063 prot);
18064 }
18065 }
18066
18067 object = VME_OBJECT(src_entry);
18068 src_entry->needs_copy = FALSE;
18069 }
18070
18071
18072 vm_object_lock(object);
18073 vm_object_reference_locked(object); /* object ref. for new entry */
18074 assert(!src_entry->needs_copy);
18075 if (object->copy_strategy ==
18076 MEMORY_OBJECT_COPY_SYMMETRIC) {
18077 /*
18078 * If we want to share this object (copy==0),
18079 * it needs to be COPY_DELAY.
18080 * If we want to copy this object (copy==1),
18081 * we can't just set "needs_copy" on our side
18082 * and expect the other side to do the same
18083 * (symmetrically), so we can't let the object
18084 * stay COPY_SYMMETRIC.
18085 * So we always switch from COPY_SYMMETRIC to
18086 * COPY_DELAY.
18087 */
18088 object->copy_strategy =
18089 MEMORY_OBJECT_COPY_DELAY;
18090 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18091 }
18092 vm_object_unlock(object);
18093 }
18094
18095 offset = (VME_OFFSET(src_entry) +
18096 (src_start - src_entry->vme_start));
18097
18098 copy_src_entry:
18099 new_entry = _vm_map_entry_create(map_header);
18100 vm_map_entry_copy(map, new_entry, src_entry);
18101 if (new_entry->is_sub_map) {
18102 /* clr address space specifics */
18103 new_entry->use_pmap = FALSE;
18104 } else if (copy) {
18105 /*
18106 * We're dealing with a copy-on-write operation,
18107 * so the resulting mapping should not inherit the
18108 * original mapping's accounting settings.
18109 * "use_pmap" should be reset to its default (TRUE)
18110 * so that the new mapping gets accounted for in
18111 * the task's memory footprint.
18112 */
18113 new_entry->use_pmap = TRUE;
18114 }
18115 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18116 assert(!new_entry->iokit_acct);
18117
18118 new_entry->map_aligned = FALSE;
18119
18120 new_entry->vme_start = map_address;
18121 new_entry->vme_end = map_address + tmp_size;
18122 assert(new_entry->vme_start < new_entry->vme_end);
18123 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18124 /* security: keep "permanent" and "csm_associated" */
18125 new_entry->vme_permanent = src_entry->vme_permanent;
18126 new_entry->csm_associated = src_entry->csm_associated;
18127 /*
18128 * Remapping for vm_map_protect(VM_PROT_COPY)
18129 * to convert a read-only mapping into a
18130 * copy-on-write version of itself but
18131 * with write access:
18132 * keep the original inheritance but let's not
18133 * add VM_PROT_WRITE to the max protection yet
18134 * since we want to do more security checks against
18135 * the target map.
18136 */
18137 new_entry->inheritance = src_entry->inheritance;
18138 new_entry->protection &= max_prot_for_prot_copy;
18139
18140 #ifdef __arm64e__
18141 /*
18142 * Remapping for vm_map_protect(VM_PROT_COPY) to remap a TPRO
18143 * region to be explicitly writable without TPRO is only permitted
18144 * if TPRO enforcement has been overridden.
18145 *
18146 * In this case we ensure any entries reset the TPRO state
18147 * and we permit the region to be downgraded from permanent.
18148 */
18149 if (new_entry->used_for_tpro) {
18150 if (vmk_flags.vmkf_tpro_enforcement_override) {
18151 new_entry->used_for_tpro = FALSE;
18152 new_entry->vme_permanent = FALSE;
18153 } else {
18154 result = KERN_PROTECTION_FAILURE;
18155 vm_object_deallocate(object);
18156 vm_map_entry_dispose(new_entry);
18157 new_entry = VM_MAP_ENTRY_NULL;
18158 break;
18159 }
18160 }
18161 #endif
18162 } else {
18163 new_entry->inheritance = inheritance;
18164 if (!vm_remap_legacy) {
18165 new_entry->protection = *cur_protection;
18166 new_entry->max_protection = *max_protection;
18167 }
18168 }
18169
18170 VME_OFFSET_SET(new_entry, offset);
18171
18172 /*
18173 * The new region has to be copied now if required.
18174 */
18175 RestartCopy:
18176 if (!copy) {
18177 if (src_entry->used_for_jit == TRUE) {
18178 if (same_map) {
18179 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18180 /*
18181 * Cannot allow an entry describing a JIT
18182 * region to be shared across address spaces.
18183 */
18184 result = KERN_INVALID_ARGUMENT;
18185 vm_object_deallocate(object);
18186 vm_map_entry_dispose(new_entry);
18187 new_entry = VM_MAP_ENTRY_NULL;
18188 break;
18189 }
18190 }
18191
18192 if (!src_entry->is_sub_map &&
18193 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
18194 /* no accessible memory; nothing to share */
18195 assert(src_entry->protection == VM_PROT_NONE);
18196 assert(src_entry->max_protection == VM_PROT_NONE);
18197 src_entry->is_shared = FALSE;
18198 } else {
18199 src_entry->is_shared = TRUE;
18200 }
18201 if (!new_entry->is_sub_map &&
18202 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
18203 /* no accessible memory; nothing to share */
18204 assert(new_entry->protection == VM_PROT_NONE);
18205 assert(new_entry->max_protection == VM_PROT_NONE);
18206 new_entry->is_shared = FALSE;
18207 } else {
18208 new_entry->is_shared = TRUE;
18209 }
18210 if (!(new_entry->is_sub_map)) {
18211 new_entry->needs_copy = FALSE;
18212 }
18213 } else if (src_entry->is_sub_map) {
18214 /* make this a COW sub_map if not already */
18215 assert(new_entry->wired_count == 0);
18216 new_entry->needs_copy = TRUE;
18217 object = VM_OBJECT_NULL;
18218 } else if (src_entry->wired_count == 0 &&
18219 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18220 vm_object_copy_quickly(VME_OBJECT(new_entry),
18221 VME_OFFSET(new_entry),
18222 (new_entry->vme_end -
18223 new_entry->vme_start),
18224 &src_needs_copy,
18225 &new_entry_needs_copy)) {
18226 new_entry->needs_copy = new_entry_needs_copy;
18227 new_entry->is_shared = FALSE;
18228 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18229
18230 /*
18231 * Handle copy_on_write semantics.
18232 */
18233 if (src_needs_copy && !src_entry->needs_copy) {
18234 vm_prot_t prot;
18235
18236 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
18237 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18238 __FUNCTION__,
18239 map, map->pmap, src_entry,
18240 (uint64_t)src_entry->vme_start,
18241 (uint64_t)src_entry->vme_end,
18242 src_entry->protection);
18243 }
18244
18245 prot = src_entry->protection & ~VM_PROT_WRITE;
18246
18247 if (override_nx(map,
18248 VME_ALIAS(src_entry))
18249 && prot) {
18250 prot |= VM_PROT_EXECUTE;
18251 }
18252
18253 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
18254 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18255 __FUNCTION__,
18256 map, map->pmap, src_entry,
18257 (uint64_t)src_entry->vme_start,
18258 (uint64_t)src_entry->vme_end,
18259 prot);
18260 }
18261
18262 vm_object_pmap_protect(object,
18263 offset,
18264 entry_size,
18265 ((src_entry->is_shared
18266 || map->mapped_in_other_pmaps) ?
18267 PMAP_NULL : map->pmap),
18268 VM_MAP_PAGE_SIZE(map),
18269 src_entry->vme_start,
18270 prot);
18271
18272 assert(src_entry->wired_count == 0);
18273 src_entry->needs_copy = TRUE;
18274 }
18275 /*
18276 * Throw away the old object reference of the new entry.
18277 */
18278 vm_object_deallocate(object);
18279 } else {
18280 new_entry->is_shared = FALSE;
18281 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18282
18283 src_entry_was_wired = (src_entry->wired_count > 0);
18284 saved_src_entry = src_entry;
18285 src_entry = VM_MAP_ENTRY_NULL;
18286
18287 /*
18288 * The map can be safely unlocked since we
18289 * already hold a reference on the object.
18290 *
18291 * Record the timestamp of the map for later
18292 * verification, and unlock the map.
18293 */
18294 version.main_timestamp = map->timestamp;
18295 vm_map_unlock(map); /* Increments timestamp once! */
18296
18297 /*
18298 * Perform the copy.
18299 */
18300 if (src_entry_was_wired > 0 ||
18301 (debug4k_no_cow_copyin &&
18302 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18303 vm_object_lock(object);
18304 result = vm_object_copy_slowly(
18305 object,
18306 offset,
18307 (new_entry->vme_end -
18308 new_entry->vme_start),
18309 THREAD_UNINT,
18310 &new_copy_object);
18311 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18312 saved_used_for_jit = new_entry->used_for_jit;
18313 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18314 new_entry->used_for_jit = saved_used_for_jit;
18315 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18316 new_entry->needs_copy = FALSE;
18317 } else {
18318 vm_object_offset_t new_offset;
18319
18320 new_offset = VME_OFFSET(new_entry);
18321 result = vm_object_copy_strategically(
18322 object,
18323 offset,
18324 (new_entry->vme_end -
18325 new_entry->vme_start),
18326 false, /* forking */
18327 &new_copy_object,
18328 &new_offset,
18329 &new_entry_needs_copy);
18330 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18331 saved_used_for_jit = new_entry->used_for_jit;
18332 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18333 new_entry->used_for_jit = saved_used_for_jit;
18334 if (new_offset != VME_OFFSET(new_entry)) {
18335 VME_OFFSET_SET(new_entry, new_offset);
18336 }
18337
18338 new_entry->needs_copy = new_entry_needs_copy;
18339 }
18340
18341 /*
18342 * Throw away the old object reference of the new entry.
18343 */
18344 vm_object_deallocate(object);
18345
18346 if (result != KERN_SUCCESS &&
18347 result != KERN_MEMORY_RESTART_COPY) {
18348 vm_map_entry_dispose(new_entry);
18349 vm_map_lock(map);
18350 break;
18351 }
18352
18353 /*
18354 * Verify that the map has not substantially
18355 * changed while the copy was being made.
18356 */
18357
18358 vm_map_lock(map);
18359 if (version.main_timestamp + 1 != map->timestamp) {
18360 /*
18361 * Simple version comparison failed.
18362 *
18363 * Retry the lookup and verify that the
18364 * same object/offset are still present.
18365 */
18366 saved_src_entry = VM_MAP_ENTRY_NULL;
18367 vm_object_deallocate(VME_OBJECT(new_entry));
18368 vm_map_entry_dispose(new_entry);
18369 if (result == KERN_MEMORY_RESTART_COPY) {
18370 result = KERN_SUCCESS;
18371 }
18372 continue;
18373 }
18374 /* map hasn't changed: src_entry is still valid */
18375 src_entry = saved_src_entry;
18376 saved_src_entry = VM_MAP_ENTRY_NULL;
18377
18378 if (result == KERN_MEMORY_RESTART_COPY) {
18379 vm_object_reference(object);
18380 goto RestartCopy;
18381 }
18382 }
18383
18384 _vm_map_store_entry_link(map_header,
18385 map_header->links.prev, new_entry);
18386
18387 /* protections for submap mapping are irrelevant here */
18388 if (vm_remap_legacy && !src_entry->is_sub_map) {
18389 *cur_protection &= src_entry->protection;
18390 *max_protection &= src_entry->max_protection;
18391 }
18392
18393 map_address += tmp_size;
18394 mapped_size += tmp_size;
18395 src_start += tmp_size;
18396
18397 if (vmk_flags.vmkf_copy_single_object) {
18398 if (mapped_size != size) {
18399 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18400 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18401 if (src_entry->vme_next != vm_map_to_entry(map) &&
18402 src_entry->vme_next->vme_object_value ==
18403 src_entry->vme_object_value) {
18404 /* XXX TODO4K */
18405 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18406 }
18407 }
18408 break;
18409 }
18410 } /* end while */
18411
18412 vm_map_unlock(map);
18413 if (result != KERN_SUCCESS) {
18414 /*
18415 * Free all allocated elements.
18416 */
18417 for (src_entry = map_header->links.next;
18418 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18419 src_entry = new_entry) {
18420 new_entry = src_entry->vme_next;
18421 _vm_map_store_entry_unlink(map_header, src_entry, false);
18422 if (src_entry->is_sub_map) {
18423 vm_map_deallocate(VME_SUBMAP(src_entry));
18424 } else {
18425 vm_object_deallocate(VME_OBJECT(src_entry));
18426 }
18427 vm_map_entry_dispose(src_entry);
18428 }
18429 }
18430 return result;
18431 }
18432
18433 bool
vm_map_is_exotic(vm_map_t map)18434 vm_map_is_exotic(
18435 vm_map_t map)
18436 {
18437 return VM_MAP_IS_EXOTIC(map);
18438 }
18439
18440 bool
vm_map_is_alien(vm_map_t map)18441 vm_map_is_alien(
18442 vm_map_t map)
18443 {
18444 return VM_MAP_IS_ALIEN(map);
18445 }
18446
18447 #if XNU_TARGET_OS_OSX
18448 void
vm_map_mark_alien(vm_map_t map)18449 vm_map_mark_alien(
18450 vm_map_t map)
18451 {
18452 vm_map_lock(map);
18453 map->is_alien = true;
18454 vm_map_unlock(map);
18455 }
18456
18457 void
vm_map_single_jit(vm_map_t map)18458 vm_map_single_jit(
18459 vm_map_t map)
18460 {
18461 vm_map_lock(map);
18462 map->single_jit = true;
18463 vm_map_unlock(map);
18464 }
18465 #endif /* XNU_TARGET_OS_OSX */
18466
18467
18468
18469 /*
18470 * Callers of this function must call vm_map_copy_require on
18471 * previously created vm_map_copy_t or pass a newly created
18472 * one to ensure that it hasn't been forged.
18473 */
18474 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18475 vm_map_copy_to_physcopy(
18476 vm_map_copy_t copy_map,
18477 vm_map_t target_map)
18478 {
18479 vm_map_size_t size;
18480 vm_map_entry_t entry;
18481 vm_map_entry_t new_entry;
18482 vm_object_t new_object;
18483 unsigned int pmap_flags;
18484 pmap_t new_pmap;
18485 vm_map_t new_map;
18486 vm_map_address_t src_start, src_end, src_cur;
18487 vm_map_address_t dst_start, dst_end, dst_cur;
18488 kern_return_t kr;
18489 void *kbuf;
18490
18491 /*
18492 * Perform the equivalent of vm_allocate() and memcpy().
18493 * Replace the mappings in "copy_map" with the newly allocated mapping.
18494 */
18495 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18496
18497 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18498
18499 /* create a new pmap to map "copy_map" */
18500 pmap_flags = 0;
18501 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18502 #if PMAP_CREATE_FORCE_4K_PAGES
18503 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18504 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18505 pmap_flags |= PMAP_CREATE_64BIT;
18506 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18507 if (new_pmap == NULL) {
18508 return KERN_RESOURCE_SHORTAGE;
18509 }
18510
18511 /* allocate new VM object */
18512 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18513 new_object = vm_object_allocate(size);
18514 assert(new_object);
18515
18516 /* allocate new VM map entry */
18517 new_entry = vm_map_copy_entry_create(copy_map);
18518 assert(new_entry);
18519
18520 /* finish initializing new VM map entry */
18521 new_entry->protection = VM_PROT_DEFAULT;
18522 new_entry->max_protection = VM_PROT_DEFAULT;
18523 new_entry->use_pmap = TRUE;
18524
18525 /* make new VM map entry point to new VM object */
18526 new_entry->vme_start = 0;
18527 new_entry->vme_end = size;
18528 VME_OBJECT_SET(new_entry, new_object, false, 0);
18529 VME_OFFSET_SET(new_entry, 0);
18530
18531 /* create a new pageable VM map to map "copy_map" */
18532 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18533 VM_MAP_CREATE_PAGEABLE);
18534 assert(new_map);
18535 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18536
18537 /* map "copy_map" in the new VM map */
18538 src_start = 0;
18539 kr = vm_map_copyout_internal(
18540 new_map,
18541 &src_start,
18542 copy_map,
18543 copy_map->size,
18544 FALSE, /* consume_on_success */
18545 VM_PROT_DEFAULT,
18546 VM_PROT_DEFAULT,
18547 VM_INHERIT_DEFAULT);
18548 assert(kr == KERN_SUCCESS);
18549 src_end = src_start + copy_map->size;
18550
18551 /* map "new_object" in the new VM map */
18552 vm_object_reference(new_object);
18553 dst_start = 0;
18554 kr = vm_map_enter(new_map,
18555 &dst_start,
18556 size,
18557 0, /* mask */
18558 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18559 new_object,
18560 0, /* offset */
18561 FALSE, /* needs copy */
18562 VM_PROT_DEFAULT,
18563 VM_PROT_DEFAULT,
18564 VM_INHERIT_DEFAULT);
18565 assert(kr == KERN_SUCCESS);
18566 dst_end = dst_start + size;
18567
18568 /* get a kernel buffer */
18569 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18570
18571 /* physically copy "copy_map" mappings to new VM object */
18572 for (src_cur = src_start, dst_cur = dst_start;
18573 src_cur < src_end;
18574 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18575 vm_size_t bytes;
18576
18577 bytes = PAGE_SIZE;
18578 if (src_cur + PAGE_SIZE > src_end) {
18579 /* partial copy for last page */
18580 bytes = src_end - src_cur;
18581 assert(bytes > 0 && bytes < PAGE_SIZE);
18582 /* rest of dst page should be zero-filled */
18583 }
18584 /* get bytes from src mapping */
18585 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18586 if (kr != KERN_SUCCESS) {
18587 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18588 }
18589 /* put bytes in dst mapping */
18590 assert(dst_cur < dst_end);
18591 assert(dst_cur + bytes <= dst_end);
18592 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18593 if (kr != KERN_SUCCESS) {
18594 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18595 }
18596 }
18597
18598 /* free kernel buffer */
18599 kfree_data(kbuf, PAGE_SIZE);
18600
18601 /* destroy new map */
18602 vm_map_destroy(new_map);
18603 new_map = VM_MAP_NULL;
18604
18605 /* dispose of the old map entries in "copy_map" */
18606 while (vm_map_copy_first_entry(copy_map) !=
18607 vm_map_copy_to_entry(copy_map)) {
18608 entry = vm_map_copy_first_entry(copy_map);
18609 vm_map_copy_entry_unlink(copy_map, entry);
18610 if (entry->is_sub_map) {
18611 vm_map_deallocate(VME_SUBMAP(entry));
18612 } else {
18613 vm_object_deallocate(VME_OBJECT(entry));
18614 }
18615 vm_map_copy_entry_dispose(entry);
18616 }
18617
18618 /* change "copy_map"'s page_size to match "target_map" */
18619 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18620 copy_map->offset = 0;
18621 copy_map->size = size;
18622
18623 /* insert new map entry in "copy_map" */
18624 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18625 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18626
18627 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18628 return KERN_SUCCESS;
18629 }
18630
18631 void
18632 vm_map_copy_adjust_get_target_copy_map(
18633 vm_map_copy_t copy_map,
18634 vm_map_copy_t *target_copy_map_p);
18635 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18636 vm_map_copy_adjust_get_target_copy_map(
18637 vm_map_copy_t copy_map,
18638 vm_map_copy_t *target_copy_map_p)
18639 {
18640 vm_map_copy_t target_copy_map;
18641 vm_map_entry_t entry, target_entry;
18642
18643 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18644 /* the caller already has a "target_copy_map": use it */
18645 return;
18646 }
18647
18648 /* the caller wants us to create a new copy of "copy_map" */
18649 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18650 target_copy_map = vm_map_copy_allocate(copy_map->type);
18651 target_copy_map->offset = copy_map->offset;
18652 target_copy_map->size = copy_map->size;
18653 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18654 for (entry = vm_map_copy_first_entry(copy_map);
18655 entry != vm_map_copy_to_entry(copy_map);
18656 entry = entry->vme_next) {
18657 target_entry = vm_map_copy_entry_create(target_copy_map);
18658 vm_map_entry_copy_full(target_entry, entry);
18659 if (target_entry->is_sub_map) {
18660 vm_map_reference(VME_SUBMAP(target_entry));
18661 } else {
18662 vm_object_reference(VME_OBJECT(target_entry));
18663 }
18664 vm_map_copy_entry_link(
18665 target_copy_map,
18666 vm_map_copy_last_entry(target_copy_map),
18667 target_entry);
18668 }
18669 entry = VM_MAP_ENTRY_NULL;
18670 *target_copy_map_p = target_copy_map;
18671 }
18672
18673 /*
18674 * Callers of this function must call vm_map_copy_require on
18675 * previously created vm_map_copy_t or pass a newly created
18676 * one to ensure that it hasn't been forged.
18677 */
18678 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18679 vm_map_copy_trim(
18680 vm_map_copy_t copy_map,
18681 uint16_t new_page_shift,
18682 vm_map_offset_t trim_start,
18683 vm_map_offset_t trim_end)
18684 {
18685 uint16_t copy_page_shift;
18686 vm_map_entry_t entry, next_entry;
18687
18688 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18689 assert(copy_map->cpy_hdr.nentries > 0);
18690
18691 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18692 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18693
18694 /* use the new page_shift to do the clipping */
18695 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18696 copy_map->cpy_hdr.page_shift = new_page_shift;
18697
18698 for (entry = vm_map_copy_first_entry(copy_map);
18699 entry != vm_map_copy_to_entry(copy_map);
18700 entry = next_entry) {
18701 next_entry = entry->vme_next;
18702 if (entry->vme_end <= trim_start) {
18703 /* entry fully before trim range: skip */
18704 continue;
18705 }
18706 if (entry->vme_start >= trim_end) {
18707 /* entry fully after trim range: done */
18708 break;
18709 }
18710 /* clip entry if needed */
18711 vm_map_copy_clip_start(copy_map, entry, trim_start);
18712 vm_map_copy_clip_end(copy_map, entry, trim_end);
18713 /* dispose of entry */
18714 copy_map->size -= entry->vme_end - entry->vme_start;
18715 vm_map_copy_entry_unlink(copy_map, entry);
18716 if (entry->is_sub_map) {
18717 vm_map_deallocate(VME_SUBMAP(entry));
18718 } else {
18719 vm_object_deallocate(VME_OBJECT(entry));
18720 }
18721 vm_map_copy_entry_dispose(entry);
18722 entry = VM_MAP_ENTRY_NULL;
18723 }
18724
18725 /* restore copy_map's original page_shift */
18726 copy_map->cpy_hdr.page_shift = copy_page_shift;
18727 }
18728
18729 /*
18730 * Make any necessary adjustments to "copy_map" to allow it to be
18731 * mapped into "target_map".
18732 * If no changes were necessary, "target_copy_map" points to the
18733 * untouched "copy_map".
18734 * If changes are necessary, changes will be made to "target_copy_map".
18735 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18736 * copy the original "copy_map" to it before applying the changes.
18737 * The caller should discard "target_copy_map" if it's not the same as
18738 * the original "copy_map".
18739 */
18740 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18741 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18742 vm_map_copy_adjust_to_target(
18743 vm_map_copy_t src_copy_map,
18744 vm_map_offset_ut offset_u,
18745 vm_map_size_ut size_u,
18746 vm_map_t target_map,
18747 boolean_t copy,
18748 vm_map_copy_t *target_copy_map_p,
18749 vm_map_offset_t *overmap_start_p,
18750 vm_map_offset_t *overmap_end_p,
18751 vm_map_offset_t *trimmed_start_p)
18752 {
18753 vm_map_copy_t copy_map, target_copy_map;
18754 vm_map_size_t target_size;
18755 vm_map_size_t src_copy_map_size;
18756 vm_map_size_t overmap_start, overmap_end;
18757 int misalignments;
18758 vm_map_entry_t entry, target_entry;
18759 vm_map_offset_t addr_adjustment;
18760 vm_map_offset_t new_start, new_end;
18761 int copy_page_mask, target_page_mask;
18762 uint16_t copy_page_shift, target_page_shift;
18763 vm_map_offset_t trimmed_end;
18764 vm_map_size_t map_size;
18765 kern_return_t kr;
18766
18767 /*
18768 * Sanitize any input parameters that are addr/size/prot/inherit
18769 */
18770 kr = vm_map_copy_addr_size_sanitize(
18771 target_map,
18772 offset_u,
18773 size_u,
18774 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18775 &new_start,
18776 &new_end,
18777 &map_size);
18778 if (__improbable(kr != KERN_SUCCESS)) {
18779 return vm_sanitize_get_kr(kr);
18780 }
18781
18782 /*
18783 * Assert that the vm_map_copy is coming from the right
18784 * zone and hasn't been forged
18785 */
18786 vm_map_copy_require(src_copy_map);
18787 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18788
18789 /*
18790 * Start working with "src_copy_map" but we'll switch
18791 * to "target_copy_map" as soon as we start making adjustments.
18792 */
18793 copy_map = src_copy_map;
18794 src_copy_map_size = src_copy_map->size;
18795
18796 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18797 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18798 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18799 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18800
18801 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18802
18803 target_copy_map = *target_copy_map_p;
18804 if (target_copy_map != VM_MAP_COPY_NULL) {
18805 vm_map_copy_require(target_copy_map);
18806 }
18807
18808 if (new_end > copy_map->size) {
18809 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18810 return KERN_INVALID_ARGUMENT;
18811 }
18812
18813 /* trim the end */
18814 trimmed_end = 0;
18815 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18816 if (new_end < copy_map->size) {
18817 trimmed_end = src_copy_map_size - new_end;
18818 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18819 /* get "target_copy_map" if needed and adjust it */
18820 vm_map_copy_adjust_get_target_copy_map(copy_map,
18821 &target_copy_map);
18822 copy_map = target_copy_map;
18823 vm_map_copy_trim(target_copy_map, target_page_shift,
18824 new_end, copy_map->size);
18825 }
18826
18827 /* trim the start */
18828 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18829 if (new_start != 0) {
18830 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18831 /* get "target_copy_map" if needed and adjust it */
18832 vm_map_copy_adjust_get_target_copy_map(copy_map,
18833 &target_copy_map);
18834 copy_map = target_copy_map;
18835 vm_map_copy_trim(target_copy_map, target_page_shift,
18836 0, new_start);
18837 }
18838 *trimmed_start_p = new_start;
18839
18840 /* target_size starts with what's left after trimming */
18841 target_size = copy_map->size;
18842 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18843 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18844 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18845 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18846
18847 /* check for misalignments but don't adjust yet */
18848 misalignments = 0;
18849 overmap_start = 0;
18850 overmap_end = 0;
18851 if (copy_page_shift < target_page_shift) {
18852 /*
18853 * Remapping from 4K to 16K: check the VM object alignments
18854 * throughout the range.
18855 * If the start and end of the range are mis-aligned, we can
18856 * over-map to re-align, and adjust the "overmap" start/end
18857 * and "target_size" of the range accordingly.
18858 * If there is any mis-alignment within the range:
18859 * if "copy":
18860 * we can do immediate-copy instead of copy-on-write,
18861 * else:
18862 * no way to remap and share; fail.
18863 */
18864 for (entry = vm_map_copy_first_entry(copy_map);
18865 entry != vm_map_copy_to_entry(copy_map);
18866 entry = entry->vme_next) {
18867 vm_object_offset_t object_offset_start, object_offset_end;
18868
18869 object_offset_start = VME_OFFSET(entry);
18870 object_offset_end = object_offset_start;
18871 object_offset_end += entry->vme_end - entry->vme_start;
18872 if (object_offset_start & target_page_mask) {
18873 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18874 overmap_start++;
18875 } else {
18876 misalignments++;
18877 }
18878 }
18879 if (object_offset_end & target_page_mask) {
18880 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18881 overmap_end++;
18882 } else {
18883 misalignments++;
18884 }
18885 }
18886 }
18887 }
18888 entry = VM_MAP_ENTRY_NULL;
18889
18890 /* decide how to deal with misalignments */
18891 assert(overmap_start <= 1);
18892 assert(overmap_end <= 1);
18893 if (!overmap_start && !overmap_end && !misalignments) {
18894 /* copy_map is properly aligned for target_map ... */
18895 if (*trimmed_start_p) {
18896 /* ... but we trimmed it, so still need to adjust */
18897 } else {
18898 /* ... and we didn't trim anything: we're done */
18899 if (target_copy_map == VM_MAP_COPY_NULL) {
18900 target_copy_map = copy_map;
18901 }
18902 *target_copy_map_p = target_copy_map;
18903 *overmap_start_p = 0;
18904 *overmap_end_p = 0;
18905 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18906 return KERN_SUCCESS;
18907 }
18908 } else if (misalignments && !copy) {
18909 /* can't "share" if misaligned */
18910 DEBUG4K_ADJUST("unsupported sharing\n");
18911 #if MACH_ASSERT
18912 if (debug4k_panic_on_misaligned_sharing) {
18913 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18914 }
18915 #endif /* MACH_ASSERT */
18916 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18917 return KERN_NOT_SUPPORTED;
18918 } else {
18919 /* can't virtual-copy if misaligned (but can physical-copy) */
18920 DEBUG4K_ADJUST("mis-aligned copying\n");
18921 }
18922
18923 /* get a "target_copy_map" if needed and switch to it */
18924 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18925 copy_map = target_copy_map;
18926
18927 if (misalignments && copy) {
18928 vm_map_size_t target_copy_map_size;
18929
18930 /*
18931 * Can't do copy-on-write with misaligned mappings.
18932 * Replace the mappings with a physical copy of the original
18933 * mappings' contents.
18934 */
18935 target_copy_map_size = target_copy_map->size;
18936 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18937 if (kr != KERN_SUCCESS) {
18938 return kr;
18939 }
18940 *target_copy_map_p = target_copy_map;
18941 *overmap_start_p = 0;
18942 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18943 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18944 return KERN_SUCCESS;
18945 }
18946
18947 /* apply the adjustments */
18948 misalignments = 0;
18949 overmap_start = 0;
18950 overmap_end = 0;
18951 /* remove copy_map->offset, so that everything starts at offset 0 */
18952 addr_adjustment = copy_map->offset;
18953 /* also remove whatever we trimmed from the start */
18954 addr_adjustment += *trimmed_start_p;
18955 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18956 target_entry != vm_map_copy_to_entry(target_copy_map);
18957 target_entry = target_entry->vme_next) {
18958 vm_object_offset_t object_offset_start, object_offset_end;
18959
18960 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18961 object_offset_start = VME_OFFSET(target_entry);
18962 if (object_offset_start & target_page_mask) {
18963 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18964 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18965 /*
18966 * start of 1st entry is mis-aligned:
18967 * re-adjust by over-mapping.
18968 */
18969 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18970 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18971 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18972 } else {
18973 misalignments++;
18974 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18975 assert(copy);
18976 }
18977 }
18978
18979 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18980 target_size += overmap_start;
18981 } else {
18982 target_entry->vme_start += overmap_start;
18983 }
18984 target_entry->vme_end += overmap_start;
18985
18986 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18987 if (object_offset_end & target_page_mask) {
18988 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18989 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18990 /*
18991 * end of last entry is mis-aligned: re-adjust by over-mapping.
18992 */
18993 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18994 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18995 target_entry->vme_end += overmap_end;
18996 target_size += overmap_end;
18997 } else {
18998 misalignments++;
18999 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
19000 assert(copy);
19001 }
19002 }
19003 target_entry->vme_start -= addr_adjustment;
19004 target_entry->vme_end -= addr_adjustment;
19005 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
19006 }
19007
19008 target_copy_map->size = target_size;
19009 target_copy_map->offset += overmap_start;
19010 target_copy_map->offset -= addr_adjustment;
19011 target_copy_map->cpy_hdr.page_shift = target_page_shift;
19012
19013 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
19014 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
19015 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
19016 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
19017
19018 *target_copy_map_p = target_copy_map;
19019 *overmap_start_p = overmap_start;
19020 *overmap_end_p = overmap_end;
19021
19022 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
19023 return KERN_SUCCESS;
19024 }
19025
19026 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)19027 vm_map_range_physical_size(
19028 vm_map_t map,
19029 vm_map_address_t start,
19030 mach_vm_size_t size,
19031 mach_vm_size_t * phys_size)
19032 {
19033 kern_return_t kr;
19034 vm_map_copy_t copy_map, target_copy_map;
19035 vm_map_offset_t adjusted_start, adjusted_end;
19036 vm_map_size_t adjusted_size;
19037 vm_prot_t cur_prot, max_prot;
19038 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
19039 vm_map_kernel_flags_t vmk_flags;
19040
19041 if (size == 0) {
19042 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
19043 *phys_size = 0;
19044 return KERN_SUCCESS;
19045 }
19046
19047 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
19048 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
19049 if (__improbable(os_add_overflow(start, size, &end) ||
19050 adjusted_end <= adjusted_start)) {
19051 /* wraparound */
19052 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
19053 *phys_size = 0;
19054 return KERN_INVALID_ARGUMENT;
19055 }
19056 if (__improbable(vm_map_range_overflows(map, start, size))) {
19057 *phys_size = 0;
19058 return KERN_INVALID_ADDRESS;
19059 }
19060 assert(adjusted_end > adjusted_start);
19061 adjusted_size = adjusted_end - adjusted_start;
19062 *phys_size = adjusted_size;
19063 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
19064 return KERN_SUCCESS;
19065 }
19066 if (start == 0) {
19067 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
19068 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
19069 if (__improbable(adjusted_end <= adjusted_start)) {
19070 /* wraparound */
19071 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
19072 *phys_size = 0;
19073 return KERN_INVALID_ARGUMENT;
19074 }
19075 assert(adjusted_end > adjusted_start);
19076 adjusted_size = adjusted_end - adjusted_start;
19077 *phys_size = adjusted_size;
19078 return KERN_SUCCESS;
19079 }
19080
19081 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
19082 vmk_flags.vmkf_copy_pageable = TRUE;
19083 vmk_flags.vmkf_copy_same_map = TRUE;
19084 assert(adjusted_size != 0);
19085 cur_prot = VM_PROT_NONE; /* legacy mode */
19086 max_prot = VM_PROT_NONE; /* legacy mode */
19087 vmk_flags.vmkf_remap_legacy_mode = true;
19088 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
19089 FALSE /* copy */,
19090 ©_map,
19091 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
19092 vmk_flags);
19093 if (kr != KERN_SUCCESS) {
19094 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19095 //assert(0);
19096 *phys_size = 0;
19097 return kr;
19098 }
19099 assert(copy_map != VM_MAP_COPY_NULL);
19100 target_copy_map = copy_map;
19101 DEBUG4K_ADJUST("adjusting...\n");
19102 kr = vm_map_copy_adjust_to_target(
19103 copy_map,
19104 start - adjusted_start, /* offset */
19105 size, /* size */
19106 kernel_map,
19107 FALSE, /* copy */
19108 &target_copy_map,
19109 &overmap_start,
19110 &overmap_end,
19111 &trimmed_start);
19112 if (kr == KERN_SUCCESS) {
19113 if (target_copy_map->size != *phys_size) {
19114 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19115 }
19116 *phys_size = target_copy_map->size;
19117 } else {
19118 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19119 //assert(0);
19120 *phys_size = 0;
19121 }
19122 vm_map_copy_discard(copy_map);
19123 copy_map = VM_MAP_COPY_NULL;
19124
19125 return kr;
19126 }
19127
19128 static __attribute__((always_inline, warn_unused_result))
19129 kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)19130 vm_map_remap_sanitize(
19131 vm_map_t src_map,
19132 vm_map_t target_map,
19133 vm_map_address_ut address_u,
19134 vm_map_size_ut size_u,
19135 vm_map_offset_ut mask_u,
19136 vm_map_offset_ut memory_address_u,
19137 vm_prot_ut cur_protection_u,
19138 vm_prot_ut max_protection_u,
19139 vm_inherit_ut inheritance_u,
19140 vm_map_kernel_flags_t vmk_flags,
19141 vm_map_address_t *target_addr,
19142 vm_map_address_t *mask,
19143 vm_map_offset_t *memory_address,
19144 vm_map_offset_t *memory_end,
19145 vm_map_size_t *memory_size,
19146 vm_prot_t *cur_protection,
19147 vm_prot_t *max_protection,
19148 vm_inherit_t *inheritance)
19149 {
19150 kern_return_t result;
19151 vm_sanitize_flags_t vm_sanitize_flags;
19152
19153 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
19154 inheritance);
19155 if (__improbable(result != KERN_SUCCESS)) {
19156 return result;
19157 }
19158
19159 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
19160 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
19161 cur_protection, max_protection);
19162 if (__improbable(result != KERN_SUCCESS)) {
19163 return result;
19164 }
19165
19166 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
19167 if (__improbable(result != KERN_SUCCESS)) {
19168 return result;
19169 }
19170
19171 /*
19172 * If the user is requesting that we return the address of the
19173 * first byte of the data (rather than the base of the page),
19174 * then we use different rounding semantics: specifically,
19175 * we assume that (memory_address, size) describes a region
19176 * all of whose pages we must cover, rather than a base to be truncated
19177 * down and a size to be added to that base. So we figure out
19178 * the highest page that the requested region includes and make
19179 * sure that the size will cover it.
19180 *
19181 * The key example we're worried about it is of the form:
19182 *
19183 * memory_address = 0x1ff0, size = 0x20
19184 *
19185 * With the old semantics, we round down the memory_address to 0x1000
19186 * and round up the size to 0x1000, resulting in our covering *only*
19187 * page 0x1000. With the new semantics, we'd realize that the region covers
19188 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19189 * 0x1000 and page 0x2000 in the region we remap.
19190 *
19191 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
19192 */
19193 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
19194 if (!vmk_flags.vmf_return_data_addr) {
19195 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
19196 }
19197
19198 result = vm_sanitize_addr_size(memory_address_u, size_u,
19199 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
19200 vm_sanitize_flags, memory_address, memory_end,
19201 memory_size);
19202 if (__improbable(result != KERN_SUCCESS)) {
19203 return result;
19204 }
19205
19206 *target_addr = vm_sanitize_addr(target_map, address_u);
19207 return KERN_SUCCESS;
19208 }
19209
19210 /*
19211 * Routine: vm_remap
19212 *
19213 * Map portion of a task's address space.
19214 * Mapped region must not overlap more than
19215 * one vm memory object. Protections and
19216 * inheritance attributes remain the same
19217 * as in the original task and are out parameters.
19218 * Source and Target task can be identical
19219 * Other attributes are identical as for vm_map()
19220 */
19221 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)19222 vm_map_remap(
19223 vm_map_t target_map,
19224 vm_map_address_ut *address_u,
19225 vm_map_size_ut size_u,
19226 vm_map_offset_ut mask_u,
19227 vm_map_kernel_flags_t vmk_flags,
19228 vm_map_t src_map,
19229 vm_map_offset_ut memory_address_u,
19230 boolean_t copy,
19231 vm_prot_ut *cur_protection_u, /* IN/OUT */
19232 vm_prot_ut *max_protection_u, /* IN/OUT */
19233 vm_inherit_ut inheritance_u)
19234 {
19235 vm_map_address_t target_addr, mask;
19236 vm_map_size_t target_size;
19237 vm_map_offset_t memory_address, memory_end;
19238 vm_map_size_t memory_size;
19239 vm_prot_t cur_protection, max_protection;
19240 vm_inherit_t inheritance;
19241 kern_return_t result;
19242 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19243 vm_map_copy_t copy_map;
19244 vm_map_offset_t offset_in_mapping;
19245 vm_map_size_t src_page_mask, target_page_mask;
19246 vm_map_size_t initial_size;
19247 VM_MAP_ZAP_DECLARE(zap_list);
19248
19249 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
19250 return KERN_INVALID_ARGUMENT;
19251 }
19252 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19253 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19254
19255 if (src_page_mask != target_page_mask) {
19256 if (copy) {
19257 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19258 } else {
19259 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19260 }
19261 }
19262
19263 /*
19264 * Sanitize any input parameters that are addr/size/prot/inherit
19265 */
19266 result = vm_map_remap_sanitize(src_map,
19267 target_map,
19268 *address_u,
19269 size_u,
19270 mask_u,
19271 memory_address_u,
19272 *cur_protection_u,
19273 *max_protection_u,
19274 inheritance_u,
19275 vmk_flags,
19276 &target_addr,
19277 &mask,
19278 &memory_address,
19279 &memory_end,
19280 &memory_size,
19281 &cur_protection,
19282 &max_protection,
19283 &inheritance);
19284 if (__improbable(result != KERN_SUCCESS)) {
19285 return vm_sanitize_get_kr(result);
19286 }
19287
19288 if (vmk_flags.vmf_return_data_addr) {
19289 /*
19290 * This is safe to unwrap now that the quantities
19291 * have been validated and rounded up normally.
19292 */
19293 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
19294 memory_address_u);
19295 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19296 } else {
19297 /*
19298 * IMPORTANT:
19299 * This legacy code path is broken: for the range mentioned
19300 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19301 * two 4k pages, it yields [ memory_address = 0x1000,
19302 * size = 0x1000 ], which covers only the first 4k page.
19303 * BUT some code unfortunately depends on this bug, so we
19304 * can't fix it without breaking something.
19305 * New code should get automatically opted in the new
19306 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19307 */
19308 offset_in_mapping = 0;
19309 initial_size = memory_size;
19310 }
19311
19312 if (vmk_flags.vmf_resilient_media) {
19313 /* must be copy-on-write to be "media resilient" */
19314 if (!copy) {
19315 return KERN_INVALID_ARGUMENT;
19316 }
19317 }
19318
19319 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19320 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19321
19322 assert(memory_size != 0);
19323 result = vm_map_copy_extract(src_map,
19324 memory_address,
19325 memory_size,
19326 copy, ©_map,
19327 &cur_protection, /* IN/OUT */
19328 &max_protection, /* IN/OUT */
19329 inheritance,
19330 vmk_flags);
19331 if (result != KERN_SUCCESS) {
19332 return result;
19333 }
19334 assert(copy_map != VM_MAP_COPY_NULL);
19335
19336 /*
19337 * Handle the policy for vm map ranges
19338 *
19339 * If the maps differ, the target_map policy applies like for vm_map()
19340 * For same mapping remaps, we preserve the range.
19341 */
19342 if (vmk_flags.vmkf_copy_same_map) {
19343 vmk_flags.vmkf_range_id = copy_map->orig_range;
19344 } else {
19345 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19346 }
19347
19348 target_size = memory_size;
19349 if (src_page_mask != target_page_mask) {
19350 vm_map_copy_t target_copy_map;
19351 vm_map_offset_t overmap_start = 0;
19352 vm_map_offset_t overmap_end = 0;
19353 vm_map_offset_t trimmed_start = 0;
19354
19355 target_copy_map = copy_map; /* can modify "copy_map" itself */
19356 DEBUG4K_ADJUST("adjusting...\n");
19357 result = vm_map_copy_adjust_to_target(
19358 copy_map,
19359 offset_in_mapping, /* offset */
19360 initial_size,
19361 target_map,
19362 copy,
19363 &target_copy_map,
19364 &overmap_start,
19365 &overmap_end,
19366 &trimmed_start);
19367 if (result != KERN_SUCCESS) {
19368 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19369 vm_map_copy_discard(copy_map);
19370 return result;
19371 }
19372 if (trimmed_start == 0) {
19373 /* nothing trimmed: no adjustment needed */
19374 } else if (trimmed_start >= offset_in_mapping) {
19375 /* trimmed more than offset_in_mapping: nothing left */
19376 assert(overmap_start == 0);
19377 assert(overmap_end == 0);
19378 offset_in_mapping = 0;
19379 } else {
19380 /* trimmed some of offset_in_mapping: adjust */
19381 assert(overmap_start == 0);
19382 assert(overmap_end == 0);
19383 offset_in_mapping -= trimmed_start;
19384 }
19385 offset_in_mapping += overmap_start;
19386 target_size = target_copy_map->size;
19387 }
19388
19389 /*
19390 * Allocate/check a range of free virtual address
19391 * space for the target
19392 */
19393 target_size = vm_map_round_page(target_size, target_page_mask);
19394
19395 if (target_size == 0) {
19396 vm_map_copy_discard(copy_map);
19397 return KERN_INVALID_ARGUMENT;
19398 }
19399
19400 vm_map_lock(target_map);
19401
19402 if (!vmk_flags.vmf_fixed) {
19403 result = vm_map_locate_space_anywhere(target_map, target_size,
19404 mask, vmk_flags, &target_addr, &insp_entry);
19405 } else {
19406 /*
19407 * vm_map_locate_space_fixed will reject overflowing
19408 * target_addr + target_size values
19409 */
19410 result = vm_map_locate_space_fixed(target_map, target_addr,
19411 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19412
19413 if (result == KERN_MEMORY_PRESENT) {
19414 assert(!vmk_flags.vmkf_already);
19415 insp_entry = VM_MAP_ENTRY_NULL;
19416 result = KERN_NO_SPACE;
19417 }
19418 }
19419
19420 if (result == KERN_SUCCESS) {
19421 while (vm_map_copy_first_entry(copy_map) !=
19422 vm_map_copy_to_entry(copy_map)) {
19423 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19424
19425 vm_map_copy_entry_unlink(copy_map, entry);
19426
19427 if (vmk_flags.vmkf_remap_prot_copy) {
19428 /*
19429 * This vm_map_remap() is for a
19430 * vm_protect(VM_PROT_COPY), so the caller
19431 * expects to be allowed to add write access
19432 * to this new mapping. This is done by
19433 * adding VM_PROT_WRITE to each entry's
19434 * max_protection... unless some security
19435 * settings disallow it.
19436 */
19437 bool allow_write = false;
19438 if (entry->vme_permanent) {
19439 /* immutable mapping... */
19440 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19441 developer_mode_state()) {
19442 /*
19443 * ... but executable and
19444 * possibly being debugged,
19445 * so let's allow it to become
19446 * writable, for breakpoints
19447 * and dtrace probes, for
19448 * example.
19449 */
19450 allow_write = true;
19451 } else {
19452 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19453 proc_selfpid(),
19454 (get_bsdtask_info(current_task())
19455 ? proc_name_address(get_bsdtask_info(current_task()))
19456 : "?"),
19457 (uint64_t)memory_address,
19458 (uint64_t)memory_size,
19459 entry->protection,
19460 entry->max_protection,
19461 developer_mode_state());
19462 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19463 vm_map_entry_t, entry,
19464 vm_map_offset_t, entry->vme_start,
19465 vm_map_offset_t, entry->vme_end,
19466 vm_prot_t, entry->protection,
19467 vm_prot_t, entry->max_protection,
19468 int, VME_ALIAS(entry));
19469 }
19470 } else {
19471 allow_write = true;
19472 }
19473
19474 /*
19475 * VM_PROT_COPY: allow this mapping to become
19476 * writable, unless it was "permanent".
19477 */
19478 if (allow_write) {
19479 entry->max_protection |= VM_PROT_WRITE;
19480 }
19481 }
19482 if (vmk_flags.vmf_resilient_codesign) {
19483 /* no codesigning -> read-only access */
19484 entry->max_protection = VM_PROT_READ;
19485 entry->protection = VM_PROT_READ;
19486 entry->vme_resilient_codesign = TRUE;
19487 }
19488 entry->vme_start += target_addr;
19489 entry->vme_end += target_addr;
19490 assert(!entry->map_aligned);
19491 if (vmk_flags.vmf_resilient_media &&
19492 !entry->is_sub_map &&
19493 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19494 VME_OBJECT(entry)->internal)) {
19495 entry->vme_resilient_media = TRUE;
19496 }
19497 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19498 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19499 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19500 vm_map_store_entry_link(target_map, insp_entry, entry,
19501 vmk_flags);
19502 insp_entry = entry;
19503 }
19504 }
19505
19506 if (vmk_flags.vmf_resilient_codesign) {
19507 cur_protection = VM_PROT_READ;
19508 max_protection = VM_PROT_READ;
19509 }
19510
19511 if (result == KERN_SUCCESS) {
19512 target_map->size += target_size;
19513 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19514 }
19515 vm_map_unlock(target_map);
19516
19517 vm_map_zap_dispose(&zap_list);
19518
19519 if (result == KERN_SUCCESS && target_map->wiring_required) {
19520 result = vm_map_wire_nested(target_map, target_addr,
19521 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19522 TRUE, PMAP_NULL, 0, NULL);
19523 }
19524
19525 if (result == KERN_SUCCESS) {
19526 #if KASAN
19527 if (target_map->pmap == kernel_pmap) {
19528 kasan_notify_address(target_addr, target_size);
19529 }
19530 #endif
19531 /*
19532 * If requested, return the address of the data pointed to by the
19533 * request, rather than the base of the resulting page.
19534 */
19535 if (vmk_flags.vmf_return_data_addr) {
19536 target_addr += offset_in_mapping;
19537 }
19538
19539 /*
19540 * Update OUT parameters.
19541 */
19542 *address_u = vm_sanitize_wrap_addr(target_addr);
19543
19544 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19545 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19546 }
19547
19548 if (src_page_mask != target_page_mask) {
19549 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19550 }
19551 vm_map_copy_discard(copy_map);
19552 copy_map = VM_MAP_COPY_NULL;
19553
19554 return result;
19555 }
19556
19557 /*
19558 * vm_map_switch:
19559 *
19560 * Set the address map for the current thread to the specified map
19561 */
19562
19563 vm_map_t
vm_map_switch(vm_map_t map)19564 vm_map_switch(
19565 vm_map_t map)
19566 {
19567 thread_t thread = current_thread();
19568 vm_map_t oldmap = thread->map;
19569
19570
19571 /*
19572 * Deactivate the current map and activate the requested map
19573 */
19574 mp_disable_preemption();
19575 PMAP_SWITCH_USER(thread, map, cpu_number());
19576 mp_enable_preemption();
19577 return oldmap;
19578 }
19579
19580 static __attribute__((always_inline, warn_unused_result))
19581 kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19582 vm_map_rw_user_sanitize(
19583 vm_map_t map,
19584 vm_map_address_ut addr_u,
19585 vm_size_ut size_u,
19586 vm_sanitize_caller_t vm_sanitize_caller,
19587 vm_map_address_t *addr,
19588 vm_map_address_t *end,
19589 vm_map_size_t *size)
19590 {
19591 vm_sanitize_flags_t flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
19592 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES;
19593
19594
19595 return vm_sanitize_addr_size(addr_u, size_u,
19596 vm_sanitize_caller, map,
19597 flags,
19598 addr, end, size);
19599 }
19600
19601 /*
19602 * Routine: vm_map_write_user
19603 *
19604 * Description:
19605 * Copy out data from a kernel space into space in the
19606 * destination map. The space must already exist in the
19607 * destination map.
19608 * NOTE: This routine should only be called by threads
19609 * which can block on a page fault. i.e. kernel mode user
19610 * threads.
19611 *
19612 */
19613 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19614 vm_map_write_user(
19615 vm_map_t map,
19616 void *src_p,
19617 vm_map_address_ut dst_addr_u,
19618 vm_size_ut size_u)
19619 {
19620 kern_return_t kr;
19621 vm_map_address_t dst_addr, dst_end;
19622 vm_map_size_t size;
19623
19624 /*
19625 * src_p isn't validated: [src_p, src_p + size_u)
19626 * is trusted kernel input.
19627 *
19628 * dst_addr_u and size_u are untrusted and need to be sanitized.
19629 */
19630 kr = vm_map_rw_user_sanitize(map,
19631 dst_addr_u,
19632 size_u,
19633 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19634 &dst_addr,
19635 &dst_end,
19636 &size);
19637 if (__improbable(kr != KERN_SUCCESS)) {
19638 return vm_sanitize_get_kr(kr);
19639 }
19640
19641 if (current_map() == map) {
19642 if (copyout(src_p, dst_addr, size)) {
19643 kr = KERN_INVALID_ADDRESS;
19644 }
19645 } else {
19646 vm_map_t oldmap;
19647
19648 /* take on the identity of the target map while doing */
19649 /* the transfer */
19650
19651 vm_map_reference(map);
19652 oldmap = vm_map_switch(map);
19653 if (copyout(src_p, dst_addr, size)) {
19654 kr = KERN_INVALID_ADDRESS;
19655 }
19656 vm_map_switch(oldmap);
19657 vm_map_deallocate(map);
19658 }
19659 return kr;
19660 }
19661
19662 /*
19663 * Routine: vm_map_read_user
19664 *
19665 * Description:
19666 * Copy in data from a user space source map into the
19667 * kernel map. The space must already exist in the
19668 * kernel map.
19669 * NOTE: This routine should only be called by threads
19670 * which can block on a page fault. i.e. kernel mode user
19671 * threads.
19672 *
19673 */
19674 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19675 vm_map_read_user(
19676 vm_map_t map,
19677 vm_map_address_ut src_addr_u,
19678 void *dst_p,
19679 vm_size_ut size_u)
19680 {
19681 kern_return_t kr;
19682 vm_map_address_t src_addr, src_end;
19683 vm_map_size_t size;
19684
19685 /*
19686 * dst_p isn't validated: [dst_p, dst_p + size_u)
19687 * is trusted kernel input.
19688 *
19689 * src_addr_u and size_u are untrusted and need to be sanitized.
19690 */
19691 kr = vm_map_rw_user_sanitize(map,
19692 src_addr_u,
19693 size_u,
19694 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19695 &src_addr,
19696 &src_end,
19697 &size);
19698 if (__improbable(kr != KERN_SUCCESS)) {
19699 return vm_sanitize_get_kr(kr);
19700 }
19701
19702 if (current_map() == map) {
19703 if (copyin(src_addr, dst_p, size)) {
19704 kr = KERN_INVALID_ADDRESS;
19705 }
19706 } else {
19707 vm_map_t oldmap;
19708
19709 /* take on the identity of the target map while doing */
19710 /* the transfer */
19711
19712 vm_map_reference(map);
19713 oldmap = vm_map_switch(map);
19714 if (copyin(src_addr, dst_p, size)) {
19715 kr = KERN_INVALID_ADDRESS;
19716 }
19717 vm_map_switch(oldmap);
19718 vm_map_deallocate(map);
19719 }
19720 return kr;
19721 }
19722
19723
19724 static __attribute__((always_inline, warn_unused_result))
19725 kern_return_t
vm_map_check_protection_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_prot_t * protection)19726 vm_map_check_protection_sanitize(
19727 vm_map_t map,
19728 vm_map_offset_ut start_u,
19729 vm_map_offset_ut end_u,
19730 vm_prot_ut protection_u,
19731 vm_sanitize_caller_t vm_sanitize_caller,
19732 vm_map_offset_t *start,
19733 vm_map_offset_t *end,
19734 vm_prot_t *protection)
19735 {
19736 kern_return_t kr;
19737 vm_map_size_t size;
19738
19739 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
19740 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start, end,
19741 &size);
19742 if (__improbable(kr != KERN_SUCCESS)) {
19743 return kr;
19744 }
19745
19746 /*
19747 * Given that the protection is used only for comparisons below
19748 * no sanitization is being applied on it.
19749 */
19750 *protection = VM_SANITIZE_UNSAFE_UNWRAP(protection_u);
19751
19752 return KERN_SUCCESS;
19753 }
19754
19755 /*
19756 * vm_map_check_protection:
19757 *
19758 * Assert that the target map allows the specified
19759 * privilege on the entire address region given.
19760 * The entire region must be allocated.
19761 */
19762 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut protection_u,vm_sanitize_caller_t vm_sanitize_caller)19763 vm_map_check_protection(
19764 vm_map_t map,
19765 vm_map_offset_ut start_u,
19766 vm_map_offset_ut end_u,
19767 vm_prot_ut protection_u,
19768 vm_sanitize_caller_t vm_sanitize_caller)
19769 {
19770 vm_map_entry_t entry;
19771 vm_map_entry_t tmp_entry;
19772 vm_map_offset_t start;
19773 vm_map_offset_t end;
19774 vm_prot_t protection;
19775 kern_return_t kr;
19776
19777 kr = vm_map_check_protection_sanitize(map,
19778 start_u,
19779 end_u,
19780 protection_u,
19781 vm_sanitize_caller,
19782 &start,
19783 &end,
19784 &protection);
19785 if (__improbable(kr != KERN_SUCCESS)) {
19786 kr = vm_sanitize_get_kr(kr);
19787 if (kr == KERN_SUCCESS) {
19788 return true;
19789 }
19790 return false;
19791 }
19792
19793 vm_map_lock(map);
19794
19795 if (start < vm_map_min(map) || end > vm_map_max(map)) {
19796 vm_map_unlock(map);
19797 return false;
19798 }
19799
19800 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19801 vm_map_unlock(map);
19802 return false;
19803 }
19804
19805 entry = tmp_entry;
19806
19807 while (start < end) {
19808 if (entry == vm_map_to_entry(map)) {
19809 vm_map_unlock(map);
19810 return false;
19811 }
19812
19813 /*
19814 * No holes allowed!
19815 */
19816
19817 if (start < entry->vme_start) {
19818 vm_map_unlock(map);
19819 return false;
19820 }
19821
19822 /*
19823 * Check protection associated with entry.
19824 */
19825
19826 if ((entry->protection & protection) != protection) {
19827 vm_map_unlock(map);
19828 return false;
19829 }
19830
19831 /* go to next entry */
19832
19833 start = entry->vme_end;
19834 entry = entry->vme_next;
19835 }
19836 vm_map_unlock(map);
19837 return true;
19838 }
19839
19840 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_ut address_u,vm_purgable_t control,int * state)19841 vm_map_purgable_control(
19842 vm_map_t map,
19843 vm_map_offset_ut address_u,
19844 vm_purgable_t control,
19845 int *state)
19846 {
19847 vm_map_offset_t address;
19848 vm_map_entry_t entry;
19849 vm_object_t object;
19850 kern_return_t kr;
19851 boolean_t was_nonvolatile;
19852
19853 /*
19854 * Vet all the input parameters and current type and state of the
19855 * underlaying object. Return with an error if anything is amiss.
19856 */
19857 if (map == VM_MAP_NULL) {
19858 return KERN_INVALID_ARGUMENT;
19859 }
19860
19861 if (control != VM_PURGABLE_SET_STATE &&
19862 control != VM_PURGABLE_GET_STATE &&
19863 control != VM_PURGABLE_PURGE_ALL &&
19864 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19865 return KERN_INVALID_ARGUMENT;
19866 }
19867
19868 if (control == VM_PURGABLE_PURGE_ALL) {
19869 vm_purgeable_object_purge_all();
19870 return KERN_SUCCESS;
19871 }
19872
19873 if ((control == VM_PURGABLE_SET_STATE ||
19874 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19875 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19876 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19877 return KERN_INVALID_ARGUMENT;
19878 }
19879
19880 address = vm_sanitize_addr(map, address_u);
19881
19882 vm_map_lock_read(map);
19883
19884 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19885 /*
19886 * Must pass a valid non-submap address.
19887 */
19888 vm_map_unlock_read(map);
19889 return KERN_INVALID_ADDRESS;
19890 }
19891
19892 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19893 control != VM_PURGABLE_GET_STATE) {
19894 /*
19895 * Can't apply purgable controls to something you can't write.
19896 */
19897 vm_map_unlock_read(map);
19898 return KERN_PROTECTION_FAILURE;
19899 }
19900
19901 object = VME_OBJECT(entry);
19902 if (object == VM_OBJECT_NULL ||
19903 object->purgable == VM_PURGABLE_DENY) {
19904 /*
19905 * Object must already be present and be purgeable.
19906 */
19907 vm_map_unlock_read(map);
19908 return KERN_INVALID_ARGUMENT;
19909 }
19910
19911 vm_object_lock(object);
19912
19913 #if 00
19914 if (VME_OFFSET(entry) != 0 ||
19915 entry->vme_end - entry->vme_start != object->vo_size) {
19916 /*
19917 * Can only apply purgable controls to the whole (existing)
19918 * object at once.
19919 */
19920 vm_map_unlock_read(map);
19921 vm_object_unlock(object);
19922 return KERN_INVALID_ARGUMENT;
19923 }
19924 #endif
19925
19926 assert(!entry->is_sub_map);
19927 assert(!entry->use_pmap); /* purgeable has its own accounting */
19928
19929 vm_map_unlock_read(map);
19930
19931 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19932
19933 kr = vm_object_purgable_control(object, control, state);
19934
19935 if (was_nonvolatile &&
19936 object->purgable != VM_PURGABLE_NONVOLATILE &&
19937 map->pmap == kernel_pmap) {
19938 #if DEBUG
19939 object->vo_purgeable_volatilizer = kernel_task;
19940 #endif /* DEBUG */
19941 }
19942
19943 vm_object_unlock(object);
19944
19945 return kr;
19946 }
19947
19948 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19949 vm_map_footprint_query_page_info(
19950 vm_map_t map,
19951 vm_map_entry_t map_entry,
19952 vm_map_offset_t curr_s_offset,
19953 int *disposition_p)
19954 {
19955 int pmap_disp;
19956 vm_object_t object = VM_OBJECT_NULL;
19957 int disposition;
19958 int effective_page_size;
19959
19960 vm_map_lock_assert_held(map);
19961 assert(!map->has_corpse_footprint);
19962 assert(curr_s_offset >= map_entry->vme_start);
19963 assert(curr_s_offset < map_entry->vme_end);
19964
19965 if (map_entry->is_sub_map) {
19966 if (!map_entry->use_pmap) {
19967 /* nested pmap: no footprint */
19968 *disposition_p = 0;
19969 return;
19970 }
19971 } else {
19972 object = VME_OBJECT(map_entry);
19973 if (object == VM_OBJECT_NULL) {
19974 /* nothing mapped here: no need to ask */
19975 *disposition_p = 0;
19976 return;
19977 }
19978 }
19979
19980 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19981
19982 pmap_disp = 0;
19983
19984 /*
19985 * Query the pmap.
19986 */
19987 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19988
19989 /*
19990 * Compute this page's disposition.
19991 */
19992 disposition = 0;
19993
19994 /* deal with "alternate accounting" first */
19995 if (!map_entry->is_sub_map &&
19996 object->vo_no_footprint) {
19997 /* does not count in footprint */
19998 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19999 } else if (!map_entry->is_sub_map &&
20000 !object->internal &&
20001 object->vo_ledger_tag &&
20002 VM_OBJECT_OWNER(object) != NULL &&
20003 VM_OBJECT_OWNER(object)->map == map) {
20004 /* owned external object: wired pages count in footprint */
20005 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20006 if ((((curr_s_offset
20007 - map_entry->vme_start
20008 + VME_OFFSET(map_entry))
20009 / effective_page_size) <
20010 object->wired_page_count)) {
20011 /*
20012 * External object owned by this task: report the first
20013 * "#wired" pages as "resident" (to show that they
20014 * contribute to the footprint) but not "dirty"
20015 * (to avoid double-counting with the fake "owned"
20016 * region we'll report at the end of the address space
20017 * to account for all (mapped or not) owned memory
20018 * owned by this task.
20019 */
20020 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20021 }
20022 } else if (!map_entry->is_sub_map &&
20023 object->internal &&
20024 (object->purgable == VM_PURGABLE_NONVOLATILE ||
20025 (object->purgable == VM_PURGABLE_DENY &&
20026 object->vo_ledger_tag)) &&
20027 VM_OBJECT_OWNER(object) != NULL &&
20028 VM_OBJECT_OWNER(object)->map == map) {
20029 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20030 if ((((curr_s_offset
20031 - map_entry->vme_start
20032 + VME_OFFSET(map_entry))
20033 / effective_page_size) <
20034 (object->resident_page_count +
20035 vm_compressor_pager_get_count(object->pager)))) {
20036 /*
20037 * Non-volatile purgeable object owned
20038 * by this task: report the first
20039 * "#resident + #compressed" pages as
20040 * "resident" (to show that they
20041 * contribute to the footprint) but not
20042 * "dirty" (to avoid double-counting
20043 * with the fake "non-volatile" region
20044 * we'll report at the end of the
20045 * address space to account for all
20046 * (mapped or not) non-volatile memory
20047 * owned by this task.
20048 */
20049 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20050 }
20051 } else if (!map_entry->is_sub_map &&
20052 object->internal &&
20053 (object->purgable == VM_PURGABLE_VOLATILE ||
20054 object->purgable == VM_PURGABLE_EMPTY) &&
20055 VM_OBJECT_OWNER(object) != NULL &&
20056 VM_OBJECT_OWNER(object)->map == map) {
20057 if (object->internal) {
20058 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20059 }
20060 if ((((curr_s_offset
20061 - map_entry->vme_start
20062 + VME_OFFSET(map_entry))
20063 / effective_page_size) <
20064 object->wired_page_count)) {
20065 /*
20066 * Volatile|empty purgeable object owned
20067 * by this task: report the first
20068 * "#wired" pages as "resident" (to
20069 * show that they contribute to the
20070 * footprint) but not "dirty" (to avoid
20071 * double-counting with the fake
20072 * "non-volatile" region we'll report
20073 * at the end of the address space to
20074 * account for all (mapped or not)
20075 * non-volatile memory owned by this
20076 * task.
20077 */
20078 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20079 }
20080 } else if (!map_entry->is_sub_map &&
20081 map_entry->iokit_acct &&
20082 object->internal &&
20083 object->purgable == VM_PURGABLE_DENY) {
20084 /*
20085 * Non-purgeable IOKit memory: phys_footprint
20086 * includes the entire virtual mapping.
20087 */
20088 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20089 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20090 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20091 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
20092 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
20093 /* alternate accounting */
20094 #if __arm64__ && (DEVELOPMENT || DEBUG)
20095 if (map->pmap->footprint_was_suspended) {
20096 /*
20097 * The assertion below can fail if dyld
20098 * suspended footprint accounting
20099 * while doing some adjustments to
20100 * this page; the mapping would say
20101 * "use pmap accounting" but the page
20102 * would be marked "alternate
20103 * accounting".
20104 */
20105 } else
20106 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
20107 {
20108 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20109 }
20110 disposition = 0;
20111 } else {
20112 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
20113 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20114 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20115 disposition |= VM_PAGE_QUERY_PAGE_REF;
20116 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
20117 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20118 } else {
20119 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20120 }
20121 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
20122 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20123 }
20124 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
20125 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
20126 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20127 }
20128 }
20129
20130 *disposition_p = disposition;
20131 }
20132
20133 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_ut offset_u,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20134 vm_map_page_info(
20135 vm_map_t map,
20136 vm_map_offset_ut offset_u,
20137 vm_page_info_flavor_t flavor,
20138 vm_page_info_t info,
20139 mach_msg_type_number_t *count)
20140 {
20141 return vm_map_page_range_info_internal(map,
20142 offset_u, /* start of range */
20143 vm_sanitize_compute_ut_end(offset_u, 1), /* this will get rounded in the call to the page boundary */
20144 (int)-1, /* effective_page_shift: unspecified */
20145 flavor,
20146 info,
20147 count);
20148 }
20149
20150 static __attribute__((always_inline, warn_unused_result))
20151 kern_return_t
vm_map_page_range_info_sanitize(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,vm_map_offset_t effective_page_mask,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_offset_t * offset_in_page)20152 vm_map_page_range_info_sanitize(
20153 vm_map_t map,
20154 vm_map_offset_ut start_offset_u,
20155 vm_map_offset_ut end_offset_u,
20156 vm_map_offset_t effective_page_mask,
20157 vm_map_offset_t *start,
20158 vm_map_offset_t *end,
20159 vm_map_offset_t *offset_in_page)
20160 {
20161 kern_return_t retval;
20162 vm_map_size_t size;
20163
20164 /*
20165 * Perform validation against map's mask but don't align start/end,
20166 * as we need for those to be aligned wrt effective_page_mask
20167 */
20168 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20169 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, map,
20170 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH |
20171 VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES, start,
20172 end, &size);
20173 if (retval != KERN_SUCCESS) {
20174 return retval;
20175 }
20176
20177 retval = vm_sanitize_addr_end(start_offset_u, end_offset_u,
20178 VM_SANITIZE_CALLER_VM_MAP_PAGE_RANGE_INFO, effective_page_mask,
20179 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH, start,
20180 end, &size);
20181 if (retval != KERN_SUCCESS) {
20182 return retval;
20183 }
20184
20185 *offset_in_page = vm_sanitize_offset_in_page(effective_page_mask,
20186 start_offset_u);
20187
20188 return KERN_SUCCESS;
20189 }
20190
20191 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_ut start_offset_u,vm_map_offset_ut end_offset_u,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)20192 vm_map_page_range_info_internal(
20193 vm_map_t map,
20194 vm_map_offset_ut start_offset_u,
20195 vm_map_offset_ut end_offset_u,
20196 int effective_page_shift,
20197 vm_page_info_flavor_t flavor,
20198 vm_page_info_t info,
20199 mach_msg_type_number_t *count)
20200 {
20201 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20202 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20203 vm_page_t m = VM_PAGE_NULL;
20204 kern_return_t retval = KERN_SUCCESS;
20205 int disposition = 0;
20206 int ref_count = 0;
20207 int depth = 0, info_idx = 0;
20208 vm_page_info_basic_t basic_info = 0;
20209 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20210 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20211 boolean_t do_region_footprint;
20212 ledger_amount_t ledger_resident, ledger_compressed;
20213 int effective_page_size;
20214 vm_map_offset_t effective_page_mask;
20215
20216 switch (flavor) {
20217 case VM_PAGE_INFO_BASIC:
20218 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20219 /*
20220 * The "vm_page_info_basic_data" structure was not
20221 * properly padded, so allow the size to be off by
20222 * one to maintain backwards binary compatibility...
20223 */
20224 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20225 return KERN_INVALID_ARGUMENT;
20226 }
20227 }
20228 break;
20229 default:
20230 return KERN_INVALID_ARGUMENT;
20231 }
20232
20233 if (effective_page_shift == -1) {
20234 effective_page_shift = vm_self_region_page_shift_safely(map);
20235 if (effective_page_shift == -1) {
20236 return KERN_INVALID_ARGUMENT;
20237 }
20238 }
20239 effective_page_size = (1 << effective_page_shift);
20240 effective_page_mask = effective_page_size - 1;
20241
20242
20243 retval = vm_map_page_range_info_sanitize(map,
20244 start_offset_u,
20245 end_offset_u,
20246 effective_page_mask,
20247 &start,
20248 &end,
20249 &offset_in_page);
20250 if (retval != KERN_SUCCESS) {
20251 return vm_sanitize_get_kr(retval);
20252 }
20253
20254 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20255
20256 do_region_footprint = task_self_region_footprint();
20257 disposition = 0;
20258 ref_count = 0;
20259 depth = 0;
20260 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20261
20262 vm_map_lock_read(map);
20263
20264 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
20265
20266 for (curr_s_offset = start; curr_s_offset < end;) {
20267 /*
20268 * New lookup needs reset of these variables.
20269 */
20270 curr_object = object = VM_OBJECT_NULL;
20271 offset_in_object = 0;
20272 ref_count = 0;
20273 depth = 0;
20274
20275 if (do_region_footprint &&
20276 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20277 /*
20278 * Request for "footprint" info about a page beyond
20279 * the end of address space: this must be for
20280 * the fake region vm_map_region_recurse_64()
20281 * reported to account for non-volatile purgeable
20282 * memory owned by this task.
20283 */
20284 disposition = 0;
20285
20286 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20287 (unsigned) ledger_compressed) {
20288 /*
20289 * We haven't reported all the "non-volatile
20290 * compressed" pages yet, so report this fake
20291 * page as "compressed".
20292 */
20293 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20294 } else {
20295 /*
20296 * We've reported all the non-volatile
20297 * compressed page but not all the non-volatile
20298 * pages , so report this fake page as
20299 * "resident dirty".
20300 */
20301 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20302 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20303 disposition |= VM_PAGE_QUERY_PAGE_REF;
20304 }
20305 switch (flavor) {
20306 case VM_PAGE_INFO_BASIC:
20307 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20308 basic_info->disposition = disposition;
20309 basic_info->ref_count = 1;
20310 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20311 basic_info->offset = 0;
20312 basic_info->depth = 0;
20313
20314 info_idx++;
20315 break;
20316 }
20317 curr_s_offset += effective_page_size;
20318 continue;
20319 }
20320
20321 /*
20322 * First, find the map entry covering "curr_s_offset", going down
20323 * submaps if necessary.
20324 */
20325 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
20326 /* no entry -> no object -> no page */
20327
20328 if (curr_s_offset < vm_map_min(map)) {
20329 /*
20330 * Illegal address that falls below map min.
20331 */
20332 curr_e_offset = MIN(end, vm_map_min(map));
20333 } else if (curr_s_offset >= vm_map_max(map)) {
20334 /*
20335 * Illegal address that falls on/after map max.
20336 */
20337 curr_e_offset = end;
20338 } else if (map_entry == vm_map_to_entry(map)) {
20339 /*
20340 * Hit a hole.
20341 */
20342 if (map_entry->vme_next == vm_map_to_entry(map)) {
20343 /*
20344 * Empty map.
20345 */
20346 curr_e_offset = MIN(map->max_offset, end);
20347 } else {
20348 /*
20349 * Hole at start of the map.
20350 */
20351 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20352 }
20353 } else {
20354 if (map_entry->vme_next == vm_map_to_entry(map)) {
20355 /*
20356 * Hole at the end of the map.
20357 */
20358 curr_e_offset = MIN(map->max_offset, end);
20359 } else {
20360 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20361 }
20362 }
20363
20364 assert(curr_e_offset >= curr_s_offset);
20365
20366 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20367
20368 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20369
20370 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20371
20372 curr_s_offset = curr_e_offset;
20373
20374 info_idx += num_pages;
20375
20376 continue;
20377 }
20378
20379 /* compute offset from this map entry's start */
20380 offset_in_object = curr_s_offset - map_entry->vme_start;
20381
20382 /* compute offset into this map entry's object (or submap) */
20383 offset_in_object += VME_OFFSET(map_entry);
20384
20385 if (map_entry->is_sub_map) {
20386 vm_map_t sub_map = VM_MAP_NULL;
20387 vm_page_info_t submap_info = 0;
20388 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20389
20390 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20391
20392 submap_s_offset = offset_in_object;
20393 submap_e_offset = submap_s_offset + range_len;
20394
20395 sub_map = VME_SUBMAP(map_entry);
20396
20397 vm_map_reference(sub_map);
20398 vm_map_unlock_read(map);
20399
20400 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20401
20402 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20403 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20404
20405 retval = vm_map_page_range_info_internal(sub_map,
20406 submap_s_offset,
20407 submap_e_offset,
20408 effective_page_shift,
20409 VM_PAGE_INFO_BASIC,
20410 (vm_page_info_t) submap_info,
20411 count);
20412
20413 assert(retval == KERN_SUCCESS);
20414
20415 vm_map_lock_read(map);
20416 vm_map_deallocate(sub_map);
20417
20418 /* Move the "info" index by the number of pages we inspected.*/
20419 info_idx += range_len >> effective_page_shift;
20420
20421 /* Move our current offset by the size of the range we inspected.*/
20422 curr_s_offset += range_len;
20423
20424 continue;
20425 }
20426
20427 object = VME_OBJECT(map_entry);
20428
20429 if (object == VM_OBJECT_NULL) {
20430 /*
20431 * We don't have an object here and, hence,
20432 * no pages to inspect. We'll fill up the
20433 * info structure appropriately.
20434 */
20435
20436 curr_e_offset = MIN(map_entry->vme_end, end);
20437
20438 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20439
20440 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20441
20442 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20443
20444 curr_s_offset = curr_e_offset;
20445
20446 info_idx += num_pages;
20447
20448 continue;
20449 }
20450
20451 if (do_region_footprint) {
20452 disposition = 0;
20453 if (map->has_corpse_footprint) {
20454 /*
20455 * Query the page info data we saved
20456 * while forking the corpse.
20457 */
20458 vm_map_corpse_footprint_query_page_info(
20459 map,
20460 curr_s_offset,
20461 &disposition);
20462 } else {
20463 /*
20464 * Query the live pmap for footprint info
20465 * about this page.
20466 */
20467 vm_map_footprint_query_page_info(
20468 map,
20469 map_entry,
20470 curr_s_offset,
20471 &disposition);
20472 }
20473 switch (flavor) {
20474 case VM_PAGE_INFO_BASIC:
20475 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20476 basic_info->disposition = disposition;
20477 basic_info->ref_count = 1;
20478 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20479 basic_info->offset = 0;
20480 basic_info->depth = 0;
20481
20482 info_idx++;
20483 break;
20484 }
20485 curr_s_offset += effective_page_size;
20486 continue;
20487 }
20488
20489 vm_object_reference(object);
20490 /*
20491 * Shared mode -- so we can allow other readers
20492 * to grab the lock too.
20493 */
20494 vm_object_lock_shared(object);
20495
20496 curr_e_offset = MIN(map_entry->vme_end, end);
20497
20498 vm_map_unlock_read(map);
20499
20500 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20501
20502 curr_object = object;
20503
20504 for (; curr_s_offset < curr_e_offset;) {
20505 if (object == curr_object) {
20506 /* account for our object reference above. */
20507 ref_count = os_ref_get_count_raw(&curr_object->ref_count) - 1;
20508 } else {
20509 ref_count = os_ref_get_count_raw(&curr_object->ref_count);
20510 }
20511
20512 curr_offset_in_object = offset_in_object;
20513
20514 for (;;) {
20515 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20516
20517 if (m != VM_PAGE_NULL) {
20518 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20519 break;
20520 } else {
20521 if (curr_object->internal &&
20522 curr_object->alive &&
20523 !curr_object->terminating &&
20524 curr_object->pager_ready) {
20525 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20526 == VM_EXTERNAL_STATE_EXISTS) {
20527 /* the pager has that page */
20528 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20529 break;
20530 }
20531 }
20532
20533 /*
20534 * Go down the VM object shadow chain until we find the page
20535 * we're looking for.
20536 */
20537
20538 if (curr_object->shadow != VM_OBJECT_NULL) {
20539 vm_object_t shadow = VM_OBJECT_NULL;
20540
20541 curr_offset_in_object += curr_object->vo_shadow_offset;
20542 shadow = curr_object->shadow;
20543
20544 vm_object_lock_shared(shadow);
20545 vm_object_unlock(curr_object);
20546
20547 curr_object = shadow;
20548 depth++;
20549 continue;
20550 } else {
20551 break;
20552 }
20553 }
20554 }
20555
20556 /* The ref_count is not strictly accurate, it measures the number */
20557 /* of entities holding a ref on the object, they may not be mapping */
20558 /* the object or may not be mapping the section holding the */
20559 /* target page but its still a ball park number and though an over- */
20560 /* count, it picks up the copy-on-write cases */
20561
20562 /* We could also get a picture of page sharing from pmap_attributes */
20563 /* but this would under count as only faulted-in mappings would */
20564 /* show up. */
20565
20566 if ((curr_object == object) && curr_object->shadow) {
20567 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20568 }
20569
20570 if (!curr_object->internal) {
20571 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20572 }
20573
20574 if (m != VM_PAGE_NULL) {
20575 if (m->vmp_fictitious) {
20576 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20577 } else {
20578 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20579 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20580 }
20581
20582 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20583 disposition |= VM_PAGE_QUERY_PAGE_REF;
20584 }
20585
20586 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20587 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20588 }
20589
20590 /*
20591 * XXX TODO4K:
20592 * when this routine deals with 4k
20593 * pages, check the appropriate CS bit
20594 * here.
20595 */
20596 if (m->vmp_cs_validated) {
20597 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20598 }
20599 if (m->vmp_cs_tainted) {
20600 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20601 }
20602 if (m->vmp_cs_nx) {
20603 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20604 }
20605 if (m->vmp_reusable || curr_object->all_reusable) {
20606 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20607 }
20608 }
20609 }
20610
20611 switch (flavor) {
20612 case VM_PAGE_INFO_BASIC:
20613 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20614 basic_info->disposition = disposition;
20615 basic_info->ref_count = ref_count;
20616 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20617 VM_KERNEL_ADDRHASH(curr_object);
20618 basic_info->offset =
20619 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20620 basic_info->depth = depth;
20621
20622 info_idx++;
20623 break;
20624 }
20625
20626 disposition = 0;
20627 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20628
20629 /*
20630 * Move to next offset in the range and in our object.
20631 */
20632 curr_s_offset += effective_page_size;
20633 offset_in_object += effective_page_size;
20634 curr_offset_in_object = offset_in_object;
20635
20636 if (curr_object != object) {
20637 vm_object_unlock(curr_object);
20638
20639 curr_object = object;
20640
20641 vm_object_lock_shared(curr_object);
20642 } else {
20643 vm_object_lock_yield_shared(curr_object);
20644 }
20645 }
20646
20647 vm_object_unlock(curr_object);
20648 vm_object_deallocate(curr_object);
20649
20650 vm_map_lock_read(map);
20651 }
20652
20653 vm_map_unlock_read(map);
20654 return retval;
20655 }
20656
20657 static __attribute__((always_inline, warn_unused_result))
20658 kern_return_t
vm_map_msync_sanitize(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_object_offset_t * address,vm_map_size_t * size)20659 vm_map_msync_sanitize(
20660 vm_map_t map,
20661 vm_map_address_ut address_u,
20662 vm_map_size_ut size_u,
20663 vm_object_offset_t *address,
20664 vm_map_size_t *size)
20665 {
20666 vm_object_offset_t end;
20667
20668 return vm_sanitize_addr_size(address_u, size_u,
20669 VM_SANITIZE_CALLER_VM_MAP_MSYNC,
20670 map, VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
20671 address, &end, size);
20672 }
20673
20674 /*
20675 * vm_map_msync
20676 *
20677 * Synchronises the memory range specified with its backing store
20678 * image by either flushing or cleaning the contents to the appropriate
20679 * memory manager engaging in a memory object synchronize dialog with
20680 * the manager. The client doesn't return until the manager issues
20681 * m_o_s_completed message. MIG Magically converts user task parameter
20682 * to the task's address map.
20683 *
20684 * interpretation of sync_flags
20685 * VM_SYNC_INVALIDATE - discard pages, only return precious
20686 * pages to manager.
20687 *
20688 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20689 * - discard pages, write dirty or precious
20690 * pages back to memory manager.
20691 *
20692 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20693 * - write dirty or precious pages back to
20694 * the memory manager.
20695 *
20696 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20697 * is a hole in the region, and we would
20698 * have returned KERN_SUCCESS, return
20699 * KERN_INVALID_ADDRESS instead.
20700 *
20701 * NOTE
20702 * The memory object attributes have not yet been implemented, this
20703 * function will have to deal with the invalidate attribute
20704 *
20705 * RETURNS
20706 * KERN_INVALID_TASK Bad task parameter
20707 * KERN_INVALID_ARGUMENT both sync and async were specified.
20708 * KERN_SUCCESS The usual.
20709 * KERN_INVALID_ADDRESS There was a hole in the region.
20710 */
20711
20712 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_sync_t sync_flags)20713 vm_map_msync(
20714 vm_map_t map,
20715 vm_map_address_ut address_u,
20716 vm_map_size_ut size_u,
20717 vm_sync_t sync_flags)
20718 {
20719 vm_map_entry_t entry;
20720 vm_map_size_t size, amount_left;
20721 vm_object_offset_t address, offset;
20722 vm_object_offset_t start_offset, end_offset;
20723 boolean_t do_sync_req;
20724 boolean_t had_hole = FALSE;
20725 vm_map_offset_t pmap_offset;
20726 kern_return_t kr;
20727
20728 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20729 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20730 return KERN_INVALID_ARGUMENT;
20731 }
20732
20733 if (map == VM_MAP_NULL) {
20734 return KERN_INVALID_TASK;
20735 }
20736
20737 kr = vm_map_msync_sanitize(map,
20738 address_u,
20739 size_u,
20740 &address,
20741 &size);
20742 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20743 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20744 }
20745 if (__improbable(kr != KERN_SUCCESS)) {
20746 return vm_sanitize_get_kr(kr);
20747 }
20748
20749 amount_left = size;
20750
20751 while (amount_left > 0) {
20752 vm_object_size_t flush_size;
20753 vm_object_t object;
20754
20755 vm_map_lock(map);
20756 if (!vm_map_lookup_entry(map,
20757 address,
20758 &entry)) {
20759 vm_map_size_t skip;
20760
20761 /*
20762 * hole in the address map.
20763 */
20764 had_hole = TRUE;
20765
20766 if (sync_flags & VM_SYNC_KILLPAGES) {
20767 /*
20768 * For VM_SYNC_KILLPAGES, there should be
20769 * no holes in the range, since we couldn't
20770 * prevent someone else from allocating in
20771 * that hole and we wouldn't want to "kill"
20772 * their pages.
20773 */
20774 vm_map_unlock(map);
20775 break;
20776 }
20777
20778 /*
20779 * Check for empty map.
20780 */
20781 if (entry == vm_map_to_entry(map) &&
20782 entry->vme_next == entry) {
20783 vm_map_unlock(map);
20784 break;
20785 }
20786 /*
20787 * Check that we don't wrap and that
20788 * we have at least one real map entry.
20789 */
20790 if ((map->hdr.nentries == 0) ||
20791 (entry->vme_next->vme_start < address)) {
20792 vm_map_unlock(map);
20793 break;
20794 }
20795 /*
20796 * Move up to the next entry if needed
20797 */
20798 skip = (entry->vme_next->vme_start - address);
20799 if (skip >= amount_left) {
20800 amount_left = 0;
20801 } else {
20802 amount_left -= skip;
20803 }
20804 address = entry->vme_next->vme_start;
20805 vm_map_unlock(map);
20806 continue;
20807 }
20808
20809 offset = address - entry->vme_start;
20810 pmap_offset = address;
20811
20812 /*
20813 * do we have more to flush than is contained in this
20814 * entry ?
20815 */
20816 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20817 flush_size = entry->vme_end -
20818 (entry->vme_start + offset);
20819 } else {
20820 flush_size = amount_left;
20821 }
20822 amount_left -= flush_size;
20823 address += flush_size;
20824
20825 if (entry->is_sub_map == TRUE) {
20826 vm_map_t local_map;
20827 vm_map_offset_t local_offset;
20828
20829 local_map = VME_SUBMAP(entry);
20830 local_offset = VME_OFFSET(entry);
20831 vm_map_reference(local_map);
20832 vm_map_unlock(map);
20833 if (vm_map_msync(
20834 local_map,
20835 local_offset,
20836 flush_size,
20837 sync_flags) == KERN_INVALID_ADDRESS) {
20838 had_hole = TRUE;
20839 }
20840 vm_map_deallocate(local_map);
20841 continue;
20842 }
20843 object = VME_OBJECT(entry);
20844
20845 /*
20846 * We can't sync this object if the object has not been
20847 * created yet
20848 */
20849 if (object == VM_OBJECT_NULL) {
20850 vm_map_unlock(map);
20851 continue;
20852 }
20853 offset += VME_OFFSET(entry);
20854
20855 vm_object_lock(object);
20856
20857 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20858 int kill_pages = 0;
20859
20860 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20861 /*
20862 * This is a destructive operation and so we
20863 * err on the side of limiting the range of
20864 * the operation.
20865 */
20866 start_offset = vm_object_round_page(offset);
20867 end_offset = vm_object_trunc_page(offset + flush_size);
20868
20869 if (end_offset <= start_offset) {
20870 vm_object_unlock(object);
20871 vm_map_unlock(map);
20872 continue;
20873 }
20874
20875 pmap_offset += start_offset - offset;
20876 } else {
20877 start_offset = offset;
20878 end_offset = offset + flush_size;
20879 }
20880
20881 if (sync_flags & VM_SYNC_KILLPAGES) {
20882 if (((os_ref_get_count_raw(&object->ref_count) == 1) ||
20883 ((object->copy_strategy !=
20884 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20885 (object->vo_copy == VM_OBJECT_NULL))) &&
20886 (object->shadow == VM_OBJECT_NULL)) {
20887 if (os_ref_get_count_raw(&object->ref_count) != 1) {
20888 vm_page_stats_reusable.free_shared++;
20889 }
20890 kill_pages = 1;
20891 } else {
20892 kill_pages = -1;
20893 }
20894 }
20895 if (kill_pages != -1) {
20896 vm_object_deactivate_pages(
20897 object,
20898 start_offset,
20899 (vm_object_size_t) (end_offset - start_offset),
20900 kill_pages,
20901 FALSE, /* reusable_pages */
20902 FALSE, /* reusable_no_write */
20903 map->pmap,
20904 pmap_offset);
20905 }
20906 vm_object_unlock(object);
20907 vm_map_unlock(map);
20908 continue;
20909 }
20910 /*
20911 * We can't sync this object if there isn't a pager.
20912 * Don't bother to sync internal objects, since there can't
20913 * be any "permanent" storage for these objects anyway.
20914 */
20915 if ((object->pager == MEMORY_OBJECT_NULL) ||
20916 (object->internal) || (object->private)) {
20917 vm_object_unlock(object);
20918 vm_map_unlock(map);
20919 continue;
20920 }
20921 /*
20922 * keep reference on the object until syncing is done
20923 */
20924 vm_object_reference_locked(object);
20925 vm_object_unlock(object);
20926
20927 vm_map_unlock(map);
20928
20929 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20930 start_offset = vm_object_trunc_page(offset);
20931 end_offset = vm_object_round_page(offset + flush_size);
20932 } else {
20933 start_offset = offset;
20934 end_offset = offset + flush_size;
20935 }
20936
20937 do_sync_req = vm_object_sync(object,
20938 start_offset,
20939 (end_offset - start_offset),
20940 sync_flags & VM_SYNC_INVALIDATE,
20941 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20942 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20943 sync_flags & VM_SYNC_SYNCHRONOUS);
20944
20945 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20946 /*
20947 * clear out the clustering and read-ahead hints
20948 */
20949 vm_object_lock(object);
20950
20951 object->pages_created = 0;
20952 object->pages_used = 0;
20953 object->sequential = 0;
20954 object->last_alloc = 0;
20955
20956 vm_object_unlock(object);
20957 }
20958 vm_object_deallocate(object);
20959 } /* while */
20960
20961 /* for proper msync() behaviour */
20962 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20963 return KERN_INVALID_ADDRESS;
20964 }
20965
20966 return KERN_SUCCESS;
20967 }/* vm_msync */
20968
20969 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20970 vm_named_entry_associate_vm_object(
20971 vm_named_entry_t named_entry,
20972 vm_object_t object,
20973 vm_object_offset_t offset,
20974 vm_object_size_t size,
20975 vm_prot_t prot)
20976 {
20977 vm_map_copy_t copy;
20978 vm_map_entry_t copy_entry;
20979
20980 assert(!named_entry->is_sub_map);
20981 assert(!named_entry->is_copy);
20982 assert(!named_entry->is_object);
20983 assert(!named_entry->internal);
20984 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20985
20986 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20987 copy->offset = offset;
20988 copy->size = size;
20989 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20990
20991 copy_entry = vm_map_copy_entry_create(copy);
20992 copy_entry->protection = prot;
20993 copy_entry->max_protection = prot;
20994 copy_entry->use_pmap = TRUE;
20995 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20996 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20997 VME_OBJECT_SET(copy_entry, object, false, 0);
20998 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20999 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
21000
21001 named_entry->backing.copy = copy;
21002 named_entry->is_object = TRUE;
21003 if (object->internal) {
21004 named_entry->internal = TRUE;
21005 }
21006
21007 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
21008 named_entry, copy, object, offset, size, prot);
21009 }
21010
21011 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)21012 vm_named_entry_to_vm_object(
21013 vm_named_entry_t named_entry)
21014 {
21015 vm_map_copy_t copy;
21016 vm_map_entry_t copy_entry;
21017 vm_object_t object;
21018
21019 assert(!named_entry->is_sub_map);
21020 assert(!named_entry->is_copy);
21021 assert(named_entry->is_object);
21022 copy = named_entry->backing.copy;
21023 assert(copy != VM_MAP_COPY_NULL);
21024 /*
21025 * Assert that the vm_map_copy is coming from the right
21026 * zone and hasn't been forged
21027 */
21028 vm_map_copy_require(copy);
21029 assert(copy->cpy_hdr.nentries == 1);
21030 copy_entry = vm_map_copy_first_entry(copy);
21031 object = VME_OBJECT(copy_entry);
21032
21033 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
21034
21035 return object;
21036 }
21037
21038 /*
21039 * Routine: convert_port_entry_to_map
21040 * Purpose:
21041 * Convert from a port specifying an entry or a task
21042 * to a map. Doesn't consume the port ref; produces a map ref,
21043 * which may be null. Unlike convert_port_to_map, the
21044 * port may be task or a named entry backed.
21045 * Conditions:
21046 * Nothing locked.
21047 */
21048
21049 vm_map_t
convert_port_entry_to_map(ipc_port_t port)21050 convert_port_entry_to_map(
21051 ipc_port_t port)
21052 {
21053 vm_map_t map = VM_MAP_NULL;
21054 vm_named_entry_t named_entry;
21055
21056 if (!IP_VALID(port)) {
21057 return VM_MAP_NULL;
21058 }
21059
21060 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
21061 return convert_port_to_map(port);
21062 }
21063
21064 named_entry = mach_memory_entry_from_port(port);
21065
21066 if ((named_entry->is_sub_map) &&
21067 (named_entry->protection & VM_PROT_WRITE)) {
21068 map = named_entry->backing.map;
21069 if (map->pmap != PMAP_NULL) {
21070 if (map->pmap == kernel_pmap) {
21071 panic("userspace has access "
21072 "to a kernel map %p", map);
21073 }
21074 pmap_require(map->pmap);
21075 }
21076 vm_map_reference(map);
21077 }
21078
21079 return map;
21080 }
21081
21082 /*
21083 * Export routines to other components for the things we access locally through
21084 * macros.
21085 */
21086 #undef current_map
21087 vm_map_t
current_map(void)21088 current_map(void)
21089 {
21090 return current_map_fast();
21091 }
21092
21093 /*
21094 * vm_map_reference:
21095 *
21096 * Takes a reference on the specified map.
21097 */
21098 void
vm_map_reference(vm_map_t map)21099 vm_map_reference(
21100 vm_map_t map)
21101 {
21102 if (__probable(map != VM_MAP_NULL)) {
21103 vm_map_require(map);
21104 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
21105 }
21106 }
21107
21108 /*
21109 * vm_map_deallocate:
21110 *
21111 * Removes a reference from the specified map,
21112 * destroying it if no references remain.
21113 * The map should not be locked.
21114 */
21115 void
vm_map_deallocate(vm_map_t map)21116 vm_map_deallocate(
21117 vm_map_t map)
21118 {
21119 if (__probable(map != VM_MAP_NULL)) {
21120 vm_map_require(map);
21121 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
21122 vm_map_destroy(map);
21123 }
21124 }
21125 }
21126
21127 void
vm_map_inspect_deallocate(vm_map_inspect_t map)21128 vm_map_inspect_deallocate(
21129 vm_map_inspect_t map)
21130 {
21131 vm_map_deallocate((vm_map_t)map);
21132 }
21133
21134 void
vm_map_read_deallocate(vm_map_read_t map)21135 vm_map_read_deallocate(
21136 vm_map_read_t map)
21137 {
21138 vm_map_deallocate((vm_map_t)map);
21139 }
21140
21141
21142 void
vm_map_disable_NX(vm_map_t map)21143 vm_map_disable_NX(vm_map_t map)
21144 {
21145 if (map == NULL) {
21146 return;
21147 }
21148 if (map->pmap == NULL) {
21149 return;
21150 }
21151
21152 pmap_disable_NX(map->pmap);
21153 }
21154
21155 void
vm_map_disallow_data_exec(vm_map_t map)21156 vm_map_disallow_data_exec(vm_map_t map)
21157 {
21158 if (map == NULL) {
21159 return;
21160 }
21161
21162 map->map_disallow_data_exec = TRUE;
21163 }
21164
21165 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21166 * more descriptive.
21167 */
21168 void
vm_map_set_32bit(vm_map_t map)21169 vm_map_set_32bit(vm_map_t map)
21170 {
21171 #if defined(__arm64__)
21172 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21173 #else
21174 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21175 #endif
21176 }
21177
21178
21179 void
vm_map_set_64bit(vm_map_t map)21180 vm_map_set_64bit(vm_map_t map)
21181 {
21182 #if defined(__arm64__)
21183 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21184 #else
21185 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21186 #endif
21187 }
21188
21189 /*
21190 * Expand the maximum size of an existing map to 64GB.
21191 */
21192 void
vm_map_set_jumbo(vm_map_t map)21193 vm_map_set_jumbo(vm_map_t map)
21194 {
21195 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21196 vm_map_set_max_addr(map, ~0, false);
21197 #else /* arm64 */
21198 (void) map;
21199 #endif
21200 }
21201
21202 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21203 /*
21204 * Expand the maximum size of an existing map to the maximum supported.
21205 */
21206 void
vm_map_set_extra_jumbo(vm_map_t map)21207 vm_map_set_extra_jumbo(vm_map_t map)
21208 {
21209 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
21210 vm_map_set_max_addr(map, ~0, true);
21211 #else /* arm64 */
21212 (void) map;
21213 #endif
21214 }
21215 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21216
21217 /*
21218 * This map has a JIT entitlement
21219 */
21220 void
vm_map_set_jit_entitled(vm_map_t map)21221 vm_map_set_jit_entitled(vm_map_t map)
21222 {
21223 #if defined (__arm64__)
21224 pmap_set_jit_entitled(map->pmap);
21225 #else /* arm64 */
21226 (void) map;
21227 #endif
21228 }
21229
21230 /*
21231 * Get status of this maps TPRO flag
21232 */
21233 boolean_t
vm_map_tpro(vm_map_t map)21234 vm_map_tpro(vm_map_t map)
21235 {
21236 #if defined (__arm64e__)
21237 return pmap_get_tpro(map->pmap);
21238 #else /* arm64e */
21239 (void) map;
21240 return FALSE;
21241 #endif
21242 }
21243
21244 /*
21245 * This map has TPRO enabled
21246 */
21247 void
vm_map_set_tpro(vm_map_t map)21248 vm_map_set_tpro(vm_map_t map)
21249 {
21250 #if defined (__arm64e__)
21251 pmap_set_tpro(map->pmap);
21252 #else /* arm64e */
21253 (void) map;
21254 #endif
21255 }
21256
21257 /*
21258 * Does this map have TPRO enforcement enabled
21259 */
21260 boolean_t
vm_map_tpro_enforcement(vm_map_t map)21261 vm_map_tpro_enforcement(vm_map_t map)
21262 {
21263 return map->tpro_enforcement;
21264 }
21265
21266 /*
21267 * Set TPRO enforcement for this map
21268 */
21269 void
vm_map_set_tpro_enforcement(vm_map_t map)21270 vm_map_set_tpro_enforcement(vm_map_t map)
21271 {
21272 if (vm_map_tpro(map)) {
21273 vm_map_lock(map);
21274 map->tpro_enforcement = TRUE;
21275 vm_map_unlock(map);
21276 }
21277 }
21278
21279 /*
21280 * Enable TPRO on the requested region
21281 *
21282 * Note:
21283 * This routine is primarily intended to be called during/soon after map
21284 * creation before the associated task has been released to run. It is only
21285 * currently safe when we have no resident pages.
21286 */
21287 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)21288 vm_map_set_tpro_range(
21289 __unused vm_map_t map,
21290 __unused vm_map_address_t start,
21291 __unused vm_map_address_t end)
21292 {
21293 return TRUE;
21294 }
21295
21296 /*
21297 * Expand the maximum size of an existing map.
21298 */
21299 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)21300 vm_map_set_max_addr(
21301 vm_map_t map,
21302 vm_map_offset_t new_max_offset,
21303 __unused bool extra_jumbo)
21304 {
21305 #if defined(__arm64__)
21306 vm_map_offset_t max_supported_offset;
21307 vm_map_offset_t old_max_offset;
21308 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
21309
21310 vm_map_lock(map);
21311
21312 old_max_offset = map->max_offset;
21313 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
21314 if (extra_jumbo) {
21315 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
21316 }
21317 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
21318 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
21319
21320 new_max_offset = trunc_page(new_max_offset);
21321
21322 /* The address space cannot be shrunk using this routine. */
21323 if (old_max_offset >= new_max_offset) {
21324 vm_map_unlock(map);
21325 return;
21326 }
21327
21328 if (max_supported_offset < new_max_offset) {
21329 new_max_offset = max_supported_offset;
21330 }
21331
21332 map->max_offset = new_max_offset;
21333
21334 /*
21335 * Disable the following chunk of code that extends the "holes" list
21336 * to accomodate a larger VM map.
21337 * In `vm_map_create_options()`, we now set the end of the "holes" list to
21338 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
21339 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
21340 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
21341 * The "holes" list does not need to be adjusted.
21342 */
21343 #if 0
21344 if (map->holelistenabled) {
21345 if (map->holes_list->prev->vme_end == old_max_offset) {
21346 /*
21347 * There is already a hole at the end of the map; simply make it bigger.
21348 */
21349 map->holes_list->prev->vme_end = map->max_offset;
21350 } else {
21351 /*
21352 * There is no hole at the end, so we need to create a new hole
21353 * for the new empty space we're creating.
21354 */
21355 struct vm_map_links *new_hole;
21356
21357 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21358 new_hole->start = old_max_offset;
21359 new_hole->end = map->max_offset;
21360 new_hole->prev = map->holes_list->prev;
21361 new_hole->next = (struct vm_map_entry *)map->holes_list;
21362 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21363 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21364 }
21365 }
21366 #endif
21367
21368 vm_map_unlock(map);
21369 #else
21370 (void)map;
21371 (void)new_max_offset;
21372 #endif
21373 }
21374
21375 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21376 vm_compute_max_offset(boolean_t is64)
21377 {
21378 #if defined(__arm64__)
21379 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21380 #else
21381 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21382 #endif
21383 }
21384
21385 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21386 vm_map_get_max_aslr_slide_section(
21387 vm_map_t map __unused,
21388 int64_t *max_sections,
21389 int64_t *section_size)
21390 {
21391 #if defined(__arm64__)
21392 *max_sections = 3;
21393 *section_size = ARM_TT_TWIG_SIZE;
21394 #else
21395 *max_sections = 1;
21396 *section_size = 0;
21397 #endif
21398 }
21399
21400 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21401 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21402 {
21403 #if defined(__arm64__)
21404 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21405 * limited embedded address space; this is also meant to minimize pmap
21406 * memory usage on 16KB page systems.
21407 */
21408 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21409 #else
21410 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21411 #endif
21412 }
21413
21414 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21415 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21416 {
21417 #if defined(__arm64__)
21418 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21419 * of independent entropy on 16KB page systems.
21420 */
21421 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21422 #else
21423 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21424 #endif
21425 }
21426
21427 boolean_t
vm_map_is_64bit(vm_map_t map)21428 vm_map_is_64bit(
21429 vm_map_t map)
21430 {
21431 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21432 }
21433
21434 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21435 vm_map_has_hard_pagezero(
21436 vm_map_t map,
21437 vm_map_offset_t pagezero_size)
21438 {
21439 /*
21440 * XXX FBDP
21441 * We should lock the VM map (for read) here but we can get away
21442 * with it for now because there can't really be any race condition:
21443 * the VM map's min_offset is changed only when the VM map is created
21444 * and when the zero page is established (when the binary gets loaded),
21445 * and this routine gets called only when the task terminates and the
21446 * VM map is being torn down, and when a new map is created via
21447 * load_machfile()/execve().
21448 */
21449 return map->min_offset >= pagezero_size;
21450 }
21451
21452 /*
21453 * Raise a VM map's maximun offset.
21454 */
21455 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21456 vm_map_raise_max_offset(
21457 vm_map_t map,
21458 vm_map_offset_t new_max_offset)
21459 {
21460 kern_return_t ret;
21461
21462 vm_map_lock(map);
21463 ret = KERN_INVALID_ADDRESS;
21464
21465 if (new_max_offset >= map->max_offset) {
21466 if (!vm_map_is_64bit(map)) {
21467 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21468 map->max_offset = new_max_offset;
21469 ret = KERN_SUCCESS;
21470 }
21471 } else {
21472 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21473 map->max_offset = new_max_offset;
21474 ret = KERN_SUCCESS;
21475 }
21476 }
21477 }
21478
21479 vm_map_unlock(map);
21480 return ret;
21481 }
21482
21483
21484 /*
21485 * Raise a VM map's minimum offset.
21486 * To strictly enforce "page zero" reservation.
21487 */
21488 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21489 vm_map_raise_min_offset(
21490 vm_map_t map,
21491 vm_map_offset_t new_min_offset)
21492 {
21493 vm_map_entry_t first_entry;
21494
21495 new_min_offset = vm_map_round_page(new_min_offset,
21496 VM_MAP_PAGE_MASK(map));
21497
21498 vm_map_lock(map);
21499
21500 if (new_min_offset < map->min_offset) {
21501 /*
21502 * Can't move min_offset backwards, as that would expose
21503 * a part of the address space that was previously, and for
21504 * possibly good reasons, inaccessible.
21505 */
21506 vm_map_unlock(map);
21507 return KERN_INVALID_ADDRESS;
21508 }
21509 if (new_min_offset >= map->max_offset) {
21510 /* can't go beyond the end of the address space */
21511 vm_map_unlock(map);
21512 return KERN_INVALID_ADDRESS;
21513 }
21514
21515 first_entry = vm_map_first_entry(map);
21516 if (first_entry != vm_map_to_entry(map) &&
21517 first_entry->vme_start < new_min_offset) {
21518 /*
21519 * Some memory was already allocated below the new
21520 * minimun offset. It's too late to change it now...
21521 */
21522 vm_map_unlock(map);
21523 return KERN_NO_SPACE;
21524 }
21525
21526 map->min_offset = new_min_offset;
21527
21528 if (map->holelistenabled) {
21529 assert(map->holes_list);
21530 map->holes_list->start = new_min_offset;
21531 assert(new_min_offset < map->holes_list->end);
21532 }
21533
21534 vm_map_unlock(map);
21535
21536 return KERN_SUCCESS;
21537 }
21538
21539 /*
21540 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21541 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21542 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21543 * have to reach over to the BSD data structures.
21544 */
21545
21546 uint64_t vm_map_set_size_limit_count = 0;
21547 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21548 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21549 {
21550 kern_return_t kr;
21551
21552 vm_map_lock(map);
21553 if (new_size_limit < map->size) {
21554 /* new limit should not be lower than its current size */
21555 DTRACE_VM2(vm_map_set_size_limit_fail,
21556 vm_map_size_t, map->size,
21557 uint64_t, new_size_limit);
21558 kr = KERN_FAILURE;
21559 } else if (new_size_limit == map->size_limit) {
21560 /* no change */
21561 kr = KERN_SUCCESS;
21562 } else {
21563 /* set new limit */
21564 DTRACE_VM2(vm_map_set_size_limit,
21565 vm_map_size_t, map->size,
21566 uint64_t, new_size_limit);
21567 if (new_size_limit != RLIM_INFINITY) {
21568 vm_map_set_size_limit_count++;
21569 }
21570 map->size_limit = new_size_limit;
21571 kr = KERN_SUCCESS;
21572 }
21573 vm_map_unlock(map);
21574 return kr;
21575 }
21576
21577 uint64_t vm_map_set_data_limit_count = 0;
21578 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21579 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21580 {
21581 kern_return_t kr;
21582
21583 vm_map_lock(map);
21584 if (new_data_limit < map->size) {
21585 /* new limit should not be lower than its current size */
21586 DTRACE_VM2(vm_map_set_data_limit_fail,
21587 vm_map_size_t, map->size,
21588 uint64_t, new_data_limit);
21589 kr = KERN_FAILURE;
21590 } else if (new_data_limit == map->data_limit) {
21591 /* no change */
21592 kr = KERN_SUCCESS;
21593 } else {
21594 /* set new limit */
21595 DTRACE_VM2(vm_map_set_data_limit,
21596 vm_map_size_t, map->size,
21597 uint64_t, new_data_limit);
21598 if (new_data_limit != RLIM_INFINITY) {
21599 vm_map_set_data_limit_count++;
21600 }
21601 map->data_limit = new_data_limit;
21602 kr = KERN_SUCCESS;
21603 }
21604 vm_map_unlock(map);
21605 return kr;
21606 }
21607
21608 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21609 vm_map_set_user_wire_limit(vm_map_t map,
21610 vm_size_t limit)
21611 {
21612 vm_map_lock(map);
21613 map->user_wire_limit = limit;
21614 vm_map_unlock(map);
21615 }
21616
21617
21618 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21619 vm_map_switch_protect(vm_map_t map,
21620 boolean_t val)
21621 {
21622 vm_map_lock(map);
21623 map->switch_protect = val;
21624 vm_map_unlock(map);
21625 }
21626
21627 extern int cs_process_enforcement_enable;
21628 boolean_t
vm_map_cs_enforcement(vm_map_t map)21629 vm_map_cs_enforcement(
21630 vm_map_t map)
21631 {
21632 if (cs_process_enforcement_enable) {
21633 return TRUE;
21634 }
21635 return map->cs_enforcement;
21636 }
21637
21638 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21639 vm_map_cs_wx_enable(
21640 __unused vm_map_t map)
21641 {
21642 #if CODE_SIGNING_MONITOR
21643 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21644 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21645 return KERN_SUCCESS;
21646 }
21647 return ret;
21648 #else
21649 /* The VM manages WX memory entirely on its own */
21650 return KERN_SUCCESS;
21651 #endif
21652 }
21653
21654 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21655 vm_map_csm_allow_jit(
21656 __unused vm_map_t map)
21657 {
21658 #if CODE_SIGNING_MONITOR
21659 return csm_allow_jit_region(vm_map_pmap(map));
21660 #else
21661 /* No code signing monitor to enforce JIT policy */
21662 return KERN_SUCCESS;
21663 #endif
21664 }
21665
21666 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21667 vm_map_cs_debugged_set(
21668 vm_map_t map,
21669 boolean_t val)
21670 {
21671 vm_map_lock(map);
21672 map->cs_debugged = val;
21673 vm_map_unlock(map);
21674 }
21675
21676 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21677 vm_map_cs_enforcement_set(
21678 vm_map_t map,
21679 boolean_t val)
21680 {
21681 vm_map_lock(map);
21682 map->cs_enforcement = val;
21683 pmap_set_vm_map_cs_enforced(map->pmap, val);
21684 vm_map_unlock(map);
21685 }
21686
21687 /*
21688 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21689 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21690 * bump both counters.
21691 */
21692 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21693 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21694 {
21695 pmap_t pmap = vm_map_pmap(map);
21696
21697 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21698 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21699 }
21700
21701 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21702 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21703 {
21704 pmap_t pmap = vm_map_pmap(map);
21705
21706 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21707 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21708 }
21709
21710 /* Add (generate) code signature for memory range */
21711 #if CONFIG_DYNAMIC_CODE_SIGNING
21712 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21713 vm_map_sign(vm_map_t map,
21714 vm_map_offset_t start,
21715 vm_map_offset_t end)
21716 {
21717 vm_map_entry_t entry;
21718 vm_page_t m;
21719 vm_object_t object;
21720
21721 /*
21722 * Vet all the input parameters and current type and state of the
21723 * underlaying object. Return with an error if anything is amiss.
21724 */
21725 if (map == VM_MAP_NULL) {
21726 return KERN_INVALID_ARGUMENT;
21727 }
21728
21729 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21730 return KERN_INVALID_ADDRESS;
21731 }
21732
21733 vm_map_lock_read(map);
21734
21735 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21736 /*
21737 * Must pass a valid non-submap address.
21738 */
21739 vm_map_unlock_read(map);
21740 return KERN_INVALID_ADDRESS;
21741 }
21742
21743 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21744 /*
21745 * Map entry doesn't cover the requested range. Not handling
21746 * this situation currently.
21747 */
21748 vm_map_unlock_read(map);
21749 return KERN_INVALID_ARGUMENT;
21750 }
21751
21752 object = VME_OBJECT(entry);
21753 if (object == VM_OBJECT_NULL) {
21754 /*
21755 * Object must already be present or we can't sign.
21756 */
21757 vm_map_unlock_read(map);
21758 return KERN_INVALID_ARGUMENT;
21759 }
21760
21761 vm_object_lock(object);
21762 vm_map_unlock_read(map);
21763
21764 while (start < end) {
21765 uint32_t refmod;
21766
21767 m = vm_page_lookup(object,
21768 start - entry->vme_start + VME_OFFSET(entry));
21769 if (m == VM_PAGE_NULL) {
21770 /* shoud we try to fault a page here? we can probably
21771 * demand it exists and is locked for this request */
21772 vm_object_unlock(object);
21773 return KERN_FAILURE;
21774 }
21775 /* deal with special page status */
21776 if (m->vmp_busy ||
21777 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21778 vm_object_unlock(object);
21779 return KERN_FAILURE;
21780 }
21781
21782 /* Page is OK... now "validate" it */
21783 /* This is the place where we'll call out to create a code
21784 * directory, later */
21785 /* XXX TODO4K: deal with 4k subpages individually? */
21786 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21787
21788 /* The page is now "clean" for codesigning purposes. That means
21789 * we don't consider it as modified (wpmapped) anymore. But
21790 * we'll disconnect the page so we note any future modification
21791 * attempts. */
21792 m->vmp_wpmapped = FALSE;
21793 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21794
21795 /* Pull the dirty status from the pmap, since we cleared the
21796 * wpmapped bit */
21797 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21798 SET_PAGE_DIRTY(m, FALSE);
21799 }
21800
21801 /* On to the next page */
21802 start += PAGE_SIZE;
21803 }
21804 vm_object_unlock(object);
21805
21806 return KERN_SUCCESS;
21807 }
21808 #endif
21809
21810 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21811 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21812 {
21813 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21814 vm_map_entry_t next_entry;
21815 kern_return_t kr = KERN_SUCCESS;
21816 VM_MAP_ZAP_DECLARE(zap_list);
21817
21818 vm_map_lock(map);
21819
21820 for (entry = vm_map_first_entry(map);
21821 entry != vm_map_to_entry(map);
21822 entry = next_entry) {
21823 next_entry = entry->vme_next;
21824
21825 if (!entry->is_sub_map &&
21826 VME_OBJECT(entry) &&
21827 (VME_OBJECT(entry)->internal == TRUE) &&
21828 (os_ref_get_count_raw(&VME_OBJECT(entry)->ref_count) == 1)) {
21829 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21830 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21831
21832 (void)vm_map_delete(map, entry->vme_start,
21833 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21834 KMEM_GUARD_NONE, &zap_list);
21835 }
21836 }
21837
21838 vm_map_unlock(map);
21839
21840 vm_map_zap_dispose(&zap_list);
21841
21842 return kr;
21843 }
21844
21845
21846 #if DEVELOPMENT || DEBUG
21847
21848 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21849 vm_map_disconnect_page_mappings(
21850 vm_map_t map,
21851 boolean_t do_unnest)
21852 {
21853 vm_map_entry_t entry;
21854 ledger_amount_t byte_count = 0;
21855
21856 if (do_unnest == TRUE) {
21857 #ifndef NO_NESTED_PMAP
21858 vm_map_lock(map);
21859
21860 for (entry = vm_map_first_entry(map);
21861 entry != vm_map_to_entry(map);
21862 entry = entry->vme_next) {
21863 if (entry->is_sub_map && entry->use_pmap) {
21864 /*
21865 * Make sure the range between the start of this entry and
21866 * the end of this entry is no longer nested, so that
21867 * we will only remove mappings from the pmap in use by this
21868 * this task
21869 */
21870 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21871 }
21872 }
21873 vm_map_unlock(map);
21874 #endif
21875 }
21876 vm_map_lock_read(map);
21877
21878 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21879
21880 for (entry = vm_map_first_entry(map);
21881 entry != vm_map_to_entry(map);
21882 entry = entry->vme_next) {
21883 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21884 (VME_OBJECT(entry)->phys_contiguous))) {
21885 continue;
21886 }
21887 if (entry->is_sub_map) {
21888 assert(!entry->use_pmap);
21889 }
21890
21891 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21892 }
21893 vm_map_unlock_read(map);
21894
21895 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21896 }
21897
21898 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21899 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21900 {
21901 vm_object_t object = NULL;
21902 vm_object_offset_t offset;
21903 vm_prot_t prot;
21904 boolean_t wired;
21905 vm_map_version_t version;
21906 vm_map_t real_map;
21907 int result = KERN_FAILURE;
21908
21909 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21910 vm_map_lock(map);
21911
21912 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21913 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21914 NULL, &real_map, NULL);
21915 if (object == NULL) {
21916 result = KERN_MEMORY_ERROR;
21917 } else if (object->pager) {
21918 result = vm_compressor_pager_inject_error(object->pager,
21919 offset);
21920 } else {
21921 result = KERN_MEMORY_PRESENT;
21922 }
21923
21924 if (object != NULL) {
21925 vm_object_unlock(object);
21926 }
21927
21928 if (real_map != map) {
21929 vm_map_unlock(real_map);
21930 }
21931 vm_map_unlock(map);
21932
21933 return result;
21934 }
21935
21936 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21937 * returns: KERN_SUCCESS if iteration completed ok,
21938 * error code if callback returned an error
21939 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21940 * iterated is different from the number in the first call
21941 */
21942 static kern_return_t
21943 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21944 kern_return_t (^entry_handler)(void* entry))
21945 {
21946 vm_map_lock_assert_held(map);
21947 int nentries = map->hdr.nentries;
21948 kern_return_t error = count_handler(nentries);
21949 if (error) {
21950 return error;
21951 }
21952
21953 /* iterate until we loop back to the map, see get_vmmap_entries() */
21954 vm_map_entry_t entry = vm_map_first_entry(map);
21955 int count = 0;
21956 while (entry != vm_map_to_entry(map)) {
21957 error = entry_handler(entry);
21958 if (error != KERN_SUCCESS) {
21959 return error;
21960 }
21961 entry = entry->vme_next;
21962 ++count;
21963 if (count > nentries) {
21964 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21965 return KERN_FAILURE;
21966 }
21967 }
21968 if (count < nentries) {
21969 return KERN_FAILURE;
21970 }
21971 return KERN_SUCCESS;
21972 }
21973
21974 kern_return_t
21975 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21976 kern_return_t (^entry_handler)(void* entry))
21977 {
21978 vm_map_lock_read(map);
21979 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21980 vm_map_unlock_read(map);
21981 return error;
21982 }
21983
21984 /*
21985 * Dump info about the entry into the given buffer.
21986 * return true on success, false if there was not enough space in the give buffer
21987 * argument size in: bytes free in the given buffer, out: bytes written
21988 */
21989 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21990 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21991 {
21992 size_t insize = *size;
21993 kern_return_t kr;
21994 size_t offset = 0;
21995
21996 *size = 0;
21997 if (sizeof(struct vm_map_entry_info) > insize) {
21998 return KERN_NO_SPACE;
21999 }
22000
22001 vm_map_entry_t entry = (vm_map_entry_t)pentry;
22002 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
22003 out_entry->vmei_start = entry->vme_start;
22004 out_entry->vmei_end = entry->vme_end;
22005 out_entry->vmei_alias = VME_ALIAS(entry);
22006 out_entry->vmei_offset = VME_OFFSET(entry);
22007 out_entry->vmei_is_sub_map = entry->is_sub_map;
22008 out_entry->vmei_protection = entry->protection;
22009 offset += sizeof(struct vm_map_entry_info);
22010
22011 out_entry->vmei_slot_mapping_count = 0;
22012 out_entry->vmei_is_compressor_pager = false;
22013 *size = offset;
22014 if (out_entry->vmei_is_sub_map) {
22015 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
22016 }
22017 /* have a vm_object? */
22018 vm_object_t object = VME_OBJECT(entry);
22019 if (object == VM_OBJECT_NULL || !object->internal) {
22020 return KERN_SUCCESS;
22021 }
22022 /* objects has a pager? */
22023 memory_object_t pager = object->pager;
22024 if (pager != MEMORY_OBJECT_NULL) {
22025 return KERN_SUCCESS;
22026 }
22027 bool is_compressor = false;
22028 unsigned int slot_mapping_count = 0;
22029 size_t pager_info_size = insize - offset;
22030 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
22031 if (kr != KERN_SUCCESS) {
22032 /* didn't have enough space for everything we want to write, caller needs to retry */
22033 return kr;
22034 }
22035 offset += pager_info_size;
22036 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
22037 * is just for sanity sake */
22038 out_entry->vmei_is_compressor_pager = is_compressor;
22039 out_entry->vmei_slot_mapping_count = slot_mapping_count;
22040 *size = offset;
22041 return KERN_SUCCESS;
22042 }
22043
22044
22045 #endif
22046
22047
22048 #if CONFIG_FREEZE
22049
22050
22051 extern struct freezer_context freezer_context_global;
22052 AbsoluteTime c_freezer_last_yield_ts = 0;
22053
22054 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
22055 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
22056
22057 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)22058 vm_map_freeze(
22059 task_t task,
22060 unsigned int *purgeable_count,
22061 unsigned int *wired_count,
22062 unsigned int *clean_count,
22063 unsigned int *dirty_count,
22064 unsigned int dirty_budget,
22065 unsigned int *shared_count,
22066 int *freezer_error_code,
22067 boolean_t eval_only)
22068 {
22069 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
22070 kern_return_t kr = KERN_SUCCESS;
22071 boolean_t evaluation_phase = TRUE;
22072 vm_object_t cur_shared_object = NULL;
22073 int cur_shared_obj_ref_cnt = 0;
22074 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
22075
22076 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
22077
22078 /*
22079 * We need the exclusive lock here so that we can
22080 * block any page faults or lookups while we are
22081 * in the middle of freezing this vm map.
22082 */
22083 vm_map_t map = task->map;
22084
22085 vm_map_lock(map);
22086
22087 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
22088
22089 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22090 if (vm_compressor_low_on_space()) {
22091 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22092 }
22093
22094 if (vm_swap_low_on_space()) {
22095 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22096 }
22097
22098 kr = KERN_NO_SPACE;
22099 goto done;
22100 }
22101
22102 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
22103 /*
22104 * In-memory compressor backing the freezer. No disk.
22105 * So no need to do the evaluation phase.
22106 */
22107 evaluation_phase = FALSE;
22108
22109 if (eval_only == TRUE) {
22110 /*
22111 * We don't support 'eval_only' mode
22112 * in this non-swap config.
22113 */
22114 *freezer_error_code = FREEZER_ERROR_GENERIC;
22115 kr = KERN_INVALID_ARGUMENT;
22116 goto done;
22117 }
22118
22119 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22120 clock_get_uptime(&c_freezer_last_yield_ts);
22121 }
22122 again:
22123
22124 for (entry2 = vm_map_first_entry(map);
22125 entry2 != vm_map_to_entry(map);
22126 entry2 = entry2->vme_next) {
22127 vm_object_t src_object;
22128
22129 if (entry2->is_sub_map) {
22130 continue;
22131 }
22132
22133 src_object = VME_OBJECT(entry2);
22134 if (!src_object ||
22135 src_object->phys_contiguous ||
22136 !src_object->internal) {
22137 continue;
22138 }
22139
22140 /* If eligible, scan the entry, moving eligible pages over to our parent object */
22141
22142 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
22143 /*
22144 * We skip purgeable objects during evaluation phase only.
22145 * If we decide to freeze this process, we'll explicitly
22146 * purge these objects before we go around again with
22147 * 'evaluation_phase' set to FALSE.
22148 */
22149
22150 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
22151 /*
22152 * We want to purge objects that may not belong to this task but are mapped
22153 * in this task alone. Since we already purged this task's purgeable memory
22154 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
22155 * on this task's purgeable objects. Hence the check for only volatile objects.
22156 */
22157 if (evaluation_phase ||
22158 src_object->purgable != VM_PURGABLE_VOLATILE ||
22159 os_ref_get_count_raw(&src_object->ref_count) != 1) {
22160 continue;
22161 }
22162 vm_object_lock(src_object);
22163 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
22164 os_ref_get_count_raw(&src_object->ref_count) == 1) {
22165 purgeable_q_t old_queue;
22166
22167 /* object should be on a purgeable queue */
22168 assert(src_object->objq.next != NULL &&
22169 src_object->objq.prev != NULL);
22170 /* move object from its volatile queue to the nonvolatile queue */
22171 old_queue = vm_purgeable_object_remove(src_object);
22172 assert(old_queue);
22173 if (src_object->purgeable_when_ripe) {
22174 /* remove a token from that volatile queue */
22175 vm_page_lock_queues();
22176 vm_purgeable_token_delete_first(old_queue);
22177 vm_page_unlock_queues();
22178 }
22179 /* purge the object */
22180 vm_object_purge(src_object, 0);
22181 }
22182 vm_object_unlock(src_object);
22183 continue;
22184 }
22185
22186 /*
22187 * Pages belonging to this object could be swapped to disk.
22188 * Make sure it's not a shared object because we could end
22189 * up just bringing it back in again.
22190 *
22191 * We try to optimize somewhat by checking for objects that are mapped
22192 * more than once within our own map. But we don't do full searches,
22193 * we just look at the entries following our current entry.
22194 */
22195
22196 if (os_ref_get_count_raw(&src_object->ref_count) > 1) {
22197 if (src_object != cur_shared_object) {
22198 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22199 dirty_shared_count += obj_pages_snapshot;
22200
22201 cur_shared_object = src_object;
22202 cur_shared_obj_ref_cnt = 1;
22203 continue;
22204 } else {
22205 cur_shared_obj_ref_cnt++;
22206 if (os_ref_get_count_raw(&src_object->ref_count) == cur_shared_obj_ref_cnt) {
22207 /*
22208 * Fall through to below and treat this object as private.
22209 * So deduct its pages from our shared total and add it to the
22210 * private total.
22211 */
22212
22213 dirty_shared_count -= obj_pages_snapshot;
22214 dirty_private_count += obj_pages_snapshot;
22215 } else {
22216 continue;
22217 }
22218 }
22219 }
22220
22221
22222 if (os_ref_get_count_raw(&src_object->ref_count) == 1) {
22223 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
22224 }
22225
22226 if (evaluation_phase == TRUE) {
22227 continue;
22228 }
22229 }
22230
22231 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
22232 *wired_count += src_object->wired_page_count;
22233
22234 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
22235 if (vm_compressor_low_on_space()) {
22236 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
22237 }
22238
22239 if (vm_swap_low_on_space()) {
22240 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
22241 }
22242
22243 kr = KERN_NO_SPACE;
22244 break;
22245 }
22246 if (paged_out_count >= dirty_budget) {
22247 break;
22248 }
22249 dirty_budget -= paged_out_count;
22250 }
22251
22252 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
22253 if (evaluation_phase) {
22254 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
22255
22256 if (dirty_shared_count > shared_pages_threshold) {
22257 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
22258 kr = KERN_FAILURE;
22259 goto done;
22260 }
22261
22262 if (dirty_shared_count &&
22263 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
22264 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
22265 kr = KERN_FAILURE;
22266 goto done;
22267 }
22268
22269 evaluation_phase = FALSE;
22270 dirty_shared_count = dirty_private_count = 0;
22271
22272 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
22273 clock_get_uptime(&c_freezer_last_yield_ts);
22274
22275 if (eval_only) {
22276 kr = KERN_SUCCESS;
22277 goto done;
22278 }
22279
22280 vm_purgeable_purge_task_owned(task);
22281
22282 goto again;
22283 } else {
22284 kr = KERN_SUCCESS;
22285 }
22286
22287 done:
22288 vm_map_unlock(map);
22289
22290 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
22291 vm_object_compressed_freezer_done();
22292 }
22293 return kr;
22294 }
22295
22296 #endif
22297
22298 /*
22299 * vm_map_entry_should_cow_for_true_share:
22300 *
22301 * Determines if the map entry should be clipped and setup for copy-on-write
22302 * to avoid applying "true_share" to a large VM object when only a subset is
22303 * targeted.
22304 *
22305 * For now, we target only the map entries created for the Objective C
22306 * Garbage Collector, which initially have the following properties:
22307 * - alias == VM_MEMORY_MALLOC
22308 * - wired_count == 0
22309 * - !needs_copy
22310 * and a VM object with:
22311 * - internal
22312 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22313 * - !true_share
22314 * - vo_size == ANON_CHUNK_SIZE
22315 *
22316 * Only non-kernel map entries.
22317 */
22318 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)22319 vm_map_entry_should_cow_for_true_share(
22320 vm_map_entry_t entry)
22321 {
22322 vm_object_t object;
22323
22324 if (entry->is_sub_map) {
22325 /* entry does not point at a VM object */
22326 return FALSE;
22327 }
22328
22329 if (entry->needs_copy) {
22330 /* already set for copy_on_write: done! */
22331 return FALSE;
22332 }
22333
22334 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22335 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22336 /* not a malloc heap or Obj-C Garbage Collector heap */
22337 return FALSE;
22338 }
22339
22340 if (entry->wired_count) {
22341 /* wired: can't change the map entry... */
22342 vm_counters.should_cow_but_wired++;
22343 return FALSE;
22344 }
22345
22346 object = VME_OBJECT(entry);
22347
22348 if (object == VM_OBJECT_NULL) {
22349 /* no object yet... */
22350 return FALSE;
22351 }
22352
22353 if (!object->internal) {
22354 /* not an internal object */
22355 return FALSE;
22356 }
22357
22358 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22359 /* not the default copy strategy */
22360 return FALSE;
22361 }
22362
22363 if (object->true_share) {
22364 /* already true_share: too late to avoid it */
22365 return FALSE;
22366 }
22367
22368 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22369 object->vo_size != ANON_CHUNK_SIZE) {
22370 /* ... not an object created for the ObjC Garbage Collector */
22371 return FALSE;
22372 }
22373
22374 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22375 object->vo_size != 2048 * 4096) {
22376 /* ... not a "MALLOC_SMALL" heap */
22377 return FALSE;
22378 }
22379
22380 /*
22381 * All the criteria match: we have a large object being targeted for "true_share".
22382 * To limit the adverse side-effects linked with "true_share", tell the caller to
22383 * try and avoid setting up the entire object for "true_share" by clipping the
22384 * targeted range and setting it up for copy-on-write.
22385 */
22386 return TRUE;
22387 }
22388
22389 uint64_t vm_map_range_overflows_count = 0;
22390 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22391 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22392 vm_map_range_overflows(
22393 vm_map_t map,
22394 vm_map_offset_t addr,
22395 vm_map_size_t size)
22396 {
22397 vm_map_offset_t start, end, sum;
22398 vm_map_offset_t pgmask;
22399
22400 if (size == 0) {
22401 /* empty range -> no overflow */
22402 return false;
22403 }
22404 pgmask = vm_map_page_mask(map);
22405 start = vm_map_trunc_page_mask(addr, pgmask);
22406 end = vm_map_round_page_mask(addr + size, pgmask);
22407 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22408 vm_map_range_overflows_count++;
22409 if (vm_map_range_overflows_log) {
22410 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22411 proc_selfpid(),
22412 proc_best_name(current_proc()),
22413 (uint64_t)addr,
22414 (uint64_t)size,
22415 (uint64_t)pgmask);
22416 }
22417 DTRACE_VM4(vm_map_range_overflows,
22418 vm_map_t, map,
22419 uint32_t, pgmask,
22420 uint64_t, (uint64_t)addr,
22421 uint64_t, (uint64_t)size);
22422 return true;
22423 }
22424 return false;
22425 }
22426
22427 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22428 vm_map_round_page_mask(
22429 vm_map_offset_t offset,
22430 vm_map_offset_t mask)
22431 {
22432 return VM_MAP_ROUND_PAGE(offset, mask);
22433 }
22434
22435 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22436 vm_map_trunc_page_mask(
22437 vm_map_offset_t offset,
22438 vm_map_offset_t mask)
22439 {
22440 return VM_MAP_TRUNC_PAGE(offset, mask);
22441 }
22442
22443 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22444 vm_map_page_aligned(
22445 vm_map_offset_t offset,
22446 vm_map_offset_t mask)
22447 {
22448 return ((offset) & mask) == 0;
22449 }
22450
22451 int
vm_map_page_shift(vm_map_t map)22452 vm_map_page_shift(
22453 vm_map_t map)
22454 {
22455 return VM_MAP_PAGE_SHIFT(map);
22456 }
22457
22458 int
vm_map_page_size(vm_map_t map)22459 vm_map_page_size(
22460 vm_map_t map)
22461 {
22462 return VM_MAP_PAGE_SIZE(map);
22463 }
22464
22465 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22466 vm_map_page_mask(
22467 vm_map_t map)
22468 {
22469 return VM_MAP_PAGE_MASK(map);
22470 }
22471
22472 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22473 vm_map_set_page_shift(
22474 vm_map_t map,
22475 int pageshift)
22476 {
22477 if (map->hdr.nentries != 0) {
22478 /* too late to change page size */
22479 return KERN_FAILURE;
22480 }
22481
22482 map->hdr.page_shift = (uint16_t)pageshift;
22483
22484 return KERN_SUCCESS;
22485 }
22486
22487 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22488 vm_map_query_volatile(
22489 vm_map_t map,
22490 mach_vm_size_t *volatile_virtual_size_p,
22491 mach_vm_size_t *volatile_resident_size_p,
22492 mach_vm_size_t *volatile_compressed_size_p,
22493 mach_vm_size_t *volatile_pmap_size_p,
22494 mach_vm_size_t *volatile_compressed_pmap_size_p)
22495 {
22496 mach_vm_size_t volatile_virtual_size;
22497 mach_vm_size_t volatile_resident_count;
22498 mach_vm_size_t volatile_compressed_count;
22499 mach_vm_size_t volatile_pmap_count;
22500 mach_vm_size_t volatile_compressed_pmap_count;
22501 mach_vm_size_t resident_count;
22502 vm_map_entry_t entry;
22503 vm_object_t object;
22504
22505 /* map should be locked by caller */
22506
22507 volatile_virtual_size = 0;
22508 volatile_resident_count = 0;
22509 volatile_compressed_count = 0;
22510 volatile_pmap_count = 0;
22511 volatile_compressed_pmap_count = 0;
22512
22513 for (entry = vm_map_first_entry(map);
22514 entry != vm_map_to_entry(map);
22515 entry = entry->vme_next) {
22516 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22517
22518 if (entry->is_sub_map) {
22519 continue;
22520 }
22521 if (!(entry->protection & VM_PROT_WRITE)) {
22522 continue;
22523 }
22524 object = VME_OBJECT(entry);
22525 if (object == VM_OBJECT_NULL) {
22526 continue;
22527 }
22528 if (object->purgable != VM_PURGABLE_VOLATILE &&
22529 object->purgable != VM_PURGABLE_EMPTY) {
22530 continue;
22531 }
22532 if (VME_OFFSET(entry)) {
22533 /*
22534 * If the map entry has been split and the object now
22535 * appears several times in the VM map, we don't want
22536 * to count the object's resident_page_count more than
22537 * once. We count it only for the first one, starting
22538 * at offset 0 and ignore the other VM map entries.
22539 */
22540 continue;
22541 }
22542 resident_count = object->resident_page_count;
22543 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22544 resident_count = 0;
22545 } else {
22546 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22547 }
22548
22549 volatile_virtual_size += entry->vme_end - entry->vme_start;
22550 volatile_resident_count += resident_count;
22551 if (object->pager) {
22552 volatile_compressed_count +=
22553 vm_compressor_pager_get_count(object->pager);
22554 }
22555 pmap_compressed_bytes = 0;
22556 pmap_resident_bytes =
22557 pmap_query_resident(map->pmap,
22558 entry->vme_start,
22559 entry->vme_end,
22560 &pmap_compressed_bytes);
22561 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22562 volatile_compressed_pmap_count += (pmap_compressed_bytes
22563 / PAGE_SIZE);
22564 }
22565
22566 /* map is still locked on return */
22567
22568 *volatile_virtual_size_p = volatile_virtual_size;
22569 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22570 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22571 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22572 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22573
22574 return KERN_SUCCESS;
22575 }
22576
22577 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22578 vm_map_sizes(vm_map_t map,
22579 vm_map_size_t * psize,
22580 vm_map_size_t * pfree,
22581 vm_map_size_t * plargest_free)
22582 {
22583 vm_map_entry_t entry;
22584 vm_map_offset_t prev;
22585 vm_map_size_t free, total_free, largest_free;
22586 boolean_t end;
22587
22588 if (!map) {
22589 *psize = *pfree = *plargest_free = 0;
22590 return;
22591 }
22592 total_free = largest_free = 0;
22593
22594 vm_map_lock_read(map);
22595 if (psize) {
22596 *psize = map->max_offset - map->min_offset;
22597 }
22598
22599 prev = map->min_offset;
22600 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22601 end = (entry == vm_map_to_entry(map));
22602
22603 if (end) {
22604 free = entry->vme_end - prev;
22605 } else {
22606 free = entry->vme_start - prev;
22607 }
22608
22609 total_free += free;
22610 if (free > largest_free) {
22611 largest_free = free;
22612 }
22613
22614 if (end) {
22615 break;
22616 }
22617 prev = entry->vme_end;
22618 }
22619 vm_map_unlock_read(map);
22620 if (pfree) {
22621 *pfree = total_free;
22622 }
22623 if (plargest_free) {
22624 *plargest_free = largest_free;
22625 }
22626 }
22627
22628 #if VM_SCAN_FOR_SHADOW_CHAIN
22629 int
vm_map_shadow_max(vm_map_t map)22630 vm_map_shadow_max(
22631 vm_map_t map)
22632 {
22633 int shadows, shadows_max;
22634 vm_map_entry_t entry;
22635 vm_object_t object, next_object;
22636
22637 if (map == NULL) {
22638 return 0;
22639 }
22640
22641 shadows_max = 0;
22642
22643 vm_map_lock_read(map);
22644
22645 for (entry = vm_map_first_entry(map);
22646 entry != vm_map_to_entry(map);
22647 entry = entry->vme_next) {
22648 if (entry->is_sub_map) {
22649 continue;
22650 }
22651 object = VME_OBJECT(entry);
22652 if (object == NULL) {
22653 continue;
22654 }
22655 vm_object_lock_shared(object);
22656 for (shadows = 0;
22657 object->shadow != NULL;
22658 shadows++, object = next_object) {
22659 next_object = object->shadow;
22660 vm_object_lock_shared(next_object);
22661 vm_object_unlock(object);
22662 }
22663 vm_object_unlock(object);
22664 if (shadows > shadows_max) {
22665 shadows_max = shadows;
22666 }
22667 }
22668
22669 vm_map_unlock_read(map);
22670
22671 return shadows_max;
22672 }
22673 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22674
22675 void
vm_commit_pagezero_status(vm_map_t lmap)22676 vm_commit_pagezero_status(vm_map_t lmap)
22677 {
22678 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22679 }
22680
22681 #if __x86_64__
22682 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22683 vm_map_set_high_start(
22684 vm_map_t map,
22685 vm_map_offset_t high_start)
22686 {
22687 map->vmmap_high_start = high_start;
22688 }
22689 #endif /* __x86_64__ */
22690
22691 #if CODE_SIGNING_MONITOR
22692
22693 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22694 vm_map_entry_cs_associate(
22695 vm_map_t map,
22696 vm_map_entry_t entry,
22697 vm_map_kernel_flags_t vmk_flags)
22698 {
22699 vm_object_t cs_object, cs_shadow, backing_object;
22700 vm_object_offset_t cs_offset, backing_offset;
22701 void *cs_blobs;
22702 struct vnode *cs_vnode;
22703 kern_return_t cs_ret;
22704
22705 if (map->pmap == NULL ||
22706 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22707 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22708 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22709 return KERN_SUCCESS;
22710 }
22711
22712 if (!(entry->protection & VM_PROT_EXECUTE)) {
22713 /*
22714 * This memory region is not executable, so the code-signing
22715 * monitor would usually not care about it...
22716 */
22717 if (vmk_flags.vmkf_remap_prot_copy &&
22718 (entry->max_protection & VM_PROT_EXECUTE)) {
22719 /*
22720 * ... except if the memory region is being remapped
22721 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22722 * which is what a debugger or dtrace would be doing
22723 * to prepare to modify an executable page to insert
22724 * a breakpoint or activate a probe.
22725 * In that case, fall through so that we can mark
22726 * this region as being "debugged" and no longer
22727 * strictly code-signed.
22728 */
22729 } else {
22730 /*
22731 * Really not executable, so no need to tell the
22732 * code-signing monitor.
22733 */
22734 return KERN_SUCCESS;
22735 }
22736 }
22737
22738 vm_map_lock_assert_exclusive(map);
22739
22740 /*
22741 * Check for a debug association mapping before we check for used_for_jit. This
22742 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22743 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22744 * since they are mapped with RW or RX permissions, which the page table monitor
22745 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22746 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22747 * violation when those USER_EXEC pages are mapped as RW.
22748 *
22749 * Since these pages switch between RW and RX through mprotect, they mimic what
22750 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22751 * on macOS systems, this works in our favor here and allows us to continue to
22752 * support these legacy-programmed applications without sacrificing security on
22753 * the page table or the code signing monitor. We don't need to explicitly check
22754 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22755 * created with RX, then the application must map it as RW in order to first write
22756 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22757 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22758 * Similarly, if the mapping was created as RW, and then switched to RX,
22759 * vm_map_protect will again mark the entry as a copy, and both these cases
22760 * lead to this if-statement being entered.
22761 *
22762 * For more information: rdar://115313336.
22763 */
22764 if (vmk_flags.vmkf_remap_prot_copy) {
22765 cs_ret = csm_associate_debug_region(
22766 map->pmap,
22767 entry->vme_start,
22768 entry->vme_end - entry->vme_start);
22769
22770 /*
22771 * csm_associate_debug_region returns not supported when the code signing
22772 * monitor is disabled. This is intentional, since cs_ret is checked towards
22773 * the end of the function, and if it is not supported, then we still want the
22774 * VM to perform code-signing enforcement on this entry. That said, if we don't
22775 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22776 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22777 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22778 * cases, which will cause a violation when attempted to be mapped as writable).
22779 */
22780 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22781 entry->vme_xnu_user_debug = TRUE;
22782 }
22783 #if DEVELOPMENT || DEBUG
22784 if (vm_log_xnu_user_debug) {
22785 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22786 proc_selfpid(),
22787 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22788 __FUNCTION__, __LINE__,
22789 map, entry,
22790 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22791 entry->vme_xnu_user_debug,
22792 cs_ret);
22793 }
22794 #endif /* DEVELOPMENT || DEBUG */
22795 goto done;
22796 }
22797
22798 if (entry->used_for_jit) {
22799 cs_ret = csm_associate_jit_region(
22800 map->pmap,
22801 entry->vme_start,
22802 entry->vme_end - entry->vme_start);
22803 goto done;
22804 }
22805
22806 cs_object = VME_OBJECT(entry);
22807 vm_object_lock_shared(cs_object);
22808 cs_offset = VME_OFFSET(entry);
22809
22810 /* find the VM object backed by the code-signed vnode */
22811 for (;;) {
22812 /* go to the bottom of cs_object's shadow chain */
22813 for (;
22814 cs_object->shadow != VM_OBJECT_NULL;
22815 cs_object = cs_shadow) {
22816 cs_shadow = cs_object->shadow;
22817 cs_offset += cs_object->vo_shadow_offset;
22818 vm_object_lock_shared(cs_shadow);
22819 vm_object_unlock(cs_object);
22820 }
22821 if (cs_object->internal ||
22822 cs_object->pager == MEMORY_OBJECT_NULL) {
22823 vm_object_unlock(cs_object);
22824 return KERN_SUCCESS;
22825 }
22826
22827 cs_offset += cs_object->paging_offset;
22828
22829 /*
22830 * cs_object could be backed by a:
22831 * vnode_pager
22832 * apple_protect_pager
22833 * shared_region_pager
22834 * fourk_pager (multiple backing objects -> fail?)
22835 * ask the pager if it has a backing VM object
22836 */
22837 if (!memory_object_backing_object(cs_object->pager,
22838 cs_offset,
22839 &backing_object,
22840 &backing_offset)) {
22841 /* no backing object: cs_object is it */
22842 break;
22843 }
22844
22845 /* look down the backing object's shadow chain */
22846 vm_object_lock_shared(backing_object);
22847 vm_object_unlock(cs_object);
22848 cs_object = backing_object;
22849 cs_offset = backing_offset;
22850 }
22851
22852 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22853 if (cs_vnode == NULL) {
22854 /* no vnode, no code signatures to associate */
22855 cs_ret = KERN_SUCCESS;
22856 } else {
22857 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22858 &cs_blobs);
22859 assert(cs_ret == KERN_SUCCESS);
22860 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22861 entry->vme_start,
22862 (entry->vme_end - entry->vme_start),
22863 cs_offset,
22864 cs_blobs);
22865 }
22866 vm_object_unlock(cs_object);
22867 cs_object = VM_OBJECT_NULL;
22868
22869 done:
22870 if (cs_ret == KERN_SUCCESS) {
22871 DTRACE_VM2(vm_map_entry_cs_associate_success,
22872 vm_map_offset_t, entry->vme_start,
22873 vm_map_offset_t, entry->vme_end);
22874 if (vm_map_executable_immutable) {
22875 /*
22876 * Prevent this executable
22877 * mapping from being unmapped
22878 * or modified.
22879 */
22880 entry->vme_permanent = TRUE;
22881 }
22882 /*
22883 * pmap says it will validate the
22884 * code-signing validity of pages
22885 * faulted in via this mapping, so
22886 * this map entry should be marked so
22887 * that vm_fault() bypasses code-signing
22888 * validation for faults coming through
22889 * this mapping.
22890 */
22891 entry->csm_associated = TRUE;
22892 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22893 /*
22894 * pmap won't check the code-signing
22895 * validity of pages faulted in via
22896 * this mapping, so VM should keep
22897 * doing it.
22898 */
22899 DTRACE_VM3(vm_map_entry_cs_associate_off,
22900 vm_map_offset_t, entry->vme_start,
22901 vm_map_offset_t, entry->vme_end,
22902 int, cs_ret);
22903 } else {
22904 /*
22905 * A real error: do not allow
22906 * execution in this mapping.
22907 */
22908 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22909 vm_map_offset_t, entry->vme_start,
22910 vm_map_offset_t, entry->vme_end,
22911 int, cs_ret);
22912 if (vmk_flags.vmkf_overwrite_immutable) {
22913 /*
22914 * We can get here when we remap an apple_protect pager
22915 * on top of an already cs_associated executable mapping
22916 * with the same code signatures, so we don't want to
22917 * lose VM_PROT_EXECUTE in that case...
22918 */
22919 } else {
22920 entry->protection &= ~VM_PROT_ALLEXEC;
22921 entry->max_protection &= ~VM_PROT_ALLEXEC;
22922 }
22923 }
22924
22925 return cs_ret;
22926 }
22927
22928 #endif /* CODE_SIGNING_MONITOR */
22929
22930 inline bool
vm_map_is_corpse_source(vm_map_t map)22931 vm_map_is_corpse_source(vm_map_t map)
22932 {
22933 bool status = false;
22934 if (map) {
22935 vm_map_lock_read(map);
22936 status = map->corpse_source;
22937 vm_map_unlock_read(map);
22938 }
22939 return status;
22940 }
22941
22942 inline void
vm_map_set_corpse_source(vm_map_t map)22943 vm_map_set_corpse_source(vm_map_t map)
22944 {
22945 if (map) {
22946 vm_map_lock(map);
22947 map->corpse_source = true;
22948 vm_map_unlock(map);
22949 }
22950 }
22951
22952 inline void
vm_map_unset_corpse_source(vm_map_t map)22953 vm_map_unset_corpse_source(vm_map_t map)
22954 {
22955 if (map) {
22956 vm_map_lock(map);
22957 map->corpse_source = false;
22958 vm_map_unlock(map);
22959 }
22960 }
22961 /*
22962 * FORKED CORPSE FOOTPRINT
22963 *
22964 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22965 * empty since it never ran and never got to fault in any pages.
22966 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22967 * a forked corpse would therefore return very little information.
22968 *
22969 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22970 * to vm_map_fork() to collect footprint information from the original VM map
22971 * and its pmap, and store it in the forked corpse's VM map. That information
22972 * is stored in place of the VM map's "hole list" since we'll never need to
22973 * lookup for holes in the corpse's map.
22974 *
22975 * The corpse's footprint info looks like this:
22976 *
22977 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22978 * as follows:
22979 * +---------------------------------------+
22980 * header-> | cf_size |
22981 * +-------------------+-------------------+
22982 * | cf_last_region | cf_last_zeroes |
22983 * +-------------------+-------------------+
22984 * region1-> | cfr_vaddr |
22985 * +-------------------+-------------------+
22986 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22987 * +---------------------------------------+
22988 * | d4 | d5 | ... |
22989 * +---------------------------------------+
22990 * | ... |
22991 * +-------------------+-------------------+
22992 * | dy | dz | na | na | cfr_vaddr... | <-region2
22993 * +-------------------+-------------------+
22994 * | cfr_vaddr (ctd) | cfr_num_pages |
22995 * +---------------------------------------+
22996 * | d0 | d1 ... |
22997 * +---------------------------------------+
22998 * ...
22999 * +---------------------------------------+
23000 * last region-> | cfr_vaddr |
23001 * +---------------------------------------+
23002 * + cfr_num_pages | d0 | d1 | d2 | d3 |
23003 * +---------------------------------------+
23004 * ...
23005 * +---------------------------------------+
23006 * | dx | dy | dz | na | na | na | na | na |
23007 * +---------------------------------------+
23008 *
23009 * where:
23010 * cf_size: total size of the buffer (rounded to page size)
23011 * cf_last_region: offset in the buffer of the last "region" sub-header
23012 * cf_last_zeroes: number of trailing "zero" dispositions at the end
23013 * of last region
23014 * cfr_vaddr: virtual address of the start of the covered "region"
23015 * cfr_num_pages: number of pages in the covered "region"
23016 * d*: disposition of the page at that virtual address
23017 * Regions in the buffer are word-aligned.
23018 *
23019 * We estimate the size of the buffer based on the number of memory regions
23020 * and the virtual size of the address space. While copying each memory region
23021 * during vm_map_fork(), we also collect the footprint info for that region
23022 * and store it in the buffer, packing it as much as possible (coalescing
23023 * contiguous memory regions to avoid having too many region headers and
23024 * avoiding long streaks of "zero" page dispositions by splitting footprint
23025 * "regions", so the number of regions in the footprint buffer might not match
23026 * the number of memory regions in the address space.
23027 *
23028 * We also have to copy the original task's "nonvolatile" ledgers since that's
23029 * part of the footprint and will need to be reported to any tool asking for
23030 * the footprint information of the forked corpse.
23031 */
23032
23033 uint64_t vm_map_corpse_footprint_count = 0;
23034 uint64_t vm_map_corpse_footprint_size_avg = 0;
23035 uint64_t vm_map_corpse_footprint_size_max = 0;
23036 uint64_t vm_map_corpse_footprint_full = 0;
23037 uint64_t vm_map_corpse_footprint_no_buf = 0;
23038
23039 struct vm_map_corpse_footprint_header {
23040 vm_size_t cf_size; /* allocated buffer size */
23041 uint32_t cf_last_region; /* offset of last region in buffer */
23042 union {
23043 uint32_t cfu_last_zeroes; /* during creation:
23044 * number of "zero" dispositions at
23045 * end of last region */
23046 uint32_t cfu_hint_region; /* during lookup:
23047 * offset of last looked up region */
23048 #define cf_last_zeroes cfu.cfu_last_zeroes
23049 #define cf_hint_region cfu.cfu_hint_region
23050 } cfu;
23051 };
23052 typedef uint8_t cf_disp_t;
23053 struct vm_map_corpse_footprint_region {
23054 vm_map_offset_t cfr_vaddr; /* region start virtual address */
23055 uint32_t cfr_num_pages; /* number of pages in this "region" */
23056 cf_disp_t cfr_disposition[0]; /* disposition of each page */
23057 } __attribute__((packed));
23058
23059 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)23060 vm_page_disposition_to_cf_disp(
23061 int disposition)
23062 {
23063 assert(sizeof(cf_disp_t) == 1);
23064 /* relocate bits that don't fit in a "uint8_t" */
23065 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
23066 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
23067 }
23068 /* cast gets rid of extra bits */
23069 return (cf_disp_t) disposition;
23070 }
23071
23072 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)23073 vm_page_cf_disp_to_disposition(
23074 cf_disp_t cf_disp)
23075 {
23076 int disposition;
23077
23078 assert(sizeof(cf_disp_t) == 1);
23079 disposition = (int) cf_disp;
23080 /* move relocated bits back in place */
23081 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
23082 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
23083 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
23084 }
23085 return disposition;
23086 }
23087
23088 /*
23089 * vm_map_corpse_footprint_new_region:
23090 * closes the current footprint "region" and creates a new one
23091 *
23092 * Returns NULL if there's not enough space in the buffer for a new region.
23093 */
23094 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)23095 vm_map_corpse_footprint_new_region(
23096 struct vm_map_corpse_footprint_header *footprint_header)
23097 {
23098 uintptr_t footprint_edge;
23099 uint32_t new_region_offset;
23100 struct vm_map_corpse_footprint_region *footprint_region;
23101 struct vm_map_corpse_footprint_region *new_footprint_region;
23102
23103 footprint_edge = ((uintptr_t)footprint_header +
23104 footprint_header->cf_size);
23105 footprint_region = ((struct vm_map_corpse_footprint_region *)
23106 ((char *)footprint_header +
23107 footprint_header->cf_last_region));
23108 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
23109 footprint_edge);
23110
23111 /* get rid of trailing zeroes in the last region */
23112 assert(footprint_region->cfr_num_pages >=
23113 footprint_header->cf_last_zeroes);
23114 footprint_region->cfr_num_pages -=
23115 footprint_header->cf_last_zeroes;
23116 footprint_header->cf_last_zeroes = 0;
23117
23118 /* reuse this region if it's now empty */
23119 if (footprint_region->cfr_num_pages == 0) {
23120 return footprint_region;
23121 }
23122
23123 /* compute offset of new region */
23124 new_region_offset = footprint_header->cf_last_region;
23125 new_region_offset += sizeof(*footprint_region);
23126 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23127 new_region_offset = roundup(new_region_offset, sizeof(int));
23128
23129 /* check if we're going over the edge */
23130 if (((uintptr_t)footprint_header +
23131 new_region_offset +
23132 sizeof(*footprint_region)) >=
23133 footprint_edge) {
23134 /* over the edge: no new region */
23135 return NULL;
23136 }
23137
23138 /* adjust offset of last region in header */
23139 footprint_header->cf_last_region = new_region_offset;
23140
23141 new_footprint_region = (struct vm_map_corpse_footprint_region *)
23142 ((char *)footprint_header +
23143 footprint_header->cf_last_region);
23144 new_footprint_region->cfr_vaddr = 0;
23145 new_footprint_region->cfr_num_pages = 0;
23146 /* caller needs to initialize new region */
23147
23148 return new_footprint_region;
23149 }
23150
23151 /*
23152 * vm_map_corpse_footprint_collect:
23153 * collect footprint information for "old_entry" in "old_map" and
23154 * stores it in "new_map"'s vmmap_footprint_info.
23155 */
23156 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)23157 vm_map_corpse_footprint_collect(
23158 vm_map_t old_map,
23159 vm_map_entry_t old_entry,
23160 vm_map_t new_map)
23161 {
23162 vm_map_offset_t va;
23163 kern_return_t kr;
23164 struct vm_map_corpse_footprint_header *footprint_header;
23165 struct vm_map_corpse_footprint_region *footprint_region;
23166 struct vm_map_corpse_footprint_region *new_footprint_region;
23167 cf_disp_t *next_disp_p;
23168 uintptr_t footprint_edge;
23169 uint32_t num_pages_tmp;
23170 int effective_page_size;
23171
23172 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
23173
23174 va = old_entry->vme_start;
23175
23176 vm_map_lock_assert_exclusive(old_map);
23177 vm_map_lock_assert_exclusive(new_map);
23178
23179 assert(new_map->has_corpse_footprint);
23180 assert(!old_map->has_corpse_footprint);
23181 if (!new_map->has_corpse_footprint ||
23182 old_map->has_corpse_footprint) {
23183 /*
23184 * This can only transfer footprint info from a
23185 * map with a live pmap to a map with a corpse footprint.
23186 */
23187 return KERN_NOT_SUPPORTED;
23188 }
23189
23190 if (new_map->vmmap_corpse_footprint == NULL) {
23191 vm_offset_t buf;
23192 vm_size_t buf_size;
23193
23194 buf = 0;
23195 buf_size = (sizeof(*footprint_header) +
23196 (old_map->hdr.nentries
23197 *
23198 (sizeof(*footprint_region) +
23199 +3)) /* potential alignment for each region */
23200 +
23201 ((old_map->size / effective_page_size)
23202 *
23203 sizeof(cf_disp_t))); /* disposition for each page */
23204 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
23205 buf_size = round_page(buf_size);
23206
23207 /* limit buffer to 1 page to validate overflow detection */
23208 // buf_size = PAGE_SIZE;
23209
23210 /* limit size to a somewhat sane amount */
23211 #if XNU_TARGET_OS_OSX
23212 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
23213 #else /* XNU_TARGET_OS_OSX */
23214 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
23215 #endif /* XNU_TARGET_OS_OSX */
23216 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
23217 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
23218 }
23219
23220 /*
23221 * Allocate the pageable buffer (with a trailing guard page).
23222 * It will be zero-filled on demand.
23223 */
23224 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
23225 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
23226 VM_KERN_MEMORY_DIAG);
23227 if (kr != KERN_SUCCESS) {
23228 vm_map_corpse_footprint_no_buf++;
23229 return kr;
23230 }
23231
23232 /* initialize header and 1st region */
23233 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
23234 new_map->vmmap_corpse_footprint = footprint_header;
23235
23236 footprint_header->cf_size = buf_size;
23237 footprint_header->cf_last_region =
23238 sizeof(*footprint_header);
23239 footprint_header->cf_last_zeroes = 0;
23240
23241 footprint_region = (struct vm_map_corpse_footprint_region *)
23242 ((char *)footprint_header +
23243 footprint_header->cf_last_region);
23244 footprint_region->cfr_vaddr = 0;
23245 footprint_region->cfr_num_pages = 0;
23246 } else {
23247 /* retrieve header and last region */
23248 footprint_header = (struct vm_map_corpse_footprint_header *)
23249 new_map->vmmap_corpse_footprint;
23250 footprint_region = (struct vm_map_corpse_footprint_region *)
23251 ((char *)footprint_header +
23252 footprint_header->cf_last_region);
23253 }
23254 footprint_edge = ((uintptr_t)footprint_header +
23255 footprint_header->cf_size);
23256
23257 if ((footprint_region->cfr_vaddr +
23258 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
23259 effective_page_size))
23260 != old_entry->vme_start) {
23261 uint64_t num_pages_delta, num_pages_delta_size;
23262 uint32_t region_offset_delta_size;
23263
23264 /*
23265 * Not the next contiguous virtual address:
23266 * start a new region or store "zero" dispositions for
23267 * the missing pages?
23268 */
23269 /* size of gap in actual page dispositions */
23270 num_pages_delta = ((old_entry->vme_start -
23271 footprint_region->cfr_vaddr) / effective_page_size)
23272 - footprint_region->cfr_num_pages;
23273 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
23274 /* size of gap as a new footprint region header */
23275 region_offset_delta_size =
23276 (sizeof(*footprint_region) +
23277 roundup(((footprint_region->cfr_num_pages -
23278 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
23279 sizeof(int)) -
23280 ((footprint_region->cfr_num_pages -
23281 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
23282 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
23283 if (region_offset_delta_size < num_pages_delta_size ||
23284 os_add3_overflow(footprint_region->cfr_num_pages,
23285 (uint32_t) num_pages_delta,
23286 1,
23287 &num_pages_tmp)) {
23288 /*
23289 * Storing data for this gap would take more space
23290 * than inserting a new footprint region header:
23291 * let's start a new region and save space. If it's a
23292 * tie, let's avoid using a new region, since that
23293 * would require more region hops to find the right
23294 * range during lookups.
23295 *
23296 * If the current region's cfr_num_pages would overflow
23297 * if we added "zero" page dispositions for the gap,
23298 * no choice but to start a new region.
23299 */
23300 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23301 new_footprint_region =
23302 vm_map_corpse_footprint_new_region(footprint_header);
23303 /* check that we're not going over the edge */
23304 if (new_footprint_region == NULL) {
23305 goto over_the_edge;
23306 }
23307 footprint_region = new_footprint_region;
23308 /* initialize new region as empty */
23309 footprint_region->cfr_vaddr = old_entry->vme_start;
23310 footprint_region->cfr_num_pages = 0;
23311 } else {
23312 /*
23313 * Store "zero" page dispositions for the missing
23314 * pages.
23315 */
23316 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23317 for (; num_pages_delta > 0; num_pages_delta--) {
23318 next_disp_p = (cf_disp_t *)
23319 ((uintptr_t) footprint_region +
23320 sizeof(*footprint_region));
23321 next_disp_p += footprint_region->cfr_num_pages;
23322 /* check that we're not going over the edge */
23323 if ((uintptr_t)next_disp_p >= footprint_edge) {
23324 goto over_the_edge;
23325 }
23326 /* store "zero" disposition for this gap page */
23327 footprint_region->cfr_num_pages++;
23328 *next_disp_p = (cf_disp_t) 0;
23329 footprint_header->cf_last_zeroes++;
23330 }
23331 }
23332 }
23333
23334 for (va = old_entry->vme_start;
23335 va < old_entry->vme_end;
23336 va += effective_page_size) {
23337 int disposition;
23338 cf_disp_t cf_disp;
23339
23340 vm_map_footprint_query_page_info(old_map,
23341 old_entry,
23342 va,
23343 &disposition);
23344 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23345
23346 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23347
23348 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23349 /*
23350 * Ignore "zero" dispositions at start of
23351 * region: just move start of region.
23352 */
23353 footprint_region->cfr_vaddr += effective_page_size;
23354 continue;
23355 }
23356
23357 /* would region's cfr_num_pages overflow? */
23358 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23359 &num_pages_tmp)) {
23360 /* overflow: create a new region */
23361 new_footprint_region =
23362 vm_map_corpse_footprint_new_region(
23363 footprint_header);
23364 if (new_footprint_region == NULL) {
23365 goto over_the_edge;
23366 }
23367 footprint_region = new_footprint_region;
23368 footprint_region->cfr_vaddr = va;
23369 footprint_region->cfr_num_pages = 0;
23370 }
23371
23372 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23373 sizeof(*footprint_region));
23374 next_disp_p += footprint_region->cfr_num_pages;
23375 /* check that we're not going over the edge */
23376 if ((uintptr_t)next_disp_p >= footprint_edge) {
23377 goto over_the_edge;
23378 }
23379 /* store this dispostion */
23380 *next_disp_p = cf_disp;
23381 footprint_region->cfr_num_pages++;
23382
23383 if (cf_disp != 0) {
23384 /* non-zero disp: break the current zero streak */
23385 footprint_header->cf_last_zeroes = 0;
23386 /* done */
23387 continue;
23388 }
23389
23390 /* zero disp: add to the current streak of zeroes */
23391 footprint_header->cf_last_zeroes++;
23392 if ((footprint_header->cf_last_zeroes +
23393 roundup(((footprint_region->cfr_num_pages -
23394 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23395 (sizeof(int) - 1),
23396 sizeof(int))) <
23397 (sizeof(*footprint_header))) {
23398 /*
23399 * There are not enough trailing "zero" dispositions
23400 * (+ the extra padding we would need for the previous
23401 * region); creating a new region would not save space
23402 * at this point, so let's keep this "zero" disposition
23403 * in this region and reconsider later.
23404 */
23405 continue;
23406 }
23407 /*
23408 * Create a new region to avoid having too many consecutive
23409 * "zero" dispositions.
23410 */
23411 new_footprint_region =
23412 vm_map_corpse_footprint_new_region(footprint_header);
23413 if (new_footprint_region == NULL) {
23414 goto over_the_edge;
23415 }
23416 footprint_region = new_footprint_region;
23417 /* initialize the new region as empty ... */
23418 footprint_region->cfr_num_pages = 0;
23419 /* ... and skip this "zero" disp */
23420 footprint_region->cfr_vaddr = va + effective_page_size;
23421 }
23422
23423 return KERN_SUCCESS;
23424
23425 over_the_edge:
23426 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23427 vm_map_corpse_footprint_full++;
23428 return KERN_RESOURCE_SHORTAGE;
23429 }
23430
23431 /*
23432 * vm_map_corpse_footprint_collect_done:
23433 * completes the footprint collection by getting rid of any remaining
23434 * trailing "zero" dispositions and trimming the unused part of the
23435 * kernel buffer
23436 */
23437 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23438 vm_map_corpse_footprint_collect_done(
23439 vm_map_t new_map)
23440 {
23441 struct vm_map_corpse_footprint_header *footprint_header;
23442 struct vm_map_corpse_footprint_region *footprint_region;
23443 vm_size_t buf_size, actual_size;
23444 kern_return_t kr;
23445
23446 assert(new_map->has_corpse_footprint);
23447 if (!new_map->has_corpse_footprint ||
23448 new_map->vmmap_corpse_footprint == NULL) {
23449 return;
23450 }
23451
23452 footprint_header = (struct vm_map_corpse_footprint_header *)
23453 new_map->vmmap_corpse_footprint;
23454 buf_size = footprint_header->cf_size;
23455
23456 footprint_region = (struct vm_map_corpse_footprint_region *)
23457 ((char *)footprint_header +
23458 footprint_header->cf_last_region);
23459
23460 /* get rid of trailing zeroes in last region */
23461 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23462 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23463 footprint_header->cf_last_zeroes = 0;
23464
23465 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23466 sizeof(*footprint_region) +
23467 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23468
23469 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23470 vm_map_corpse_footprint_size_avg =
23471 (((vm_map_corpse_footprint_size_avg *
23472 vm_map_corpse_footprint_count) +
23473 actual_size) /
23474 (vm_map_corpse_footprint_count + 1));
23475 vm_map_corpse_footprint_count++;
23476 if (actual_size > vm_map_corpse_footprint_size_max) {
23477 vm_map_corpse_footprint_size_max = actual_size;
23478 }
23479
23480 actual_size = round_page(actual_size);
23481 if (buf_size > actual_size) {
23482 kr = vm_deallocate(kernel_map,
23483 vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23484 actual_size + PAGE_SIZE), /* trailing guard page */
23485 vm_sanitize_wrap_size(buf_size - actual_size));
23486 assertf(kr == KERN_SUCCESS,
23487 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23488 footprint_header,
23489 (uint64_t) buf_size,
23490 (uint64_t) actual_size,
23491 kr);
23492 kr = vm_protect(kernel_map,
23493 (vm_address_t)footprint_header + actual_size,
23494 PAGE_SIZE,
23495 FALSE, /* set_maximum */
23496 vm_sanitize_wrap_prot(VM_PROT_NONE));
23497 assertf(kr == KERN_SUCCESS,
23498 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23499 footprint_header,
23500 (uint64_t) buf_size,
23501 (uint64_t) actual_size,
23502 kr);
23503 }
23504
23505 footprint_header->cf_size = actual_size;
23506 }
23507
23508 /*
23509 * vm_map_corpse_footprint_query_page_info:
23510 * retrieves the disposition of the page at virtual address "vaddr"
23511 * in the forked corpse's VM map
23512 *
23513 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23514 */
23515 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23516 vm_map_corpse_footprint_query_page_info(
23517 vm_map_t map,
23518 vm_map_offset_t va,
23519 int *disposition_p)
23520 {
23521 struct vm_map_corpse_footprint_header *footprint_header;
23522 struct vm_map_corpse_footprint_region *footprint_region;
23523 uint32_t footprint_region_offset;
23524 vm_map_offset_t region_start, region_end;
23525 int disp_idx;
23526 kern_return_t kr;
23527 int effective_page_size;
23528 cf_disp_t cf_disp;
23529
23530 if (!map->has_corpse_footprint) {
23531 *disposition_p = 0;
23532 kr = KERN_INVALID_ARGUMENT;
23533 goto done;
23534 }
23535
23536 footprint_header = map->vmmap_corpse_footprint;
23537 if (footprint_header == NULL) {
23538 *disposition_p = 0;
23539 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23540 kr = KERN_INVALID_ARGUMENT;
23541 goto done;
23542 }
23543
23544 /* start looking at the hint ("cf_hint_region") */
23545 footprint_region_offset = footprint_header->cf_hint_region;
23546
23547 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23548
23549 lookup_again:
23550 if (footprint_region_offset < sizeof(*footprint_header)) {
23551 /* hint too low: start from 1st region */
23552 footprint_region_offset = sizeof(*footprint_header);
23553 }
23554 if (footprint_region_offset > footprint_header->cf_last_region) {
23555 /* hint too high: re-start from 1st region */
23556 footprint_region_offset = sizeof(*footprint_header);
23557 }
23558 footprint_region = (struct vm_map_corpse_footprint_region *)
23559 ((char *)footprint_header + footprint_region_offset);
23560 region_start = footprint_region->cfr_vaddr;
23561 region_end = (region_start +
23562 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23563 effective_page_size));
23564 if (va < region_start &&
23565 footprint_region_offset != sizeof(*footprint_header)) {
23566 /* our range starts before the hint region */
23567
23568 /* reset the hint (in a racy way...) */
23569 footprint_header->cf_hint_region = sizeof(*footprint_header);
23570 /* lookup "va" again from 1st region */
23571 footprint_region_offset = sizeof(*footprint_header);
23572 goto lookup_again;
23573 }
23574
23575 while (va >= region_end) {
23576 if (footprint_region_offset >= footprint_header->cf_last_region) {
23577 break;
23578 }
23579 /* skip the region's header */
23580 footprint_region_offset += sizeof(*footprint_region);
23581 /* skip the region's page dispositions */
23582 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23583 /* align to next word boundary */
23584 footprint_region_offset =
23585 roundup(footprint_region_offset,
23586 sizeof(int));
23587 footprint_region = (struct vm_map_corpse_footprint_region *)
23588 ((char *)footprint_header + footprint_region_offset);
23589 region_start = footprint_region->cfr_vaddr;
23590 region_end = (region_start +
23591 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23592 effective_page_size));
23593 }
23594 if (va < region_start || va >= region_end) {
23595 /* page not found */
23596 *disposition_p = 0;
23597 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23598 kr = KERN_SUCCESS;
23599 goto done;
23600 }
23601
23602 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23603 footprint_header->cf_hint_region = footprint_region_offset;
23604
23605 /* get page disposition for "va" in this region */
23606 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23607 cf_disp = footprint_region->cfr_disposition[disp_idx];
23608 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23609 kr = KERN_SUCCESS;
23610 done:
23611 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23612 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23613 DTRACE_VM4(footprint_query_page_info,
23614 vm_map_t, map,
23615 vm_map_offset_t, va,
23616 int, *disposition_p,
23617 kern_return_t, kr);
23618
23619 return kr;
23620 }
23621
23622 void
vm_map_corpse_footprint_destroy(vm_map_t map)23623 vm_map_corpse_footprint_destroy(
23624 vm_map_t map)
23625 {
23626 if (map->has_corpse_footprint &&
23627 map->vmmap_corpse_footprint != 0) {
23628 struct vm_map_corpse_footprint_header *footprint_header;
23629 vm_size_t buf_size;
23630 kern_return_t kr;
23631
23632 footprint_header = map->vmmap_corpse_footprint;
23633 buf_size = footprint_header->cf_size;
23634 kr = vm_deallocate(kernel_map,
23635 vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23636 vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23637 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23638 map->vmmap_corpse_footprint = 0;
23639 map->has_corpse_footprint = FALSE;
23640 }
23641 }
23642
23643 /*
23644 * vm_map_copy_footprint_ledgers:
23645 * copies any ledger that's relevant to the memory footprint of "old_task"
23646 * into the forked corpse's task ("new_task")
23647 */
23648 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23649 vm_map_copy_footprint_ledgers(
23650 task_t old_task,
23651 task_t new_task)
23652 {
23653 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23654 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23655 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23656 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23657 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23658 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23659 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23660 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23661 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23662 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23663 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23664 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23665 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23666 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23667 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23668 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23669 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23670 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23671 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23672 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23673 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23674 }
23675
23676 /*
23677 * vm_map_copy_ledger:
23678 * copy a single ledger from "old_task" to "new_task"
23679 */
23680 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23681 vm_map_copy_ledger(
23682 task_t old_task,
23683 task_t new_task,
23684 int ledger_entry)
23685 {
23686 ledger_amount_t old_balance, new_balance, delta;
23687
23688 assert(new_task->map->has_corpse_footprint);
23689 if (!new_task->map->has_corpse_footprint) {
23690 return;
23691 }
23692
23693 /* turn off sanity checks for the ledger we're about to mess with */
23694 ledger_disable_panic_on_negative(new_task->ledger,
23695 ledger_entry);
23696
23697 /* adjust "new_task" to match "old_task" */
23698 ledger_get_balance(old_task->ledger,
23699 ledger_entry,
23700 &old_balance);
23701 ledger_get_balance(new_task->ledger,
23702 ledger_entry,
23703 &new_balance);
23704 if (new_balance == old_balance) {
23705 /* new == old: done */
23706 } else if (new_balance > old_balance) {
23707 /* new > old ==> new -= new - old */
23708 delta = new_balance - old_balance;
23709 ledger_debit(new_task->ledger,
23710 ledger_entry,
23711 delta);
23712 } else {
23713 /* new < old ==> new += old - new */
23714 delta = old_balance - new_balance;
23715 ledger_credit(new_task->ledger,
23716 ledger_entry,
23717 delta);
23718 }
23719 }
23720
23721 /*
23722 * vm_map_get_pmap:
23723 * returns the pmap associated with the vm_map
23724 */
23725 pmap_t
vm_map_get_pmap(vm_map_t map)23726 vm_map_get_pmap(vm_map_t map)
23727 {
23728 return vm_map_pmap(map);
23729 }
23730
23731 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23732 vm_map_get_phys_page(
23733 vm_map_t map,
23734 vm_offset_t addr)
23735 {
23736 vm_object_offset_t offset;
23737 vm_object_t object;
23738 vm_map_offset_t map_offset;
23739 vm_map_entry_t entry;
23740 ppnum_t phys_page = 0;
23741
23742 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23743
23744 vm_map_lock(map);
23745 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23746 if (entry->is_sub_map) {
23747 vm_map_t old_map;
23748 vm_map_lock(VME_SUBMAP(entry));
23749 old_map = map;
23750 map = VME_SUBMAP(entry);
23751 map_offset = (VME_OFFSET(entry) +
23752 (map_offset - entry->vme_start));
23753 vm_map_unlock(old_map);
23754 continue;
23755 }
23756 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23757 vm_map_unlock(map);
23758 return (ppnum_t) 0;
23759 }
23760 if (VME_OBJECT(entry)->phys_contiguous) {
23761 /* These are not standard pageable memory mappings */
23762 /* If they are not present in the object they will */
23763 /* have to be picked up from the pager through the */
23764 /* fault mechanism. */
23765 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23766 /* need to call vm_fault */
23767 vm_map_unlock(map);
23768 vm_fault(map, map_offset, VM_PROT_NONE,
23769 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23770 THREAD_UNINT, NULL, 0);
23771 vm_map_lock(map);
23772 continue;
23773 }
23774 offset = (VME_OFFSET(entry) +
23775 (map_offset - entry->vme_start));
23776 phys_page = (ppnum_t)
23777 ((VME_OBJECT(entry)->vo_shadow_offset
23778 + offset) >> PAGE_SHIFT);
23779 break;
23780 }
23781 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23782 object = VME_OBJECT(entry);
23783 vm_object_lock(object);
23784 while (TRUE) {
23785 vm_page_t dst_page = vm_page_lookup(object, offset);
23786 if (dst_page == VM_PAGE_NULL) {
23787 if (object->shadow) {
23788 vm_object_t old_object;
23789 vm_object_lock(object->shadow);
23790 old_object = object;
23791 offset = offset + object->vo_shadow_offset;
23792 object = object->shadow;
23793 vm_object_unlock(old_object);
23794 } else {
23795 vm_object_unlock(object);
23796 break;
23797 }
23798 } else {
23799 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23800 vm_object_unlock(object);
23801 break;
23802 }
23803 }
23804 break;
23805 }
23806
23807 vm_map_unlock(map);
23808 return phys_page;
23809 }
23810
23811 #if CONFIG_MAP_RANGES
23812 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23813 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23814
23815 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23816 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23817
23818 /*
23819 * vm_map_range_map_init:
23820 * initializes the VM range ID map to enable index lookup
23821 * of user VM ranges based on VM tag from userspace.
23822 */
23823 static void
vm_map_range_map_init(void)23824 vm_map_range_map_init(void)
23825 {
23826 /*
23827 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23828 * - the former is malloc metadata which should be kept separate
23829 * - the latter has its own ranges
23830 */
23831 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23832 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23833 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23834 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23835 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23836 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23837 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23838 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23839 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23840 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23841 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23842 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23843 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23844 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23845 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23846 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23847 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23848 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23849 }
23850
23851 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23852 vm_map_range_random_uniform(
23853 vm_map_size_t req_size,
23854 vm_map_offset_t min_addr,
23855 vm_map_offset_t max_addr,
23856 vm_map_offset_t offmask)
23857 {
23858 vm_map_offset_t random_addr;
23859 struct mach_vm_range alloc;
23860
23861 req_size = (req_size + offmask) & ~offmask;
23862 min_addr = (min_addr + offmask) & ~offmask;
23863 max_addr = max_addr & ~offmask;
23864
23865 read_random(&random_addr, sizeof(random_addr));
23866 random_addr %= (max_addr - req_size - min_addr);
23867 random_addr &= ~offmask;
23868
23869 alloc.min_address = min_addr + random_addr;
23870 alloc.max_address = min_addr + random_addr + req_size;
23871 return alloc;
23872 }
23873
23874 static vm_map_offset_t
vm_map_range_offmask(void)23875 vm_map_range_offmask(void)
23876 {
23877 uint32_t pte_depth;
23878
23879 /*
23880 * PTE optimizations
23881 *
23882 *
23883 * 16k pages systems
23884 * ~~~~~~~~~~~~~~~~~
23885 *
23886 * A single L1 (sub-)page covers the address space.
23887 * - L2 pages cover 64G,
23888 * - L3 pages cover 32M.
23889 *
23890 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23891 * As a result, we really only need to align the ranges to 32M to avoid
23892 * partial L3 pages.
23893 *
23894 * On macOS, the usage of L2 pages will increase, so as a result we will
23895 * want to align ranges to 64G in order to utilize them fully.
23896 *
23897 *
23898 * 4k pages systems
23899 * ~~~~~~~~~~~~~~~~
23900 *
23901 * A single L0 (sub-)page covers the address space.
23902 * - L1 pages cover 512G,
23903 * - L2 pages cover 1G,
23904 * - L3 pages cover 2M.
23905 *
23906 * The long tail of processes on a system will tend to have a VA usage
23907 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23908 * This is achievable with a single L1 and a few L2s without
23909 * randomization.
23910 *
23911 * However once randomization is introduced, the system will immediately
23912 * need several L1s and many more L2s. As a result:
23913 *
23914 * - on embedded devices, the cost of these extra pages isn't
23915 * sustainable, and we just disable the feature entirely,
23916 *
23917 * - on macOS we align ranges to a 512G boundary so that the extra L1
23918 * pages can be used to their full potential.
23919 */
23920
23921 /*
23922 * note, this function assumes _non exotic mappings_
23923 * which is why it uses the native kernel's PAGE_SHIFT.
23924 */
23925 #if XNU_PLATFORM_MacOSX
23926 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23927 #else /* !XNU_PLATFORM_MacOSX */
23928 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23929 #endif /* !XNU_PLATFORM_MacOSX */
23930
23931 if (pte_depth == 0) {
23932 return 0;
23933 }
23934
23935 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23936 }
23937
23938 /*
23939 * vm_map_range_configure:
23940 * configures the user vm_map ranges by increasing the maximum VA range of
23941 * the map and carving out a range at the end of VA space (searching backwards
23942 * in the newly expanded map).
23943 */
23944 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23945 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23946 {
23947 const vm_map_offset_t offmask = vm_map_range_offmask();
23948 struct mach_vm_range data_range;
23949 vm_map_offset_t default_end;
23950 kern_return_t kr;
23951
23952 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23953 /*
23954 * No point doing vm ranges in a 32bit address space.
23955 */
23956 return KERN_NOT_SUPPORTED;
23957 }
23958
23959 /* Should not be applying ranges to kernel map or kernel map submaps */
23960 assert(vm_map_pmap(map) != kernel_pmap);
23961
23962 #if XNU_PLATFORM_MacOSX
23963
23964 /*
23965 * on macOS, the address space is a massive 47 bits (128T),
23966 * with several carve outs that processes can't use:
23967 * - the shared region
23968 * - the commpage region
23969 * - the GPU carve out (if applicable)
23970 *
23971 * and when nano-malloc is in use it desires memory at the 96T mark.
23972 *
23973 * However, their location is architecture dependent:
23974 * - On intel, the shared region and commpage are
23975 * at the very end of the usable address space (above +127T),
23976 * and there is no GPU carve out, and pthread wants to place
23977 * threads at the 112T mark (0x70T).
23978 *
23979 * - On arm64, these are in the same spot as on embedded devices:
23980 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23981 * o commpage region: [63G, 64G)
23982 * o GPU carve out: [64G, 448G)
23983 *
23984 * This is conveninent because the mappings at the end of the address
23985 * space (when they exist) are made by the kernel.
23986 *
23987 * The policy is to allocate a random 1T for the data heap
23988 * in the end of the address-space in the:
23989 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23990 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23991 */
23992
23993 /* see NANOZONE_SIGNATURE in libmalloc */
23994 #if __x86_64__
23995 default_end = 0x71ull << 40;
23996 #else
23997 default_end = 0x61ull << 40;
23998 #endif
23999 data_range = vm_map_range_random_uniform(1ull << 40,
24000 default_end, 0x7full << 40, offmask);
24001
24002 #else /* !XNU_PLATFORM_MacOSX */
24003
24004 /*
24005 * Embedded devices:
24006 *
24007 * The default VA Size scales with the device physical memory.
24008 *
24009 * Out of that:
24010 * - the "zero" page typically uses 4G + some slide
24011 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
24012 *
24013 * Without the use of jumbo or any adjustment to the address space,
24014 * a default VM map typically looks like this:
24015 *
24016 * 0G -->╒════════════╕
24017 * │ pagezero │
24018 * │ + slide │
24019 * ~4G -->╞════════════╡<-- vm_map_min(map)
24020 * │ │
24021 * 6G -->├────────────┤
24022 * │ shared │
24023 * │ region │
24024 * 10G -->├────────────┤
24025 * │ │
24026 * max_va -->├────────────┤<-- vm_map_max(map)
24027 * │ │
24028 * ╎ jumbo ╎
24029 * ╎ ╎
24030 * │ │
24031 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
24032 * │ commpage │
24033 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
24034 * │ │
24035 * ╎ GPU ╎
24036 * ╎ carveout ╎
24037 * │ │
24038 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
24039 * │ │
24040 * ╎ ╎
24041 * ╎ ╎
24042 * │ │
24043 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
24044 *
24045 * When this drawing was made, "max_va" was smaller than
24046 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
24047 * 12G of address space for the zero-page, slide, files,
24048 * binaries, heap ...
24049 *
24050 * We will want to make a "heap/data" carve out inside
24051 * the jumbo range of half of that usable space, assuming
24052 * that this is less than a forth of the jumbo range.
24053 *
24054 * The assert below intends to catch when max_va grows
24055 * too large for this heuristic.
24056 */
24057
24058 vm_map_lock_read(map);
24059 default_end = vm_map_max(map);
24060 vm_map_unlock_read(map);
24061
24062 /*
24063 * Check that we're not already jumbo'd,
24064 * or our address space was somehow modified.
24065 *
24066 * If so we cannot guarantee that we can set up the ranges
24067 * safely without interfering with the existing map.
24068 */
24069 if (default_end > vm_compute_max_offset(true)) {
24070 return KERN_NO_SPACE;
24071 }
24072
24073 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
24074 /*
24075 * an override boot-arg was set, disable user-ranges
24076 *
24077 * XXX: this is problematic because it means these boot-args
24078 * no longer test the behavior changing the value
24079 * of ARM64_MAX_OFFSET_DEVICE_* would have.
24080 */
24081 return KERN_NOT_SUPPORTED;
24082 }
24083
24084 /* expand the default VM space to 64GB */
24085 vm_map_set_jumbo(map);
24086
24087 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
24088 data_range = vm_map_range_random_uniform(GiB(10),
24089 default_end + PAGE_SIZE, vm_map_max(map), offmask);
24090
24091 #endif /* !XNU_PLATFORM_MacOSX */
24092
24093 /*
24094 * Poke holes so that ASAN or people listing regions
24095 * do not think this space is free.
24096 */
24097
24098 if (default_end != data_range.min_address) {
24099 kr = vm_map_enter(map, &default_end,
24100 data_range.min_address - default_end,
24101 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24102 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24103 assert(kr == KERN_SUCCESS);
24104 }
24105
24106 if (data_range.max_address != vm_map_max(map)) {
24107 vm_map_entry_t entry;
24108 vm_size_t size;
24109
24110 /*
24111 * Extend the end of the hole to the next VM entry or the end of the map,
24112 * whichever comes first.
24113 */
24114 vm_map_lock_read(map);
24115 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
24116 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
24117 size = vm_map_max(map) - data_range.max_address;
24118 } else {
24119 size = entry->vme_start - data_range.max_address;
24120 }
24121 vm_map_unlock_read(map);
24122
24123 kr = vm_map_enter(map, &data_range.max_address, size,
24124 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
24125 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
24126 assert(kr == KERN_SUCCESS);
24127 }
24128
24129 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24130 if (needs_extra_jumbo_va) {
24131 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
24132 vm_map_set_extra_jumbo(map);
24133 }
24134 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24135
24136 vm_map_lock(map);
24137 map->default_range.min_address = vm_map_min(map);
24138 map->default_range.max_address = default_end;
24139 map->data_range = data_range;
24140 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
24141 /* If process has "extra jumbo" entitlement, enable large file range */
24142 if (needs_extra_jumbo_va) {
24143 map->large_file_range = vm_map_range_random_uniform(TiB(1),
24144 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
24145 }
24146 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
24147 map->uses_user_ranges = true;
24148 vm_map_unlock(map);
24149
24150 return KERN_SUCCESS;
24151 }
24152
24153 /*
24154 * vm_map_range_fork:
24155 * clones the array of ranges from old_map to new_map in support
24156 * of a VM map fork.
24157 */
24158 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)24159 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
24160 {
24161 if (!old_map->uses_user_ranges) {
24162 /* nothing to do */
24163 return;
24164 }
24165
24166 new_map->default_range = old_map->default_range;
24167 new_map->data_range = old_map->data_range;
24168
24169 if (old_map->extra_ranges_count) {
24170 vm_map_user_range_t otable, ntable;
24171 uint16_t count;
24172
24173 otable = old_map->extra_ranges;
24174 count = old_map->extra_ranges_count;
24175 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
24176 Z_WAITOK | Z_ZERO | Z_NOFAIL);
24177 memcpy(ntable, otable,
24178 count * sizeof(struct vm_map_user_range));
24179
24180 new_map->extra_ranges_count = count;
24181 new_map->extra_ranges = ntable;
24182 }
24183
24184 new_map->uses_user_ranges = true;
24185 }
24186
24187 /*
24188 * vm_map_get_user_range:
24189 * copy the VM user range for the given VM map and range ID.
24190 */
24191 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)24192 vm_map_get_user_range(
24193 vm_map_t map,
24194 vm_map_range_id_t range_id,
24195 mach_vm_range_t range)
24196 {
24197 if (map == NULL || !map->uses_user_ranges || range == NULL) {
24198 return KERN_INVALID_ARGUMENT;
24199 }
24200
24201 switch (range_id) {
24202 case UMEM_RANGE_ID_DEFAULT:
24203 *range = map->default_range;
24204 return KERN_SUCCESS;
24205
24206 case UMEM_RANGE_ID_HEAP:
24207 *range = map->data_range;
24208 return KERN_SUCCESS;
24209
24210 case UMEM_RANGE_ID_LARGE_FILE:
24211 /*
24212 * Because this function tells a user-space process about the user
24213 * ranges in its VM map, this case communicates whether the large file
24214 * range is in use. Note that this is different from how the large file
24215 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
24216 * VA policy and return either the large file range or data range,
24217 * depending on whether the large file range is enabled.
24218 */
24219 if (map->large_file_range.min_address != map->large_file_range.max_address) {
24220 /* large file range is configured and should be used */
24221 *range = map->large_file_range;
24222 } else {
24223 return KERN_INVALID_ARGUMENT;
24224 }
24225 return KERN_SUCCESS;
24226
24227 default:
24228 return KERN_INVALID_ARGUMENT;
24229 }
24230 }
24231
24232 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)24233 vm_map_user_range_resolve(
24234 vm_map_t map,
24235 mach_vm_address_t addr,
24236 mach_vm_size_t size,
24237 mach_vm_range_t range)
24238 {
24239 struct mach_vm_range tmp;
24240
24241 vm_map_lock_assert_held(map);
24242
24243 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
24244 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
24245
24246 if (mach_vm_range_contains(&map->default_range, addr, size)) {
24247 if (range) {
24248 *range = map->default_range;
24249 }
24250 return UMEM_RANGE_ID_DEFAULT;
24251 }
24252
24253 if (mach_vm_range_contains(&map->data_range, addr, size)) {
24254 if (range) {
24255 *range = map->data_range;
24256 }
24257 return UMEM_RANGE_ID_HEAP;
24258 }
24259
24260 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
24261 if (range) {
24262 *range = map->large_file_range;
24263 }
24264 return UMEM_RANGE_ID_LARGE_FILE;
24265 }
24266
24267 for (size_t i = 0; i < map->extra_ranges_count; i++) {
24268 vm_map_user_range_t r = &map->extra_ranges[i];
24269
24270 tmp.min_address = r->vmur_min_address;
24271 tmp.max_address = r->vmur_max_address;
24272
24273 if (mach_vm_range_contains(&tmp, addr, size)) {
24274 if (range) {
24275 *range = tmp;
24276 }
24277 return r->vmur_range_id;
24278 }
24279 }
24280
24281 if (range) {
24282 range->min_address = range->max_address = 0;
24283 }
24284 return UMEM_RANGE_ID_DEFAULT;
24285 }
24286 #endif /* CONFIG_MAP_RANGES */
24287
24288 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)24289 vm_map_kernel_flags_update_range_id(
24290 vm_map_kernel_flags_t *vmkf,
24291 vm_map_t map,
24292 __unused vm_map_size_t size)
24293 {
24294 if (map == kernel_map) {
24295 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24296 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24297 }
24298 #if CONFIG_MAP_RANGES
24299 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24300 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
24301 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
24302 || size >= VM_LARGE_FILE_THRESHOLD) {
24303 /*
24304 * if the map doesn't have the large file range configured,
24305 * the range will get resolved to the heap range in `vm_map_get_range`
24306 */
24307 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
24308 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24309 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24310 }
24311 #endif /* CONFIG_MAP_RANGES */
24312 }
24313 }
24314
24315 /*
24316 * vm_map_entry_has_device_pager:
24317 * Check if the vm map entry specified by the virtual address has a device pager.
24318 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24319 */
24320 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)24321 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24322 {
24323 vm_map_entry_t entry;
24324 vm_object_t object;
24325 boolean_t result;
24326
24327 if (map == NULL) {
24328 return FALSE;
24329 }
24330
24331 vm_map_lock(map);
24332 while (TRUE) {
24333 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
24334 result = FALSE;
24335 break;
24336 }
24337 if (entry->is_sub_map) {
24338 // Check the submap
24339 vm_map_t submap = VME_SUBMAP(entry);
24340 assert(submap != NULL);
24341 vm_map_lock(submap);
24342 vm_map_unlock(map);
24343 map = submap;
24344 continue;
24345 }
24346 object = VME_OBJECT(entry);
24347 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
24348 result = TRUE;
24349 break;
24350 }
24351 result = FALSE;
24352 break;
24353 }
24354
24355 vm_map_unlock(map);
24356 return result;
24357 }
24358
24359
24360 #if MACH_ASSERT
24361
24362 extern int pmap_ledgers_panic;
24363 extern int pmap_ledgers_panic_leeway;
24364
24365 #define LEDGER_DRIFT(__LEDGER) \
24366 int __LEDGER##_over; \
24367 ledger_amount_t __LEDGER##_over_total; \
24368 ledger_amount_t __LEDGER##_over_max; \
24369 int __LEDGER##_under; \
24370 ledger_amount_t __LEDGER##_under_total; \
24371 ledger_amount_t __LEDGER##_under_max
24372
24373 struct {
24374 uint64_t num_pmaps_checked;
24375
24376 LEDGER_DRIFT(phys_footprint);
24377 LEDGER_DRIFT(internal);
24378 LEDGER_DRIFT(internal_compressed);
24379 LEDGER_DRIFT(external);
24380 LEDGER_DRIFT(reusable);
24381 LEDGER_DRIFT(iokit_mapped);
24382 LEDGER_DRIFT(alternate_accounting);
24383 LEDGER_DRIFT(alternate_accounting_compressed);
24384 LEDGER_DRIFT(page_table);
24385 LEDGER_DRIFT(purgeable_volatile);
24386 LEDGER_DRIFT(purgeable_nonvolatile);
24387 LEDGER_DRIFT(purgeable_volatile_compressed);
24388 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24389 LEDGER_DRIFT(tagged_nofootprint);
24390 LEDGER_DRIFT(tagged_footprint);
24391 LEDGER_DRIFT(tagged_nofootprint_compressed);
24392 LEDGER_DRIFT(tagged_footprint_compressed);
24393 LEDGER_DRIFT(network_volatile);
24394 LEDGER_DRIFT(network_nonvolatile);
24395 LEDGER_DRIFT(network_volatile_compressed);
24396 LEDGER_DRIFT(network_nonvolatile_compressed);
24397 LEDGER_DRIFT(media_nofootprint);
24398 LEDGER_DRIFT(media_footprint);
24399 LEDGER_DRIFT(media_nofootprint_compressed);
24400 LEDGER_DRIFT(media_footprint_compressed);
24401 LEDGER_DRIFT(graphics_nofootprint);
24402 LEDGER_DRIFT(graphics_footprint);
24403 LEDGER_DRIFT(graphics_nofootprint_compressed);
24404 LEDGER_DRIFT(graphics_footprint_compressed);
24405 LEDGER_DRIFT(neural_nofootprint);
24406 LEDGER_DRIFT(neural_footprint);
24407 LEDGER_DRIFT(neural_nofootprint_compressed);
24408 LEDGER_DRIFT(neural_footprint_compressed);
24409 LEDGER_DRIFT(neural_nofootprint_total);
24410 } pmap_ledgers_drift;
24411
24412 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24413 vm_map_pmap_check_ledgers(
24414 pmap_t pmap,
24415 ledger_t ledger,
24416 int pid,
24417 char *procname)
24418 {
24419 ledger_amount_t bal;
24420 boolean_t do_panic;
24421
24422 do_panic = FALSE;
24423
24424 pmap_ledgers_drift.num_pmaps_checked++;
24425
24426 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24427 MACRO_BEGIN \
24428 int panic_on_negative = TRUE; \
24429 ledger_get_balance(ledger, \
24430 task_ledgers.__LEDGER, \
24431 &bal); \
24432 ledger_get_panic_on_negative(ledger, \
24433 task_ledgers.__LEDGER, \
24434 &panic_on_negative); \
24435 if (bal != 0) { \
24436 if (panic_on_negative || \
24437 (pmap_ledgers_panic && \
24438 pmap_ledgers_panic_leeway > 0 && \
24439 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24440 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24441 do_panic = TRUE; \
24442 } \
24443 printf("LEDGER BALANCE proc %d (%s) " \
24444 "\"%s\" = %lld\n", \
24445 pid, procname, #__LEDGER, bal); \
24446 if (bal > 0) { \
24447 pmap_ledgers_drift.__LEDGER##_over++; \
24448 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24449 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24450 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24451 } \
24452 } else if (bal < 0) { \
24453 pmap_ledgers_drift.__LEDGER##_under++; \
24454 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24455 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24456 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24457 } \
24458 } \
24459 } \
24460 MACRO_END
24461
24462 LEDGER_CHECK_BALANCE(phys_footprint);
24463 LEDGER_CHECK_BALANCE(internal);
24464 LEDGER_CHECK_BALANCE(internal_compressed);
24465 LEDGER_CHECK_BALANCE(external);
24466 LEDGER_CHECK_BALANCE(reusable);
24467 LEDGER_CHECK_BALANCE(iokit_mapped);
24468 LEDGER_CHECK_BALANCE(alternate_accounting);
24469 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24470 LEDGER_CHECK_BALANCE(page_table);
24471 LEDGER_CHECK_BALANCE(purgeable_volatile);
24472 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24473 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24474 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24475 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24476 LEDGER_CHECK_BALANCE(tagged_footprint);
24477 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24478 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24479 LEDGER_CHECK_BALANCE(network_volatile);
24480 LEDGER_CHECK_BALANCE(network_nonvolatile);
24481 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24482 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24483 LEDGER_CHECK_BALANCE(media_nofootprint);
24484 LEDGER_CHECK_BALANCE(media_footprint);
24485 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24486 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24487 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24488 LEDGER_CHECK_BALANCE(graphics_footprint);
24489 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24490 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24491 LEDGER_CHECK_BALANCE(neural_nofootprint);
24492 LEDGER_CHECK_BALANCE(neural_footprint);
24493 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24494 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24495 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24496
24497 if (do_panic) {
24498 if (pmap_ledgers_panic) {
24499 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24500 pmap, pid, procname);
24501 } else {
24502 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24503 pmap, pid, procname);
24504 }
24505 }
24506 }
24507
24508 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24509 vm_map_pmap_set_process(
24510 vm_map_t map,
24511 int pid,
24512 char *procname)
24513 {
24514 pmap_set_process(vm_map_pmap(map), pid, procname);
24515 }
24516
24517 #endif /* MACH_ASSERT */
24518