1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach/vm_types.h>
67 #include <mach_assert.h>
68
69 #include <vm/vm_options.h>
70
71 #include <libkern/OSAtomic.h>
72
73 #include <mach/kern_return.h>
74 #include <mach/port.h>
75 #include <mach/vm_attributes.h>
76 #include <mach/vm_param.h>
77 #include <mach/vm_behavior.h>
78 #include <mach/vm_statistics.h>
79 #include <mach/memory_object.h>
80 #include <mach/mach_vm_server.h>
81 #include <machine/cpu_capabilities.h>
82 #include <mach/sdt.h>
83
84 #include <kern/assert.h>
85 #include <kern/backtrace.h>
86 #include <kern/counter.h>
87 #include <kern/exc_guard.h>
88 #include <kern/kalloc.h>
89 #include <kern/zalloc_internal.h>
90
91 #include <vm/cpm_internal.h>
92 #include <vm/memory_types.h>
93 #include <vm/vm_compressor_xnu.h>
94 #include <vm/vm_compressor_pager_internal.h>
95 #include <vm/vm_init_xnu.h>
96 #include <vm/vm_fault_internal.h>
97 #include <vm/vm_map_internal.h>
98 #include <vm/vm_object_internal.h>
99 #include <vm/vm_page_internal.h>
100 #include <vm/vm_pageout.h>
101 #include <vm/pmap.h>
102 #include <vm/vm_kern_internal.h>
103 #include <ipc/ipc_port.h>
104 #include <kern/sched_prim.h>
105 #include <kern/misc_protos.h>
106
107 #include <mach/vm_map_server.h>
108 #include <mach/mach_host_server.h>
109 #include <vm/vm_memtag.h>
110 #include <vm/vm_protos_internal.h>
111 #include <vm/vm_purgeable_internal.h>
112
113 #include <vm/vm_iokit.h>
114 #include <vm/vm_shared_region_internal.h>
115 #include <vm/vm_map_store_internal.h>
116 #include <vm/vm_memory_entry_xnu.h>
117 #include <vm/memory_object_internal.h>
118 #include <vm/vm_memory_entry.h>
119 #include <vm/vm_sanitize_internal.h>
120 #if DEVELOPMENT || DEBUG
121 #include <vm/vm_compressor_info.h>
122 #endif /* DEVELOPMENT || DEBUG */
123 #include <san/kasan.h>
124
125 #include <sys/resource.h>
126 #include <sys/random.h>
127 #include <sys/codesign.h>
128 #include <sys/code_signing.h>
129 #include <sys/mman.h>
130 #include <sys/reboot.h>
131 #include <sys/kdebug_triage.h>
132 #include <sys/reason.h>
133
134 #include <libkern/section_keywords.h>
135
136 #if DEVELOPMENT || DEBUG
137 extern int proc_selfcsflags(void);
138 int vm_log_xnu_user_debug = 0;
139 int panic_on_unsigned_execute = 0;
140 int panic_on_mlock_failure = 0;
141 #endif /* DEVELOPMENT || DEBUG */
142
143 #if DEVELOPMENT || DEBUG
144 int debug4k_filter = 0;
145 char debug4k_proc_name[1024] = "";
146 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
147 int debug4k_panic_on_misaligned_sharing = 0;
148 const char *debug4k_category_name[] = {
149 "error", /* 0 */
150 "life", /* 1 */
151 "load", /* 2 */
152 "fault", /* 3 */
153 "copy", /* 4 */
154 "share", /* 5 */
155 "adjust", /* 6 */
156 "pmap", /* 7 */
157 "mementry", /* 8 */
158 "iokit", /* 9 */
159 "upl", /* 10 */
160 "exc", /* 11 */
161 "vfs" /* 12 */
162 };
163 #endif /* DEVELOPMENT || DEBUG */
164 int debug4k_no_cow_copyin = 0;
165
166
167 #if __arm64__
168 extern const int fourk_binary_compatibility_unsafe;
169 #endif /* __arm64__ */
170 extern int proc_selfpid(void);
171 extern char *proc_name_address(void *p);
172 extern const char *proc_best_name(struct proc *p);
173
174 #if VM_MAP_DEBUG_APPLE_PROTECT
175 int vm_map_debug_apple_protect = 0;
176 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
177 #if VM_MAP_DEBUG_FOURK
178 int vm_map_debug_fourk = 0;
179 #endif /* VM_MAP_DEBUG_FOURK */
180
181 #if DEBUG || DEVELOPMENT
182 static TUNABLE(bool, vm_map_executable_immutable,
183 "vm_map_executable_immutable", true);
184 #else
185 #define vm_map_executable_immutable true
186 #endif
187
188 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
189
190 extern u_int32_t random(void); /* from <libkern/libkern.h> */
191 /* Internal prototypes
192 */
193
194 typedef struct vm_map_zap {
195 vm_map_entry_t vmz_head;
196 vm_map_entry_t *vmz_tail;
197 } *vm_map_zap_t;
198
199 #define VM_MAP_ZAP_DECLARE(zap) \
200 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
201
202 extern kern_return_t vm_map_wire_external(
203 vm_map_t map,
204 vm_map_offset_ut start_u,
205 vm_map_offset_ut end_u,
206 vm_prot_ut prot_u,
207 boolean_t user_wire) __exported;
208
209 #if XNU_PLATFORM_MacOSX
210 extern /* exported via Private.<arch>.MacOSX.exports on macOS */
211 #else
212 static
213 #endif
214 kern_return_t vm_map_copyin_common(
215 vm_map_t src_map,
216 vm_map_address_ut src_addr,
217 vm_map_size_ut len,
218 boolean_t src_destroy,
219 boolean_t src_volatile,
220 vm_map_copy_t *copy_result, /* OUT */
221 boolean_t use_maxprot);
222
223 static vm_map_entry_t vm_map_entry_insert(
224 vm_map_t map,
225 vm_map_entry_t insp_entry,
226 vm_map_offset_t start,
227 vm_map_offset_t end,
228 vm_object_t object,
229 vm_object_offset_t offset,
230 vm_map_kernel_flags_t vmk_flags,
231 boolean_t needs_copy,
232 vm_prot_t cur_protection,
233 vm_prot_t max_protection,
234 vm_inherit_t inheritance,
235 boolean_t clear_map_aligned);
236
237 static void vm_map_simplify_range(
238 vm_map_t map,
239 vm_map_offset_t start,
240 vm_map_offset_t end); /* forward */
241
242 static boolean_t vm_map_range_check(
243 vm_map_t map,
244 vm_map_offset_t start,
245 vm_map_offset_t end,
246 vm_map_entry_t *entry);
247
248 static void vm_map_submap_pmap_clean(
249 vm_map_t map,
250 vm_map_offset_t start,
251 vm_map_offset_t end,
252 vm_map_t sub_map,
253 vm_map_offset_t offset);
254
255 static void vm_map_pmap_enter(
256 vm_map_t map,
257 vm_map_offset_t addr,
258 vm_map_offset_t end_addr,
259 vm_object_t object,
260 vm_object_offset_t offset,
261 vm_prot_t protection);
262
263 static void _vm_map_clip_end(
264 struct vm_map_header *map_header,
265 vm_map_entry_t entry,
266 vm_map_offset_t end);
267
268 static void _vm_map_clip_start(
269 struct vm_map_header *map_header,
270 vm_map_entry_t entry,
271 vm_map_offset_t start);
272
273 static kmem_return_t vm_map_delete(
274 vm_map_t map,
275 vm_map_offset_t start,
276 vm_map_offset_t end,
277 vmr_flags_t flags,
278 kmem_guard_t guard,
279 vm_map_zap_t zap);
280
281 static void vm_map_copy_insert(
282 vm_map_t map,
283 vm_map_entry_t after_where,
284 vm_map_copy_t copy);
285
286 static kern_return_t vm_map_copy_overwrite_unaligned(
287 vm_map_t dst_map,
288 vm_map_entry_t entry,
289 vm_map_copy_t copy,
290 vm_map_address_t start,
291 boolean_t discard_on_success);
292
293 static kern_return_t vm_map_copy_overwrite_aligned(
294 vm_map_t dst_map,
295 vm_map_entry_t tmp_entry,
296 vm_map_copy_t copy,
297 vm_map_offset_t start,
298 pmap_t pmap);
299
300 static kern_return_t vm_map_copyin_kernel_buffer(
301 vm_map_t src_map,
302 vm_map_address_t src_addr,
303 vm_map_size_t len,
304 boolean_t src_destroy,
305 vm_map_copy_t *copy_result); /* OUT */
306
307 static kern_return_t vm_map_copyout_kernel_buffer(
308 vm_map_t map,
309 vm_map_address_t *addr, /* IN/OUT */
310 vm_map_copy_t copy,
311 vm_map_size_t copy_size,
312 boolean_t overwrite,
313 boolean_t consume_on_success);
314
315 static void vm_map_fork_share(
316 vm_map_t old_map,
317 vm_map_entry_t old_entry,
318 vm_map_t new_map);
319
320 static boolean_t vm_map_fork_copy(
321 vm_map_t old_map,
322 vm_map_entry_t *old_entry_p,
323 vm_map_t new_map,
324 int vm_map_copyin_flags);
325
326 static kern_return_t vm_map_wire_nested(
327 vm_map_t map,
328 vm_map_offset_t start,
329 vm_map_offset_t end,
330 vm_prot_t caller_prot,
331 vm_tag_t tag,
332 boolean_t user_wire,
333 pmap_t map_pmap,
334 vm_map_offset_t pmap_addr,
335 ppnum_t *physpage_p);
336
337 static kern_return_t vm_map_unwire_nested(
338 vm_map_t map,
339 vm_map_offset_t start,
340 vm_map_offset_t end,
341 boolean_t user_wire,
342 pmap_t map_pmap,
343 vm_map_offset_t pmap_addr);
344
345 static kern_return_t vm_map_overwrite_submap_recurse(
346 vm_map_t dst_map,
347 vm_map_offset_t dst_addr,
348 vm_map_size_t dst_size);
349
350 static kern_return_t vm_map_copy_overwrite_nested(
351 vm_map_t dst_map,
352 vm_map_offset_t dst_addr,
353 vm_map_copy_t copy,
354 boolean_t interruptible,
355 pmap_t pmap,
356 boolean_t discard_on_success);
357
358 static kern_return_t vm_map_remap_extract(
359 vm_map_t map,
360 vm_map_offset_t addr,
361 vm_map_size_t size,
362 boolean_t copy,
363 vm_map_copy_t map_copy,
364 vm_prot_t *cur_protection,
365 vm_prot_t *max_protection,
366 vm_inherit_t inheritance,
367 vm_map_kernel_flags_t vmk_flags);
368
369 static void vm_map_region_look_for_page(
370 vm_map_t map,
371 vm_map_offset_t va,
372 vm_object_t object,
373 vm_object_offset_t offset,
374 int max_refcnt,
375 unsigned short depth,
376 vm_region_extended_info_t extended,
377 mach_msg_type_number_t count);
378
379 static boolean_t vm_map_region_has_obj_ref(
380 vm_map_entry_t entry,
381 vm_object_t object);
382
383
384 static kern_return_t vm_map_willneed(
385 vm_map_t map,
386 vm_map_offset_t start,
387 vm_map_offset_t end);
388
389 static kern_return_t vm_map_reuse_pages(
390 vm_map_t map,
391 vm_map_offset_t start,
392 vm_map_offset_t end);
393
394 static kern_return_t vm_map_reusable_pages(
395 vm_map_t map,
396 vm_map_offset_t start,
397 vm_map_offset_t end);
398
399 static kern_return_t vm_map_can_reuse(
400 vm_map_t map,
401 vm_map_offset_t start,
402 vm_map_offset_t end);
403
404 static kern_return_t vm_map_zero(
405 vm_map_t map,
406 vm_map_offset_t start,
407 vm_map_offset_t end);
408
409 static kern_return_t vm_map_random_address_for_size(
410 vm_map_t map,
411 vm_map_offset_t *address,
412 vm_map_size_t size,
413 vm_map_kernel_flags_t vmk_flags);
414
415
416 #if CONFIG_MAP_RANGES
417
418 static vm_map_range_id_t vm_map_user_range_resolve(
419 vm_map_t map,
420 mach_vm_address_t addr,
421 mach_vm_address_t size,
422 mach_vm_range_t range);
423
424 #endif /* CONFIG_MAP_RANGES */
425 #if MACH_ASSERT
426 static kern_return_t vm_map_pageout(
427 vm_map_t map,
428 vm_map_offset_t start,
429 vm_map_offset_t end);
430 #endif /* MACH_ASSERT */
431
432 kern_return_t vm_map_corpse_footprint_collect(
433 vm_map_t old_map,
434 vm_map_entry_t old_entry,
435 vm_map_t new_map);
436 void vm_map_corpse_footprint_collect_done(
437 vm_map_t new_map);
438 void vm_map_corpse_footprint_destroy(
439 vm_map_t map);
440 kern_return_t vm_map_corpse_footprint_query_page_info(
441 vm_map_t map,
442 vm_map_offset_t va,
443 int *disposition_p);
444 void vm_map_footprint_query_page_info(
445 vm_map_t map,
446 vm_map_entry_t map_entry,
447 vm_map_offset_t curr_s_offset,
448 int *disposition_p);
449
450 #if CONFIG_MAP_RANGES
451 static void vm_map_range_map_init(void);
452 #endif /* CONFIG_MAP_RANGES */
453
454 pid_t find_largest_process_vm_map_entries(void);
455
456 __attribute__((always_inline))
457 int
vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)458 vm_map_kernel_flags_vmflags(vm_map_kernel_flags_t vmk_flags)
459 {
460 int flags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
461
462 /* in vmk flags the meaning of fixed/anywhere is inverted */
463 return flags ^ (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
464 }
465
466 __attribute__((always_inline, overloadable))
467 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags,vm_tag_t vm_tag)468 vm_map_kernel_flags_set_vmflags(
469 vm_map_kernel_flags_t *vmk_flags,
470 int vm_flags,
471 vm_tag_t vm_tag)
472 {
473 vm_flags ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
474 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
475 vmk_flags->__vm_flags |= (vm_flags & VM_FLAGS_ANY_MASK);
476 vmk_flags->vm_tag = vm_tag;
477 }
478
479 __attribute__((always_inline, overloadable))
480 void
vm_map_kernel_flags_set_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_and_tag)481 vm_map_kernel_flags_set_vmflags(
482 vm_map_kernel_flags_t *vmk_flags,
483 int vm_flags_and_tag)
484 {
485 vm_flags_and_tag ^= (VM_FLAGS_FIXED | VM_FLAGS_ANYWHERE);
486 vmk_flags->__vm_flags &= ~VM_FLAGS_ANY_MASK;
487 vmk_flags->__vm_flags |= (vm_flags_and_tag & VM_FLAGS_ANY_MASK);
488 VM_GET_FLAGS_ALIAS(vm_flags_and_tag, vmk_flags->vm_tag);
489 }
490
491 __attribute__((always_inline))
492 void
vm_map_kernel_flags_and_vmflags(vm_map_kernel_flags_t * vmk_flags,int vm_flags_mask)493 vm_map_kernel_flags_and_vmflags(
494 vm_map_kernel_flags_t *vmk_flags,
495 int vm_flags_mask)
496 {
497 /* this function doesn't handle the inverted FIXED/ANYWHERE */
498 assert(vm_flags_mask & VM_FLAGS_ANYWHERE);
499 vmk_flags->__vm_flags &= vm_flags_mask;
500 }
501
502 __attribute__((always_inline))
503 bool
vm_map_kernel_flags_check_vm_and_kflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)504 vm_map_kernel_flags_check_vm_and_kflags(
505 vm_map_kernel_flags_t vmk_flags,
506 int vm_flags_mask)
507 {
508 return (vmk_flags.__vm_flags & ~vm_flags_mask) == 0;
509 }
510
511 bool
vm_map_kernel_flags_check_vmflags(vm_map_kernel_flags_t vmk_flags,int vm_flags_mask)512 vm_map_kernel_flags_check_vmflags(
513 vm_map_kernel_flags_t vmk_flags,
514 int vm_flags_mask)
515 {
516 int vmflags = vmk_flags.__vm_flags & VM_FLAGS_ANY_MASK;
517
518 /* Note: up to 16 still has good calling conventions */
519 static_assert(sizeof(vm_map_kernel_flags_t) == 8);
520
521 #if DEBUG || DEVELOPMENT
522 /*
523 * All of this compiles to nothing if all checks pass.
524 */
525 #define check(field, value) ({ \
526 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE; \
527 fl.__vm_flags = (value); \
528 fl.field = 0; \
529 assert(fl.__vm_flags == 0); \
530 })
531
532 /* bits 0-7 */
533 check(vmf_fixed, VM_FLAGS_ANYWHERE); // kind of a lie this is inverted
534 check(vmf_purgeable, VM_FLAGS_PURGABLE);
535 check(vmf_4gb_chunk, VM_FLAGS_4GB_CHUNK);
536 check(vmf_random_addr, VM_FLAGS_RANDOM_ADDR);
537 check(vmf_no_cache, VM_FLAGS_NO_CACHE);
538 check(vmf_resilient_codesign, VM_FLAGS_RESILIENT_CODESIGN);
539 check(vmf_resilient_media, VM_FLAGS_RESILIENT_MEDIA);
540 check(vmf_permanent, VM_FLAGS_PERMANENT);
541
542 /* bits 8-15 */
543 check(vmf_tpro, VM_FLAGS_TPRO);
544 check(vmf_overwrite, VM_FLAGS_OVERWRITE);
545
546 /* bits 16-23 */
547 check(vmf_superpage_size, VM_FLAGS_SUPERPAGE_MASK);
548 check(vmf_return_data_addr, VM_FLAGS_RETURN_DATA_ADDR);
549 check(vmf_return_4k_data_addr, VM_FLAGS_RETURN_4K_DATA_ADDR);
550
551 {
552 vm_map_kernel_flags_t fl = VM_MAP_KERNEL_FLAGS_NONE;
553
554 /* check user tags will never clip */
555 fl.vm_tag = VM_MEMORY_COUNT - 1;
556 assert(fl.vm_tag == VM_MEMORY_COUNT - 1);
557
558 /* check kernel tags will never clip */
559 fl.vm_tag = VM_MAX_TAG_VALUE - 1;
560 assert(fl.vm_tag == VM_MAX_TAG_VALUE - 1);
561 }
562
563
564 #undef check
565 #endif /* DEBUG || DEVELOPMENT */
566
567 return (vmflags & ~vm_flags_mask) == 0;
568 }
569
570 /*
571 * Macros to copy a vm_map_entry. We must be careful to correctly
572 * manage the wired page count. vm_map_entry_copy() creates a new
573 * map entry to the same memory - the wired count in the new entry
574 * must be set to zero. vm_map_entry_copy_full() creates a new
575 * entry that is identical to the old entry. This preserves the
576 * wire count; it's used for map splitting and zone changing in
577 * vm_map_copyout.
578 */
579
580 static inline void
vm_map_entry_copy_csm_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)581 vm_map_entry_copy_csm_assoc(
582 vm_map_t map __unused,
583 vm_map_entry_t new __unused,
584 vm_map_entry_t old __unused)
585 {
586 #if CODE_SIGNING_MONITOR
587 /* when code signing monitor is enabled, we want to reset on copy */
588 new->csm_associated = FALSE;
589 #else
590 /* when code signing monitor is not enabled, assert as a sanity check */
591 assert(new->csm_associated == FALSE);
592 #endif
593 #if DEVELOPMENT || DEBUG
594 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
595 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
596 proc_selfpid(),
597 (get_bsdtask_info(current_task())
598 ? proc_name_address(get_bsdtask_info(current_task()))
599 : "?"),
600 __FUNCTION__, __LINE__,
601 map, new, new->vme_start, new->vme_end);
602 }
603 #endif /* DEVELOPMENT || DEBUG */
604 new->vme_xnu_user_debug = FALSE;
605 }
606
607 /*
608 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
609 * But for security reasons on some platforms, we don't want the
610 * new mapping to be "used for jit", so we reset the flag here.
611 */
612 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)613 vm_map_entry_copy_code_signing(
614 vm_map_t map,
615 vm_map_entry_t new,
616 vm_map_entry_t old __unused)
617 {
618 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
619 assert(new->used_for_jit == old->used_for_jit);
620 } else {
621 if (old->used_for_jit) {
622 DTRACE_VM3(cs_wx,
623 uint64_t, new->vme_start,
624 uint64_t, new->vme_end,
625 vm_prot_t, new->protection);
626 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
627 proc_selfpid(),
628 (get_bsdtask_info(current_task())
629 ? proc_name_address(get_bsdtask_info(current_task()))
630 : "?"),
631 __FUNCTION__,
632 "removing execute access");
633 new->protection &= ~VM_PROT_EXECUTE;
634 new->max_protection &= ~VM_PROT_EXECUTE;
635 }
636 new->used_for_jit = FALSE;
637 }
638 }
639
640 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)641 vm_map_entry_copy_full(
642 vm_map_entry_t new,
643 vm_map_entry_t old)
644 {
645 #if MAP_ENTRY_CREATION_DEBUG
646 btref_put(new->vme_creation_bt);
647 btref_retain(old->vme_creation_bt);
648 #endif
649 #if MAP_ENTRY_INSERTION_DEBUG
650 btref_put(new->vme_insertion_bt);
651 btref_retain(old->vme_insertion_bt);
652 #endif
653 #if VM_BTLOG_TAGS
654 /* Discard the btref that might be in the new entry */
655 if (new->vme_kernel_object) {
656 btref_put(new->vme_tag_btref);
657 }
658 /* Retain the btref in the old entry to account for its copy */
659 if (old->vme_kernel_object) {
660 btref_retain(old->vme_tag_btref);
661 }
662 #endif /* VM_BTLOG_TAGS */
663 *new = *old;
664 }
665
666 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)667 vm_map_entry_copy(
668 vm_map_t map,
669 vm_map_entry_t new,
670 vm_map_entry_t old)
671 {
672 vm_map_entry_copy_full(new, old);
673
674 new->is_shared = FALSE;
675 new->needs_wakeup = FALSE;
676 new->in_transition = FALSE;
677 new->wired_count = 0;
678 new->user_wired_count = 0;
679 new->vme_permanent = FALSE;
680 vm_map_entry_copy_code_signing(map, new, old);
681 vm_map_entry_copy_csm_assoc(map, new, old);
682 if (new->iokit_acct) {
683 assertf(!new->use_pmap, "old %p new %p\n", old, new);
684 new->iokit_acct = FALSE;
685 new->use_pmap = TRUE;
686 }
687 new->vme_resilient_codesign = FALSE;
688 new->vme_resilient_media = FALSE;
689 new->vme_atomic = FALSE;
690 new->vme_no_copy_on_read = FALSE;
691 }
692
693 /*
694 * Normal lock_read_to_write() returns FALSE/0 on failure.
695 * These functions evaluate to zero on success and non-zero value on failure.
696 */
697 __attribute__((always_inline))
698 int
vm_map_lock_read_to_write(vm_map_t map)699 vm_map_lock_read_to_write(vm_map_t map)
700 {
701 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
702 DTRACE_VM(vm_map_lock_upgrade);
703 return 0;
704 }
705 return 1;
706 }
707
708 __attribute__((always_inline))
709 boolean_t
vm_map_try_lock(vm_map_t map)710 vm_map_try_lock(vm_map_t map)
711 {
712 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
713 DTRACE_VM(vm_map_lock_w);
714 return TRUE;
715 }
716 return FALSE;
717 }
718
719 __attribute__((always_inline))
720 boolean_t
vm_map_try_lock_read(vm_map_t map)721 vm_map_try_lock_read(vm_map_t map)
722 {
723 if (lck_rw_try_lock_shared(&(map)->lock)) {
724 DTRACE_VM(vm_map_lock_r);
725 return TRUE;
726 }
727 return FALSE;
728 }
729
730 /*!
731 * @function kdp_vm_map_is_acquired_exclusive
732 *
733 * @abstract
734 * Checks if vm map is acquired exclusive.
735 *
736 * @discussion
737 * NOT SAFE: To be used only by kernel debugger.
738 *
739 * @param map map to check
740 *
741 * @returns TRUE if the map is acquired exclusively.
742 */
743 boolean_t
kdp_vm_map_is_acquired_exclusive(vm_map_t map)744 kdp_vm_map_is_acquired_exclusive(vm_map_t map)
745 {
746 return kdp_lck_rw_lock_is_acquired_exclusive(&map->lock);
747 }
748
749 /*
750 * Routines to get the page size the caller should
751 * use while inspecting the target address space.
752 * Use the "_safely" variant if the caller is dealing with a user-provided
753 * array whose size depends on the page size, to avoid any overflow or
754 * underflow of a user-allocated buffer.
755 */
756 int
vm_self_region_page_shift_safely(vm_map_t target_map)757 vm_self_region_page_shift_safely(
758 vm_map_t target_map)
759 {
760 int effective_page_shift = 0;
761
762 if (PAGE_SIZE == (4096)) {
763 /* x86_64 and 4k watches: always use 4k */
764 return PAGE_SHIFT;
765 }
766 /* did caller provide an explicit page size for this thread to use? */
767 effective_page_shift = thread_self_region_page_shift();
768 if (effective_page_shift) {
769 /* use the explicitly-provided page size */
770 return effective_page_shift;
771 }
772 /* no explicit page size: use the caller's page size... */
773 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
774 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
775 /* page size match: safe to use */
776 return effective_page_shift;
777 }
778 /* page size mismatch */
779 return -1;
780 }
781 int
vm_self_region_page_shift(vm_map_t target_map)782 vm_self_region_page_shift(
783 vm_map_t target_map)
784 {
785 int effective_page_shift;
786
787 effective_page_shift = vm_self_region_page_shift_safely(target_map);
788 if (effective_page_shift == -1) {
789 /* no safe value but OK to guess for caller */
790 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
791 VM_MAP_PAGE_SHIFT(target_map));
792 }
793 return effective_page_shift;
794 }
795
796
797 /*
798 * Decide if we want to allow processes to execute from their data or stack areas.
799 * override_nx() returns true if we do. Data/stack execution can be enabled independently
800 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
801 * or allow_stack_exec to enable data execution for that type of data area for that particular
802 * ABI (or both by or'ing the flags together). These are initialized in the architecture
803 * specific pmap files since the default behavior varies according to architecture. The
804 * main reason it varies is because of the need to provide binary compatibility with old
805 * applications that were written before these restrictions came into being. In the old
806 * days, an app could execute anything it could read, but this has slowly been tightened
807 * up over time. The default behavior is:
808 *
809 * 32-bit PPC apps may execute from both stack and data areas
810 * 32-bit Intel apps may exeucte from data areas but not stack
811 * 64-bit PPC/Intel apps may not execute from either data or stack
812 *
813 * An application on any architecture may override these defaults by explicitly
814 * adding PROT_EXEC permission to the page in question with the mprotect(2)
815 * system call. This code here just determines what happens when an app tries to
816 * execute from a page that lacks execute permission.
817 *
818 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
819 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
820 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
821 * execution from data areas for a particular binary even if the arch normally permits it. As
822 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
823 * to support some complicated use cases, notably browsers with out-of-process plugins that
824 * are not all NX-safe.
825 */
826
827 extern int allow_data_exec, allow_stack_exec;
828
829 int
override_nx(vm_map_t map,uint32_t user_tag)830 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
831 {
832 int current_abi;
833
834 if (map->pmap == kernel_pmap) {
835 return FALSE;
836 }
837
838 /*
839 * Determine if the app is running in 32 or 64 bit mode.
840 */
841
842 if (vm_map_is_64bit(map)) {
843 current_abi = VM_ABI_64;
844 } else {
845 current_abi = VM_ABI_32;
846 }
847
848 /*
849 * Determine if we should allow the execution based on whether it's a
850 * stack or data area and the current architecture.
851 */
852
853 if (user_tag == VM_MEMORY_STACK) {
854 return allow_stack_exec & current_abi;
855 }
856
857 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
858 }
859
860
861 /*
862 * Virtual memory maps provide for the mapping, protection,
863 * and sharing of virtual memory objects. In addition,
864 * this module provides for an efficient virtual copy of
865 * memory from one map to another.
866 *
867 * Synchronization is required prior to most operations.
868 *
869 * Maps consist of an ordered doubly-linked list of simple
870 * entries; a single hint is used to speed up lookups.
871 *
872 * Sharing maps have been deleted from this version of Mach.
873 * All shared objects are now mapped directly into the respective
874 * maps. This requires a change in the copy on write strategy;
875 * the asymmetric (delayed) strategy is used for shared temporary
876 * objects instead of the symmetric (shadow) strategy. All maps
877 * are now "top level" maps (either task map, kernel map or submap
878 * of the kernel map).
879 *
880 * Since portions of maps are specified by start/end addreses,
881 * which may not align with existing map entries, all
882 * routines merely "clip" entries to these start/end values.
883 * [That is, an entry is split into two, bordering at a
884 * start or end value.] Note that these clippings may not
885 * always be necessary (as the two resulting entries are then
886 * not changed); however, the clipping is done for convenience.
887 * No attempt is currently made to "glue back together" two
888 * abutting entries.
889 *
890 * The symmetric (shadow) copy strategy implements virtual copy
891 * by copying VM object references from one map to
892 * another, and then marking both regions as copy-on-write.
893 * It is important to note that only one writeable reference
894 * to a VM object region exists in any map when this strategy
895 * is used -- this means that shadow object creation can be
896 * delayed until a write operation occurs. The symmetric (delayed)
897 * strategy allows multiple maps to have writeable references to
898 * the same region of a vm object, and hence cannot delay creating
899 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
900 * Copying of permanent objects is completely different; see
901 * vm_object_copy_strategically() in vm_object.c.
902 */
903
904 ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
905
906 #define VM_MAP_ZONE_NAME "maps"
907 #define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
908
909 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
910 #define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
911
912 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
913 #define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
914
915 /*
916 * Asserts that a vm_map_copy object is coming from the
917 * vm_map_copy_zone to ensure that it isn't a fake constructed
918 * anywhere else.
919 */
920 void
vm_map_copy_require(struct vm_map_copy * copy)921 vm_map_copy_require(struct vm_map_copy *copy)
922 {
923 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
924 }
925
926 /*
927 * vm_map_require:
928 *
929 * Ensures that the argument is memory allocated from the genuine
930 * vm map zone. (See zone_id_require_allow_foreign).
931 */
932 void
vm_map_require(vm_map_t map)933 vm_map_require(vm_map_t map)
934 {
935 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
936 }
937
938 #define VM_MAP_EARLY_COUNT_MAX 16
939 static __startup_data vm_offset_t map_data;
940 static __startup_data vm_size_t map_data_size;
941 static __startup_data vm_offset_t kentry_data;
942 static __startup_data vm_size_t kentry_data_size;
943 static __startup_data vm_offset_t map_holes_data;
944 static __startup_data vm_size_t map_holes_data_size;
945 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
946 static __startup_data uint32_t early_map_count;
947
948 #if XNU_TARGET_OS_OSX
949 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
950 #else /* XNU_TARGET_OS_OSX */
951 #define NO_COALESCE_LIMIT 0
952 #endif /* XNU_TARGET_OS_OSX */
953
954 /* Skip acquiring locks if we're in the midst of a kernel core dump */
955 unsigned int not_in_kdp = 1;
956
957 unsigned int vm_map_set_cache_attr_count = 0;
958
959 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)960 vm_map_set_cache_attr(
961 vm_map_t map,
962 vm_map_offset_t va)
963 {
964 vm_map_entry_t map_entry;
965 vm_object_t object;
966 kern_return_t kr = KERN_SUCCESS;
967
968 vm_map_lock_read(map);
969
970 if (!vm_map_lookup_entry(map, va, &map_entry) ||
971 map_entry->is_sub_map) {
972 /*
973 * that memory is not properly mapped
974 */
975 kr = KERN_INVALID_ARGUMENT;
976 goto done;
977 }
978 object = VME_OBJECT(map_entry);
979
980 if (object == VM_OBJECT_NULL) {
981 /*
982 * there should be a VM object here at this point
983 */
984 kr = KERN_INVALID_ARGUMENT;
985 goto done;
986 }
987 vm_object_lock(object);
988 object->set_cache_attr = TRUE;
989 vm_object_unlock(object);
990
991 vm_map_set_cache_attr_count++;
992 done:
993 vm_map_unlock_read(map);
994
995 return kr;
996 }
997
998
999 #if CONFIG_CODE_DECRYPTION
1000 /*
1001 * vm_map_apple_protected:
1002 * This remaps the requested part of the object with an object backed by
1003 * the decrypting pager.
1004 * crypt_info contains entry points and session data for the crypt module.
1005 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
1006 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
1007 */
1008 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)1009 vm_map_apple_protected(
1010 vm_map_t map,
1011 vm_map_offset_t start,
1012 vm_map_offset_t end,
1013 vm_object_offset_t crypto_backing_offset,
1014 struct pager_crypt_info *crypt_info,
1015 uint32_t cryptid)
1016 {
1017 boolean_t map_locked;
1018 kern_return_t kr;
1019 vm_map_entry_t map_entry;
1020 struct vm_map_entry tmp_entry;
1021 memory_object_t unprotected_mem_obj;
1022 vm_object_t protected_object;
1023 vm_map_offset_t map_addr;
1024 vm_map_offset_t start_aligned, end_aligned;
1025 vm_object_offset_t crypto_start, crypto_end;
1026 boolean_t cache_pager;
1027
1028 map_locked = FALSE;
1029 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1030
1031 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
1032 return KERN_INVALID_ADDRESS;
1033 }
1034 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
1035 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
1036 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
1037 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
1038
1039 #if __arm64__
1040 /*
1041 * "start" and "end" might be 4K-aligned but not 16K-aligned,
1042 * so we might have to loop and establish up to 3 mappings:
1043 *
1044 * + the first 16K-page, which might overlap with the previous
1045 * 4K-aligned mapping,
1046 * + the center,
1047 * + the last 16K-page, which might overlap with the next
1048 * 4K-aligned mapping.
1049 * Each of these mapping might be backed by a vnode pager (if
1050 * properly page-aligned) or a "fourk_pager", itself backed by a
1051 * vnode pager (if 4K-aligned but not page-aligned).
1052 */
1053 #endif /* __arm64__ */
1054
1055 map_addr = start_aligned;
1056 for (map_addr = start_aligned;
1057 map_addr < end;
1058 map_addr = tmp_entry.vme_end) {
1059 vm_map_lock(map);
1060 map_locked = TRUE;
1061
1062 /* lookup the protected VM object */
1063 if (!vm_map_lookup_entry(map,
1064 map_addr,
1065 &map_entry) ||
1066 map_entry->is_sub_map ||
1067 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
1068 /* that memory is not properly mapped */
1069 kr = KERN_INVALID_ARGUMENT;
1070 goto done;
1071 }
1072
1073 /* ensure mapped memory is mapped as executable except
1074 * except for model decryption flow */
1075 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
1076 !(map_entry->protection & VM_PROT_EXECUTE)) {
1077 kr = KERN_INVALID_ARGUMENT;
1078 goto done;
1079 }
1080
1081 /* get the protected object to be decrypted */
1082 protected_object = VME_OBJECT(map_entry);
1083 if (protected_object == VM_OBJECT_NULL) {
1084 /* there should be a VM object here at this point */
1085 kr = KERN_INVALID_ARGUMENT;
1086 goto done;
1087 }
1088 /* ensure protected object stays alive while map is unlocked */
1089 vm_object_reference(protected_object);
1090
1091 /* limit the map entry to the area we want to cover */
1092 vm_map_clip_start(map, map_entry, start_aligned);
1093 vm_map_clip_end(map, map_entry, end_aligned);
1094
1095 tmp_entry = *map_entry;
1096 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
1097 vm_map_unlock(map);
1098 map_locked = FALSE;
1099
1100 /*
1101 * This map entry might be only partially encrypted
1102 * (if not fully "page-aligned").
1103 */
1104 crypto_start = 0;
1105 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
1106 if (tmp_entry.vme_start < start) {
1107 if (tmp_entry.vme_start != start_aligned) {
1108 kr = KERN_INVALID_ADDRESS;
1109 vm_object_deallocate(protected_object);
1110 goto done;
1111 }
1112 crypto_start += (start - tmp_entry.vme_start);
1113 }
1114 if (tmp_entry.vme_end > end) {
1115 if (tmp_entry.vme_end != end_aligned) {
1116 kr = KERN_INVALID_ADDRESS;
1117 vm_object_deallocate(protected_object);
1118 goto done;
1119 }
1120 crypto_end -= (tmp_entry.vme_end - end);
1121 }
1122
1123 /*
1124 * This "extra backing offset" is needed to get the decryption
1125 * routine to use the right key. It adjusts for the possibly
1126 * relative offset of an interposed "4K" pager...
1127 */
1128 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1129 crypto_backing_offset = VME_OFFSET(&tmp_entry);
1130 }
1131
1132 cache_pager = TRUE;
1133 #if XNU_TARGET_OS_OSX
1134 if (vm_map_is_alien(map)) {
1135 cache_pager = FALSE;
1136 }
1137 #endif /* XNU_TARGET_OS_OSX */
1138
1139 /*
1140 * Lookup (and create if necessary) the protected memory object
1141 * matching that VM object.
1142 * If successful, this also grabs a reference on the memory object,
1143 * to guarantee that it doesn't go away before we get a chance to map
1144 * it.
1145 */
1146 unprotected_mem_obj = apple_protect_pager_setup(
1147 protected_object,
1148 VME_OFFSET(&tmp_entry),
1149 crypto_backing_offset,
1150 crypt_info,
1151 crypto_start,
1152 crypto_end,
1153 cache_pager);
1154
1155 /* release extra ref on protected object */
1156 vm_object_deallocate(protected_object);
1157
1158 if (unprotected_mem_obj == NULL) {
1159 kr = KERN_FAILURE;
1160 goto done;
1161 }
1162
1163 /* can overwrite an immutable mapping */
1164 vm_map_kernel_flags_t vmk_flags = {
1165 .vmf_fixed = true,
1166 .vmf_overwrite = true,
1167 .vmkf_overwrite_immutable = true,
1168 };
1169 /* make the new mapping as "permanent" as the one it replaces */
1170 vmk_flags.vmf_permanent = tmp_entry.vme_permanent;
1171
1172 /* map this memory object in place of the current one */
1173 map_addr = tmp_entry.vme_start;
1174 kr = mach_vm_map_kernel(map,
1175 vm_sanitize_wrap_addr_ref(&map_addr),
1176 (tmp_entry.vme_end -
1177 tmp_entry.vme_start),
1178 (mach_vm_offset_t) 0,
1179 vmk_flags,
1180 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1181 0,
1182 TRUE,
1183 tmp_entry.protection,
1184 tmp_entry.max_protection,
1185 tmp_entry.inheritance);
1186 assertf(kr == KERN_SUCCESS,
1187 "kr = 0x%x\n", kr);
1188 assertf(map_addr == tmp_entry.vme_start,
1189 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1190 (uint64_t)map_addr,
1191 (uint64_t) tmp_entry.vme_start,
1192 &tmp_entry);
1193
1194 #if VM_MAP_DEBUG_APPLE_PROTECT
1195 if (vm_map_debug_apple_protect) {
1196 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1197 " backing:[object:%p,offset:0x%llx,"
1198 "crypto_backing_offset:0x%llx,"
1199 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1200 map,
1201 (uint64_t) map_addr,
1202 (uint64_t) (map_addr + (tmp_entry.vme_end -
1203 tmp_entry.vme_start)),
1204 unprotected_mem_obj,
1205 protected_object,
1206 VME_OFFSET(&tmp_entry),
1207 crypto_backing_offset,
1208 crypto_start,
1209 crypto_end);
1210 }
1211 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1212
1213 /*
1214 * Release the reference obtained by
1215 * apple_protect_pager_setup().
1216 * The mapping (if it succeeded) is now holding a reference on
1217 * the memory object.
1218 */
1219 memory_object_deallocate(unprotected_mem_obj);
1220 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1221
1222 /* continue with next map entry */
1223 crypto_backing_offset += (tmp_entry.vme_end -
1224 tmp_entry.vme_start);
1225 crypto_backing_offset -= crypto_start;
1226 }
1227 kr = KERN_SUCCESS;
1228
1229 done:
1230 if (map_locked) {
1231 vm_map_unlock(map);
1232 }
1233 return kr;
1234 }
1235 #endif /* CONFIG_CODE_DECRYPTION */
1236
1237
1238 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1239 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1240 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1241
1242 #if XNU_TARGET_OS_OSX
1243 #define MALLOC_NO_COW_DEFAULT 1
1244 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1245 #else /* XNU_TARGET_OS_OSX */
1246 #define MALLOC_NO_COW_DEFAULT 1
1247 #define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1248 #endif /* XNU_TARGET_OS_OSX */
1249 TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1250 TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1251 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1252 #if DEBUG
1253 int vm_check_map_sanity = 0;
1254 #endif
1255
1256 /*
1257 * vm_map_init:
1258 *
1259 * Initialize the vm_map module. Must be called before
1260 * any other vm_map routines.
1261 *
1262 * Map and entry structures are allocated from zones -- we must
1263 * initialize those zones.
1264 *
1265 * There are three zones of interest:
1266 *
1267 * vm_map_zone: used to allocate maps.
1268 * vm_map_entry_zone: used to allocate map entries.
1269 *
1270 * LP32:
1271 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1272 *
1273 * The kernel allocates map entries from a special zone that is initially
1274 * "crammed" with memory. It would be difficult (perhaps impossible) for
1275 * the kernel to allocate more memory to a entry zone when it became
1276 * empty since the very act of allocating memory implies the creation
1277 * of a new entry.
1278 */
1279 __startup_func
1280 void
vm_map_init(void)1281 vm_map_init(void)
1282 {
1283
1284 #if MACH_ASSERT
1285 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1286 sizeof(debug4k_filter));
1287 #endif /* MACH_ASSERT */
1288
1289 zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1290 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1291
1292 /*
1293 * Don't quarantine because we always need elements available
1294 * Disallow GC on this zone... to aid the GC.
1295 */
1296 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1297 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1298 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1299 z->z_elems_rsv = (uint16_t)(32 *
1300 (ml_early_cpu_max_number() + 1));
1301 });
1302
1303 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1304 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1305 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1306 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(z));
1307 });
1308
1309 zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1310 ZC_NOENCRYPT, ZONE_ID_VM_MAP_COPY, NULL);
1311
1312 /*
1313 * Add the stolen memory to zones, adjust zone size and stolen counts.
1314 */
1315 zone_cram_early(vm_map_zone, map_data, map_data_size);
1316 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1317 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1318 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1319 zone_count_free(vm_map_zone),
1320 zone_count_free(vm_map_entry_zone),
1321 zone_count_free(vm_map_holes_zone));
1322
1323 /*
1324 * Since these are covered by zones, remove them from stolen page accounting.
1325 */
1326 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1327
1328 #if VM_MAP_DEBUG_APPLE_PROTECT
1329 PE_parse_boot_argn("vm_map_debug_apple_protect",
1330 &vm_map_debug_apple_protect,
1331 sizeof(vm_map_debug_apple_protect));
1332 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1333 #if VM_MAP_DEBUG_APPLE_FOURK
1334 PE_parse_boot_argn("vm_map_debug_fourk",
1335 &vm_map_debug_fourk,
1336 sizeof(vm_map_debug_fourk));
1337 #endif /* VM_MAP_DEBUG_FOURK */
1338
1339 if (malloc_no_cow) {
1340 vm_memory_malloc_no_cow_mask = 0ULL;
1341 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1342 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1343 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1344 #if XNU_TARGET_OS_OSX
1345 /*
1346 * On macOS, keep copy-on-write for MALLOC_LARGE because
1347 * realloc() may use vm_copy() to transfer the old contents
1348 * to the new location.
1349 */
1350 #else /* XNU_TARGET_OS_OSX */
1351 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1352 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1353 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1354 #endif /* XNU_TARGET_OS_OSX */
1355 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1356 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1357 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1358 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1359 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1360 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1361 &vm_memory_malloc_no_cow_mask,
1362 sizeof(vm_memory_malloc_no_cow_mask));
1363 }
1364
1365 #if CONFIG_MAP_RANGES
1366 vm_map_range_map_init();
1367 #endif /* CONFIG_MAP_RANGES */
1368
1369 #if DEBUG
1370 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1371 if (vm_check_map_sanity) {
1372 kprintf("VM sanity checking enabled\n");
1373 } else {
1374 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1375 }
1376 #endif /* DEBUG */
1377
1378 #if DEVELOPMENT || DEBUG
1379 PE_parse_boot_argn("panic_on_unsigned_execute",
1380 &panic_on_unsigned_execute,
1381 sizeof(panic_on_unsigned_execute));
1382 PE_parse_boot_argn("panic_on_mlock_failure",
1383 &panic_on_mlock_failure,
1384 sizeof(panic_on_mlock_failure));
1385 #endif /* DEVELOPMENT || DEBUG */
1386 }
1387
1388 __startup_func
1389 static void
vm_map_steal_memory(void)1390 vm_map_steal_memory(void)
1391 {
1392 /*
1393 * We need to reserve enough memory to support boostraping VM maps
1394 * and the zone subsystem.
1395 *
1396 * The VM Maps that need to function before zones can support them
1397 * are the ones registered with vm_map_will_allocate_early_map(),
1398 * which are:
1399 * - the kernel map
1400 * - the various submaps used by zones (pgz, meta, ...)
1401 *
1402 * We also need enough entries and holes to support them
1403 * until zone_metadata_init() is called, which is when
1404 * the zone allocator becomes capable of expanding dynamically.
1405 *
1406 * We need:
1407 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1408 * - To allow for 3-4 entries per map, but the kernel map
1409 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1410 * to describe the submaps, so double it (and make it 8x too)
1411 * - To allow for holes between entries,
1412 * hence needs the same budget as entries
1413 */
1414 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1415 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1416 VM_MAP_EARLY_COUNT_MAX);
1417
1418 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1419 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1420 8 * VM_MAP_EARLY_COUNT_MAX);
1421
1422 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1423 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1424 8 * VM_MAP_EARLY_COUNT_MAX);
1425
1426 /*
1427 * Steal a contiguous range of memory so that a simple range check
1428 * can validate early addresses being freed/crammed to these
1429 * zones
1430 */
1431 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1432 map_holes_data_size);
1433 kentry_data = map_data + map_data_size;
1434 map_holes_data = kentry_data + kentry_data_size;
1435 }
1436 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1437
1438 __startup_func
1439 static void
vm_kernel_boostraped(void)1440 vm_kernel_boostraped(void)
1441 {
1442 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_ENTRY]);
1443 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_HOLES]);
1444 zone_enable_caching(&zone_array[ZONE_ID_VM_MAP_COPY]);
1445
1446 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1447 zone_count_free(vm_map_zone),
1448 zone_count_free(vm_map_entry_zone),
1449 zone_count_free(vm_map_holes_zone));
1450 }
1451 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1452
1453 void
vm_map_disable_hole_optimization(vm_map_t map)1454 vm_map_disable_hole_optimization(vm_map_t map)
1455 {
1456 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1457
1458 if (map->holelistenabled) {
1459 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1460
1461 while (hole_entry != NULL) {
1462 next_hole_entry = hole_entry->vme_next;
1463
1464 hole_entry->vme_next = NULL;
1465 hole_entry->vme_prev = NULL;
1466 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1467
1468 if (next_hole_entry == head_entry) {
1469 hole_entry = NULL;
1470 } else {
1471 hole_entry = next_hole_entry;
1472 }
1473 }
1474
1475 map->holes_list = NULL;
1476 map->holelistenabled = FALSE;
1477
1478 map->first_free = vm_map_first_entry(map);
1479 SAVE_HINT_HOLE_WRITE(map, NULL);
1480 }
1481 }
1482
1483 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1484 vm_kernel_map_is_kernel(vm_map_t map)
1485 {
1486 return map->pmap == kernel_pmap;
1487 }
1488
1489 /*
1490 * vm_map_create:
1491 *
1492 * Creates and returns a new empty VM map with
1493 * the given physical map structure, and having
1494 * the given lower and upper address bounds.
1495 */
1496
1497 extern vm_map_t vm_map_create_external(
1498 pmap_t pmap,
1499 vm_map_offset_t min_off,
1500 vm_map_offset_t max_off,
1501 boolean_t pageable);
1502
1503 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1504 vm_map_create_external(
1505 pmap_t pmap,
1506 vm_map_offset_t min,
1507 vm_map_offset_t max,
1508 boolean_t pageable)
1509 {
1510 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1511
1512 if (pageable) {
1513 options |= VM_MAP_CREATE_PAGEABLE;
1514 }
1515 return vm_map_create_options(pmap, min, max, options);
1516 }
1517
1518 __startup_func
1519 void
vm_map_will_allocate_early_map(vm_map_t * owner)1520 vm_map_will_allocate_early_map(vm_map_t *owner)
1521 {
1522 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1523 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1524 }
1525
1526 early_map_owners[early_map_count++] = owner;
1527 }
1528
1529 __startup_func
1530 void
vm_map_relocate_early_maps(vm_offset_t delta)1531 vm_map_relocate_early_maps(vm_offset_t delta)
1532 {
1533 for (uint32_t i = 0; i < early_map_count; i++) {
1534 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1535
1536 *early_map_owners[i] = (vm_map_t)(addr + delta);
1537 }
1538
1539 early_map_count = ~0u;
1540 }
1541
1542 /*
1543 * Routine: vm_map_relocate_early_elem
1544 *
1545 * Purpose:
1546 * Early zone elements are allocated in a temporary part
1547 * of the address space.
1548 *
1549 * Once the zones live in their final place, the early
1550 * VM maps, map entries and map holes need to be relocated.
1551 *
1552 * It involves rewriting any vm_map_t, vm_map_entry_t or
1553 * pointers to vm_map_links. Other pointers to other types
1554 * are fine.
1555 *
1556 * Fortunately, pointers to those types are self-contained
1557 * in those zones, _except_ for pointers to VM maps,
1558 * which are tracked during early boot and fixed with
1559 * vm_map_relocate_early_maps().
1560 */
1561 __startup_func
1562 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1563 vm_map_relocate_early_elem(
1564 uint32_t zone_id,
1565 vm_offset_t new_addr,
1566 vm_offset_t delta)
1567 {
1568 #define relocate(type_t, field) ({ \
1569 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1570 if (*__field) { \
1571 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1572 } \
1573 })
1574
1575 switch (zone_id) {
1576 case ZONE_ID_VM_MAP:
1577 case ZONE_ID_VM_MAP_ENTRY:
1578 case ZONE_ID_VM_MAP_HOLES:
1579 break;
1580
1581 default:
1582 panic("Unexpected zone ID %d", zone_id);
1583 }
1584
1585 if (zone_id == ZONE_ID_VM_MAP) {
1586 relocate(vm_map_t, hdr.links.prev);
1587 relocate(vm_map_t, hdr.links.next);
1588 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1589 #ifdef VM_MAP_STORE_USE_RB
1590 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1591 #endif /* VM_MAP_STORE_USE_RB */
1592 relocate(vm_map_t, hint);
1593 relocate(vm_map_t, hole_hint);
1594 relocate(vm_map_t, first_free);
1595 return;
1596 }
1597
1598 relocate(struct vm_map_links *, prev);
1599 relocate(struct vm_map_links *, next);
1600
1601 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1602 #ifdef VM_MAP_STORE_USE_RB
1603 relocate(vm_map_entry_t, store.entry.rbe_left);
1604 relocate(vm_map_entry_t, store.entry.rbe_right);
1605 relocate(vm_map_entry_t, store.entry.rbe_parent);
1606 #endif /* VM_MAP_STORE_USE_RB */
1607 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1608 /* no object to relocate because we haven't made any */
1609 ((vm_map_entry_t)new_addr)->vme_submap +=
1610 delta >> VME_SUBMAP_SHIFT;
1611 }
1612 #if MAP_ENTRY_CREATION_DEBUG
1613 relocate(vm_map_entry_t, vme_creation_maphdr);
1614 #endif /* MAP_ENTRY_CREATION_DEBUG */
1615 }
1616
1617 #undef relocate
1618 }
1619
1620 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1621 vm_map_create_options(
1622 pmap_t pmap,
1623 vm_map_offset_t min,
1624 vm_map_offset_t max,
1625 vm_map_create_options_t options)
1626 {
1627 vm_map_t result;
1628
1629 #if DEBUG || DEVELOPMENT
1630 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1631 if (early_map_count != ~0u && early_map_count !=
1632 zone_count_allocated(vm_map_zone) + 1) {
1633 panic("allocating %dth early map, owner not known",
1634 zone_count_allocated(vm_map_zone) + 1);
1635 }
1636 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1637 panic("allocating %dth early map for non kernel pmap",
1638 early_map_count);
1639 }
1640 }
1641 #endif /* DEBUG || DEVELOPMENT */
1642
1643 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1644
1645 vm_map_store_init(&result->hdr);
1646 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1647 vm_map_set_page_shift(result, PAGE_SHIFT);
1648
1649 result->size_limit = RLIM_INFINITY; /* default unlimited */
1650 result->data_limit = RLIM_INFINITY; /* default unlimited */
1651 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1652 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1653 result->pmap = pmap;
1654 result->min_offset = min;
1655 result->max_offset = max;
1656 result->first_free = vm_map_to_entry(result);
1657 result->hint = vm_map_to_entry(result);
1658
1659 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1660 assert(pmap == kernel_pmap);
1661 result->never_faults = true;
1662 }
1663
1664 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1665 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1666 result->has_corpse_footprint = true;
1667 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1668 struct vm_map_links *hole_entry;
1669
1670 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1671 hole_entry->start = min;
1672 /*
1673 * Holes can be used to track ranges all the way up to
1674 * MACH_VM_MAX_ADDRESS or more (e.g. kernel map).
1675 */
1676 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1677 result->holes_list = result->hole_hint = hole_entry;
1678 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1679 result->holelistenabled = true;
1680 }
1681
1682 vm_map_lock_init(result);
1683
1684 return result;
1685 }
1686
1687 /*
1688 * Adjusts a submap that was made by kmem_suballoc()
1689 * before it knew where it would be mapped,
1690 * so that it has the right min/max offsets.
1691 *
1692 * We do not need to hold any locks:
1693 * only the caller knows about this map,
1694 * and it is not published on any entry yet.
1695 */
1696 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1697 vm_map_adjust_offsets(
1698 vm_map_t map,
1699 vm_map_offset_t min_off,
1700 vm_map_offset_t max_off)
1701 {
1702 assert(map->min_offset == 0);
1703 assert(map->max_offset == max_off - min_off);
1704 assert(map->hdr.nentries == 0);
1705 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1706
1707 map->min_offset = min_off;
1708 map->max_offset = max_off;
1709
1710 if (map->holelistenabled) {
1711 struct vm_map_links *hole = map->holes_list;
1712
1713 hole->start = min_off;
1714 #if defined(__arm64__)
1715 hole->end = max_off;
1716 #else
1717 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1718 #endif
1719 }
1720 }
1721
1722
1723 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1724 vm_map_adjusted_size(vm_map_t map)
1725 {
1726 const struct vm_reserved_region *regions = NULL;
1727 size_t num_regions = 0;
1728 mach_vm_size_t reserved_size = 0, map_size = 0;
1729
1730 if (map == NULL || (map->size == 0)) {
1731 return 0;
1732 }
1733
1734 map_size = map->size;
1735
1736 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1737 /*
1738 * No special reserved regions or not an exotic map or the task
1739 * is terminating and these special regions might have already
1740 * been deallocated.
1741 */
1742 return map_size;
1743 }
1744
1745 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1746 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1747
1748 while (num_regions) {
1749 reserved_size += regions[--num_regions].vmrr_size;
1750 }
1751
1752 /*
1753 * There are a few places where the map is being switched out due to
1754 * 'termination' without that bit being set (e.g. exec and corpse purging).
1755 * In those cases, we could have the map's regions being deallocated on
1756 * a core while some accounting process is trying to get the map's size.
1757 * So this assert can't be enabled till all those places are uniform in
1758 * their use of the 'map->terminated' bit.
1759 *
1760 * assert(map_size >= reserved_size);
1761 */
1762
1763 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1764 }
1765
1766 /*
1767 * vm_map_entry_create: [ internal use only ]
1768 *
1769 * Allocates a VM map entry for insertion in the
1770 * given map (or map copy). No fields are filled.
1771 *
1772 * The VM entry will be zero initialized, except for:
1773 * - behavior set to VM_BEHAVIOR_DEFAULT
1774 * - inheritance set to VM_INHERIT_DEFAULT
1775 */
1776 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1777
1778 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1779
1780 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1781 _vm_map_entry_create(
1782 struct vm_map_header *map_header __unused)
1783 {
1784 vm_map_entry_t entry = NULL;
1785
1786 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1787
1788 /*
1789 * Help the compiler with what we know to be true,
1790 * so that the further bitfields inits have good codegen.
1791 *
1792 * See rdar://87041299
1793 */
1794 __builtin_assume(entry->vme_object_value == 0);
1795 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1796 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1797
1798 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1799 "VME_ALIAS_MASK covers tags");
1800
1801 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1802 "can skip zeroing of the behavior field");
1803 entry->inheritance = VM_INHERIT_DEFAULT;
1804
1805 #if MAP_ENTRY_CREATION_DEBUG
1806 entry->vme_creation_maphdr = map_header;
1807 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1808 BTREF_GET_NOWAIT);
1809 #endif
1810 return entry;
1811 }
1812
1813 /*
1814 * vm_map_entry_dispose: [ internal use only ]
1815 *
1816 * Inverse of vm_map_entry_create.
1817 *
1818 * write map lock held so no need to
1819 * do anything special to insure correctness
1820 * of the stores
1821 */
1822 static void
vm_map_entry_dispose(vm_map_entry_t entry)1823 vm_map_entry_dispose(
1824 vm_map_entry_t entry)
1825 {
1826 #if VM_BTLOG_TAGS
1827 if (entry->vme_kernel_object) {
1828 btref_put(entry->vme_tag_btref);
1829 }
1830 #endif /* VM_BTLOG_TAGS */
1831 #if MAP_ENTRY_CREATION_DEBUG
1832 btref_put(entry->vme_creation_bt);
1833 #endif
1834 #if MAP_ENTRY_INSERTION_DEBUG
1835 btref_put(entry->vme_insertion_bt);
1836 #endif
1837 zfree(vm_map_entry_zone, entry);
1838 }
1839
1840 #define vm_map_copy_entry_dispose(copy_entry) \
1841 vm_map_entry_dispose(copy_entry)
1842
1843 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1844 vm_map_zap_first_entry(
1845 vm_map_zap_t list)
1846 {
1847 return list->vmz_head;
1848 }
1849
1850 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1851 vm_map_zap_last_entry(
1852 vm_map_zap_t list)
1853 {
1854 assert(vm_map_zap_first_entry(list));
1855 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1856 }
1857
1858 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1859 vm_map_zap_append(
1860 vm_map_zap_t list,
1861 vm_map_entry_t entry)
1862 {
1863 entry->vme_next = VM_MAP_ENTRY_NULL;
1864 *list->vmz_tail = entry;
1865 list->vmz_tail = &entry->vme_next;
1866 }
1867
1868 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1869 vm_map_zap_pop(
1870 vm_map_zap_t list)
1871 {
1872 vm_map_entry_t head = list->vmz_head;
1873
1874 if (head != VM_MAP_ENTRY_NULL &&
1875 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1876 list->vmz_tail = &list->vmz_head;
1877 }
1878
1879 return head;
1880 }
1881
1882 static void
vm_map_zap_dispose(vm_map_zap_t list)1883 vm_map_zap_dispose(
1884 vm_map_zap_t list)
1885 {
1886 vm_map_entry_t entry;
1887
1888 while ((entry = vm_map_zap_pop(list))) {
1889 if (entry->is_sub_map) {
1890 vm_map_deallocate(VME_SUBMAP(entry));
1891 } else {
1892 vm_object_deallocate(VME_OBJECT(entry));
1893 }
1894
1895 vm_map_entry_dispose(entry);
1896 }
1897 }
1898
1899 #if MACH_ASSERT
1900 static boolean_t first_free_check = FALSE;
1901 boolean_t
first_free_is_valid(vm_map_t map)1902 first_free_is_valid(
1903 vm_map_t map)
1904 {
1905 if (!first_free_check) {
1906 return TRUE;
1907 }
1908
1909 return first_free_is_valid_store( map );
1910 }
1911 #endif /* MACH_ASSERT */
1912
1913
1914 #define vm_map_copy_entry_link(copy, after_where, entry) \
1915 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1916
1917 #define vm_map_copy_entry_unlink(copy, entry) \
1918 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1919
1920 /*
1921 * vm_map_destroy:
1922 *
1923 * Actually destroy a map.
1924 */
1925 void
vm_map_destroy(vm_map_t map)1926 vm_map_destroy(
1927 vm_map_t map)
1928 {
1929 /* final cleanup: this is not allowed to fail */
1930 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1931
1932 VM_MAP_ZAP_DECLARE(zap);
1933
1934 vm_map_lock(map);
1935
1936 map->terminated = true;
1937 /* clean up regular map entries */
1938 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1939 KMEM_GUARD_NONE, &zap);
1940 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1941 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1942 KMEM_GUARD_NONE, &zap);
1943
1944 vm_map_disable_hole_optimization(map);
1945 vm_map_corpse_footprint_destroy(map);
1946
1947 vm_map_unlock(map);
1948
1949 vm_map_zap_dispose(&zap);
1950
1951 assert(map->hdr.nentries == 0);
1952
1953 if (map->pmap) {
1954 pmap_destroy(map->pmap);
1955 }
1956
1957 lck_rw_destroy(&map->lock, &vm_map_lck_grp);
1958
1959 #if CONFIG_MAP_RANGES
1960 kfree_data(map->extra_ranges,
1961 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1962 #endif
1963
1964 zfree_id(ZONE_ID_VM_MAP, map);
1965 }
1966
1967 /*
1968 * Returns pid of the task with the largest number of VM map entries.
1969 * Used in the zone-map-exhaustion jetsam path.
1970 */
1971 pid_t
find_largest_process_vm_map_entries(void)1972 find_largest_process_vm_map_entries(void)
1973 {
1974 pid_t victim_pid = -1;
1975 int max_vm_map_entries = 0;
1976 task_t task = TASK_NULL;
1977 queue_head_t *task_list = &tasks;
1978
1979 lck_mtx_lock(&tasks_threads_lock);
1980 queue_iterate(task_list, task, task_t, tasks) {
1981 if (task == kernel_task || !task->active) {
1982 continue;
1983 }
1984
1985 vm_map_t task_map = task->map;
1986 if (task_map != VM_MAP_NULL) {
1987 int task_vm_map_entries = task_map->hdr.nentries;
1988 if (task_vm_map_entries > max_vm_map_entries) {
1989 max_vm_map_entries = task_vm_map_entries;
1990 victim_pid = pid_from_task(task);
1991 }
1992 }
1993 }
1994 lck_mtx_unlock(&tasks_threads_lock);
1995
1996 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1997 return victim_pid;
1998 }
1999
2000
2001 /*
2002 * vm_map_lookup_entry: [ internal use only ]
2003 *
2004 * Calls into the vm map store layer to find the map
2005 * entry containing (or immediately preceding) the
2006 * specified address in the given map; the entry is returned
2007 * in the "entry" parameter. The boolean
2008 * result indicates whether the address is
2009 * actually contained in the map.
2010 */
2011 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2012 vm_map_lookup_entry(
2013 vm_map_t map,
2014 vm_map_offset_t address,
2015 vm_map_entry_t *entry) /* OUT */
2016 {
2017 bool result = false;
2018 if (VM_KERNEL_ADDRESS(address)) {
2019 address = VM_KERNEL_STRIP_UPTR(address);
2020 }
2021
2022 #if CONFIG_PROB_GZALLOC
2023 if (map->pmap == kernel_pmap) {
2024 assertf(!pgz_owned(address),
2025 "it is the responsibility of callers to unguard PGZ addresses");
2026 }
2027 #endif /* CONFIG_PROB_GZALLOC */
2028 result = vm_map_store_lookup_entry( map, address, entry );
2029
2030 return result;
2031 }
2032
2033 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2034 vm_map_lookup_entry_or_next(
2035 vm_map_t map,
2036 vm_map_offset_t address,
2037 vm_map_entry_t *entry) /* OUT */
2038 {
2039 if (vm_map_lookup_entry(map, address, entry)) {
2040 return true;
2041 }
2042
2043 *entry = (*entry)->vme_next;
2044 return false;
2045 }
2046
2047 #if CONFIG_PROB_GZALLOC
2048 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)2049 vm_map_lookup_entry_allow_pgz(
2050 vm_map_t map,
2051 vm_map_offset_t address,
2052 vm_map_entry_t *entry) /* OUT */
2053 {
2054 if (VM_KERNEL_ADDRESS(address)) {
2055 address = VM_KERNEL_STRIP_UPTR(address);
2056 }
2057 return vm_map_store_lookup_entry( map, address, entry );
2058 }
2059 #endif /* CONFIG_PROB_GZALLOC */
2060
2061 /*
2062 * Routine: vm_map_range_invalid_panic
2063 * Purpose:
2064 * Panic on detection of an invalid range id.
2065 */
2066 __abortlike
2067 static void
vm_map_range_invalid_panic(vm_map_t map,vm_map_range_id_t range_id)2068 vm_map_range_invalid_panic(
2069 vm_map_t map,
2070 vm_map_range_id_t range_id)
2071 {
2072 panic("invalid range ID (%u) for map %p", range_id, map);
2073 }
2074
2075 /*
2076 * Routine: vm_map_get_range
2077 * Purpose:
2078 * Adjust bounds based on security policy.
2079 */
2080 static struct mach_vm_range
vm_map_get_range(vm_map_t map,vm_map_address_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size,bool * is_ptr)2081 vm_map_get_range(
2082 vm_map_t map,
2083 vm_map_address_t *address,
2084 vm_map_kernel_flags_t *vmk_flags,
2085 vm_map_size_t size,
2086 bool *is_ptr)
2087 {
2088 struct mach_vm_range effective_range = {};
2089 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
2090
2091 if (map == kernel_map) {
2092 effective_range = kmem_ranges[range_id];
2093
2094 if (startup_phase >= STARTUP_SUB_KMEM) {
2095 /*
2096 * Hint provided by caller is zeroed as the range is restricted to a
2097 * subset of the entire kernel_map VA, which could put the hint outside
2098 * the range, causing vm_map_store_find_space to fail.
2099 */
2100 *address = 0ull;
2101 /*
2102 * Ensure that range_id passed in by the caller is within meaningful
2103 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
2104 * to fail as the corresponding range is invalid. Range id larger than
2105 * KMEM_RANGE_ID_MAX will lead to an OOB access.
2106 */
2107 if ((range_id == KMEM_RANGE_ID_NONE) ||
2108 (range_id > KMEM_RANGE_ID_MAX)) {
2109 vm_map_range_invalid_panic(map, range_id);
2110 }
2111
2112 /*
2113 * Pointer ranges use kmem_locate_space to do allocations.
2114 *
2115 * Non pointer fronts look like [ Small | Large | Permanent ]
2116 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
2117 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
2118 * use the entire range.
2119 */
2120 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2121 *is_ptr = true;
2122 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2123 effective_range = kmem_large_ranges[range_id];
2124 }
2125 }
2126 #if CONFIG_MAP_RANGES
2127 } else if (map->uses_user_ranges) {
2128 switch (range_id) {
2129 case UMEM_RANGE_ID_DEFAULT:
2130 effective_range = map->default_range;
2131 break;
2132 case UMEM_RANGE_ID_HEAP:
2133 effective_range = map->data_range;
2134 break;
2135 case UMEM_RANGE_ID_LARGE_FILE:
2136 if (map->large_file_range.min_address != map->large_file_range.max_address) {
2137 /* large file range is configured and should be used */
2138 effective_range = map->large_file_range;
2139 } else {
2140 /*
2141 * the user asking for this user range might not have the
2142 * permissions to use the large file range (i.e., it doesn't
2143 * hold the correct entitlement), so we give it the data range
2144 * instead
2145 */
2146 effective_range = map->data_range;
2147 }
2148 break;
2149 case UMEM_RANGE_ID_FIXED:
2150 /*
2151 * anywhere allocations with an address in "FIXED"
2152 * makes no sense, leave the range empty
2153 */
2154 break;
2155
2156 default:
2157 vm_map_range_invalid_panic(map, range_id);
2158 }
2159 #endif /* CONFIG_MAP_RANGES */
2160 } else {
2161 /*
2162 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2163 * allocations of PAGEZERO to explicit requests since its
2164 * normal use is to catch dereferences of NULL and many
2165 * applications also treat pointers with a value of 0 as
2166 * special and suddenly having address 0 contain useable
2167 * memory would tend to confuse those applications.
2168 */
2169 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2170 effective_range.max_address = map->max_offset;
2171 }
2172
2173 return effective_range;
2174 }
2175
2176 kern_return_t
vm_map_locate_space_anywhere(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2177 vm_map_locate_space_anywhere(
2178 vm_map_t map,
2179 vm_map_size_t size,
2180 vm_map_offset_t mask,
2181 vm_map_kernel_flags_t vmk_flags,
2182 vm_map_offset_t *start_inout,
2183 vm_map_entry_t *entry_out)
2184 {
2185 struct mach_vm_range effective_range = {};
2186 vm_map_size_t guard_offset;
2187 vm_map_offset_t hint, limit;
2188 vm_map_entry_t entry;
2189 bool is_kmem_ptr_range = false;
2190
2191 /*
2192 * Only supported by vm_map_enter() with a fixed address.
2193 */
2194 assert(!vmk_flags.vmf_fixed);
2195 assert(!vmk_flags.vmkf_beyond_max);
2196
2197 if (__improbable(map->wait_for_space)) {
2198 /*
2199 * support for "wait_for_space" is minimal,
2200 * its only consumer is the ipc_kernel_copy_map.
2201 */
2202 assert(!map->holelistenabled &&
2203 !vmk_flags.vmkf_last_free &&
2204 !vmk_flags.vmkf_keep_map_locked &&
2205 !vmk_flags.vmkf_map_jit &&
2206 !vmk_flags.vmf_random_addr &&
2207 *start_inout <= map->min_offset);
2208 } else if (vmk_flags.vmkf_last_free) {
2209 assert(!vmk_flags.vmkf_map_jit &&
2210 !vmk_flags.vmf_random_addr);
2211 }
2212
2213 if (vmk_flags.vmkf_guard_before) {
2214 guard_offset = VM_MAP_PAGE_SIZE(map);
2215 assert(size > guard_offset);
2216 size -= guard_offset;
2217 } else {
2218 assert(size != 0);
2219 guard_offset = 0;
2220 }
2221
2222 /*
2223 * Validate range_id from flags and get associated range
2224 */
2225 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size,
2226 &is_kmem_ptr_range);
2227
2228 if (is_kmem_ptr_range) {
2229 return kmem_locate_space(size + guard_offset, vmk_flags.vmkf_range_id,
2230 vmk_flags.vmkf_last_free, start_inout, entry_out);
2231 }
2232
2233 #if XNU_TARGET_OS_OSX
2234 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2235 assert(map != kernel_map);
2236 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2237 }
2238 #endif /* XNU_TARGET_OS_OSX */
2239
2240 again:
2241 if (vmk_flags.vmkf_last_free) {
2242 hint = *start_inout;
2243
2244 if (hint == 0 || hint > effective_range.max_address) {
2245 hint = effective_range.max_address;
2246 }
2247 if (hint <= effective_range.min_address) {
2248 return KERN_NO_SPACE;
2249 }
2250 limit = effective_range.min_address;
2251 } else {
2252 hint = *start_inout;
2253
2254 if (vmk_flags.vmkf_map_jit) {
2255 if (map->jit_entry_exists &&
2256 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2257 return KERN_INVALID_ARGUMENT;
2258 }
2259 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2260 vmk_flags.vmf_random_addr = true;
2261 }
2262 }
2263
2264 if (vmk_flags.vmf_random_addr) {
2265 kern_return_t kr;
2266
2267 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2268 if (kr != KERN_SUCCESS) {
2269 return kr;
2270 }
2271 }
2272 #if __x86_64__
2273 else if ((hint == 0 || hint == vm_map_min(map)) &&
2274 !map->disable_vmentry_reuse &&
2275 map->vmmap_high_start != 0) {
2276 hint = map->vmmap_high_start;
2277 }
2278 #endif /* __x86_64__ */
2279
2280 if (hint < effective_range.min_address) {
2281 hint = effective_range.min_address;
2282 }
2283 if (effective_range.max_address <= hint) {
2284 return KERN_NO_SPACE;
2285 }
2286
2287 limit = effective_range.max_address;
2288 }
2289 entry = vm_map_store_find_space(map,
2290 hint, limit, vmk_flags.vmkf_last_free,
2291 guard_offset, size, mask,
2292 start_inout);
2293
2294 if (__improbable(entry == NULL)) {
2295 if (map->wait_for_space &&
2296 guard_offset + size <=
2297 effective_range.max_address - effective_range.min_address) {
2298 assert_wait((event_t)map, THREAD_ABORTSAFE);
2299 vm_map_unlock(map);
2300 thread_block(THREAD_CONTINUE_NULL);
2301 vm_map_lock(map);
2302 goto again;
2303 }
2304 return KERN_NO_SPACE;
2305 }
2306
2307 if (entry_out) {
2308 *entry_out = entry;
2309 }
2310 return KERN_SUCCESS;
2311 }
2312
2313 /*!
2314 * @function vm_map_locate_space_fixed()
2315 *
2316 * @brief
2317 * Locate (no reservation) a range in the specified VM map at a fixed address.
2318 *
2319 * @param map the map to scan for memory, must be locked.
2320 * @param start the fixed address trying to be reserved
2321 * @param size the size of the allocation to make.
2322 * @param mask an alignment mask the allocation must respect,
2323 * @param vmk_flags the vm map kernel flags to influence this call.
2324 * vmk_flags.vmf_anywhere must not be set.
2325 * @param entry_out the entry right before the hole.
2326 * @param zap_list a zap list of entries to clean up after the call.
2327 *
2328 * @returns
2329 * - KERN_SUCCESS in case of success and no conflicting entry is found,
2330 * in which case entry_out is set to the entry before the hole.
2331 *
2332 * - KERN_MEMORY_PRESENT if a conflicting entry is found,
2333 * in which case entry_out is set the conflicting entry,
2334 * the callers MUST handle this error explicitly.
2335 *
2336 * - KERN_INVALID_ADDRESS if the specified @c start or @c size
2337 * would result in a mapping outside of the map.
2338 *
2339 * - KERN_NO_SPACE for various cases of unrecoverable failures.
2340 */
2341 static kern_return_t
vm_map_locate_space_fixed(vm_map_t map,vm_map_offset_t start,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * entry_out,vm_map_zap_t zap_list)2342 vm_map_locate_space_fixed(
2343 vm_map_t map,
2344 vm_map_offset_t start,
2345 vm_map_size_t size,
2346 vm_map_offset_t mask,
2347 vm_map_kernel_flags_t vmk_flags,
2348 vm_map_entry_t *entry_out,
2349 vm_map_zap_t zap_list)
2350 {
2351 vm_map_offset_t effective_min_offset, effective_max_offset;
2352 vm_map_entry_t entry;
2353 vm_map_offset_t end;
2354
2355 assert(vmk_flags.vmf_fixed);
2356
2357 effective_min_offset = map->min_offset;
2358 effective_max_offset = map->max_offset;
2359
2360 if (vmk_flags.vmkf_beyond_max) {
2361 /*
2362 * Allow an insertion beyond the map's max offset.
2363 */
2364 effective_max_offset = 0x00000000FFFFF000ULL;
2365 if (vm_map_is_64bit(map)) {
2366 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2367 }
2368 #if XNU_TARGET_OS_OSX
2369 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2370 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2371 #endif /* XNU_TARGET_OS_OSX */
2372 }
2373
2374 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2375 !vmk_flags.vmf_overwrite &&
2376 map->pmap == kernel_pmap &&
2377 vmk_flags.vm_tag == VM_MEMORY_REALLOC) {
2378 /*
2379 * Force realloc() to switch to a new allocation,
2380 * to prevent 4k-fragmented virtual ranges.
2381 */
2382 // DEBUG4K_ERROR("no realloc in place");
2383 return KERN_NO_SPACE;
2384 }
2385
2386 /*
2387 * Verify that:
2388 * the address doesn't itself violate
2389 * the mask requirement.
2390 */
2391
2392 if ((start & mask) != 0) {
2393 return KERN_NO_SPACE;
2394 }
2395
2396 #if CONFIG_MAP_RANGES
2397 if (map->uses_user_ranges) {
2398 struct mach_vm_range r;
2399
2400 vm_map_user_range_resolve(map, start, 1, &r);
2401 if (r.max_address == 0) {
2402 return KERN_INVALID_ADDRESS;
2403 }
2404 effective_min_offset = r.min_address;
2405 effective_max_offset = r.max_address;
2406 }
2407 #endif /* CONFIG_MAP_RANGES */
2408
2409 if ((startup_phase >= STARTUP_SUB_KMEM) && !vmk_flags.vmkf_submap &&
2410 (map == kernel_map)) {
2411 mach_vm_range_t r = kmem_validate_range_for_overwrite(start, size);
2412 effective_min_offset = r->min_address;
2413 effective_max_offset = r->max_address;
2414 }
2415
2416 /*
2417 * ... the address is within bounds
2418 */
2419
2420 end = start + size;
2421
2422 if ((start < effective_min_offset) ||
2423 (end > effective_max_offset) ||
2424 (start >= end)) {
2425 return KERN_INVALID_ADDRESS;
2426 }
2427
2428 if (vmk_flags.vmf_overwrite) {
2429 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2430 kern_return_t remove_kr;
2431
2432 /*
2433 * Fixed mapping and "overwrite" flag: attempt to
2434 * remove all existing mappings in the specified
2435 * address range, saving them in our "zap_list".
2436 *
2437 * This avoids releasing the VM map lock in
2438 * vm_map_entry_delete() and allows atomicity
2439 * when we want to replace some mappings with a new one.
2440 * It also allows us to restore the old VM mappings if the
2441 * new mapping fails.
2442 */
2443 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2444
2445 if (vmk_flags.vmkf_overwrite_immutable) {
2446 /* we can overwrite immutable mappings */
2447 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2448 }
2449 if (vmk_flags.vmkf_remap_prot_copy) {
2450 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2451 }
2452 remove_kr = vm_map_delete(map, start, end, remove_flags,
2453 KMEM_GUARD_NONE, zap_list).kmr_return;
2454 if (remove_kr) {
2455 /* XXX FBDP restore zap_list? */
2456 return remove_kr;
2457 }
2458 }
2459
2460 /*
2461 * ... the starting address isn't allocated
2462 */
2463
2464 if (vm_map_lookup_entry(map, start, &entry)) {
2465 *entry_out = entry;
2466 return KERN_MEMORY_PRESENT;
2467 }
2468
2469 /*
2470 * ... the next region doesn't overlap the
2471 * end point.
2472 */
2473
2474 if ((entry->vme_next != vm_map_to_entry(map)) &&
2475 (entry->vme_next->vme_start < end)) {
2476 return KERN_NO_SPACE;
2477 }
2478
2479 *entry_out = entry;
2480 return KERN_SUCCESS;
2481 }
2482
2483 /*
2484 * Routine: vm_map_find_space
2485 * Purpose:
2486 * Allocate a range in the specified virtual address map,
2487 * returning the entry allocated for that range.
2488 * Used by kmem_alloc, etc.
2489 *
2490 * The map must be NOT be locked. It will be returned locked
2491 * on KERN_SUCCESS, unlocked on failure.
2492 *
2493 * If an entry is allocated, the object/offset fields
2494 * are initialized to zero.
2495 */
2496 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2497 vm_map_find_space(
2498 vm_map_t map,
2499 vm_map_offset_t hint_address,
2500 vm_map_size_t size,
2501 vm_map_offset_t mask,
2502 vm_map_kernel_flags_t vmk_flags,
2503 vm_map_entry_t *o_entry) /* OUT */
2504 {
2505 vm_map_entry_t new_entry, entry;
2506 kern_return_t kr;
2507
2508 if (size == 0) {
2509 return KERN_INVALID_ARGUMENT;
2510 }
2511
2512 new_entry = vm_map_entry_create(map);
2513 new_entry->use_pmap = true;
2514 new_entry->protection = VM_PROT_DEFAULT;
2515 new_entry->max_protection = VM_PROT_ALL;
2516
2517 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2518 new_entry->map_aligned = true;
2519 }
2520 if (vmk_flags.vmf_permanent) {
2521 new_entry->vme_permanent = true;
2522 }
2523
2524 vm_map_lock(map);
2525
2526 kr = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
2527 &hint_address, &entry);
2528 if (kr != KERN_SUCCESS) {
2529 vm_map_unlock(map);
2530 vm_map_entry_dispose(new_entry);
2531 return kr;
2532 }
2533 new_entry->vme_start = hint_address;
2534 new_entry->vme_end = hint_address + size;
2535
2536 /*
2537 * At this point,
2538 *
2539 * - new_entry's "vme_start" and "vme_end" should define
2540 * the endpoints of the available new range,
2541 *
2542 * - and "entry" should refer to the region before
2543 * the new range,
2544 *
2545 * - and the map should still be locked.
2546 */
2547
2548 assert(page_aligned(new_entry->vme_start));
2549 assert(page_aligned(new_entry->vme_end));
2550 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2551 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2552
2553 /*
2554 * Insert the new entry into the list
2555 */
2556
2557 vm_map_store_entry_link(map, entry, new_entry,
2558 VM_MAP_KERNEL_FLAGS_NONE);
2559 map->size += size;
2560
2561 /*
2562 * Update the lookup hint
2563 */
2564 SAVE_HINT_MAP_WRITE(map, new_entry);
2565
2566 *o_entry = new_entry;
2567 return KERN_SUCCESS;
2568 }
2569
2570 int vm_map_pmap_enter_print = FALSE;
2571 int vm_map_pmap_enter_enable = FALSE;
2572
2573 /*
2574 * Routine: vm_map_pmap_enter [internal only]
2575 *
2576 * Description:
2577 * Force pages from the specified object to be entered into
2578 * the pmap at the specified address if they are present.
2579 * As soon as a page not found in the object the scan ends.
2580 *
2581 * Returns:
2582 * Nothing.
2583 *
2584 * In/out conditions:
2585 * The source map should not be locked on entry.
2586 */
2587 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2588 vm_map_pmap_enter(
2589 vm_map_t map,
2590 vm_map_offset_t addr,
2591 vm_map_offset_t end_addr,
2592 vm_object_t object,
2593 vm_object_offset_t offset,
2594 vm_prot_t protection)
2595 {
2596 int type_of_fault;
2597 kern_return_t kr;
2598 uint8_t object_lock_type = 0;
2599 struct vm_object_fault_info fault_info = {};
2600
2601 if (map->pmap == 0) {
2602 return;
2603 }
2604
2605 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2606
2607 while (addr < end_addr) {
2608 vm_page_t m;
2609
2610
2611 /*
2612 * TODO:
2613 * From vm_map_enter(), we come into this function without the map
2614 * lock held or the object lock held.
2615 * We haven't taken a reference on the object either.
2616 * We should do a proper lookup on the map to make sure
2617 * that things are sane before we go locking objects that
2618 * could have been deallocated from under us.
2619 */
2620
2621 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2622 vm_object_lock(object);
2623
2624 m = vm_page_lookup(object, offset);
2625
2626 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2627 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2628 vm_object_unlock(object);
2629 return;
2630 }
2631
2632 if (vm_map_pmap_enter_print) {
2633 printf("vm_map_pmap_enter:");
2634 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2635 map, (unsigned long long)addr, object, (unsigned long long)offset);
2636 }
2637 type_of_fault = DBG_CACHE_HIT_FAULT;
2638 kr = vm_fault_enter(m, map->pmap,
2639 addr,
2640 PAGE_SIZE, 0,
2641 protection, protection,
2642 VM_PAGE_WIRED(m),
2643 FALSE, /* change_wiring */
2644 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2645 &fault_info,
2646 NULL, /* need_retry */
2647 &type_of_fault,
2648 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2649
2650 vm_object_unlock(object);
2651
2652 offset += PAGE_SIZE_64;
2653 addr += PAGE_SIZE;
2654 }
2655 }
2656
2657 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2658 static kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2659 vm_map_random_address_for_size(
2660 vm_map_t map,
2661 vm_map_offset_t *address,
2662 vm_map_size_t size,
2663 vm_map_kernel_flags_t vmk_flags)
2664 {
2665 kern_return_t kr = KERN_SUCCESS;
2666 int tries = 0;
2667 vm_map_offset_t random_addr = 0;
2668 vm_map_offset_t hole_end;
2669
2670 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2671 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2672 vm_map_size_t vm_hole_size = 0;
2673 vm_map_size_t addr_space_size;
2674 bool is_kmem_ptr;
2675 struct mach_vm_range effective_range;
2676
2677 effective_range = vm_map_get_range(map, address, &vmk_flags, size,
2678 &is_kmem_ptr);
2679
2680 addr_space_size = effective_range.max_address - effective_range.min_address;
2681 if (size >= addr_space_size) {
2682 return KERN_NO_SPACE;
2683 }
2684 addr_space_size -= size;
2685
2686 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2687
2688 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2689 if (startup_phase < STARTUP_SUB_ZALLOC) {
2690 random_addr = (vm_map_offset_t)early_random();
2691 } else {
2692 random_addr = (vm_map_offset_t)random();
2693 }
2694 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2695 random_addr = vm_map_trunc_page(
2696 effective_range.min_address + (random_addr % addr_space_size),
2697 VM_MAP_PAGE_MASK(map));
2698
2699 #if CONFIG_PROB_GZALLOC
2700 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2701 continue;
2702 }
2703 #endif /* CONFIG_PROB_GZALLOC */
2704
2705 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2706 if (prev_entry == vm_map_to_entry(map)) {
2707 next_entry = vm_map_first_entry(map);
2708 } else {
2709 next_entry = prev_entry->vme_next;
2710 }
2711 if (next_entry == vm_map_to_entry(map)) {
2712 hole_end = vm_map_max(map);
2713 } else {
2714 hole_end = next_entry->vme_start;
2715 }
2716 vm_hole_size = hole_end - random_addr;
2717 if (vm_hole_size >= size) {
2718 *address = random_addr;
2719 break;
2720 }
2721 }
2722 tries++;
2723 }
2724
2725 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2726 kr = KERN_NO_SPACE;
2727 }
2728 return kr;
2729 }
2730
2731 static boolean_t
vm_memory_malloc_no_cow(int alias)2732 vm_memory_malloc_no_cow(
2733 int alias)
2734 {
2735 uint64_t alias_mask;
2736
2737 if (!malloc_no_cow) {
2738 return FALSE;
2739 }
2740 if (alias > 63) {
2741 return FALSE;
2742 }
2743 alias_mask = 1ULL << alias;
2744 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2745 return TRUE;
2746 }
2747 return FALSE;
2748 }
2749
2750 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2751 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2752 /*
2753 * Routine: vm_map_enter
2754 *
2755 * Description:
2756 * Allocate a range in the specified virtual address map.
2757 * The resulting range will refer to memory defined by
2758 * the given memory object and offset into that object.
2759 *
2760 * Arguments are as defined in the vm_map call.
2761 */
2762 static unsigned int vm_map_enter_restore_successes = 0;
2763 static unsigned int vm_map_enter_restore_failures = 0;
2764 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2765 vm_map_enter(
2766 vm_map_t map,
2767 vm_map_offset_t *address, /* IN/OUT */
2768 vm_map_size_t size,
2769 vm_map_offset_t mask,
2770 vm_map_kernel_flags_t vmk_flags,
2771 vm_object_t object,
2772 vm_object_offset_t offset,
2773 boolean_t needs_copy,
2774 vm_prot_t cur_protection,
2775 vm_prot_t max_protection,
2776 vm_inherit_t inheritance)
2777 {
2778 vm_map_entry_t entry, new_entry;
2779 vm_map_offset_t start, tmp_start, tmp_offset;
2780 vm_map_offset_t end, tmp_end;
2781 vm_map_offset_t tmp2_start, tmp2_end;
2782 vm_map_offset_t step;
2783 kern_return_t result = KERN_SUCCESS;
2784 bool map_locked = FALSE;
2785 bool pmap_empty = TRUE;
2786 bool new_mapping_established = FALSE;
2787 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2788 const bool anywhere = !vmk_flags.vmf_fixed;
2789 const bool purgable = vmk_flags.vmf_purgeable;
2790 const bool no_cache = vmk_flags.vmf_no_cache;
2791 const bool is_submap = vmk_flags.vmkf_submap;
2792 const bool permanent = vmk_flags.vmf_permanent;
2793 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2794 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2795 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2796 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2797 const bool resilient_media = vmk_flags.vmf_resilient_media;
2798 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2799 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2800 const vm_tag_t alias = vmk_flags.vm_tag;
2801 vm_tag_t user_alias;
2802 kern_return_t kr;
2803 bool clear_map_aligned = FALSE;
2804 vm_map_size_t chunk_size = 0;
2805 vm_object_t caller_object;
2806 VM_MAP_ZAP_DECLARE(zap_old_list);
2807 VM_MAP_ZAP_DECLARE(zap_new_list);
2808
2809 caller_object = object;
2810
2811 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2812
2813 if (vmk_flags.vmf_4gb_chunk) {
2814 #if defined(__LP64__)
2815 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2816 #else /* __LP64__ */
2817 chunk_size = ANON_CHUNK_SIZE;
2818 #endif /* __LP64__ */
2819 } else {
2820 chunk_size = ANON_CHUNK_SIZE;
2821 }
2822
2823
2824
2825 if (superpage_size) {
2826 if (object != VM_OBJECT_NULL) {
2827 /* caller can't provide their own VM object */
2828 return KERN_INVALID_ARGUMENT;
2829 }
2830 switch (superpage_size) {
2831 /*
2832 * Note that the current implementation only supports
2833 * a single size for superpages, SUPERPAGE_SIZE, per
2834 * architecture. As soon as more sizes are supposed
2835 * to be supported, SUPERPAGE_SIZE has to be replaced
2836 * with a lookup of the size depending on superpage_size.
2837 */
2838 #ifdef __x86_64__
2839 case SUPERPAGE_SIZE_ANY:
2840 /* handle it like 2 MB and round up to page size */
2841 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2842 OS_FALLTHROUGH;
2843 case SUPERPAGE_SIZE_2MB:
2844 break;
2845 #endif
2846 default:
2847 return KERN_INVALID_ARGUMENT;
2848 }
2849 mask = SUPERPAGE_SIZE - 1;
2850 if (size & (SUPERPAGE_SIZE - 1)) {
2851 return KERN_INVALID_ARGUMENT;
2852 }
2853 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2854 }
2855
2856
2857 if ((cur_protection & VM_PROT_WRITE) &&
2858 (cur_protection & VM_PROT_EXECUTE) &&
2859 #if XNU_TARGET_OS_OSX
2860 map->pmap != kernel_pmap &&
2861 (cs_process_global_enforcement() ||
2862 (vmk_flags.vmkf_cs_enforcement_override
2863 ? vmk_flags.vmkf_cs_enforcement
2864 : (vm_map_cs_enforcement(map)
2865 #if __arm64__
2866 || !VM_MAP_IS_EXOTIC(map)
2867 #endif /* __arm64__ */
2868 ))) &&
2869 #endif /* XNU_TARGET_OS_OSX */
2870 #if CODE_SIGNING_MONITOR
2871 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2872 #endif
2873 (VM_MAP_POLICY_WX_FAIL(map) ||
2874 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2875 !entry_for_jit) {
2876 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2877
2878 DTRACE_VM3(cs_wx,
2879 uint64_t, 0,
2880 uint64_t, 0,
2881 vm_prot_t, cur_protection);
2882 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2883 proc_selfpid(),
2884 (get_bsdtask_info(current_task())
2885 ? proc_name_address(get_bsdtask_info(current_task()))
2886 : "?"),
2887 __FUNCTION__,
2888 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2889 cur_protection &= ~VM_PROT_EXECUTE;
2890 if (vm_protect_wx_fail) {
2891 return KERN_PROTECTION_FAILURE;
2892 }
2893 }
2894
2895 if (entry_for_jit
2896 && cur_protection != VM_PROT_ALL) {
2897 /*
2898 * Native macOS processes and all non-macOS processes are
2899 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2900 * the RWX requirement was not enforced, and thus, we must live
2901 * with our sins. We are now dealing with a JIT mapping without
2902 * RWX.
2903 *
2904 * We deal with these by letting the MAP_JIT stick in order
2905 * to avoid CS violations when these pages are mapped executable
2906 * down the line. In order to appease the page table monitor (you
2907 * know what I'm talking about), these pages will end up being
2908 * marked as XNU_USER_DEBUG, which will be allowed because we
2909 * don't enforce the code signing monitor on macOS systems. If
2910 * the user-space application ever changes permissions to RWX,
2911 * which they are allowed to since the mapping was originally
2912 * created with MAP_JIT, then they'll switch over to using the
2913 * XNU_USER_JIT type, and won't be allowed to downgrade any
2914 * more after that.
2915 *
2916 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2917 * strictly disallowed.
2918 */
2919
2920 #if XNU_TARGET_OS_OSX
2921 /*
2922 * Continue to allow non-RWX JIT
2923 */
2924 #else
2925 /* non-macOS: reject JIT regions without RWX */
2926 DTRACE_VM3(cs_wx,
2927 uint64_t, 0,
2928 uint64_t, 0,
2929 vm_prot_t, cur_protection);
2930 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2931 proc_selfpid(),
2932 (get_bsdtask_info(current_task())
2933 ? proc_name_address(get_bsdtask_info(current_task()))
2934 : "?"),
2935 __FUNCTION__,
2936 cur_protection);
2937 return KERN_PROTECTION_FAILURE;
2938 #endif
2939 }
2940
2941 /*
2942 * If the task has requested executable lockdown,
2943 * deny any new executable mapping.
2944 */
2945 if (map->map_disallow_new_exec == TRUE) {
2946 if (cur_protection & VM_PROT_EXECUTE) {
2947 return KERN_PROTECTION_FAILURE;
2948 }
2949 }
2950
2951 if (resilient_codesign) {
2952 assert(!is_submap);
2953 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2954 if ((cur_protection | max_protection) & reject_prot) {
2955 return KERN_PROTECTION_FAILURE;
2956 }
2957 }
2958
2959 if (resilient_media) {
2960 assert(!is_submap);
2961 // assert(!needs_copy);
2962 if (object != VM_OBJECT_NULL &&
2963 !object->internal) {
2964 /*
2965 * This mapping is directly backed by an external
2966 * memory manager (e.g. a vnode pager for a file):
2967 * we would not have any safe place to inject
2968 * a zero-filled page if an actual page is not
2969 * available, without possibly impacting the actual
2970 * contents of the mapped object (e.g. the file),
2971 * so we can't provide any media resiliency here.
2972 */
2973 return KERN_INVALID_ARGUMENT;
2974 }
2975 }
2976
2977 if (entry_for_tpro) {
2978 /*
2979 * TPRO overrides the effective permissions of the region
2980 * and explicitly maps as RW. Ensure we have been passed
2981 * the expected permissions. We accept `cur_protections`
2982 * RO as that will be handled on fault.
2983 */
2984 if (!(max_protection & VM_PROT_READ) ||
2985 !(max_protection & VM_PROT_WRITE) ||
2986 !(cur_protection & VM_PROT_READ)) {
2987 return KERN_PROTECTION_FAILURE;
2988 }
2989
2990 /*
2991 * We can now downgrade the cur_protection to RO. This is a mild lie
2992 * to the VM layer. But TPRO will be responsible for toggling the
2993 * protections between RO/RW
2994 */
2995 cur_protection = VM_PROT_READ;
2996 }
2997
2998 if (is_submap) {
2999 vm_map_t submap;
3000 if (purgable) {
3001 /* submaps can not be purgeable */
3002 return KERN_INVALID_ARGUMENT;
3003 }
3004 if (object == VM_OBJECT_NULL) {
3005 /* submaps can not be created lazily */
3006 return KERN_INVALID_ARGUMENT;
3007 }
3008 submap = (vm_map_t) object;
3009 if (VM_MAP_PAGE_SHIFT(submap) != VM_MAP_PAGE_SHIFT(map)) {
3010 /* page size mismatch */
3011 return KERN_INVALID_ARGUMENT;
3012 }
3013 }
3014 if (vmk_flags.vmkf_already) {
3015 /*
3016 * VM_FLAGS_ALREADY says that it's OK if the same mapping
3017 * is already present. For it to be meaningul, the requested
3018 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
3019 * we shouldn't try and remove what was mapped there first
3020 * (!VM_FLAGS_OVERWRITE).
3021 */
3022 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
3023 return KERN_INVALID_ARGUMENT;
3024 }
3025 }
3026
3027 if (size == 0 ||
3028 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
3029 *address = 0;
3030 return KERN_INVALID_ARGUMENT;
3031 }
3032
3033 if (map->pmap == kernel_pmap) {
3034 user_alias = VM_KERN_MEMORY_NONE;
3035 } else {
3036 user_alias = alias;
3037 }
3038
3039 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
3040 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
3041 }
3042
3043 #define RETURN(value) { result = value; goto BailOut; }
3044
3045 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
3046 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
3047 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
3048 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
3049 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
3050 }
3051
3052 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3053 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
3054 /*
3055 * In most cases, the caller rounds the size up to the
3056 * map's page size.
3057 * If we get a size that is explicitly not map-aligned here,
3058 * we'll have to respect the caller's wish and mark the
3059 * mapping as "not map-aligned" to avoid tripping the
3060 * map alignment checks later.
3061 */
3062 clear_map_aligned = TRUE;
3063 }
3064 if (!anywhere &&
3065 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
3066 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
3067 /*
3068 * We've been asked to map at a fixed address and that
3069 * address is not aligned to the map's specific alignment.
3070 * The caller should know what it's doing (i.e. most likely
3071 * mapping some fragmented copy map, transferring memory from
3072 * a VM map with a different alignment), so clear map_aligned
3073 * for this new VM map entry and proceed.
3074 */
3075 clear_map_aligned = TRUE;
3076 }
3077
3078 /*
3079 * Only zero-fill objects are allowed to be purgable.
3080 * LP64todo - limit purgable objects to 32-bits for now
3081 */
3082 if (purgable &&
3083 (offset != 0 ||
3084 (object != VM_OBJECT_NULL &&
3085 (object->vo_size != size ||
3086 object->purgable == VM_PURGABLE_DENY))
3087 #if __LP64__
3088 || size > ANON_MAX_SIZE
3089 #endif
3090 )) {
3091 return KERN_INVALID_ARGUMENT;
3092 }
3093
3094 vm_map_lock(map);
3095 map_locked = TRUE;
3096
3097 if (anywhere) {
3098 result = vm_map_locate_space_anywhere(map, size, mask, vmk_flags,
3099 address, &entry);
3100 start = *address;
3101 } else {
3102 start = *address;
3103 result = vm_map_locate_space_fixed(map, start, size, mask,
3104 vmk_flags, &entry, &zap_old_list);
3105 }
3106
3107 end = start + size;
3108
3109 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
3110
3111 /*
3112 * Check if what's already there is what we want.
3113 */
3114 if (result == KERN_MEMORY_PRESENT) {
3115 assert(!anywhere);
3116 if (!(vmk_flags.vmkf_already)) {
3117 RETURN(KERN_NO_SPACE);
3118 }
3119 tmp_start = start;
3120 tmp_offset = offset;
3121 if (entry->vme_start < start) {
3122 tmp_start -= start - entry->vme_start;
3123 tmp_offset -= start - entry->vme_start;
3124 }
3125 for (; entry->vme_start < end;
3126 entry = entry->vme_next) {
3127 /*
3128 * Check if the mapping's attributes
3129 * match the existing map entry.
3130 */
3131 if (entry == vm_map_to_entry(map) ||
3132 entry->vme_start != tmp_start ||
3133 entry->is_sub_map != is_submap ||
3134 VME_OFFSET(entry) != tmp_offset ||
3135 entry->needs_copy != needs_copy ||
3136 entry->protection != cur_protection ||
3137 entry->max_protection != max_protection ||
3138 entry->inheritance != inheritance ||
3139 entry->iokit_acct != iokit_acct ||
3140 VME_ALIAS(entry) != alias) {
3141 /* not the same mapping ! */
3142 RETURN(KERN_NO_SPACE);
3143 }
3144 /*
3145 * Check if the same object is being mapped.
3146 */
3147 if (is_submap) {
3148 if (VME_SUBMAP(entry) !=
3149 (vm_map_t) object) {
3150 /* not the same submap */
3151 RETURN(KERN_NO_SPACE);
3152 }
3153 } else {
3154 if (VME_OBJECT(entry) != object) {
3155 /* not the same VM object... */
3156 vm_object_t obj2;
3157
3158 obj2 = VME_OBJECT(entry);
3159 if ((obj2 == VM_OBJECT_NULL || obj2->internal) &&
3160 (object == VM_OBJECT_NULL || object->internal)) {
3161 /*
3162 * ... but both are
3163 * anonymous memory,
3164 * so equivalent.
3165 */
3166 } else {
3167 RETURN(KERN_NO_SPACE);
3168 }
3169 }
3170 }
3171
3172 tmp_offset += entry->vme_end - entry->vme_start;
3173 tmp_start += entry->vme_end - entry->vme_start;
3174 if (entry->vme_end >= end) {
3175 /* reached the end of our mapping */
3176 break;
3177 }
3178 }
3179 /* it all matches: let's use what's already there ! */
3180 RETURN(KERN_MEMORY_PRESENT);
3181 }
3182
3183 if (result != KERN_SUCCESS) {
3184 goto BailOut;
3185 }
3186
3187
3188 /*
3189 * At this point,
3190 * "start" and "end" should define the endpoints of the
3191 * available new range, and
3192 * "entry" should refer to the region before the new
3193 * range, and
3194 *
3195 * the map should be locked.
3196 */
3197
3198 /*
3199 * See whether we can avoid creating a new entry (and object) by
3200 * extending one of our neighbors. [So far, we only attempt to
3201 * extend from below.] Note that we can never extend/join
3202 * purgable objects because they need to remain distinct
3203 * entities in order to implement their "volatile object"
3204 * semantics.
3205 */
3206
3207 if (purgable ||
3208 entry_for_jit ||
3209 entry_for_tpro ||
3210 vm_memory_malloc_no_cow(user_alias)) {
3211 if (superpage_size) {
3212 /*
3213 * For "super page" allocations, we will allocate
3214 * special physically-contiguous VM objects later on,
3215 * so we should not have flags instructing us to create
3216 * a differently special VM object here.
3217 */
3218 RETURN(KERN_INVALID_ARGUMENT);
3219 }
3220
3221 if (object == VM_OBJECT_NULL) {
3222 assert(!superpage_size);
3223 object = vm_object_allocate(size);
3224 vm_object_lock(object);
3225 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3226 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3227 if (malloc_no_cow_except_fork &&
3228 !purgable &&
3229 !entry_for_jit &&
3230 !entry_for_tpro &&
3231 vm_memory_malloc_no_cow(user_alias)) {
3232 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3233 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3234 }
3235 if (entry_for_jit) {
3236 object->vo_inherit_copy_none = true;
3237 }
3238 if (purgable) {
3239 task_t owner;
3240 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3241 if (map->pmap == kernel_pmap) {
3242 /*
3243 * Purgeable mappings made in a kernel
3244 * map are "owned" by the kernel itself
3245 * rather than the current user task
3246 * because they're likely to be used by
3247 * more than this user task (see
3248 * execargs_purgeable_allocate(), for
3249 * example).
3250 */
3251 owner = kernel_task;
3252 } else {
3253 owner = current_task();
3254 }
3255 assert(object->vo_owner == NULL);
3256 assert(object->resident_page_count == 0);
3257 assert(object->wired_page_count == 0);
3258 vm_purgeable_nonvolatile_enqueue(object, owner);
3259 }
3260 vm_object_unlock(object);
3261 offset = (vm_object_offset_t)0;
3262 }
3263 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3264 /* no coalescing if address space uses sub-pages */
3265 } else if ((is_submap == FALSE) &&
3266 (object == VM_OBJECT_NULL) &&
3267 (entry != vm_map_to_entry(map)) &&
3268 (entry->vme_end == start) &&
3269 (!entry->is_shared) &&
3270 (!entry->is_sub_map) &&
3271 (!entry->in_transition) &&
3272 (!entry->needs_wakeup) &&
3273 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3274 (entry->protection == cur_protection) &&
3275 (entry->max_protection == max_protection) &&
3276 (entry->inheritance == inheritance) &&
3277 ((user_alias == VM_MEMORY_REALLOC) ||
3278 (VME_ALIAS(entry) == alias)) &&
3279 (entry->no_cache == no_cache) &&
3280 (entry->vme_permanent == permanent) &&
3281 /* no coalescing for immutable executable mappings */
3282 !((entry->protection & VM_PROT_EXECUTE) &&
3283 entry->vme_permanent) &&
3284 (!entry->superpage_size && !superpage_size) &&
3285 /*
3286 * No coalescing if not map-aligned, to avoid propagating
3287 * that condition any further than needed:
3288 */
3289 (!entry->map_aligned || !clear_map_aligned) &&
3290 (!entry->zero_wired_pages) &&
3291 (!entry->used_for_jit && !entry_for_jit) &&
3292 #if __arm64e__
3293 (!entry->used_for_tpro && !entry_for_tpro) &&
3294 #endif
3295 (!entry->csm_associated) &&
3296 (entry->iokit_acct == iokit_acct) &&
3297 (!entry->vme_resilient_codesign) &&
3298 (!entry->vme_resilient_media) &&
3299 (!entry->vme_atomic) &&
3300 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3301
3302 ((entry->vme_end - entry->vme_start) + size <=
3303 (user_alias == VM_MEMORY_REALLOC ?
3304 ANON_CHUNK_SIZE :
3305 NO_COALESCE_LIMIT)) &&
3306
3307 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3308 if (vm_object_coalesce(VME_OBJECT(entry),
3309 VM_OBJECT_NULL,
3310 VME_OFFSET(entry),
3311 (vm_object_offset_t) 0,
3312 (vm_map_size_t)(entry->vme_end - entry->vme_start),
3313 (vm_map_size_t)(end - entry->vme_end))) {
3314 /*
3315 * Coalesced the two objects - can extend
3316 * the previous map entry to include the
3317 * new range.
3318 */
3319 map->size += (end - entry->vme_end);
3320 assert(entry->vme_start < end);
3321 assert(VM_MAP_PAGE_ALIGNED(end,
3322 VM_MAP_PAGE_MASK(map)));
3323 if (__improbable(vm_debug_events)) {
3324 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3325 }
3326 entry->vme_end = end;
3327 if (map->holelistenabled) {
3328 vm_map_store_update_first_free(map, entry, TRUE);
3329 } else {
3330 vm_map_store_update_first_free(map, map->first_free, TRUE);
3331 }
3332 new_mapping_established = TRUE;
3333 RETURN(KERN_SUCCESS);
3334 }
3335 }
3336
3337 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3338 new_entry = NULL;
3339
3340 if (vmk_flags.vmkf_submap_adjust) {
3341 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
3342 offset = start;
3343 }
3344
3345 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3346 tmp2_end = tmp2_start + step;
3347 /*
3348 * Create a new entry
3349 *
3350 * XXX FBDP
3351 * The reserved "page zero" in each process's address space can
3352 * be arbitrarily large. Splitting it into separate objects and
3353 * therefore different VM map entries serves no purpose and just
3354 * slows down operations on the VM map, so let's not split the
3355 * allocation into chunks if the max protection is NONE. That
3356 * memory should never be accessible, so it will never get to the
3357 * default pager.
3358 */
3359 tmp_start = tmp2_start;
3360 if (!is_submap &&
3361 object == VM_OBJECT_NULL &&
3362 size > chunk_size &&
3363 max_protection != VM_PROT_NONE &&
3364 superpage_size == 0) {
3365 tmp_end = tmp_start + chunk_size;
3366 } else {
3367 tmp_end = tmp2_end;
3368 }
3369 do {
3370 if (!is_submap &&
3371 object != VM_OBJECT_NULL &&
3372 object->internal &&
3373 offset + (tmp_end - tmp_start) > object->vo_size) {
3374 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3375 DTRACE_VM5(vm_map_enter_overmap,
3376 vm_map_t, map,
3377 vm_map_address_t, tmp_start,
3378 vm_map_address_t, tmp_end,
3379 vm_object_offset_t, offset,
3380 vm_object_size_t, object->vo_size);
3381 }
3382 new_entry = vm_map_entry_insert(map,
3383 entry, tmp_start, tmp_end,
3384 object, offset, vmk_flags,
3385 needs_copy,
3386 cur_protection, max_protection,
3387 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3388 VM_INHERIT_NONE : inheritance),
3389 clear_map_aligned);
3390
3391 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3392
3393 if (resilient_codesign) {
3394 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3395 if (!((cur_protection | max_protection) & reject_prot)) {
3396 new_entry->vme_resilient_codesign = TRUE;
3397 }
3398 }
3399
3400 if (resilient_media &&
3401 (object == VM_OBJECT_NULL ||
3402 object->internal)) {
3403 new_entry->vme_resilient_media = TRUE;
3404 }
3405
3406 assert(!new_entry->iokit_acct);
3407 if (!is_submap &&
3408 object != VM_OBJECT_NULL &&
3409 object->internal &&
3410 (object->purgable != VM_PURGABLE_DENY ||
3411 object->vo_ledger_tag)) {
3412 assert(new_entry->use_pmap);
3413 assert(!new_entry->iokit_acct);
3414 /*
3415 * Turn off pmap accounting since
3416 * purgeable (or tagged) objects have their
3417 * own ledgers.
3418 */
3419 new_entry->use_pmap = FALSE;
3420 } else if (!is_submap &&
3421 iokit_acct &&
3422 object != VM_OBJECT_NULL &&
3423 object->internal) {
3424 /* alternate accounting */
3425 assert(!new_entry->iokit_acct);
3426 assert(new_entry->use_pmap);
3427 new_entry->iokit_acct = TRUE;
3428 new_entry->use_pmap = FALSE;
3429 DTRACE_VM4(
3430 vm_map_iokit_mapped_region,
3431 vm_map_t, map,
3432 vm_map_offset_t, new_entry->vme_start,
3433 vm_map_offset_t, new_entry->vme_end,
3434 int, VME_ALIAS(new_entry));
3435 vm_map_iokit_mapped_region(
3436 map,
3437 (new_entry->vme_end -
3438 new_entry->vme_start));
3439 } else if (!is_submap) {
3440 assert(!new_entry->iokit_acct);
3441 assert(new_entry->use_pmap);
3442 }
3443
3444 if (is_submap) {
3445 vm_map_t submap;
3446 boolean_t submap_is_64bit;
3447 boolean_t use_pmap;
3448
3449 assert(new_entry->is_sub_map);
3450 assert(!new_entry->use_pmap);
3451 assert(!new_entry->iokit_acct);
3452 submap = (vm_map_t) object;
3453 submap_is_64bit = vm_map_is_64bit(submap);
3454 use_pmap = vmk_flags.vmkf_nested_pmap;
3455 #ifndef NO_NESTED_PMAP
3456 if (use_pmap && submap->pmap == NULL) {
3457 ledger_t ledger = map->pmap->ledger;
3458 /* we need a sub pmap to nest... */
3459 submap->pmap = pmap_create_options(ledger, 0,
3460 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3461 if (submap->pmap == NULL) {
3462 /* let's proceed without nesting... */
3463 }
3464 #if defined(__arm64__)
3465 else {
3466 pmap_set_nested(submap->pmap);
3467 }
3468 #endif
3469 }
3470 if (use_pmap && submap->pmap != NULL) {
3471 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3472 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3473 kr = KERN_FAILURE;
3474 } else {
3475 kr = pmap_nest(map->pmap,
3476 submap->pmap,
3477 tmp_start,
3478 tmp_end - tmp_start);
3479 }
3480 if (kr != KERN_SUCCESS) {
3481 printf("vm_map_enter: "
3482 "pmap_nest(0x%llx,0x%llx) "
3483 "error 0x%x\n",
3484 (long long)tmp_start,
3485 (long long)tmp_end,
3486 kr);
3487 } else {
3488 /* we're now nested ! */
3489 new_entry->use_pmap = TRUE;
3490 pmap_empty = FALSE;
3491 }
3492 }
3493 #endif /* NO_NESTED_PMAP */
3494 }
3495 entry = new_entry;
3496
3497 if (superpage_size) {
3498 vm_page_t pages, m;
3499 vm_object_t sp_object;
3500 vm_object_offset_t sp_offset;
3501
3502 assert(object == VM_OBJECT_NULL);
3503 VME_OFFSET_SET(entry, 0);
3504
3505 /* allocate one superpage */
3506 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3507 if (kr != KERN_SUCCESS) {
3508 /* deallocate whole range... */
3509 new_mapping_established = TRUE;
3510 /* ... but only up to "tmp_end" */
3511 size -= end - tmp_end;
3512 RETURN(kr);
3513 }
3514
3515 /* create one vm_object per superpage */
3516 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3517 vm_object_lock(sp_object);
3518 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3519 VM_OBJECT_SET_PHYS_CONTIGUOUS(sp_object, TRUE);
3520 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3521 VME_OBJECT_SET(entry, sp_object, false, 0);
3522 assert(entry->use_pmap);
3523
3524 /* enter the base pages into the object */
3525 for (sp_offset = 0;
3526 sp_offset < SUPERPAGE_SIZE;
3527 sp_offset += PAGE_SIZE) {
3528 m = pages;
3529 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3530 pages = NEXT_PAGE(m);
3531 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3532 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3533 }
3534 vm_object_unlock(sp_object);
3535 }
3536 } while (tmp_end != tmp2_end &&
3537 (tmp_start = tmp_end) &&
3538 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3539 tmp_end + chunk_size : tmp2_end));
3540 }
3541
3542 new_mapping_established = TRUE;
3543
3544 BailOut:
3545 assert(map_locked == TRUE);
3546
3547 /*
3548 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3549 * If we have identified and possibly established the new mapping(s),
3550 * make sure we did not go beyond the address space limit.
3551 */
3552 if (result == KERN_SUCCESS) {
3553 if (map->size_limit != RLIM_INFINITY &&
3554 map->size > map->size_limit) {
3555 /*
3556 * Establishing the requested mappings would exceed
3557 * the process's RLIMIT_AS limit: fail with
3558 * KERN_NO_SPACE.
3559 */
3560 result = KERN_NO_SPACE;
3561 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3562 proc_selfpid(),
3563 (get_bsdtask_info(current_task())
3564 ? proc_name_address(get_bsdtask_info(current_task()))
3565 : "?"),
3566 __FUNCTION__,
3567 (uint64_t) map->size,
3568 (uint64_t) map->size_limit);
3569 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3570 vm_map_size_t, map->size,
3571 uint64_t, map->size_limit);
3572 vm_map_enter_RLIMIT_AS_count++;
3573 } else if (map->data_limit != RLIM_INFINITY &&
3574 map->size > map->data_limit) {
3575 /*
3576 * Establishing the requested mappings would exceed
3577 * the process's RLIMIT_DATA limit: fail with
3578 * KERN_NO_SPACE.
3579 */
3580 result = KERN_NO_SPACE;
3581 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3582 proc_selfpid(),
3583 (get_bsdtask_info(current_task())
3584 ? proc_name_address(get_bsdtask_info(current_task()))
3585 : "?"),
3586 __FUNCTION__,
3587 (uint64_t) map->size,
3588 (uint64_t) map->data_limit);
3589 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3590 vm_map_size_t, map->size,
3591 uint64_t, map->data_limit);
3592 vm_map_enter_RLIMIT_DATA_count++;
3593 }
3594 }
3595
3596 if (result == KERN_SUCCESS) {
3597 vm_prot_t pager_prot;
3598 memory_object_t pager;
3599
3600 #if DEBUG
3601 if (pmap_empty &&
3602 !(vmk_flags.vmkf_no_pmap_check)) {
3603 assert(pmap_is_empty(map->pmap,
3604 *address,
3605 *address + size));
3606 }
3607 #endif /* DEBUG */
3608
3609 /*
3610 * For "named" VM objects, let the pager know that the
3611 * memory object is being mapped. Some pagers need to keep
3612 * track of this, to know when they can reclaim the memory
3613 * object, for example.
3614 * VM calls memory_object_map() for each mapping (specifying
3615 * the protection of each mapping) and calls
3616 * memory_object_last_unmap() when all the mappings are gone.
3617 */
3618 pager_prot = max_protection;
3619 if (needs_copy) {
3620 /*
3621 * Copy-On-Write mapping: won't modify
3622 * the memory object.
3623 */
3624 pager_prot &= ~VM_PROT_WRITE;
3625 }
3626 if (!is_submap &&
3627 object != VM_OBJECT_NULL &&
3628 object->named &&
3629 object->pager != MEMORY_OBJECT_NULL) {
3630 vm_object_lock(object);
3631 pager = object->pager;
3632 if (object->named &&
3633 pager != MEMORY_OBJECT_NULL) {
3634 assert(object->pager_ready);
3635 vm_object_mapping_wait(object, THREAD_UNINT);
3636 vm_object_mapping_begin(object);
3637 vm_object_unlock(object);
3638
3639 kr = memory_object_map(pager, pager_prot);
3640 assert(kr == KERN_SUCCESS);
3641
3642 vm_object_lock(object);
3643 vm_object_mapping_end(object);
3644 }
3645 vm_object_unlock(object);
3646 }
3647 }
3648
3649 assert(map_locked == TRUE);
3650
3651 if (new_mapping_established) {
3652 /*
3653 * If we release the map lock for any reason below,
3654 * another thread could deallocate our new mapping,
3655 * releasing the caller's reference on "caller_object",
3656 * which was transferred to the mapping.
3657 * If this was the only reference, the object could be
3658 * destroyed.
3659 *
3660 * We need to take an extra reference on "caller_object"
3661 * to keep it alive if we need to return the caller's
3662 * reference to the caller in case of failure.
3663 */
3664 if (is_submap) {
3665 vm_map_reference((vm_map_t)caller_object);
3666 } else {
3667 vm_object_reference(caller_object);
3668 }
3669 }
3670
3671 if (!keep_map_locked) {
3672 vm_map_unlock(map);
3673 map_locked = FALSE;
3674 entry = VM_MAP_ENTRY_NULL;
3675 new_entry = VM_MAP_ENTRY_NULL;
3676 }
3677
3678 /*
3679 * We can't hold the map lock if we enter this block.
3680 */
3681
3682 if (result == KERN_SUCCESS) {
3683 /* Wire down the new entry if the user
3684 * requested all new map entries be wired.
3685 */
3686 if ((map->wiring_required) || (superpage_size)) {
3687 assert(!keep_map_locked);
3688 pmap_empty = FALSE; /* pmap won't be empty */
3689 kr = vm_map_wire_nested(map, start, end,
3690 cur_protection, VM_KERN_MEMORY_MLOCK,
3691 TRUE, PMAP_NULL, 0, NULL);
3692 result = kr;
3693 }
3694
3695 }
3696
3697 if (result != KERN_SUCCESS) {
3698 if (new_mapping_established) {
3699 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3700
3701 /*
3702 * We have to get rid of the new mappings since we
3703 * won't make them available to the user.
3704 * Try and do that atomically, to minimize the risk
3705 * that someone else create new mappings that range.
3706 */
3707 if (!map_locked) {
3708 vm_map_lock(map);
3709 map_locked = TRUE;
3710 }
3711 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3712 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3713 if (permanent) {
3714 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3715 }
3716 (void) vm_map_delete(map,
3717 *address, *address + size,
3718 remove_flags,
3719 KMEM_GUARD_NONE, &zap_new_list);
3720 }
3721
3722 if (vm_map_zap_first_entry(&zap_old_list)) {
3723 vm_map_entry_t entry1, entry2;
3724
3725 /*
3726 * The new mapping failed. Attempt to restore
3727 * the old mappings, saved in the "zap_old_map".
3728 */
3729 if (!map_locked) {
3730 vm_map_lock(map);
3731 map_locked = TRUE;
3732 }
3733
3734 /* first check if the coast is still clear */
3735 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3736 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3737
3738 if (vm_map_lookup_entry(map, start, &entry1) ||
3739 vm_map_lookup_entry(map, end, &entry2) ||
3740 entry1 != entry2) {
3741 /*
3742 * Part of that range has already been
3743 * re-mapped: we can't restore the old
3744 * mappings...
3745 */
3746 vm_map_enter_restore_failures++;
3747 } else {
3748 /*
3749 * Transfer the saved map entries from
3750 * "zap_old_map" to the original "map",
3751 * inserting them all after "entry1".
3752 */
3753 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3754 vm_map_size_t entry_size;
3755
3756 entry_size = (entry2->vme_end -
3757 entry2->vme_start);
3758 vm_map_store_entry_link(map, entry1, entry2,
3759 VM_MAP_KERNEL_FLAGS_NONE);
3760 map->size += entry_size;
3761 entry1 = entry2;
3762 }
3763 if (map->wiring_required) {
3764 /*
3765 * XXX TODO: we should rewire the
3766 * old pages here...
3767 */
3768 }
3769 vm_map_enter_restore_successes++;
3770 }
3771 }
3772 }
3773
3774 /*
3775 * The caller is responsible for releasing the lock if it requested to
3776 * keep the map locked.
3777 */
3778 if (map_locked && !keep_map_locked) {
3779 vm_map_unlock(map);
3780 }
3781
3782 vm_map_zap_dispose(&zap_old_list);
3783 vm_map_zap_dispose(&zap_new_list);
3784
3785 if (new_mapping_established) {
3786 /*
3787 * The caller had a reference on "caller_object" and we
3788 * transferred that reference to the mapping.
3789 * We also took an extra reference on "caller_object" to keep
3790 * it alive while the map was unlocked.
3791 */
3792 if (result == KERN_SUCCESS) {
3793 /*
3794 * On success, the caller's reference on the object gets
3795 * tranferred to the mapping.
3796 * Release our extra reference.
3797 */
3798 if (is_submap) {
3799 vm_map_deallocate((vm_map_t)caller_object);
3800 } else {
3801 vm_object_deallocate(caller_object);
3802 }
3803 } else {
3804 /*
3805 * On error, the caller expects to still have a
3806 * reference on the object it gave us.
3807 * Let's use our extra reference for that.
3808 */
3809 }
3810 }
3811
3812 return result;
3813
3814 #undef RETURN
3815 }
3816
3817 /*
3818 * Counters for the prefault optimization.
3819 */
3820 int64_t vm_prefault_nb_pages = 0;
3821 int64_t vm_prefault_nb_bailout = 0;
3822
3823 static kern_return_t
vm_map_enter_adjust_offset(vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_offset_t quantity)3824 vm_map_enter_adjust_offset(
3825 vm_object_offset_t *obj_offs,
3826 vm_object_offset_t *obj_end,
3827 vm_object_offset_t quantity)
3828 {
3829 if (os_add_overflow(*obj_offs, quantity, obj_offs) ||
3830 os_add_overflow(*obj_end, quantity, obj_end) ||
3831 vm_map_round_page_mask(*obj_end, PAGE_MASK) == 0) {
3832 return KERN_INVALID_ARGUMENT;
3833 }
3834
3835 return KERN_SUCCESS;
3836 }
3837
3838 static inline kern_return_t
vm_map_enter_mem_object_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)3839 vm_map_enter_mem_object_sanitize(
3840 vm_map_t target_map,
3841 vm_map_offset_ut address_u,
3842 vm_map_size_ut initial_size_u,
3843 vm_map_offset_ut mask_u,
3844 vm_object_offset_ut offset_u,
3845 vm_prot_ut cur_protection_u,
3846 vm_prot_ut max_protection_u,
3847 vm_inherit_ut inheritance_u,
3848 vm_map_kernel_flags_t vmk_flags,
3849 ipc_port_t port,
3850 vm_map_address_t *map_addr,
3851 vm_map_size_t *map_size,
3852 vm_map_offset_t *mask,
3853 vm_object_offset_t *obj_offs,
3854 vm_object_offset_t *obj_end,
3855 vm_object_size_t *obj_size,
3856 vm_prot_t *cur_protection,
3857 vm_prot_t *max_protection,
3858 vm_inherit_t *inheritance)
3859 {
3860 kern_return_t result;
3861
3862 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
3863 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3864 VM_PROT_IS_MASK, cur_protection,
3865 max_protection);
3866 if (__improbable(result != KERN_SUCCESS)) {
3867 return result;
3868 }
3869
3870 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3871 inheritance);
3872 if (__improbable(result != KERN_SUCCESS)) {
3873 return result;
3874 }
3875
3876 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ, mask);
3877 if (__improbable(result != KERN_SUCCESS)) {
3878 return result;
3879 }
3880
3881 if (vmk_flags.vmf_fixed) {
3882 vm_map_address_t map_end;
3883
3884 result = vm_sanitize_addr_size(address_u, initial_size_u,
3885 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3886 target_map,
3887 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_REALIGN_START,
3888 map_addr, &map_end, map_size);
3889 if (__improbable(result != KERN_SUCCESS)) {
3890 return result;
3891 }
3892 } else {
3893 *map_addr = vm_sanitize_addr(target_map, address_u);
3894 result = vm_sanitize_size(0, initial_size_u,
3895 VM_SANITIZE_CALLER_ENTER_MEM_OBJ, target_map,
3896 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
3897 if (__improbable(result != KERN_SUCCESS)) {
3898 return result;
3899 }
3900 }
3901
3902 *obj_size = vm_object_round_page(*map_size);
3903 if (__improbable(*obj_size == 0)) {
3904 return KERN_INVALID_ARGUMENT;
3905 }
3906
3907 if (IP_VALID(port)) {
3908 result = vm_sanitize_addr_size(offset_u, *obj_size,
3909 VM_SANITIZE_CALLER_ENTER_MEM_OBJ,
3910 PAGE_MASK,
3911 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
3912 obj_offs, obj_end, obj_size);
3913 if (__improbable(result != KERN_SUCCESS)) {
3914 return result;
3915 }
3916 } else {
3917 *obj_offs = 0;
3918 *obj_end = *obj_size;
3919 }
3920
3921 return KERN_SUCCESS;
3922 }
3923
3924 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset_u,boolean_t copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,upl_page_list_ptr_t page_list,unsigned int page_list_count)3925 vm_map_enter_mem_object(
3926 vm_map_t target_map,
3927 vm_map_offset_ut *address_u,
3928 vm_map_size_ut initial_size_u,
3929 vm_map_offset_ut mask_u,
3930 vm_map_kernel_flags_t vmk_flags,
3931 ipc_port_t port,
3932 vm_object_offset_ut offset_u,
3933 boolean_t copy,
3934 vm_prot_ut cur_protection_u,
3935 vm_prot_ut max_protection_u,
3936 vm_inherit_ut inheritance_u,
3937 upl_page_list_ptr_t page_list,
3938 unsigned int page_list_count)
3939 {
3940 vm_map_offset_t mask, address;
3941 vm_prot_t cur_protection;
3942 vm_prot_t max_protection;
3943 vm_inherit_t inheritance;
3944 vm_map_address_t map_addr, map_mask;
3945 vm_map_size_t map_size;
3946 vm_object_t object = VM_OBJECT_NULL;
3947 vm_object_offset_t obj_offs, obj_end;
3948 vm_object_size_t obj_size;
3949 kern_return_t result;
3950 boolean_t mask_cur_protection, mask_max_protection;
3951 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
3952 vm_map_offset_t offset_in_mapping = 0;
3953
3954 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3955 /* XXX TODO4K prefaulting depends on page size... */
3956 try_prefault = FALSE;
3957 }
3958
3959 /*
3960 * Check arguments for validity
3961 */
3962 if ((target_map == VM_MAP_NULL) ||
3963 (try_prefault && (copy || !page_list))) {
3964 return KERN_INVALID_ARGUMENT;
3965 }
3966
3967 map_mask = vm_map_page_mask(target_map);
3968
3969 /*
3970 * Sanitize any input parameters that are addr/size/prot/inherit
3971 */
3972 result = vm_map_enter_mem_object_sanitize(
3973 target_map,
3974 *address_u,
3975 initial_size_u,
3976 mask_u,
3977 offset_u,
3978 cur_protection_u,
3979 max_protection_u,
3980 inheritance_u,
3981 vmk_flags,
3982 port,
3983 &map_addr,
3984 &map_size,
3985 &mask,
3986 &obj_offs,
3987 &obj_end,
3988 &obj_size,
3989 &cur_protection,
3990 &max_protection,
3991 &inheritance);
3992 if (__improbable(result != KERN_SUCCESS)) {
3993 return vm_sanitize_get_kr(result);
3994 }
3995
3996 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3997 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, map_size);
3998
3999 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4000 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4001 cur_protection &= ~VM_PROT_IS_MASK;
4002 max_protection &= ~VM_PROT_IS_MASK;
4003
4004 #if __arm64__
4005 if (cur_protection & VM_PROT_EXECUTE) {
4006 cur_protection |= VM_PROT_READ;
4007 }
4008 #endif /* __arm64__ */
4009
4010 /*
4011 * Find the vm object (if any) corresponding to this port.
4012 */
4013 if (!IP_VALID(port)) {
4014 object = VM_OBJECT_NULL;
4015 copy = FALSE;
4016 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4017 vm_named_entry_t named_entry;
4018 vm_object_offset_t data_offset;
4019 vm_object_size_t initial_size;
4020
4021 named_entry = mach_memory_entry_from_port(port);
4022
4023 if (vmk_flags.vmf_return_data_addr ||
4024 vmk_flags.vmf_return_4k_data_addr) {
4025 data_offset = named_entry->data_offset;
4026 result = vm_map_enter_adjust_offset(&obj_offs,
4027 &obj_end, data_offset);
4028 if (__improbable(result)) {
4029 return result;
4030 }
4031 } else {
4032 data_offset = 0;
4033 }
4034
4035 /* a few checks to make sure user is obeying rules */
4036 if (mask_max_protection) {
4037 max_protection &= named_entry->protection;
4038 }
4039 if (mask_cur_protection) {
4040 cur_protection &= named_entry->protection;
4041 }
4042 if ((named_entry->protection & max_protection) !=
4043 max_protection) {
4044 return KERN_INVALID_RIGHT;
4045 }
4046 if ((named_entry->protection & cur_protection) !=
4047 cur_protection) {
4048 return KERN_INVALID_RIGHT;
4049 }
4050
4051 /*
4052 * unwrap is safe because we know obj_size is larger and doesn't
4053 * overflow
4054 */
4055 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(initial_size_u);
4056 if (named_entry->size < obj_offs + initial_size) {
4057 return KERN_INVALID_ARGUMENT;
4058 }
4059
4060 /* for a vm_map_copy, we can only map it whole */
4061 if (named_entry->is_copy &&
4062 (obj_size != named_entry->size) &&
4063 (vm_map_round_page(obj_size, map_mask) == named_entry->size)) {
4064 /* XXX FBDP use the rounded size... */
4065 obj_end += named_entry->size - obj_size;
4066 obj_size = named_entry->size;
4067 }
4068
4069 if (named_entry->offset) {
4070 /*
4071 * the callers parameter offset is defined to be the
4072 * offset from beginning of named entry offset in object
4073 *
4074 * Because we checked above that
4075 * obj_offs + obj_size < named_entry_size
4076 * these overflow checks should be redundant...
4077 */
4078 result = vm_map_enter_adjust_offset(&obj_offs,
4079 &obj_end, named_entry->offset);
4080 if (__improbable(result)) {
4081 return result;
4082 }
4083 }
4084
4085 if (!VM_MAP_PAGE_ALIGNED(obj_size, map_mask)) {
4086 /*
4087 * Let's not map more than requested;
4088 * vm_map_enter() will handle this "not map-aligned"
4089 * case.
4090 */
4091 map_size = obj_size;
4092 }
4093
4094 named_entry_lock(named_entry);
4095 if (named_entry->is_sub_map) {
4096 vm_map_t submap;
4097
4098 if (vmk_flags.vmf_return_data_addr ||
4099 vmk_flags.vmf_return_4k_data_addr) {
4100 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4101 }
4102
4103 submap = named_entry->backing.map;
4104 vm_map_reference(submap);
4105 named_entry_unlock(named_entry);
4106
4107 vmk_flags.vmkf_submap = TRUE;
4108 result = vm_map_enter(target_map,
4109 &map_addr,
4110 map_size,
4111 mask,
4112 vmk_flags,
4113 (vm_object_t)(uintptr_t) submap,
4114 obj_offs,
4115 copy,
4116 cur_protection,
4117 max_protection,
4118 inheritance);
4119 if (result != KERN_SUCCESS) {
4120 vm_map_deallocate(submap);
4121 return result;
4122 }
4123 /*
4124 * No need to lock "submap" just to check its
4125 * "mapped" flag: that flag is never reset
4126 * once it's been set and if we race, we'll
4127 * just end up setting it twice, which is OK.
4128 */
4129 if (submap->mapped_in_other_pmaps == FALSE &&
4130 vm_map_pmap(submap) != PMAP_NULL &&
4131 vm_map_pmap(submap) !=
4132 vm_map_pmap(target_map)) {
4133 /*
4134 * This submap is being mapped in a map
4135 * that uses a different pmap.
4136 * Set its "mapped_in_other_pmaps" flag
4137 * to indicate that we now need to
4138 * remove mappings from all pmaps rather
4139 * than just the submap's pmap.
4140 */
4141 vm_map_lock(submap);
4142 submap->mapped_in_other_pmaps = TRUE;
4143 vm_map_unlock(submap);
4144 }
4145 address = map_addr;
4146 goto out;
4147 } else if (named_entry->is_copy) {
4148 kern_return_t kr;
4149 vm_map_copy_t copy_map;
4150 vm_map_entry_t copy_entry;
4151 vm_map_offset_t copy_addr;
4152 vm_map_copy_t target_copy_map;
4153 vm_map_offset_t overmap_start, overmap_end;
4154 vm_map_offset_t trimmed_start;
4155 vm_map_size_t target_size;
4156
4157 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4158 (VM_FLAGS_FIXED |
4159 VM_FLAGS_ANYWHERE |
4160 VM_FLAGS_OVERWRITE |
4161 VM_FLAGS_RETURN_4K_DATA_ADDR |
4162 VM_FLAGS_RETURN_DATA_ADDR))) {
4163 named_entry_unlock(named_entry);
4164 return KERN_INVALID_ARGUMENT;
4165 }
4166
4167 copy_map = named_entry->backing.copy;
4168 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4169 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4170 /* unsupported type; should not happen */
4171 printf("vm_map_enter_mem_object: "
4172 "memory_entry->backing.copy "
4173 "unsupported type 0x%x\n",
4174 copy_map->type);
4175 named_entry_unlock(named_entry);
4176 return KERN_INVALID_ARGUMENT;
4177 }
4178
4179 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4180 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, obj_offs, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4181 }
4182
4183 if (vmk_flags.vmf_return_data_addr ||
4184 vmk_flags.vmf_return_4k_data_addr) {
4185 offset_in_mapping = obj_offs & map_mask;
4186 if (vmk_flags.vmf_return_4k_data_addr) {
4187 offset_in_mapping &= ~((signed)(0xFFF));
4188 }
4189 }
4190
4191 target_copy_map = VM_MAP_COPY_NULL;
4192 target_size = copy_map->size;
4193 overmap_start = 0;
4194 overmap_end = 0;
4195 trimmed_start = 0;
4196 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4197 DEBUG4K_ADJUST("adjusting...\n");
4198 kr = vm_map_copy_adjust_to_target(
4199 copy_map,
4200 obj_offs /* includes data_offset */,
4201 initial_size,
4202 target_map,
4203 copy,
4204 &target_copy_map,
4205 &overmap_start,
4206 &overmap_end,
4207 &trimmed_start);
4208 if (kr != KERN_SUCCESS) {
4209 named_entry_unlock(named_entry);
4210 return kr;
4211 }
4212 target_size = target_copy_map->size;
4213 if (trimmed_start >= data_offset) {
4214 data_offset = obj_offs & VM_MAP_PAGE_MASK(target_map);
4215 } else {
4216 data_offset -= trimmed_start;
4217 }
4218 } else {
4219 /*
4220 * Assert that the vm_map_copy is coming from the right
4221 * zone and hasn't been forged
4222 */
4223 vm_map_copy_require(copy_map);
4224 target_copy_map = copy_map;
4225 }
4226
4227 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4228
4229 vm_map_kernel_flags_and_vmflags(&rsv_flags,
4230 (VM_FLAGS_FIXED |
4231 VM_FLAGS_ANYWHERE |
4232 VM_FLAGS_OVERWRITE |
4233 VM_FLAGS_RETURN_4K_DATA_ADDR |
4234 VM_FLAGS_RETURN_DATA_ADDR));
4235
4236 /* reserve a contiguous range */
4237 kr = vm_map_enter(target_map,
4238 &map_addr,
4239 vm_map_round_page(target_size, map_mask),
4240 mask,
4241 rsv_flags,
4242 VM_OBJECT_NULL,
4243 0,
4244 FALSE, /* copy */
4245 cur_protection,
4246 max_protection,
4247 inheritance);
4248 if (kr != KERN_SUCCESS) {
4249 DEBUG4K_ERROR("kr 0x%x\n", kr);
4250 if (target_copy_map != copy_map) {
4251 vm_map_copy_discard(target_copy_map);
4252 target_copy_map = VM_MAP_COPY_NULL;
4253 }
4254 named_entry_unlock(named_entry);
4255 return kr;
4256 }
4257
4258 copy_addr = map_addr;
4259
4260 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4261 copy_entry != vm_map_copy_to_entry(target_copy_map);
4262 copy_entry = copy_entry->vme_next) {
4263 vm_map_t copy_submap = VM_MAP_NULL;
4264 vm_object_t copy_object = VM_OBJECT_NULL;
4265 vm_map_size_t copy_size;
4266 vm_object_offset_t copy_offset;
4267 boolean_t do_copy = false;
4268
4269 if (copy_entry->is_sub_map) {
4270 copy_submap = VME_SUBMAP(copy_entry);
4271 copy_object = (vm_object_t)copy_submap;
4272 } else {
4273 copy_object = VME_OBJECT(copy_entry);
4274 }
4275 copy_offset = VME_OFFSET(copy_entry);
4276 copy_size = (copy_entry->vme_end -
4277 copy_entry->vme_start);
4278
4279 /* sanity check */
4280 if ((copy_addr + copy_size) >
4281 (map_addr +
4282 overmap_start + overmap_end +
4283 named_entry->size /* XXX full size */)) {
4284 /* over-mapping too much !? */
4285 kr = KERN_INVALID_ARGUMENT;
4286 DEBUG4K_ERROR("kr 0x%x\n", kr);
4287 /* abort */
4288 break;
4289 }
4290
4291 /* take a reference on the object */
4292 if (copy_entry->is_sub_map) {
4293 vm_map_reference(copy_submap);
4294 } else {
4295 if (!copy &&
4296 copy_object != VM_OBJECT_NULL &&
4297 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4298 bool is_writable;
4299
4300 /*
4301 * We need to resolve our side of this
4302 * "symmetric" copy-on-write now; we
4303 * need a new object to map and share,
4304 * instead of the current one which
4305 * might still be shared with the
4306 * original mapping.
4307 *
4308 * Note: A "vm_map_copy_t" does not
4309 * have a lock but we're protected by
4310 * the named entry's lock here.
4311 */
4312 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4313 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
4314 assert(copy_object != VME_OBJECT(copy_entry));
4315 is_writable = false;
4316 if (copy_entry->protection & VM_PROT_WRITE) {
4317 is_writable = true;
4318 #if __arm64e__
4319 } else if (copy_entry->used_for_tpro) {
4320 is_writable = true;
4321 #endif /* __arm64e__ */
4322 }
4323 if (!copy_entry->needs_copy && is_writable) {
4324 vm_prot_t prot;
4325
4326 prot = copy_entry->protection & ~VM_PROT_WRITE;
4327 vm_object_pmap_protect(copy_object,
4328 copy_offset,
4329 copy_size,
4330 PMAP_NULL,
4331 PAGE_SIZE,
4332 0,
4333 prot);
4334 }
4335 copy_entry->needs_copy = FALSE;
4336 copy_entry->is_shared = TRUE;
4337 copy_object = VME_OBJECT(copy_entry);
4338 copy_offset = VME_OFFSET(copy_entry);
4339 vm_object_lock(copy_object);
4340 /* we're about to make a shared mapping of this object */
4341 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4342 VM_OBJECT_SET_TRUE_SHARE(copy_object, TRUE);
4343 vm_object_unlock(copy_object);
4344 }
4345
4346 if (copy_object != VM_OBJECT_NULL &&
4347 copy_object->named &&
4348 copy_object->pager != MEMORY_OBJECT_NULL &&
4349 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4350 memory_object_t pager;
4351 vm_prot_t pager_prot;
4352
4353 /*
4354 * For "named" VM objects, let the pager know that the
4355 * memory object is being mapped. Some pagers need to keep
4356 * track of this, to know when they can reclaim the memory
4357 * object, for example.
4358 * VM calls memory_object_map() for each mapping (specifying
4359 * the protection of each mapping) and calls
4360 * memory_object_last_unmap() when all the mappings are gone.
4361 */
4362 pager_prot = max_protection;
4363 if (copy) {
4364 /*
4365 * Copy-On-Write mapping: won't modify the
4366 * memory object.
4367 */
4368 pager_prot &= ~VM_PROT_WRITE;
4369 }
4370 vm_object_lock(copy_object);
4371 pager = copy_object->pager;
4372 if (copy_object->named &&
4373 pager != MEMORY_OBJECT_NULL &&
4374 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4375 assert(copy_object->pager_ready);
4376 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4377 vm_object_mapping_begin(copy_object);
4378 vm_object_unlock(copy_object);
4379
4380 kr = memory_object_map(pager, pager_prot);
4381 assert(kr == KERN_SUCCESS);
4382
4383 vm_object_lock(copy_object);
4384 vm_object_mapping_end(copy_object);
4385 }
4386 vm_object_unlock(copy_object);
4387 }
4388
4389 /*
4390 * Perform the copy if requested
4391 */
4392
4393 if (copy && copy_object != VM_OBJECT_NULL) {
4394 vm_object_t new_object;
4395 vm_object_offset_t new_offset;
4396
4397 result = vm_object_copy_strategically(copy_object, copy_offset,
4398 copy_size,
4399 false, /* forking */
4400 &new_object, &new_offset,
4401 &do_copy);
4402
4403
4404 if (result == KERN_MEMORY_RESTART_COPY) {
4405 boolean_t success;
4406 boolean_t src_needs_copy;
4407
4408 /*
4409 * XXX
4410 * We currently ignore src_needs_copy.
4411 * This really is the issue of how to make
4412 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4413 * non-kernel users to use. Solution forthcoming.
4414 * In the meantime, since we don't allow non-kernel
4415 * memory managers to specify symmetric copy,
4416 * we won't run into problems here.
4417 */
4418 new_object = copy_object;
4419 new_offset = copy_offset;
4420 success = vm_object_copy_quickly(new_object,
4421 new_offset,
4422 copy_size,
4423 &src_needs_copy,
4424 &do_copy);
4425 assert(success);
4426 result = KERN_SUCCESS;
4427 }
4428 if (result != KERN_SUCCESS) {
4429 kr = result;
4430 break;
4431 }
4432
4433 copy_object = new_object;
4434 copy_offset = new_offset;
4435 /*
4436 * No extra object reference for the mapping:
4437 * the mapping should be the only thing keeping
4438 * this new object alive.
4439 */
4440 } else {
4441 /*
4442 * We already have the right object
4443 * to map.
4444 */
4445 copy_object = VME_OBJECT(copy_entry);
4446 /* take an extra ref for the mapping below */
4447 vm_object_reference(copy_object);
4448 }
4449 }
4450
4451 /*
4452 * If the caller does not want a specific
4453 * tag for this new mapping: use
4454 * the tag of the original mapping.
4455 */
4456 vm_map_kernel_flags_t vmk_remap_flags = {
4457 .vmkf_submap = copy_entry->is_sub_map,
4458 };
4459
4460 vm_map_kernel_flags_set_vmflags(&vmk_remap_flags,
4461 vm_map_kernel_flags_vmflags(vmk_flags),
4462 vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4463
4464 /* over-map the object into destination */
4465 vmk_remap_flags.vmf_fixed = true;
4466 vmk_remap_flags.vmf_overwrite = true;
4467
4468 if (!copy && !copy_entry->is_sub_map) {
4469 /*
4470 * copy-on-write should have been
4471 * resolved at this point, or we would
4472 * end up sharing instead of copying.
4473 */
4474 assert(!copy_entry->needs_copy);
4475 }
4476 #if XNU_TARGET_OS_OSX
4477 if (copy_entry->used_for_jit) {
4478 vmk_remap_flags.vmkf_map_jit = TRUE;
4479 }
4480 #endif /* XNU_TARGET_OS_OSX */
4481
4482 kr = vm_map_enter(target_map,
4483 ©_addr,
4484 copy_size,
4485 (vm_map_offset_t) 0,
4486 vmk_remap_flags,
4487 copy_object,
4488 copy_offset,
4489 ((copy_object == NULL)
4490 ? FALSE
4491 : (copy || copy_entry->needs_copy)),
4492 cur_protection,
4493 max_protection,
4494 inheritance);
4495 if (kr != KERN_SUCCESS) {
4496 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4497 if (copy_entry->is_sub_map) {
4498 vm_map_deallocate(copy_submap);
4499 } else {
4500 vm_object_deallocate(copy_object);
4501 }
4502 /* abort */
4503 break;
4504 }
4505
4506 /* next mapping */
4507 copy_addr += copy_size;
4508 }
4509
4510 if (kr == KERN_SUCCESS) {
4511 if (vmk_flags.vmf_return_data_addr ||
4512 vmk_flags.vmf_return_4k_data_addr) {
4513 address = map_addr + offset_in_mapping;
4514 } else {
4515 address = map_addr;
4516 }
4517 if (overmap_start) {
4518 address += overmap_start;
4519 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)address);
4520 }
4521 }
4522 named_entry_unlock(named_entry);
4523 if (target_copy_map != copy_map) {
4524 vm_map_copy_discard(target_copy_map);
4525 target_copy_map = VM_MAP_COPY_NULL;
4526 }
4527
4528 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4529 /* deallocate the contiguous range */
4530 vm_map_remove(target_map, map_addr,
4531 map_addr + map_size);
4532 }
4533 result = kr;
4534 goto out;
4535 }
4536
4537 if (named_entry->is_object) {
4538 unsigned int access;
4539 unsigned int wimg_mode;
4540
4541 /* we are mapping a VM object */
4542
4543 access = named_entry->access;
4544
4545 if (vmk_flags.vmf_return_data_addr ||
4546 vmk_flags.vmf_return_4k_data_addr) {
4547 offset_in_mapping = obj_offs & map_mask;
4548 if (vmk_flags.vmf_return_4k_data_addr) {
4549 offset_in_mapping &= ~((signed)(0xFFF));
4550 }
4551 obj_offs = VM_MAP_TRUNC_PAGE(obj_offs, VM_MAP_PAGE_MASK(target_map));
4552 map_size = VM_MAP_ROUND_PAGE((obj_offs + offset_in_mapping + initial_size) - obj_offs, VM_MAP_PAGE_MASK(target_map));
4553 }
4554
4555 object = vm_named_entry_to_vm_object(named_entry);
4556 assert(object != VM_OBJECT_NULL);
4557 vm_object_lock(object);
4558 named_entry_unlock(named_entry);
4559
4560 vm_object_reference_locked(object);
4561
4562 wimg_mode = object->wimg_bits;
4563 vm_prot_to_wimg(access, &wimg_mode);
4564 if (object->wimg_bits != wimg_mode) {
4565 vm_object_change_wimg_mode(object, wimg_mode);
4566 }
4567
4568 vm_object_unlock(object);
4569 } else {
4570 panic("invalid VM named entry %p", named_entry);
4571 }
4572 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4573 /*
4574 * JMM - This is temporary until we unify named entries
4575 * and raw memory objects.
4576 *
4577 * Detected fake ip_kotype for a memory object. In
4578 * this case, the port isn't really a port at all, but
4579 * instead is just a raw memory object.
4580 */
4581 if (vmk_flags.vmf_return_data_addr ||
4582 vmk_flags.vmf_return_4k_data_addr) {
4583 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4584 }
4585
4586 object = memory_object_to_vm_object((memory_object_t)port);
4587 if (object == VM_OBJECT_NULL) {
4588 return KERN_INVALID_OBJECT;
4589 }
4590 vm_object_reference(object);
4591
4592 /* wait for object (if any) to be ready */
4593 if (object != VM_OBJECT_NULL) {
4594 if (is_kernel_object(object)) {
4595 printf("Warning: Attempt to map kernel object"
4596 " by a non-private kernel entity\n");
4597 return KERN_INVALID_OBJECT;
4598 }
4599 if (!object->pager_ready) {
4600 vm_object_lock(object);
4601
4602 while (!object->pager_ready) {
4603 vm_object_sleep(object,
4604 VM_OBJECT_EVENT_PAGER_READY,
4605 THREAD_UNINT,
4606 LCK_SLEEP_EXCLUSIVE);
4607 }
4608 vm_object_unlock(object);
4609 }
4610 }
4611 } else {
4612 return KERN_INVALID_OBJECT;
4613 }
4614
4615 if (object != VM_OBJECT_NULL &&
4616 object->named &&
4617 object->pager != MEMORY_OBJECT_NULL &&
4618 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4619 memory_object_t pager;
4620 vm_prot_t pager_prot;
4621 kern_return_t kr;
4622
4623 /*
4624 * For "named" VM objects, let the pager know that the
4625 * memory object is being mapped. Some pagers need to keep
4626 * track of this, to know when they can reclaim the memory
4627 * object, for example.
4628 * VM calls memory_object_map() for each mapping (specifying
4629 * the protection of each mapping) and calls
4630 * memory_object_last_unmap() when all the mappings are gone.
4631 */
4632 pager_prot = max_protection;
4633 if (copy) {
4634 /*
4635 * Copy-On-Write mapping: won't modify the
4636 * memory object.
4637 */
4638 pager_prot &= ~VM_PROT_WRITE;
4639 }
4640 vm_object_lock(object);
4641 pager = object->pager;
4642 if (object->named &&
4643 pager != MEMORY_OBJECT_NULL &&
4644 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4645 assert(object->pager_ready);
4646 vm_object_mapping_wait(object, THREAD_UNINT);
4647 vm_object_mapping_begin(object);
4648 vm_object_unlock(object);
4649
4650 kr = memory_object_map(pager, pager_prot);
4651 assert(kr == KERN_SUCCESS);
4652
4653 vm_object_lock(object);
4654 vm_object_mapping_end(object);
4655 }
4656 vm_object_unlock(object);
4657 }
4658
4659 /*
4660 * Perform the copy if requested
4661 */
4662
4663 if (copy) {
4664 vm_object_t new_object;
4665 vm_object_offset_t new_offset;
4666
4667 result = vm_object_copy_strategically(object,
4668 obj_offs,
4669 map_size,
4670 false, /* forking */
4671 &new_object, &new_offset,
4672 ©);
4673
4674
4675 if (result == KERN_MEMORY_RESTART_COPY) {
4676 boolean_t success;
4677 boolean_t src_needs_copy;
4678
4679 /*
4680 * XXX
4681 * We currently ignore src_needs_copy.
4682 * This really is the issue of how to make
4683 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4684 * non-kernel users to use. Solution forthcoming.
4685 * In the meantime, since we don't allow non-kernel
4686 * memory managers to specify symmetric copy,
4687 * we won't run into problems here.
4688 */
4689 new_object = object;
4690 new_offset = obj_offs;
4691 success = vm_object_copy_quickly(new_object,
4692 new_offset,
4693 map_size,
4694 &src_needs_copy,
4695 ©);
4696 assert(success);
4697 result = KERN_SUCCESS;
4698 }
4699 /*
4700 * Throw away the reference to the
4701 * original object, as it won't be mapped.
4702 */
4703
4704 vm_object_deallocate(object);
4705
4706 if (result != KERN_SUCCESS) {
4707 return result;
4708 }
4709
4710 object = new_object;
4711 obj_offs = new_offset;
4712 }
4713
4714 /*
4715 * If non-kernel users want to try to prefault pages, the mapping and prefault
4716 * needs to be atomic.
4717 */
4718 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4719 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4720
4721 result = vm_map_enter(target_map,
4722 &map_addr, map_size,
4723 (vm_map_offset_t)mask,
4724 vmk_flags,
4725 object, obj_offs,
4726 copy,
4727 cur_protection, max_protection,
4728 inheritance);
4729 if (result != KERN_SUCCESS) {
4730 vm_object_deallocate(object);
4731 }
4732
4733 /*
4734 * Try to prefault, and do not forget to release the vm map lock.
4735 */
4736 if (result == KERN_SUCCESS && try_prefault) {
4737 mach_vm_address_t va = map_addr;
4738 kern_return_t kr = KERN_SUCCESS;
4739 unsigned int i = 0;
4740 int pmap_options;
4741
4742 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4743 if (object->internal) {
4744 pmap_options |= PMAP_OPTIONS_INTERNAL;
4745 }
4746
4747 for (i = 0; i < page_list_count; ++i) {
4748 if (!UPL_VALID_PAGE(page_list, i)) {
4749 if (kernel_prefault) {
4750 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4751 result = KERN_MEMORY_ERROR;
4752 break;
4753 }
4754 } else {
4755 /*
4756 * If this function call failed, we should stop
4757 * trying to optimize, other calls are likely
4758 * going to fail too.
4759 *
4760 * We are not gonna report an error for such
4761 * failure though. That's an optimization, not
4762 * something critical.
4763 */
4764 kr = pmap_enter_options(target_map->pmap,
4765 va, UPL_PHYS_PAGE(page_list, i),
4766 cur_protection, VM_PROT_NONE,
4767 0, TRUE, pmap_options, NULL, PMAP_MAPPING_TYPE_INFER);
4768 if (kr != KERN_SUCCESS) {
4769 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4770 if (kernel_prefault) {
4771 result = kr;
4772 }
4773 break;
4774 }
4775 OSIncrementAtomic64(&vm_prefault_nb_pages);
4776 }
4777
4778 /* Next virtual address */
4779 va += PAGE_SIZE;
4780 }
4781 if (vmk_flags.vmkf_keep_map_locked) {
4782 vm_map_unlock(target_map);
4783 }
4784 }
4785
4786 if (vmk_flags.vmf_return_data_addr ||
4787 vmk_flags.vmf_return_4k_data_addr) {
4788 address = map_addr + offset_in_mapping;
4789 } else {
4790 address = map_addr;
4791 }
4792
4793 out:
4794 if (result == KERN_SUCCESS) {
4795 #if KASAN
4796 if (target_map->pmap == kernel_pmap) {
4797 kasan_notify_address(map_addr, map_size);
4798 }
4799 #endif
4800 *address_u = vm_sanitize_wrap_addr(address);
4801 }
4802 return result;
4803 }
4804
4805 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_ut * address,vm_map_size_ut initial_size,vm_map_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,vm_object_offset_ut offset,vm_prot_ut cur_protection,vm_prot_ut max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4806 vm_map_enter_mem_object_prefault(
4807 vm_map_t target_map,
4808 vm_map_offset_ut *address,
4809 vm_map_size_ut initial_size,
4810 vm_map_offset_ut mask,
4811 vm_map_kernel_flags_t vmk_flags,
4812 ipc_port_t port,
4813 vm_object_offset_ut offset,
4814 vm_prot_ut cur_protection,
4815 vm_prot_ut max_protection,
4816 upl_page_list_ptr_t page_list,
4817 unsigned int page_list_count)
4818 {
4819 /* range_id is set by vm_map_enter_mem_object */
4820 return vm_map_enter_mem_object(target_map,
4821 address,
4822 initial_size,
4823 mask,
4824 vmk_flags,
4825 port,
4826 offset,
4827 FALSE,
4828 cur_protection,
4829 max_protection,
4830 VM_INHERIT_DEFAULT,
4831 page_list,
4832 page_list_count);
4833 }
4834
4835 static inline kern_return_t
vm_map_enter_mem_object_control_sanitize(vm_map_t target_map,vm_map_offset_ut address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_object_offset_ut offset_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * map_addr,vm_map_size_t * map_size,vm_map_offset_t * mask,vm_object_offset_t * obj_offs,vm_object_offset_t * obj_end,vm_object_size_t * obj_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)4836 vm_map_enter_mem_object_control_sanitize(
4837 vm_map_t target_map,
4838 vm_map_offset_ut address_u,
4839 vm_map_size_ut initial_size_u,
4840 vm_map_offset_ut mask_u,
4841 vm_object_offset_ut offset_u,
4842 vm_prot_ut cur_protection_u,
4843 vm_prot_ut max_protection_u,
4844 vm_inherit_ut inheritance_u,
4845 vm_map_kernel_flags_t vmk_flags,
4846 vm_map_address_t *map_addr,
4847 vm_map_size_t *map_size,
4848 vm_map_offset_t *mask,
4849 vm_object_offset_t *obj_offs,
4850 vm_object_offset_t *obj_end,
4851 vm_object_size_t *obj_size,
4852 vm_prot_t *cur_protection,
4853 vm_prot_t *max_protection,
4854 vm_inherit_t *inheritance)
4855 {
4856 kern_return_t kr;
4857
4858 kr = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
4859 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4860 cur_protection, max_protection);
4861 if (__improbable(kr != KERN_SUCCESS)) {
4862 return kr;
4863 }
4864
4865 kr = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL,
4866 inheritance);
4867 if (__improbable(kr != KERN_SUCCESS)) {
4868 return kr;
4869 }
4870
4871 kr = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, mask);
4872 if (__improbable(kr != KERN_SUCCESS)) {
4873 return kr;
4874 }
4875 /*
4876 * Ensure arithmetic doesn't overflow in vm_object space (kernel
4877 * pages).
4878 * We keep unaligned values for now. The call we eventually make to
4879 * vm_map_enter does guarantee that offset_u is page aligned for EITHER
4880 * target_map pages or kernel pages. But this isn't enough to guarantee
4881 * kernel space alignment.
4882 */
4883 kr = vm_sanitize_addr_size(offset_u, initial_size_u,
4884 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, PAGE_MASK,
4885 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
4886 obj_offs, obj_end, obj_size);
4887 if (__improbable(kr != KERN_SUCCESS)) {
4888 return kr;
4889 }
4890
4891 /*
4892 * There is no vm_sanitize_addr_size variant that also adjusts for
4893 * a separate offset. Rather than create one for this one-off issue,
4894 * we sanitize map_addr and map_size individually, relying on
4895 * vm_sanitize_size to incorporate the offset. Then, we perform the
4896 * overflow check manually below.
4897 */
4898 *map_addr = vm_sanitize_addr(target_map, address_u);
4899 kr = vm_sanitize_size(offset_u, initial_size_u,
4900 VM_SANITIZE_CALLER_ENTER_MEM_OBJ_CTL, target_map,
4901 VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS, map_size);
4902 if (__improbable(kr != KERN_SUCCESS)) {
4903 return kr;
4904 }
4905
4906 /*
4907 * Ensure arithmetic doesn't overflow in target_map space.
4908 * The computation of map_size above accounts for the possibility that
4909 * offset_u might be unaligned in target_map space.
4910 */
4911 if (vmk_flags.vmf_fixed) {
4912 vm_map_address_t map_end;
4913
4914 if (__improbable(os_add_overflow(*map_addr, *map_size, &map_end))) {
4915 return KERN_INVALID_ARGUMENT;
4916 }
4917 }
4918
4919 return KERN_SUCCESS;
4920 }
4921
4922 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_ut * address_u,vm_map_size_ut initial_size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,memory_object_control_t control,vm_object_offset_ut offset_u,boolean_t needs_copy,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u)4923 vm_map_enter_mem_object_control(
4924 vm_map_t target_map,
4925 vm_map_offset_ut *address_u,
4926 vm_map_size_ut initial_size_u,
4927 vm_map_offset_ut mask_u,
4928 vm_map_kernel_flags_t vmk_flags,
4929 memory_object_control_t control,
4930 vm_object_offset_ut offset_u,
4931 boolean_t needs_copy,
4932 vm_prot_ut cur_protection_u,
4933 vm_prot_ut max_protection_u,
4934 vm_inherit_ut inheritance_u)
4935 {
4936 vm_map_offset_t mask;
4937 vm_prot_t cur_protection;
4938 vm_prot_t max_protection;
4939 vm_inherit_t inheritance;
4940 vm_map_address_t map_addr;
4941 vm_map_size_t map_size;
4942 vm_object_t object;
4943 vm_object_offset_t obj_offs, obj_end;
4944 vm_object_size_t obj_size;
4945 kern_return_t result;
4946 memory_object_t pager;
4947 vm_prot_t pager_prot;
4948 kern_return_t kr;
4949
4950 /*
4951 * Check arguments for validity
4952 */
4953 if (target_map == VM_MAP_NULL) {
4954 return KERN_INVALID_ARGUMENT;
4955 }
4956
4957 /*
4958 * We only support vmf_return_data_addr-like behavior.
4959 */
4960 vmk_flags.vmf_return_data_addr = true;
4961
4962 /*
4963 * Sanitize any input parameters that are addr/size/prot/inherit
4964 */
4965 kr = vm_map_enter_mem_object_control_sanitize(target_map,
4966 *address_u,
4967 initial_size_u,
4968 mask_u,
4969 offset_u,
4970 cur_protection_u,
4971 max_protection_u,
4972 inheritance_u,
4973 vmk_flags,
4974 &map_addr,
4975 &map_size,
4976 &mask,
4977 &obj_offs,
4978 &obj_end,
4979 &obj_size,
4980 &cur_protection,
4981 &max_protection,
4982 &inheritance);
4983 if (__improbable(kr != KERN_SUCCESS)) {
4984 return vm_sanitize_get_kr(kr);
4985 }
4986
4987 object = memory_object_control_to_vm_object(control);
4988
4989 if (object == VM_OBJECT_NULL) {
4990 return KERN_INVALID_OBJECT;
4991 }
4992
4993 if (is_kernel_object(object)) {
4994 printf("Warning: Attempt to map kernel object"
4995 " by a non-private kernel entity\n");
4996 return KERN_INVALID_OBJECT;
4997 }
4998
4999 vm_object_lock(object);
5000 object->ref_count++;
5001
5002 /*
5003 * For "named" VM objects, let the pager know that the
5004 * memory object is being mapped. Some pagers need to keep
5005 * track of this, to know when they can reclaim the memory
5006 * object, for example.
5007 * VM calls memory_object_map() for each mapping (specifying
5008 * the protection of each mapping) and calls
5009 * memory_object_last_unmap() when all the mappings are gone.
5010 */
5011 pager_prot = max_protection;
5012 if (needs_copy) {
5013 pager_prot &= ~VM_PROT_WRITE;
5014 }
5015 pager = object->pager;
5016 if (object->named &&
5017 pager != MEMORY_OBJECT_NULL &&
5018 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5019 assert(object->pager_ready);
5020 vm_object_mapping_wait(object, THREAD_UNINT);
5021 vm_object_mapping_begin(object);
5022 vm_object_unlock(object);
5023
5024 kr = memory_object_map(pager, pager_prot);
5025 assert(kr == KERN_SUCCESS);
5026
5027 vm_object_lock(object);
5028 vm_object_mapping_end(object);
5029 }
5030 vm_object_unlock(object);
5031
5032 /*
5033 * Perform the copy if requested
5034 */
5035
5036 if (needs_copy) {
5037 vm_object_t new_object;
5038 vm_object_offset_t new_offset;
5039
5040 result = vm_object_copy_strategically(object, obj_offs, obj_size,
5041 false, /* forking */
5042 &new_object, &new_offset,
5043 &needs_copy);
5044
5045
5046 if (result == KERN_MEMORY_RESTART_COPY) {
5047 boolean_t success;
5048 boolean_t src_needs_copy;
5049
5050 /*
5051 * XXX
5052 * We currently ignore src_needs_copy.
5053 * This really is the issue of how to make
5054 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5055 * non-kernel users to use. Solution forthcoming.
5056 * In the meantime, since we don't allow non-kernel
5057 * memory managers to specify symmetric copy,
5058 * we won't run into problems here.
5059 */
5060 new_object = object;
5061 new_offset = obj_offs;
5062 success = vm_object_copy_quickly(new_object,
5063 new_offset, obj_size,
5064 &src_needs_copy,
5065 &needs_copy);
5066 assert(success);
5067 result = KERN_SUCCESS;
5068 }
5069 /*
5070 * Throw away the reference to the
5071 * original object, as it won't be mapped.
5072 */
5073
5074 vm_object_deallocate(object);
5075
5076 if (result != KERN_SUCCESS) {
5077 return result;
5078 }
5079
5080 object = new_object;
5081 obj_offs = new_offset;
5082 }
5083
5084 result = vm_map_enter(target_map,
5085 &map_addr, map_size,
5086 (vm_map_offset_t)mask,
5087 vmk_flags,
5088 object,
5089 obj_offs,
5090 needs_copy,
5091 cur_protection, max_protection,
5092 inheritance);
5093
5094 if (result == KERN_SUCCESS) {
5095 *address_u = vm_sanitize_wrap_addr(map_addr + (obj_offs & vm_map_page_mask(target_map)));
5096 } else {
5097 vm_object_deallocate(object);
5098 }
5099
5100 return result;
5101 }
5102
5103
5104 /* Not used without nested pmaps */
5105 #ifndef NO_NESTED_PMAP
5106 /*
5107 * Clip and unnest a portion of a nested submap mapping.
5108 */
5109
5110
5111 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5112 vm_map_clip_unnest(
5113 vm_map_t map,
5114 vm_map_entry_t entry,
5115 vm_map_offset_t start_unnest,
5116 vm_map_offset_t end_unnest)
5117 {
5118 vm_map_offset_t old_start_unnest = start_unnest;
5119 vm_map_offset_t old_end_unnest = end_unnest;
5120
5121 assert(entry->is_sub_map);
5122 assert(VME_SUBMAP(entry) != NULL);
5123 assert(entry->use_pmap);
5124
5125 /*
5126 * Query the platform for the optimal unnest range.
5127 * DRK: There's some duplication of effort here, since
5128 * callers may have adjusted the range to some extent. This
5129 * routine was introduced to support 1GiB subtree nesting
5130 * for x86 platforms, which can also nest on 2MiB boundaries
5131 * depending on size/alignment.
5132 */
5133 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5134 assert(VME_SUBMAP(entry)->is_nested_map);
5135 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5136 log_unnest_badness(map,
5137 old_start_unnest,
5138 old_end_unnest,
5139 VME_SUBMAP(entry)->is_nested_map,
5140 (entry->vme_start +
5141 VME_SUBMAP(entry)->lowest_unnestable_start -
5142 VME_OFFSET(entry)));
5143 }
5144
5145 if (entry->vme_start > start_unnest ||
5146 entry->vme_end < end_unnest) {
5147 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5148 "bad nested entry: start=0x%llx end=0x%llx\n",
5149 (long long)start_unnest, (long long)end_unnest,
5150 (long long)entry->vme_start, (long long)entry->vme_end);
5151 }
5152
5153 if (start_unnest > entry->vme_start) {
5154 _vm_map_clip_start(&map->hdr,
5155 entry,
5156 start_unnest);
5157 if (map->holelistenabled) {
5158 vm_map_store_update_first_free(map, NULL, FALSE);
5159 } else {
5160 vm_map_store_update_first_free(map, map->first_free, FALSE);
5161 }
5162 }
5163 if (entry->vme_end > end_unnest) {
5164 _vm_map_clip_end(&map->hdr,
5165 entry,
5166 end_unnest);
5167 if (map->holelistenabled) {
5168 vm_map_store_update_first_free(map, NULL, FALSE);
5169 } else {
5170 vm_map_store_update_first_free(map, map->first_free, FALSE);
5171 }
5172 }
5173
5174 pmap_unnest(map->pmap,
5175 entry->vme_start,
5176 entry->vme_end - entry->vme_start);
5177 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5178 /* clean up parent map/maps */
5179 vm_map_submap_pmap_clean(
5180 map, entry->vme_start,
5181 entry->vme_end,
5182 VME_SUBMAP(entry),
5183 VME_OFFSET(entry));
5184 }
5185 entry->use_pmap = FALSE;
5186 if ((map->pmap != kernel_pmap) &&
5187 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5188 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5189 }
5190 }
5191 #endif /* NO_NESTED_PMAP */
5192
5193 __abortlike
5194 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5195 __vm_map_clip_atomic_entry_panic(
5196 vm_map_t map,
5197 vm_map_entry_t entry,
5198 vm_map_offset_t where)
5199 {
5200 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5201 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5202 (uint64_t)entry->vme_start,
5203 (uint64_t)entry->vme_end,
5204 (uint64_t)where);
5205 }
5206
5207 /*
5208 * vm_map_clip_start: [ internal use only ]
5209 *
5210 * Asserts that the given entry begins at or after
5211 * the specified address; if necessary,
5212 * it splits the entry into two.
5213 */
5214 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5215 vm_map_clip_start(
5216 vm_map_t map,
5217 vm_map_entry_t entry,
5218 vm_map_offset_t startaddr)
5219 {
5220 #ifndef NO_NESTED_PMAP
5221 if (entry->is_sub_map &&
5222 entry->use_pmap &&
5223 startaddr >= entry->vme_start) {
5224 vm_map_offset_t start_unnest, end_unnest;
5225
5226 /*
5227 * Make sure "startaddr" is no longer in a nested range
5228 * before we clip. Unnest only the minimum range the platform
5229 * can handle.
5230 * vm_map_clip_unnest may perform additional adjustments to
5231 * the unnest range.
5232 */
5233 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5234 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5235 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5236 }
5237 #endif /* NO_NESTED_PMAP */
5238 if (startaddr > entry->vme_start) {
5239 if (!entry->is_sub_map &&
5240 VME_OBJECT(entry) &&
5241 VME_OBJECT(entry)->phys_contiguous) {
5242 pmap_remove(map->pmap,
5243 (addr64_t)(entry->vme_start),
5244 (addr64_t)(entry->vme_end));
5245 }
5246 if (entry->vme_atomic) {
5247 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5248 }
5249
5250 DTRACE_VM5(
5251 vm_map_clip_start,
5252 vm_map_t, map,
5253 vm_map_offset_t, entry->vme_start,
5254 vm_map_offset_t, entry->vme_end,
5255 vm_map_offset_t, startaddr,
5256 int, VME_ALIAS(entry));
5257
5258 _vm_map_clip_start(&map->hdr, entry, startaddr);
5259 if (map->holelistenabled) {
5260 vm_map_store_update_first_free(map, NULL, FALSE);
5261 } else {
5262 vm_map_store_update_first_free(map, map->first_free, FALSE);
5263 }
5264 }
5265 }
5266
5267
5268 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5269 MACRO_BEGIN \
5270 if ((startaddr) > (entry)->vme_start) \
5271 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5272 MACRO_END
5273
5274 /*
5275 * This routine is called only when it is known that
5276 * the entry must be split.
5277 */
5278 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5279 _vm_map_clip_start(
5280 struct vm_map_header *map_header,
5281 vm_map_entry_t entry,
5282 vm_map_offset_t start)
5283 {
5284 vm_map_entry_t new_entry;
5285
5286 /*
5287 * Split off the front portion --
5288 * note that we must insert the new
5289 * entry BEFORE this one, so that
5290 * this entry has the specified starting
5291 * address.
5292 */
5293
5294 if (entry->map_aligned) {
5295 assert(VM_MAP_PAGE_ALIGNED(start,
5296 VM_MAP_HDR_PAGE_MASK(map_header)));
5297 }
5298
5299 new_entry = _vm_map_entry_create(map_header);
5300 vm_map_entry_copy_full(new_entry, entry);
5301
5302 new_entry->vme_end = start;
5303 assert(new_entry->vme_start < new_entry->vme_end);
5304 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5305 if (__improbable(start >= entry->vme_end)) {
5306 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5307 }
5308 assert(start < entry->vme_end);
5309 entry->vme_start = start;
5310
5311 #if VM_BTLOG_TAGS
5312 if (new_entry->vme_kernel_object) {
5313 btref_retain(new_entry->vme_tag_btref);
5314 }
5315 #endif /* VM_BTLOG_TAGS */
5316
5317 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5318
5319 if (entry->is_sub_map) {
5320 vm_map_reference(VME_SUBMAP(new_entry));
5321 } else {
5322 vm_object_reference(VME_OBJECT(new_entry));
5323 }
5324 }
5325
5326
5327 /*
5328 * vm_map_clip_end: [ internal use only ]
5329 *
5330 * Asserts that the given entry ends at or before
5331 * the specified address; if necessary,
5332 * it splits the entry into two.
5333 */
5334 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5335 vm_map_clip_end(
5336 vm_map_t map,
5337 vm_map_entry_t entry,
5338 vm_map_offset_t endaddr)
5339 {
5340 if (endaddr > entry->vme_end) {
5341 /*
5342 * Within the scope of this clipping, limit "endaddr" to
5343 * the end of this map entry...
5344 */
5345 endaddr = entry->vme_end;
5346 }
5347 #ifndef NO_NESTED_PMAP
5348 if (entry->is_sub_map && entry->use_pmap) {
5349 vm_map_offset_t start_unnest, end_unnest;
5350
5351 /*
5352 * Make sure the range between the start of this entry and
5353 * the new "endaddr" is no longer nested before we clip.
5354 * Unnest only the minimum range the platform can handle.
5355 * vm_map_clip_unnest may perform additional adjustments to
5356 * the unnest range.
5357 */
5358 start_unnest = entry->vme_start;
5359 end_unnest =
5360 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5361 ~(pmap_shared_region_size_min(map->pmap) - 1);
5362 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5363 }
5364 #endif /* NO_NESTED_PMAP */
5365 if (endaddr < entry->vme_end) {
5366 if (!entry->is_sub_map &&
5367 VME_OBJECT(entry) &&
5368 VME_OBJECT(entry)->phys_contiguous) {
5369 pmap_remove(map->pmap,
5370 (addr64_t)(entry->vme_start),
5371 (addr64_t)(entry->vme_end));
5372 }
5373 if (entry->vme_atomic) {
5374 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5375 }
5376 DTRACE_VM5(
5377 vm_map_clip_end,
5378 vm_map_t, map,
5379 vm_map_offset_t, entry->vme_start,
5380 vm_map_offset_t, entry->vme_end,
5381 vm_map_offset_t, endaddr,
5382 int, VME_ALIAS(entry));
5383
5384 _vm_map_clip_end(&map->hdr, entry, endaddr);
5385 if (map->holelistenabled) {
5386 vm_map_store_update_first_free(map, NULL, FALSE);
5387 } else {
5388 vm_map_store_update_first_free(map, map->first_free, FALSE);
5389 }
5390 }
5391 }
5392
5393
5394 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5395 MACRO_BEGIN \
5396 if ((endaddr) < (entry)->vme_end) \
5397 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5398 MACRO_END
5399
5400 /*
5401 * This routine is called only when it is known that
5402 * the entry must be split.
5403 */
5404 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5405 _vm_map_clip_end(
5406 struct vm_map_header *map_header,
5407 vm_map_entry_t entry,
5408 vm_map_offset_t end)
5409 {
5410 vm_map_entry_t new_entry;
5411
5412 /*
5413 * Create a new entry and insert it
5414 * AFTER the specified entry
5415 */
5416
5417 if (entry->map_aligned) {
5418 assert(VM_MAP_PAGE_ALIGNED(end,
5419 VM_MAP_HDR_PAGE_MASK(map_header)));
5420 }
5421
5422 new_entry = _vm_map_entry_create(map_header);
5423 vm_map_entry_copy_full(new_entry, entry);
5424
5425 if (__improbable(end <= entry->vme_start)) {
5426 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5427 }
5428 assert(entry->vme_start < end);
5429 new_entry->vme_start = entry->vme_end = end;
5430 VME_OFFSET_SET(new_entry,
5431 VME_OFFSET(new_entry) + (end - entry->vme_start));
5432 assert(new_entry->vme_start < new_entry->vme_end);
5433
5434 #if VM_BTLOG_TAGS
5435 if (new_entry->vme_kernel_object) {
5436 btref_retain(new_entry->vme_tag_btref);
5437 }
5438 #endif /* VM_BTLOG_TAGS */
5439
5440 _vm_map_store_entry_link(map_header, entry, new_entry);
5441
5442 if (entry->is_sub_map) {
5443 vm_map_reference(VME_SUBMAP(new_entry));
5444 } else {
5445 vm_object_reference(VME_OBJECT(new_entry));
5446 }
5447 }
5448
5449
5450 /*
5451 * VM_MAP_RANGE_CHECK: [ internal use only ]
5452 *
5453 * Asserts that the starting and ending region
5454 * addresses fall within the valid range of the map.
5455 */
5456 #define VM_MAP_RANGE_CHECK(map, start, end) \
5457 MACRO_BEGIN \
5458 if (start < vm_map_min(map)) \
5459 start = vm_map_min(map); \
5460 if (end > vm_map_max(map)) \
5461 end = vm_map_max(map); \
5462 if (start > end) \
5463 start = end; \
5464 MACRO_END
5465
5466 /*
5467 * vm_map_range_check: [ internal use only ]
5468 *
5469 * Check that the region defined by the specified start and
5470 * end addresses are wholly contained within a single map
5471 * entry or set of adjacent map entries of the spacified map,
5472 * i.e. the specified region contains no unmapped space.
5473 * If any or all of the region is unmapped, FALSE is returned.
5474 * Otherwise, TRUE is returned and if the output argument 'entry'
5475 * is not NULL it points to the map entry containing the start
5476 * of the region.
5477 *
5478 * The map is locked for reading on entry and is left locked.
5479 */
5480 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5481 vm_map_range_check(
5482 vm_map_t map,
5483 vm_map_offset_t start,
5484 vm_map_offset_t end,
5485 vm_map_entry_t *entry)
5486 {
5487 vm_map_entry_t cur;
5488 vm_map_offset_t prev;
5489
5490 /*
5491 * Basic sanity checks first
5492 */
5493 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5494 return FALSE;
5495 }
5496
5497 /*
5498 * Check first if the region starts within a valid
5499 * mapping for the map.
5500 */
5501 if (!vm_map_lookup_entry(map, start, &cur)) {
5502 return FALSE;
5503 }
5504
5505 /*
5506 * Optimize for the case that the region is contained
5507 * in a single map entry.
5508 */
5509 if (entry != (vm_map_entry_t *) NULL) {
5510 *entry = cur;
5511 }
5512 if (end <= cur->vme_end) {
5513 return TRUE;
5514 }
5515
5516 /*
5517 * If the region is not wholly contained within a
5518 * single entry, walk the entries looking for holes.
5519 */
5520 prev = cur->vme_end;
5521 cur = cur->vme_next;
5522 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5523 if (end <= cur->vme_end) {
5524 return TRUE;
5525 }
5526 prev = cur->vme_end;
5527 cur = cur->vme_next;
5528 }
5529 return FALSE;
5530 }
5531
5532 /*
5533 * vm_map_protect:
5534 *
5535 * Sets the protection of the specified address
5536 * region in the target map. If "set_max" is
5537 * specified, the maximum protection is to be set;
5538 * otherwise, only the current protection is affected.
5539 */
5540 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5541 vm_map_protect(
5542 vm_map_t map,
5543 vm_map_offset_t start,
5544 vm_map_offset_t end,
5545 vm_prot_t new_prot,
5546 boolean_t set_max)
5547 {
5548 vm_map_entry_t current;
5549 vm_map_offset_t prev;
5550 vm_map_entry_t entry;
5551 vm_prot_t new_max;
5552 int pmap_options = 0;
5553 kern_return_t kr;
5554
5555 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5556 return KERN_INVALID_ARGUMENT;
5557 }
5558
5559 if (new_prot & VM_PROT_COPY) {
5560 vm_map_offset_t new_start;
5561 vm_prot_t cur_prot, max_prot;
5562 vm_map_kernel_flags_t kflags;
5563
5564 /* LP64todo - see below */
5565 if (start >= map->max_offset) {
5566 return KERN_INVALID_ADDRESS;
5567 }
5568
5569 if ((new_prot & VM_PROT_ALLEXEC) &&
5570 map->pmap != kernel_pmap &&
5571 (vm_map_cs_enforcement(map)
5572 #if XNU_TARGET_OS_OSX && __arm64__
5573 || !VM_MAP_IS_EXOTIC(map)
5574 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5575 ) &&
5576 VM_MAP_POLICY_WX_FAIL(map)) {
5577 DTRACE_VM3(cs_wx,
5578 uint64_t, (uint64_t) start,
5579 uint64_t, (uint64_t) end,
5580 vm_prot_t, new_prot);
5581 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5582 proc_selfpid(),
5583 (get_bsdtask_info(current_task())
5584 ? proc_name_address(get_bsdtask_info(current_task()))
5585 : "?"),
5586 __FUNCTION__, __LINE__,
5587 #if DEVELOPMENT || DEBUG
5588 (uint64_t)start,
5589 (uint64_t)end,
5590 #else /* DEVELOPMENT || DEBUG */
5591 (uint64_t)0,
5592 (uint64_t)0,
5593 #endif /* DEVELOPMENT || DEBUG */
5594 new_prot);
5595 return KERN_PROTECTION_FAILURE;
5596 }
5597
5598 /*
5599 * Let vm_map_remap_extract() know that it will need to:
5600 * + make a copy of the mapping
5601 * + add VM_PROT_WRITE to the max protections
5602 * + remove any protections that are no longer allowed from the
5603 * max protections (to avoid any WRITE/EXECUTE conflict, for
5604 * example).
5605 * Note that "max_prot" is an IN/OUT parameter only for this
5606 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5607 * only.
5608 */
5609 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5610 cur_prot = VM_PROT_NONE;
5611 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
5612 kflags.vmkf_remap_prot_copy = true;
5613 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
5614 new_start = start;
5615 kr = vm_map_remap(map,
5616 vm_sanitize_wrap_addr_ref(&new_start),
5617 end - start,
5618 0, /* mask */
5619 kflags,
5620 map,
5621 start,
5622 TRUE, /* copy-on-write remapping! */
5623 vm_sanitize_wrap_prot_ref(&cur_prot), /* IN/OUT */
5624 vm_sanitize_wrap_prot_ref(&max_prot), /* IN/OUT */
5625 VM_INHERIT_DEFAULT);
5626 if (kr != KERN_SUCCESS) {
5627 return kr;
5628 }
5629 new_prot &= ~VM_PROT_COPY;
5630 }
5631
5632 vm_map_lock(map);
5633
5634 /* LP64todo - remove this check when vm_map_commpage64()
5635 * no longer has to stuff in a map_entry for the commpage
5636 * above the map's max_offset.
5637 */
5638 if (start >= map->max_offset) {
5639 vm_map_unlock(map);
5640 return KERN_INVALID_ADDRESS;
5641 }
5642
5643 while (1) {
5644 /*
5645 * Lookup the entry. If it doesn't start in a valid
5646 * entry, return an error.
5647 */
5648 if (!vm_map_lookup_entry(map, start, &entry)) {
5649 vm_map_unlock(map);
5650 return KERN_INVALID_ADDRESS;
5651 }
5652
5653 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5654 start = SUPERPAGE_ROUND_DOWN(start);
5655 continue;
5656 }
5657 break;
5658 }
5659 if (entry->superpage_size) {
5660 end = SUPERPAGE_ROUND_UP(end);
5661 }
5662
5663 /*
5664 * Make a first pass to check for protection and address
5665 * violations.
5666 */
5667
5668 current = entry;
5669 prev = current->vme_start;
5670 while ((current != vm_map_to_entry(map)) &&
5671 (current->vme_start < end)) {
5672 /*
5673 * If there is a hole, return an error.
5674 */
5675 if (current->vme_start != prev) {
5676 vm_map_unlock(map);
5677 return KERN_INVALID_ADDRESS;
5678 }
5679
5680 new_max = current->max_protection;
5681
5682 #if defined(__x86_64__)
5683 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5684 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5685 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5686 }
5687 #elif CODE_SIGNING_MONITOR
5688 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
5689 new_max |= VM_PROT_EXECUTE;
5690 }
5691 #endif
5692 if ((new_prot & new_max) != new_prot) {
5693 vm_map_unlock(map);
5694 return KERN_PROTECTION_FAILURE;
5695 }
5696
5697 if (current->used_for_jit &&
5698 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5699 vm_map_unlock(map);
5700 return KERN_PROTECTION_FAILURE;
5701 }
5702
5703 #if __arm64e__
5704 /* Disallow remapping hw assisted TPRO mappings */
5705 if (current->used_for_tpro) {
5706 vm_map_unlock(map);
5707 return KERN_PROTECTION_FAILURE;
5708 }
5709 #endif /* __arm64e__ */
5710
5711
5712 if ((new_prot & VM_PROT_WRITE) &&
5713 (new_prot & VM_PROT_ALLEXEC) &&
5714 #if XNU_TARGET_OS_OSX
5715 map->pmap != kernel_pmap &&
5716 (vm_map_cs_enforcement(map)
5717 #if __arm64__
5718 || !VM_MAP_IS_EXOTIC(map)
5719 #endif /* __arm64__ */
5720 ) &&
5721 #endif /* XNU_TARGET_OS_OSX */
5722 #if CODE_SIGNING_MONITOR
5723 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
5724 #endif
5725 !(current->used_for_jit)) {
5726 DTRACE_VM3(cs_wx,
5727 uint64_t, (uint64_t) current->vme_start,
5728 uint64_t, (uint64_t) current->vme_end,
5729 vm_prot_t, new_prot);
5730 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
5731 proc_selfpid(),
5732 (get_bsdtask_info(current_task())
5733 ? proc_name_address(get_bsdtask_info(current_task()))
5734 : "?"),
5735 __FUNCTION__, __LINE__,
5736 #if DEVELOPMENT || DEBUG
5737 (uint64_t)current->vme_start,
5738 (uint64_t)current->vme_end,
5739 #else /* DEVELOPMENT || DEBUG */
5740 (uint64_t)0,
5741 (uint64_t)0,
5742 #endif /* DEVELOPMENT || DEBUG */
5743 new_prot);
5744 new_prot &= ~VM_PROT_ALLEXEC;
5745 if (VM_MAP_POLICY_WX_FAIL(map)) {
5746 vm_map_unlock(map);
5747 return KERN_PROTECTION_FAILURE;
5748 }
5749 }
5750
5751 /*
5752 * If the task has requested executable lockdown,
5753 * deny both:
5754 * - adding executable protections OR
5755 * - adding write protections to an existing executable mapping.
5756 */
5757 if (map->map_disallow_new_exec == TRUE) {
5758 if ((new_prot & VM_PROT_ALLEXEC) ||
5759 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5760 vm_map_unlock(map);
5761 return KERN_PROTECTION_FAILURE;
5762 }
5763 }
5764
5765 prev = current->vme_end;
5766 current = current->vme_next;
5767 }
5768
5769 #if __arm64__
5770 if (end > prev &&
5771 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5772 vm_map_entry_t prev_entry;
5773
5774 prev_entry = current->vme_prev;
5775 if (prev_entry != vm_map_to_entry(map) &&
5776 !prev_entry->map_aligned &&
5777 (vm_map_round_page(prev_entry->vme_end,
5778 VM_MAP_PAGE_MASK(map))
5779 == end)) {
5780 /*
5781 * The last entry in our range is not "map-aligned"
5782 * but it would have reached all the way to "end"
5783 * if it had been map-aligned, so this is not really
5784 * a hole in the range and we can proceed.
5785 */
5786 prev = end;
5787 }
5788 }
5789 #endif /* __arm64__ */
5790
5791 if (end > prev) {
5792 vm_map_unlock(map);
5793 return KERN_INVALID_ADDRESS;
5794 }
5795
5796 /*
5797 * Go back and fix up protections.
5798 * Clip to start here if the range starts within
5799 * the entry.
5800 */
5801
5802 current = entry;
5803 if (current != vm_map_to_entry(map)) {
5804 /* clip and unnest if necessary */
5805 vm_map_clip_start(map, current, start);
5806 }
5807
5808 while ((current != vm_map_to_entry(map)) &&
5809 (current->vme_start < end)) {
5810 vm_prot_t old_prot;
5811
5812 vm_map_clip_end(map, current, end);
5813
5814 #if DEVELOPMENT || DEBUG
5815 if (current->csm_associated && vm_log_xnu_user_debug) {
5816 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
5817 proc_selfpid(),
5818 (get_bsdtask_info(current_task())
5819 ? proc_name_address(get_bsdtask_info(current_task()))
5820 : "?"),
5821 __FUNCTION__,
5822 (uint64_t)start,
5823 (uint64_t)end,
5824 new_prot,
5825 map, current,
5826 current->vme_start,
5827 current->vme_end,
5828 current->protection,
5829 current->max_protection);
5830 }
5831 #endif /* DEVELOPMENT || DEBUG */
5832
5833 if (current->is_sub_map) {
5834 /* clipping did unnest if needed */
5835 assert(!current->use_pmap);
5836 }
5837
5838 old_prot = current->protection;
5839
5840 if (set_max) {
5841 current->max_protection = new_prot;
5842 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
5843 current->protection = (new_prot & old_prot);
5844 } else {
5845 current->protection = new_prot;
5846 }
5847
5848 #if CODE_SIGNING_MONITOR
5849 if (!current->vme_xnu_user_debug &&
5850 /* a !csm_associated mapping becoming executable */
5851 ((!current->csm_associated &&
5852 !(old_prot & VM_PROT_EXECUTE) &&
5853 (current->protection & VM_PROT_EXECUTE))
5854 ||
5855 /* a csm_associated mapping becoming writable */
5856 (current->csm_associated &&
5857 !(old_prot & VM_PROT_WRITE) &&
5858 (current->protection & VM_PROT_WRITE)))) {
5859 /*
5860 * This mapping has not already been marked as
5861 * "user_debug" and it is either:
5862 * 1. not code-signing-monitored and becoming executable
5863 * 2. code-signing-monitored and becoming writable,
5864 * so inform the CodeSigningMonitor and mark the
5865 * mapping as "user_debug" if appropriate.
5866 */
5867 vm_map_kernel_flags_t vmk_flags;
5868 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
5869 /* pretend it's a vm_protect(VM_PROT_COPY)... */
5870 vmk_flags.vmkf_remap_prot_copy = true;
5871 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
5872 #if DEVELOPMENT || DEBUG
5873 if (vm_log_xnu_user_debug) {
5874 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
5875 proc_selfpid(),
5876 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
5877 __FUNCTION__, __LINE__,
5878 map, current,
5879 current->vme_start, current->vme_end,
5880 old_prot, current->protection,
5881 kr, current->vme_xnu_user_debug);
5882 }
5883 #endif /* DEVELOPMENT || DEBUG */
5884 }
5885 #endif /* CODE_SIGNING_MONITOR */
5886
5887 /*
5888 * Update physical map if necessary.
5889 * If the request is to turn off write protection,
5890 * we won't do it for real (in pmap). This is because
5891 * it would cause copy-on-write to fail. We've already
5892 * set, the new protection in the map, so if a
5893 * write-protect fault occurred, it will be fixed up
5894 * properly, COW or not.
5895 */
5896 if (current->protection != old_prot) {
5897 /* Look one level in we support nested pmaps */
5898 /* from mapped submaps which are direct entries */
5899 /* in our map */
5900
5901 vm_prot_t prot;
5902
5903 prot = current->protection;
5904 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
5905 prot &= ~VM_PROT_WRITE;
5906 } else {
5907 assert(!VME_OBJECT(current)->code_signed);
5908 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
5909 if (prot & VM_PROT_WRITE) {
5910 /*
5911 * For write requests on the
5912 * compressor, we wil ask the
5913 * pmap layer to prevent us from
5914 * taking a write fault when we
5915 * attempt to access the mapping
5916 * next.
5917 */
5918 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
5919 }
5920 }
5921
5922 if (override_nx(map, VME_ALIAS(current)) && prot) {
5923 prot |= VM_PROT_EXECUTE;
5924 }
5925
5926 #if DEVELOPMENT || DEBUG
5927 if (!(old_prot & VM_PROT_EXECUTE) &&
5928 (prot & VM_PROT_EXECUTE) &&
5929 panic_on_unsigned_execute &&
5930 (proc_selfcsflags() & CS_KILL)) {
5931 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
5932 }
5933 #endif /* DEVELOPMENT || DEBUG */
5934
5935 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
5936 if (current->wired_count) {
5937 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
5938 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
5939 }
5940
5941 /* If the pmap layer cares about this
5942 * protection type, force a fault for
5943 * each page so that vm_fault will
5944 * repopulate the page with the full
5945 * set of protections.
5946 */
5947 /*
5948 * TODO: We don't seem to need this,
5949 * but this is due to an internal
5950 * implementation detail of
5951 * pmap_protect. Do we want to rely
5952 * on this?
5953 */
5954 prot = VM_PROT_NONE;
5955 }
5956
5957 if (current->is_sub_map && current->use_pmap) {
5958 pmap_protect(VME_SUBMAP(current)->pmap,
5959 current->vme_start,
5960 current->vme_end,
5961 prot);
5962 } else {
5963 pmap_protect_options(map->pmap,
5964 current->vme_start,
5965 current->vme_end,
5966 prot,
5967 pmap_options,
5968 NULL);
5969 }
5970 }
5971 current = current->vme_next;
5972 }
5973
5974 current = entry;
5975 while ((current != vm_map_to_entry(map)) &&
5976 (current->vme_start <= end)) {
5977 vm_map_simplify_entry(map, current);
5978 current = current->vme_next;
5979 }
5980
5981 vm_map_unlock(map);
5982 return KERN_SUCCESS;
5983 }
5984
5985 /*
5986 * vm_map_inherit:
5987 *
5988 * Sets the inheritance of the specified address
5989 * range in the target map. Inheritance
5990 * affects how the map will be shared with
5991 * child maps at the time of vm_map_fork.
5992 */
5993 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)5994 vm_map_inherit(
5995 vm_map_t map,
5996 vm_map_offset_t start,
5997 vm_map_offset_t end,
5998 vm_inherit_t new_inheritance)
5999 {
6000 vm_map_entry_t entry;
6001 vm_map_entry_t temp_entry;
6002
6003 vm_map_lock(map);
6004
6005 VM_MAP_RANGE_CHECK(map, start, end);
6006
6007 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6008 vm_map_unlock(map);
6009 return KERN_INVALID_ADDRESS;
6010 }
6011
6012 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6013 entry = temp_entry;
6014 } else {
6015 temp_entry = temp_entry->vme_next;
6016 entry = temp_entry;
6017 }
6018
6019 /* first check entire range for submaps which can't support the */
6020 /* given inheritance. */
6021 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6022 if (entry->is_sub_map) {
6023 if (new_inheritance == VM_INHERIT_COPY) {
6024 vm_map_unlock(map);
6025 return KERN_INVALID_ARGUMENT;
6026 }
6027 }
6028
6029 entry = entry->vme_next;
6030 }
6031
6032 entry = temp_entry;
6033 if (entry != vm_map_to_entry(map)) {
6034 /* clip and unnest if necessary */
6035 vm_map_clip_start(map, entry, start);
6036 }
6037
6038 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6039 vm_map_clip_end(map, entry, end);
6040 if (entry->is_sub_map) {
6041 /* clip did unnest if needed */
6042 assert(!entry->use_pmap);
6043 }
6044
6045 entry->inheritance = new_inheritance;
6046
6047 entry = entry->vme_next;
6048 }
6049
6050 vm_map_unlock(map);
6051 return KERN_SUCCESS;
6052 }
6053
6054 /*
6055 * Update the accounting for the amount of wired memory in this map. If the user has
6056 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6057 */
6058
6059 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6060 add_wire_counts(
6061 vm_map_t map,
6062 vm_map_entry_t entry,
6063 boolean_t user_wire)
6064 {
6065 vm_map_size_t size;
6066
6067 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6068
6069 if (user_wire) {
6070 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6071
6072 /*
6073 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6074 * this map entry.
6075 */
6076
6077 if (entry->user_wired_count == 0) {
6078 size = entry->vme_end - entry->vme_start;
6079
6080 /*
6081 * Since this is the first time the user is wiring this map entry, check to see if we're
6082 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6083 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6084 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6085 * limit, then we fail.
6086 */
6087
6088 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6089 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6090 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6091 #if DEVELOPMENT || DEBUG
6092 if (panic_on_mlock_failure) {
6093 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6094 }
6095 #endif /* DEVELOPMENT || DEBUG */
6096 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6097 } else {
6098 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6099 #if DEVELOPMENT || DEBUG
6100 if (panic_on_mlock_failure) {
6101 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6102 }
6103 #endif /* DEVELOPMENT || DEBUG */
6104 }
6105 return KERN_RESOURCE_SHORTAGE;
6106 }
6107
6108 /*
6109 * The first time the user wires an entry, we also increment the wired_count and add this to
6110 * the total that has been wired in the map.
6111 */
6112
6113 if (entry->wired_count >= MAX_WIRE_COUNT) {
6114 return KERN_FAILURE;
6115 }
6116
6117 entry->wired_count++;
6118 map->user_wire_size += size;
6119 }
6120
6121 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6122 return KERN_FAILURE;
6123 }
6124
6125 entry->user_wired_count++;
6126 } else {
6127 /*
6128 * The kernel's wiring the memory. Just bump the count and continue.
6129 */
6130
6131 if (entry->wired_count >= MAX_WIRE_COUNT) {
6132 panic("vm_map_wire: too many wirings");
6133 }
6134
6135 entry->wired_count++;
6136 }
6137
6138 if (first_wire) {
6139 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
6140 }
6141
6142 return KERN_SUCCESS;
6143 }
6144
6145 /*
6146 * Update the memory wiring accounting now that the given map entry is being unwired.
6147 */
6148
6149 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6150 subtract_wire_counts(
6151 vm_map_t map,
6152 vm_map_entry_t entry,
6153 boolean_t user_wire)
6154 {
6155 if (user_wire) {
6156 /*
6157 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6158 */
6159
6160 if (entry->user_wired_count == 1) {
6161 /*
6162 * We're removing the last user wire reference. Decrement the wired_count and the total
6163 * user wired memory for this map.
6164 */
6165
6166 assert(entry->wired_count >= 1);
6167 entry->wired_count--;
6168 map->user_wire_size -= entry->vme_end - entry->vme_start;
6169 }
6170
6171 assert(entry->user_wired_count >= 1);
6172 entry->user_wired_count--;
6173 } else {
6174 /*
6175 * The kernel is unwiring the memory. Just update the count.
6176 */
6177
6178 assert(entry->wired_count >= 1);
6179 entry->wired_count--;
6180 }
6181
6182 vme_btref_consider_and_put(entry);
6183 }
6184
6185 int cs_executable_wire = 0;
6186
6187 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6188 vm_map_wire_nested(
6189 vm_map_t map,
6190 vm_map_offset_t start,
6191 vm_map_offset_t end,
6192 vm_prot_t caller_prot,
6193 vm_tag_t tag,
6194 boolean_t user_wire,
6195 pmap_t map_pmap,
6196 vm_map_offset_t pmap_addr,
6197 ppnum_t *physpage_p)
6198 {
6199 vm_map_entry_t entry;
6200 vm_prot_t access_type;
6201 struct vm_map_entry *first_entry, tmp_entry;
6202 vm_map_t real_map;
6203 vm_map_offset_t s, e;
6204 kern_return_t rc;
6205 boolean_t need_wakeup;
6206 boolean_t main_map = FALSE;
6207 wait_interrupt_t interruptible_state;
6208 thread_t cur_thread;
6209 unsigned int last_timestamp;
6210 vm_map_size_t size;
6211 boolean_t wire_and_extract;
6212 vm_prot_t extra_prots;
6213
6214 extra_prots = VM_PROT_COPY;
6215 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6216 #if XNU_TARGET_OS_OSX
6217 if (map->pmap == kernel_pmap ||
6218 !vm_map_cs_enforcement(map)) {
6219 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6220 }
6221 #endif /* XNU_TARGET_OS_OSX */
6222 #if CODE_SIGNING_MONITOR
6223 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6224 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6225 }
6226 #endif /* CODE_SIGNING_MONITOR */
6227
6228 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6229
6230 wire_and_extract = FALSE;
6231 if (physpage_p != NULL) {
6232 /*
6233 * The caller wants the physical page number of the
6234 * wired page. We return only one physical page number
6235 * so this works for only one page at a time.
6236 *
6237 * The only caller (vm_map_wire_and_extract)
6238 * guarantees it.
6239 */
6240 assert(end - start == VM_MAP_PAGE_SIZE(map));
6241 wire_and_extract = TRUE;
6242 *physpage_p = 0;
6243 }
6244
6245 VM_MAP_RANGE_CHECK(map, start, end);
6246 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6247 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6248 if (start == end) {
6249 /* We wired what the caller asked for, zero pages */
6250 return KERN_SUCCESS;
6251 }
6252
6253 vm_map_lock(map);
6254 if (map_pmap == NULL) {
6255 main_map = TRUE;
6256 }
6257 last_timestamp = map->timestamp;
6258
6259 need_wakeup = FALSE;
6260 cur_thread = current_thread();
6261
6262 s = start;
6263 rc = KERN_SUCCESS;
6264
6265 if (vm_map_lookup_entry(map, s, &first_entry)) {
6266 entry = first_entry;
6267 /*
6268 * vm_map_clip_start will be done later.
6269 * We don't want to unnest any nested submaps here !
6270 */
6271 } else {
6272 /* Start address is not in map */
6273 rc = KERN_INVALID_ADDRESS;
6274 goto done;
6275 }
6276
6277 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6278 /*
6279 * At this point, we have wired from "start" to "s".
6280 * We still need to wire from "s" to "end".
6281 *
6282 * "entry" hasn't been clipped, so it could start before "s"
6283 * and/or end after "end".
6284 */
6285
6286 /* "e" is how far we want to wire in this entry */
6287 e = entry->vme_end;
6288 if (e > end) {
6289 e = end;
6290 }
6291
6292 /*
6293 * If another thread is wiring/unwiring this entry then
6294 * block after informing other thread to wake us up.
6295 */
6296 if (entry->in_transition) {
6297 wait_result_t wait_result;
6298
6299 /*
6300 * We have not clipped the entry. Make sure that
6301 * the start address is in range so that the lookup
6302 * below will succeed.
6303 * "s" is the current starting point: we've already
6304 * wired from "start" to "s" and we still have
6305 * to wire from "s" to "end".
6306 */
6307
6308 entry->needs_wakeup = TRUE;
6309
6310 /*
6311 * wake up anybody waiting on entries that we have
6312 * already wired.
6313 */
6314 if (need_wakeup) {
6315 vm_map_entry_wakeup(map);
6316 need_wakeup = FALSE;
6317 }
6318 /*
6319 * User wiring is interruptible
6320 */
6321 wait_result = vm_map_entry_wait(map,
6322 (user_wire) ? THREAD_ABORTSAFE :
6323 THREAD_UNINT);
6324 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6325 /*
6326 * undo the wirings we have done so far
6327 * We do not clear the needs_wakeup flag,
6328 * because we cannot tell if we were the
6329 * only one waiting.
6330 */
6331 rc = KERN_FAILURE;
6332 goto done;
6333 }
6334
6335 /*
6336 * Cannot avoid a lookup here. reset timestamp.
6337 */
6338 last_timestamp = map->timestamp;
6339
6340 /*
6341 * The entry could have been clipped, look it up again.
6342 * Worse that can happen is, it may not exist anymore.
6343 */
6344 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6345 /*
6346 * User: undo everything upto the previous
6347 * entry. let vm_map_unwire worry about
6348 * checking the validity of the range.
6349 */
6350 rc = KERN_FAILURE;
6351 goto done;
6352 }
6353 entry = first_entry;
6354 continue;
6355 }
6356
6357 if (entry->is_sub_map) {
6358 vm_map_offset_t sub_start;
6359 vm_map_offset_t sub_end;
6360 vm_map_offset_t local_start;
6361 vm_map_offset_t local_end;
6362 pmap_t pmap;
6363
6364 if (wire_and_extract) {
6365 /*
6366 * Wiring would result in copy-on-write
6367 * which would not be compatible with
6368 * the sharing we have with the original
6369 * provider of this memory.
6370 */
6371 rc = KERN_INVALID_ARGUMENT;
6372 goto done;
6373 }
6374
6375 vm_map_clip_start(map, entry, s);
6376 vm_map_clip_end(map, entry, end);
6377
6378 sub_start = VME_OFFSET(entry);
6379 sub_end = entry->vme_end;
6380 sub_end += VME_OFFSET(entry) - entry->vme_start;
6381
6382 local_end = entry->vme_end;
6383 if (map_pmap == NULL) {
6384 vm_object_t object;
6385 vm_object_offset_t offset;
6386 vm_prot_t prot;
6387 boolean_t wired;
6388 vm_map_entry_t local_entry;
6389 vm_map_version_t version;
6390 vm_map_t lookup_map;
6391
6392 if (entry->use_pmap) {
6393 pmap = VME_SUBMAP(entry)->pmap;
6394 /* ppc implementation requires that */
6395 /* submaps pmap address ranges line */
6396 /* up with parent map */
6397 #ifdef notdef
6398 pmap_addr = sub_start;
6399 #endif
6400 pmap_addr = s;
6401 } else {
6402 pmap = map->pmap;
6403 pmap_addr = s;
6404 }
6405
6406 if (entry->wired_count) {
6407 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6408 goto done;
6409 }
6410
6411 /*
6412 * The map was not unlocked:
6413 * no need to goto re-lookup.
6414 * Just go directly to next entry.
6415 */
6416 entry = entry->vme_next;
6417 s = entry->vme_start;
6418 continue;
6419 }
6420
6421 /* call vm_map_lookup_and_lock_object to */
6422 /* cause any needs copy to be */
6423 /* evaluated */
6424 local_start = entry->vme_start;
6425 lookup_map = map;
6426 vm_map_lock_write_to_read(map);
6427 rc = vm_map_lookup_and_lock_object(
6428 &lookup_map, local_start,
6429 (access_type | extra_prots),
6430 OBJECT_LOCK_EXCLUSIVE,
6431 &version, &object,
6432 &offset, &prot, &wired,
6433 NULL,
6434 &real_map, NULL);
6435 if (rc != KERN_SUCCESS) {
6436 vm_map_unlock_read(lookup_map);
6437 assert(map_pmap == NULL);
6438 vm_map_unwire_nested(map, start,
6439 s, user_wire, PMAP_NULL, 0);
6440 return rc;
6441 }
6442 vm_object_unlock(object);
6443 if (real_map != lookup_map) {
6444 vm_map_unlock(real_map);
6445 }
6446 vm_map_unlock_read(lookup_map);
6447 vm_map_lock(map);
6448
6449 /* we unlocked, so must re-lookup */
6450 if (!vm_map_lookup_entry(map,
6451 local_start,
6452 &local_entry)) {
6453 rc = KERN_FAILURE;
6454 goto done;
6455 }
6456
6457 /*
6458 * entry could have been "simplified",
6459 * so re-clip
6460 */
6461 entry = local_entry;
6462 assert(s == local_start);
6463 vm_map_clip_start(map, entry, s);
6464 vm_map_clip_end(map, entry, end);
6465 /* re-compute "e" */
6466 e = entry->vme_end;
6467 if (e > end) {
6468 e = end;
6469 }
6470
6471 /* did we have a change of type? */
6472 if (!entry->is_sub_map) {
6473 last_timestamp = map->timestamp;
6474 continue;
6475 }
6476 } else {
6477 local_start = entry->vme_start;
6478 pmap = map_pmap;
6479 }
6480
6481 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6482 goto done;
6483 }
6484
6485 entry->in_transition = TRUE;
6486
6487 vm_map_unlock(map);
6488 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6489 sub_start, sub_end,
6490 caller_prot, tag,
6491 user_wire, pmap, pmap_addr,
6492 NULL);
6493 vm_map_lock(map);
6494
6495 /*
6496 * Find the entry again. It could have been clipped
6497 * after we unlocked the map.
6498 */
6499 if (!vm_map_lookup_entry(map, local_start,
6500 &first_entry)) {
6501 panic("vm_map_wire: re-lookup failed");
6502 }
6503 entry = first_entry;
6504
6505 assert(local_start == s);
6506 /* re-compute "e" */
6507 e = entry->vme_end;
6508 if (e > end) {
6509 e = end;
6510 }
6511
6512 last_timestamp = map->timestamp;
6513 while ((entry != vm_map_to_entry(map)) &&
6514 (entry->vme_start < e)) {
6515 assert(entry->in_transition);
6516 entry->in_transition = FALSE;
6517 if (entry->needs_wakeup) {
6518 entry->needs_wakeup = FALSE;
6519 need_wakeup = TRUE;
6520 }
6521 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6522 subtract_wire_counts(map, entry, user_wire);
6523 }
6524 entry = entry->vme_next;
6525 }
6526 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6527 goto done;
6528 }
6529
6530 /* no need to relookup again */
6531 s = entry->vme_start;
6532 continue;
6533 }
6534
6535 /*
6536 * If this entry is already wired then increment
6537 * the appropriate wire reference count.
6538 */
6539 if (entry->wired_count) {
6540 if ((entry->protection & access_type) != access_type) {
6541 /* found a protection problem */
6542
6543 /*
6544 * XXX FBDP
6545 * We should always return an error
6546 * in this case but since we didn't
6547 * enforce it before, let's do
6548 * it only for the new "wire_and_extract"
6549 * code path for now...
6550 */
6551 if (wire_and_extract) {
6552 rc = KERN_PROTECTION_FAILURE;
6553 goto done;
6554 }
6555 }
6556
6557 /*
6558 * entry is already wired down, get our reference
6559 * after clipping to our range.
6560 */
6561 vm_map_clip_start(map, entry, s);
6562 vm_map_clip_end(map, entry, end);
6563
6564 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6565 goto done;
6566 }
6567
6568 if (wire_and_extract) {
6569 vm_object_t object;
6570 vm_object_offset_t offset;
6571 vm_page_t m;
6572
6573 /*
6574 * We don't have to "wire" the page again
6575 * bit we still have to "extract" its
6576 * physical page number, after some sanity
6577 * checks.
6578 */
6579 assert((entry->vme_end - entry->vme_start)
6580 == PAGE_SIZE);
6581 assert(!entry->needs_copy);
6582 assert(!entry->is_sub_map);
6583 assert(VME_OBJECT(entry));
6584 if (((entry->vme_end - entry->vme_start)
6585 != PAGE_SIZE) ||
6586 entry->needs_copy ||
6587 entry->is_sub_map ||
6588 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6589 rc = KERN_INVALID_ARGUMENT;
6590 goto done;
6591 }
6592
6593 object = VME_OBJECT(entry);
6594 offset = VME_OFFSET(entry);
6595 /* need exclusive lock to update m->dirty */
6596 if (entry->protection & VM_PROT_WRITE) {
6597 vm_object_lock(object);
6598 } else {
6599 vm_object_lock_shared(object);
6600 }
6601 m = vm_page_lookup(object, offset);
6602 assert(m != VM_PAGE_NULL);
6603 assert(VM_PAGE_WIRED(m));
6604 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6605 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6606 if (entry->protection & VM_PROT_WRITE) {
6607 vm_object_lock_assert_exclusive(
6608 object);
6609 m->vmp_dirty = TRUE;
6610 }
6611 } else {
6612 /* not already wired !? */
6613 *physpage_p = 0;
6614 }
6615 vm_object_unlock(object);
6616 }
6617
6618 /* map was not unlocked: no need to relookup */
6619 entry = entry->vme_next;
6620 s = entry->vme_start;
6621 continue;
6622 }
6623
6624 /*
6625 * Unwired entry or wire request transmitted via submap
6626 */
6627
6628 /*
6629 * Wiring would copy the pages to the shadow object.
6630 * The shadow object would not be code-signed so
6631 * attempting to execute code from these copied pages
6632 * would trigger a code-signing violation.
6633 */
6634
6635 if ((entry->protection & VM_PROT_EXECUTE)
6636 #if XNU_TARGET_OS_OSX
6637 &&
6638 map->pmap != kernel_pmap &&
6639 (vm_map_cs_enforcement(map)
6640 #if __arm64__
6641 || !VM_MAP_IS_EXOTIC(map)
6642 #endif /* __arm64__ */
6643 )
6644 #endif /* XNU_TARGET_OS_OSX */
6645 #if CODE_SIGNING_MONITOR
6646 &&
6647 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
6648 #endif
6649 ) {
6650 #if MACH_ASSERT
6651 printf("pid %d[%s] wiring executable range from "
6652 "0x%llx to 0x%llx: rejected to preserve "
6653 "code-signing\n",
6654 proc_selfpid(),
6655 (get_bsdtask_info(current_task())
6656 ? proc_name_address(get_bsdtask_info(current_task()))
6657 : "?"),
6658 (uint64_t) entry->vme_start,
6659 (uint64_t) entry->vme_end);
6660 #endif /* MACH_ASSERT */
6661 DTRACE_VM2(cs_executable_wire,
6662 uint64_t, (uint64_t)entry->vme_start,
6663 uint64_t, (uint64_t)entry->vme_end);
6664 cs_executable_wire++;
6665 rc = KERN_PROTECTION_FAILURE;
6666 goto done;
6667 }
6668
6669 /*
6670 * Perform actions of vm_map_lookup that need the write
6671 * lock on the map: create a shadow object for a
6672 * copy-on-write region, or an object for a zero-fill
6673 * region.
6674 */
6675 size = entry->vme_end - entry->vme_start;
6676 /*
6677 * If wiring a copy-on-write page, we need to copy it now
6678 * even if we're only (currently) requesting read access.
6679 * This is aggressive, but once it's wired we can't move it.
6680 */
6681 if (entry->needs_copy) {
6682 if (wire_and_extract) {
6683 /*
6684 * We're supposed to share with the original
6685 * provider so should not be "needs_copy"
6686 */
6687 rc = KERN_INVALID_ARGUMENT;
6688 goto done;
6689 }
6690
6691 VME_OBJECT_SHADOW(entry, size,
6692 vm_map_always_shadow(map));
6693 entry->needs_copy = FALSE;
6694 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6695 if (wire_and_extract) {
6696 /*
6697 * We're supposed to share with the original
6698 * provider so should already have an object.
6699 */
6700 rc = KERN_INVALID_ARGUMENT;
6701 goto done;
6702 }
6703 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6704 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6705 assert(entry->use_pmap);
6706 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6707 if (wire_and_extract) {
6708 /*
6709 * We're supposed to share with the original
6710 * provider so should not be COPY_SYMMETRIC.
6711 */
6712 rc = KERN_INVALID_ARGUMENT;
6713 goto done;
6714 }
6715 /*
6716 * Force an unrequested "copy-on-write" but only for
6717 * the range we're wiring.
6718 */
6719 // printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
6720 vm_map_clip_start(map, entry, s);
6721 vm_map_clip_end(map, entry, end);
6722 /* recompute "size" */
6723 size = entry->vme_end - entry->vme_start;
6724 /* make a shadow object */
6725 vm_object_t orig_object;
6726 vm_object_offset_t orig_offset;
6727 orig_object = VME_OBJECT(entry);
6728 orig_offset = VME_OFFSET(entry);
6729 VME_OBJECT_SHADOW(entry, size, vm_map_always_shadow(map));
6730 if (VME_OBJECT(entry) != orig_object) {
6731 /*
6732 * This mapping has not been shared (or it would be
6733 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
6734 * not been copied-on-write (or it would be marked
6735 * as "needs_copy" and would have been handled above
6736 * and also already write-protected).
6737 * We still need to write-protect here to prevent
6738 * other threads from modifying these pages while
6739 * we're in the process of copying and wiring
6740 * the copied pages.
6741 * Since the mapping is neither shared nor COWed,
6742 * we only need to write-protect the PTEs for this
6743 * mapping.
6744 */
6745 vm_object_pmap_protect(orig_object,
6746 orig_offset,
6747 size,
6748 map->pmap,
6749 VM_MAP_PAGE_SIZE(map),
6750 entry->vme_start,
6751 entry->protection & ~VM_PROT_WRITE);
6752 }
6753 }
6754 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6755 /*
6756 * Make the object COPY_DELAY to get a stable object
6757 * to wire.
6758 * That should avoid creating long shadow chains while
6759 * wiring/unwiring the same range repeatedly.
6760 * That also prevents part of the object from being
6761 * wired while another part is "needs_copy", which
6762 * could result in conflicting rules wrt copy-on-write.
6763 */
6764 vm_object_t object;
6765
6766 object = VME_OBJECT(entry);
6767 vm_object_lock(object);
6768 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
6769 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
6770 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
6771 object, (uint64_t)object->vo_size,
6772 entry,
6773 (uint64_t)entry->vme_start,
6774 (uint64_t)entry->vme_end,
6775 (uint64_t)VME_OFFSET(entry),
6776 (uint64_t)size);
6777 assertf(object->ref_count == 1,
6778 "object %p ref_count %d\n",
6779 object, object->ref_count);
6780 assertf(!entry->needs_copy,
6781 "entry %p\n", entry);
6782 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
6783 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
6784 }
6785 vm_object_unlock(object);
6786 }
6787
6788 vm_map_clip_start(map, entry, s);
6789 vm_map_clip_end(map, entry, end);
6790
6791 /* re-compute "e" */
6792 e = entry->vme_end;
6793 if (e > end) {
6794 e = end;
6795 }
6796
6797 /*
6798 * Check for holes and protection mismatch.
6799 * Holes: Next entry should be contiguous unless this
6800 * is the end of the region.
6801 * Protection: Access requested must be allowed, unless
6802 * wiring is by protection class
6803 */
6804 if ((entry->vme_end < end) &&
6805 ((entry->vme_next == vm_map_to_entry(map)) ||
6806 (entry->vme_next->vme_start > entry->vme_end))) {
6807 /* found a hole */
6808 rc = KERN_INVALID_ADDRESS;
6809 goto done;
6810 }
6811 if ((entry->protection & access_type) != access_type) {
6812 /* found a protection problem */
6813 rc = KERN_PROTECTION_FAILURE;
6814 goto done;
6815 }
6816
6817 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6818
6819 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6820 goto done;
6821 }
6822
6823 entry->in_transition = TRUE;
6824
6825 /*
6826 * This entry might get split once we unlock the map.
6827 * In vm_fault_wire(), we need the current range as
6828 * defined by this entry. In order for this to work
6829 * along with a simultaneous clip operation, we make a
6830 * temporary copy of this entry and use that for the
6831 * wiring. Note that the underlying objects do not
6832 * change during a clip.
6833 */
6834 tmp_entry = *entry;
6835
6836 /*
6837 * The in_transition state guarentees that the entry
6838 * (or entries for this range, if split occured) will be
6839 * there when the map lock is acquired for the second time.
6840 */
6841 vm_map_unlock(map);
6842
6843 if (!user_wire && cur_thread != THREAD_NULL) {
6844 interruptible_state = thread_interrupt_level(THREAD_UNINT);
6845 } else {
6846 interruptible_state = THREAD_UNINT;
6847 }
6848
6849 if (map_pmap) {
6850 rc = vm_fault_wire(map,
6851 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6852 physpage_p);
6853 } else {
6854 rc = vm_fault_wire(map,
6855 &tmp_entry, caller_prot, tag, map->pmap,
6856 tmp_entry.vme_start,
6857 physpage_p);
6858 }
6859
6860 if (!user_wire && cur_thread != THREAD_NULL) {
6861 thread_interrupt_level(interruptible_state);
6862 }
6863
6864 vm_map_lock(map);
6865
6866 if (last_timestamp + 1 != map->timestamp) {
6867 /*
6868 * Find the entry again. It could have been clipped
6869 * after we unlocked the map.
6870 */
6871 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6872 &first_entry)) {
6873 panic("vm_map_wire: re-lookup failed");
6874 }
6875
6876 entry = first_entry;
6877 }
6878
6879 last_timestamp = map->timestamp;
6880
6881 while ((entry != vm_map_to_entry(map)) &&
6882 (entry->vme_start < tmp_entry.vme_end)) {
6883 assert(entry->in_transition);
6884 entry->in_transition = FALSE;
6885 if (entry->needs_wakeup) {
6886 entry->needs_wakeup = FALSE;
6887 need_wakeup = TRUE;
6888 }
6889 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6890 subtract_wire_counts(map, entry, user_wire);
6891 }
6892 entry = entry->vme_next;
6893 }
6894
6895 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6896 goto done;
6897 }
6898
6899 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6900 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
6901 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6902 /* found a "new" hole */
6903 s = tmp_entry.vme_end;
6904 rc = KERN_INVALID_ADDRESS;
6905 goto done;
6906 }
6907
6908 s = entry->vme_start;
6909 } /* end while loop through map entries */
6910
6911 done:
6912 if (rc == KERN_SUCCESS) {
6913 /* repair any damage we may have made to the VM map */
6914 vm_map_simplify_range(map, start, end);
6915 }
6916
6917 vm_map_unlock(map);
6918
6919 /*
6920 * wake up anybody waiting on entries we wired.
6921 */
6922 if (need_wakeup) {
6923 vm_map_entry_wakeup(map);
6924 }
6925
6926 if (rc != KERN_SUCCESS) {
6927 /* undo what has been wired so far */
6928 vm_map_unwire_nested(map, start, s, user_wire,
6929 map_pmap, pmap_addr);
6930 if (physpage_p) {
6931 *physpage_p = 0;
6932 }
6933 }
6934
6935 return rc;
6936 }
6937
6938 static inline kern_return_t
vm_map_wire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size,vm_prot_t * prot)6939 vm_map_wire_sanitize(
6940 vm_map_t map,
6941 vm_map_offset_ut start_u,
6942 vm_map_offset_ut end_u,
6943 vm_prot_ut prot_u,
6944 vm_sanitize_caller_t vm_sanitize_caller,
6945 vm_map_offset_t *start,
6946 vm_map_offset_t *end,
6947 vm_map_size_t *size,
6948 vm_prot_t *prot)
6949 {
6950 kern_return_t kr;
6951
6952 kr = vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
6953 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
6954 size);
6955 if (__improbable(kr != KERN_SUCCESS)) {
6956 return kr;
6957 }
6958
6959 kr = vm_sanitize_prot(prot_u, vm_sanitize_caller, map, prot);
6960 if (__improbable(kr != KERN_SUCCESS)) {
6961 return kr;
6962 }
6963
6964 return KERN_SUCCESS;
6965 }
6966
6967 /*
6968 * Validation function for vm_map_wire_nested().
6969 */
6970 kern_return_t
vm_map_wire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p,vm_sanitize_caller_t vm_sanitize_caller)6971 vm_map_wire_impl(
6972 vm_map_t map,
6973 vm_map_offset_ut start_u,
6974 vm_map_offset_ut end_u,
6975 vm_prot_ut prot_u,
6976 vm_tag_t tag,
6977 boolean_t user_wire,
6978 ppnum_t *physpage_p,
6979 vm_sanitize_caller_t vm_sanitize_caller)
6980 {
6981 vm_map_offset_t start, end;
6982 vm_map_size_t size;
6983 vm_prot_t prot;
6984 kern_return_t kr;
6985
6986 /*
6987 * Sanitize any input parameters that are addr/size/prot/inherit
6988 */
6989 kr = vm_map_wire_sanitize(map,
6990 start_u,
6991 end_u,
6992 prot_u,
6993 vm_sanitize_caller,
6994 &start,
6995 &end,
6996 &size,
6997 &prot);
6998 if (__improbable(kr != KERN_SUCCESS)) {
6999 if (physpage_p) {
7000 *physpage_p = 0;
7001 }
7002 return vm_sanitize_get_kr(kr);
7003 }
7004
7005 return vm_map_wire_nested(map, start, end, prot, tag, user_wire,
7006 PMAP_NULL, 0, physpage_p);
7007 }
7008
7009 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,boolean_t user_wire)7010 vm_map_wire_external(
7011 vm_map_t map,
7012 vm_map_offset_ut start_u,
7013 vm_map_offset_ut end_u,
7014 vm_prot_ut prot_u,
7015 boolean_t user_wire)
7016 {
7017 vm_tag_t tag = vm_tag_bt();
7018
7019 return vm_map_wire_kernel(map, start_u, end_u, prot_u, tag, user_wire);
7020 }
7021
7022 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_prot_ut prot_u,vm_tag_t tag,boolean_t user_wire)7023 vm_map_wire_kernel(
7024 vm_map_t map,
7025 vm_map_offset_ut start_u,
7026 vm_map_offset_ut end_u,
7027 vm_prot_ut prot_u,
7028 vm_tag_t tag,
7029 boolean_t user_wire)
7030 {
7031 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7032 user_wire, NULL, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7033 }
7034
7035 #if XNU_PLATFORM_MacOSX
7036
7037 kern_return_t
vm_map_wire_and_extract(vm_map_t map,vm_map_offset_ut start_u,vm_prot_ut prot_u,boolean_t user_wire,ppnum_t * physpage_p)7038 vm_map_wire_and_extract(
7039 vm_map_t map,
7040 vm_map_offset_ut start_u,
7041 vm_prot_ut prot_u,
7042 boolean_t user_wire,
7043 ppnum_t *physpage_p)
7044 {
7045 vm_tag_t tag = vm_tag_bt();
7046 vm_map_size_ut size_u = vm_sanitize_wrap_size(VM_MAP_PAGE_SIZE(map));
7047 vm_map_offset_ut end_u = vm_sanitize_compute_unsafe_end(start_u, size_u);
7048
7049 return vm_map_wire_impl(map, start_u, end_u, prot_u, tag,
7050 user_wire, physpage_p, VM_SANITIZE_CALLER_VM_MAP_WIRE);
7051 }
7052
7053 #endif /* XNU_PLATFORM_MacOSX */
7054
7055 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7056 vm_map_unwire_nested(
7057 vm_map_t map,
7058 vm_map_offset_t start,
7059 vm_map_offset_t end,
7060 boolean_t user_wire,
7061 pmap_t map_pmap,
7062 vm_map_offset_t pmap_addr)
7063 {
7064 vm_map_entry_t entry;
7065 struct vm_map_entry *first_entry, tmp_entry;
7066 boolean_t need_wakeup;
7067 boolean_t main_map = FALSE;
7068 unsigned int last_timestamp;
7069
7070 VM_MAP_RANGE_CHECK(map, start, end);
7071 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7072 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7073
7074 if (start == end) {
7075 /* We unwired what the caller asked for: zero pages */
7076 return KERN_SUCCESS;
7077 }
7078
7079 vm_map_lock(map);
7080 if (map_pmap == NULL) {
7081 main_map = TRUE;
7082 }
7083 last_timestamp = map->timestamp;
7084
7085 if (vm_map_lookup_entry(map, start, &first_entry)) {
7086 entry = first_entry;
7087 /*
7088 * vm_map_clip_start will be done later.
7089 * We don't want to unnest any nested sub maps here !
7090 */
7091 } else {
7092 if (!user_wire) {
7093 panic("vm_map_unwire: start not found");
7094 }
7095 /* Start address is not in map. */
7096 vm_map_unlock(map);
7097 return KERN_INVALID_ADDRESS;
7098 }
7099
7100 if (entry->superpage_size) {
7101 /* superpages are always wired */
7102 vm_map_unlock(map);
7103 return KERN_INVALID_ADDRESS;
7104 }
7105
7106 need_wakeup = FALSE;
7107 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7108 if (entry->in_transition) {
7109 /*
7110 * 1)
7111 * Another thread is wiring down this entry. Note
7112 * that if it is not for the other thread we would
7113 * be unwiring an unwired entry. This is not
7114 * permitted. If we wait, we will be unwiring memory
7115 * we did not wire.
7116 *
7117 * 2)
7118 * Another thread is unwiring this entry. We did not
7119 * have a reference to it, because if we did, this
7120 * entry will not be getting unwired now.
7121 */
7122 if (!user_wire) {
7123 /*
7124 * XXX FBDP
7125 * This could happen: there could be some
7126 * overlapping vslock/vsunlock operations
7127 * going on.
7128 * We should probably just wait and retry,
7129 * but then we have to be careful that this
7130 * entry could get "simplified" after
7131 * "in_transition" gets unset and before
7132 * we re-lookup the entry, so we would
7133 * have to re-clip the entry to avoid
7134 * re-unwiring what we have already unwired...
7135 * See vm_map_wire_nested().
7136 *
7137 * Or we could just ignore "in_transition"
7138 * here and proceed to decement the wired
7139 * count(s) on this entry. That should be fine
7140 * as long as "wired_count" doesn't drop all
7141 * the way to 0 (and we should panic if THAT
7142 * happens).
7143 */
7144 panic("vm_map_unwire: in_transition entry");
7145 }
7146
7147 entry = entry->vme_next;
7148 continue;
7149 }
7150
7151 if (entry->is_sub_map) {
7152 vm_map_offset_t sub_start;
7153 vm_map_offset_t sub_end;
7154 vm_map_offset_t local_end;
7155 pmap_t pmap;
7156
7157 vm_map_clip_start(map, entry, start);
7158 vm_map_clip_end(map, entry, end);
7159
7160 sub_start = VME_OFFSET(entry);
7161 sub_end = entry->vme_end - entry->vme_start;
7162 sub_end += VME_OFFSET(entry);
7163 local_end = entry->vme_end;
7164 if (map_pmap == NULL) {
7165 if (entry->use_pmap) {
7166 pmap = VME_SUBMAP(entry)->pmap;
7167 pmap_addr = sub_start;
7168 } else {
7169 pmap = map->pmap;
7170 pmap_addr = start;
7171 }
7172 if (entry->wired_count == 0 ||
7173 (user_wire && entry->user_wired_count == 0)) {
7174 if (!user_wire) {
7175 panic("vm_map_unwire: entry is unwired");
7176 }
7177 entry = entry->vme_next;
7178 continue;
7179 }
7180
7181 /*
7182 * Check for holes
7183 * Holes: Next entry should be contiguous unless
7184 * this is the end of the region.
7185 */
7186 if (((entry->vme_end < end) &&
7187 ((entry->vme_next == vm_map_to_entry(map)) ||
7188 (entry->vme_next->vme_start
7189 > entry->vme_end)))) {
7190 if (!user_wire) {
7191 panic("vm_map_unwire: non-contiguous region");
7192 }
7193 /*
7194 * entry = entry->vme_next;
7195 * continue;
7196 */
7197 }
7198
7199 subtract_wire_counts(map, entry, user_wire);
7200
7201 if (entry->wired_count != 0) {
7202 entry = entry->vme_next;
7203 continue;
7204 }
7205
7206 entry->in_transition = TRUE;
7207 tmp_entry = *entry;/* see comment in vm_map_wire() */
7208
7209 /*
7210 * We can unlock the map now. The in_transition state
7211 * guarantees existance of the entry.
7212 */
7213 vm_map_unlock(map);
7214 vm_map_unwire_nested(VME_SUBMAP(entry),
7215 sub_start, sub_end, user_wire, pmap, pmap_addr);
7216 vm_map_lock(map);
7217
7218 if (last_timestamp + 1 != map->timestamp) {
7219 /*
7220 * Find the entry again. It could have been
7221 * clipped or deleted after we unlocked the map.
7222 */
7223 if (!vm_map_lookup_entry(map,
7224 tmp_entry.vme_start,
7225 &first_entry)) {
7226 if (!user_wire) {
7227 panic("vm_map_unwire: re-lookup failed");
7228 }
7229 entry = first_entry->vme_next;
7230 } else {
7231 entry = first_entry;
7232 }
7233 }
7234 last_timestamp = map->timestamp;
7235
7236 /*
7237 * clear transition bit for all constituent entries
7238 * that were in the original entry (saved in
7239 * tmp_entry). Also check for waiters.
7240 */
7241 while ((entry != vm_map_to_entry(map)) &&
7242 (entry->vme_start < tmp_entry.vme_end)) {
7243 assert(entry->in_transition);
7244 entry->in_transition = FALSE;
7245 if (entry->needs_wakeup) {
7246 entry->needs_wakeup = FALSE;
7247 need_wakeup = TRUE;
7248 }
7249 entry = entry->vme_next;
7250 }
7251 continue;
7252 } else {
7253 tmp_entry = *entry;
7254 vm_map_unlock(map);
7255 vm_map_unwire_nested(VME_SUBMAP(entry),
7256 sub_start, sub_end, user_wire, map_pmap,
7257 pmap_addr);
7258 vm_map_lock(map);
7259
7260 if (last_timestamp + 1 != map->timestamp) {
7261 /*
7262 * Find the entry again. It could have been
7263 * clipped or deleted after we unlocked the map.
7264 */
7265 if (!vm_map_lookup_entry(map,
7266 tmp_entry.vme_start,
7267 &first_entry)) {
7268 if (!user_wire) {
7269 panic("vm_map_unwire: re-lookup failed");
7270 }
7271 entry = first_entry->vme_next;
7272 } else {
7273 entry = first_entry;
7274 }
7275 }
7276 last_timestamp = map->timestamp;
7277 }
7278 }
7279
7280
7281 if ((entry->wired_count == 0) ||
7282 (user_wire && entry->user_wired_count == 0)) {
7283 if (!user_wire) {
7284 panic("vm_map_unwire: entry is unwired");
7285 }
7286
7287 entry = entry->vme_next;
7288 continue;
7289 }
7290
7291 assert(entry->wired_count > 0 &&
7292 (!user_wire || entry->user_wired_count > 0));
7293
7294 vm_map_clip_start(map, entry, start);
7295 vm_map_clip_end(map, entry, end);
7296
7297 /*
7298 * Check for holes
7299 * Holes: Next entry should be contiguous unless
7300 * this is the end of the region.
7301 */
7302 if (((entry->vme_end < end) &&
7303 ((entry->vme_next == vm_map_to_entry(map)) ||
7304 (entry->vme_next->vme_start > entry->vme_end)))) {
7305 if (!user_wire) {
7306 panic("vm_map_unwire: non-contiguous region");
7307 }
7308 entry = entry->vme_next;
7309 continue;
7310 }
7311
7312 subtract_wire_counts(map, entry, user_wire);
7313
7314 if (entry->wired_count != 0) {
7315 entry = entry->vme_next;
7316 continue;
7317 }
7318
7319 if (entry->zero_wired_pages) {
7320 entry->zero_wired_pages = FALSE;
7321 }
7322
7323 entry->in_transition = TRUE;
7324 tmp_entry = *entry; /* see comment in vm_map_wire() */
7325
7326 /*
7327 * We can unlock the map now. The in_transition state
7328 * guarantees existance of the entry.
7329 */
7330 vm_map_unlock(map);
7331 if (map_pmap) {
7332 vm_fault_unwire(map, &tmp_entry, FALSE, map_pmap,
7333 pmap_addr, tmp_entry.vme_end);
7334 } else {
7335 vm_fault_unwire(map, &tmp_entry, FALSE, map->pmap,
7336 tmp_entry.vme_start, tmp_entry.vme_end);
7337 }
7338 vm_map_lock(map);
7339
7340 if (last_timestamp + 1 != map->timestamp) {
7341 /*
7342 * Find the entry again. It could have been clipped
7343 * or deleted after we unlocked the map.
7344 */
7345 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7346 &first_entry)) {
7347 if (!user_wire) {
7348 panic("vm_map_unwire: re-lookup failed");
7349 }
7350 entry = first_entry->vme_next;
7351 } else {
7352 entry = first_entry;
7353 }
7354 }
7355 last_timestamp = map->timestamp;
7356
7357 /*
7358 * clear transition bit for all constituent entries that
7359 * were in the original entry (saved in tmp_entry). Also
7360 * check for waiters.
7361 */
7362 while ((entry != vm_map_to_entry(map)) &&
7363 (entry->vme_start < tmp_entry.vme_end)) {
7364 assert(entry->in_transition);
7365 entry->in_transition = FALSE;
7366 if (entry->needs_wakeup) {
7367 entry->needs_wakeup = FALSE;
7368 need_wakeup = TRUE;
7369 }
7370 entry = entry->vme_next;
7371 }
7372 }
7373
7374 /*
7375 * We might have fragmented the address space when we wired this
7376 * range of addresses. Attempt to re-coalesce these VM map entries
7377 * with their neighbors now that they're no longer wired.
7378 * Under some circumstances, address space fragmentation can
7379 * prevent VM object shadow chain collapsing, which can cause
7380 * swap space leaks.
7381 */
7382 vm_map_simplify_range(map, start, end);
7383
7384 vm_map_unlock(map);
7385 /*
7386 * wake up anybody waiting on entries that we have unwired.
7387 */
7388 if (need_wakeup) {
7389 vm_map_entry_wakeup(map);
7390 }
7391 return KERN_SUCCESS;
7392 }
7393
7394 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire)7395 vm_map_unwire(
7396 vm_map_t map,
7397 vm_map_offset_ut start_u,
7398 vm_map_offset_ut end_u,
7399 boolean_t user_wire)
7400 {
7401 return vm_map_unwire_impl(map, start_u, end_u, user_wire,
7402 VM_SANITIZE_CALLER_VM_MAP_UNWIRE);
7403 }
7404
7405 static inline kern_return_t
vm_map_unwire_sanitize(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * start,vm_map_offset_t * end,vm_map_size_t * size)7406 vm_map_unwire_sanitize(
7407 vm_map_t map,
7408 vm_map_offset_ut start_u,
7409 vm_map_offset_ut end_u,
7410 vm_sanitize_caller_t vm_sanitize_caller,
7411 vm_map_offset_t *start,
7412 vm_map_offset_t *end,
7413 vm_map_size_t *size)
7414 {
7415 return vm_sanitize_addr_end(start_u, end_u, vm_sanitize_caller, map,
7416 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS, start, end,
7417 size);
7418 }
7419
7420 kern_return_t
vm_map_unwire_impl(vm_map_t map,vm_map_offset_ut start_u,vm_map_offset_ut end_u,boolean_t user_wire,vm_sanitize_caller_t vm_sanitize_caller)7421 vm_map_unwire_impl(
7422 vm_map_t map,
7423 vm_map_offset_ut start_u,
7424 vm_map_offset_ut end_u,
7425 boolean_t user_wire,
7426 vm_sanitize_caller_t vm_sanitize_caller)
7427 {
7428 vm_map_offset_t start, end;
7429 vm_map_size_t size;
7430 kern_return_t kr;
7431
7432 /*
7433 * Sanitize any input parameters that are addr/size/prot/inherit
7434 */
7435 kr = vm_map_unwire_sanitize(
7436 map,
7437 start_u,
7438 end_u,
7439 vm_sanitize_caller,
7440 &start,
7441 &end,
7442 &size);
7443 if (__improbable(kr != KERN_SUCCESS)) {
7444 return vm_sanitize_get_kr(kr);
7445 }
7446
7447 return vm_map_unwire_nested(map, start, end,
7448 user_wire, (pmap_t)NULL, 0);
7449 }
7450
7451
7452 /*
7453 * vm_map_entry_zap: [ internal use only ]
7454 *
7455 * Remove the entry from the target map
7456 * and put it on a zap list.
7457 */
7458 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7459 vm_map_entry_zap(
7460 vm_map_t map,
7461 vm_map_entry_t entry,
7462 vm_map_zap_t zap)
7463 {
7464 vm_map_offset_t s, e;
7465
7466 s = entry->vme_start;
7467 e = entry->vme_end;
7468 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7469 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7470 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7471 assert(page_aligned(s));
7472 assert(page_aligned(e));
7473 }
7474 if (entry->map_aligned == TRUE) {
7475 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7476 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7477 }
7478 assert(entry->wired_count == 0);
7479 assert(entry->user_wired_count == 0);
7480 assert(!entry->vme_permanent);
7481
7482 vm_map_store_entry_unlink(map, entry, false);
7483 map->size -= e - s;
7484
7485 vm_map_zap_append(zap, entry);
7486 }
7487
7488 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7489 vm_map_submap_pmap_clean(
7490 vm_map_t map,
7491 vm_map_offset_t start,
7492 vm_map_offset_t end,
7493 vm_map_t sub_map,
7494 vm_map_offset_t offset)
7495 {
7496 vm_map_offset_t submap_start;
7497 vm_map_offset_t submap_end;
7498 vm_map_size_t remove_size;
7499 vm_map_entry_t entry;
7500
7501 submap_end = offset + (end - start);
7502 submap_start = offset;
7503
7504 vm_map_lock_read(sub_map);
7505 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7506 remove_size = (entry->vme_end - entry->vme_start);
7507 if (offset > entry->vme_start) {
7508 remove_size -= offset - entry->vme_start;
7509 }
7510
7511
7512 if (submap_end < entry->vme_end) {
7513 remove_size -=
7514 entry->vme_end - submap_end;
7515 }
7516 if (entry->is_sub_map) {
7517 vm_map_submap_pmap_clean(
7518 sub_map,
7519 start,
7520 start + remove_size,
7521 VME_SUBMAP(entry),
7522 VME_OFFSET(entry));
7523 } else {
7524 if (map->mapped_in_other_pmaps &&
7525 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7526 VME_OBJECT(entry) != NULL) {
7527 vm_object_pmap_protect_options(
7528 VME_OBJECT(entry),
7529 (VME_OFFSET(entry) +
7530 offset -
7531 entry->vme_start),
7532 remove_size,
7533 PMAP_NULL,
7534 PAGE_SIZE,
7535 entry->vme_start,
7536 VM_PROT_NONE,
7537 PMAP_OPTIONS_REMOVE);
7538 } else {
7539 pmap_remove(map->pmap,
7540 (addr64_t)start,
7541 (addr64_t)(start + remove_size));
7542 }
7543 }
7544 }
7545
7546 entry = entry->vme_next;
7547
7548 while ((entry != vm_map_to_entry(sub_map))
7549 && (entry->vme_start < submap_end)) {
7550 remove_size = (entry->vme_end - entry->vme_start);
7551 if (submap_end < entry->vme_end) {
7552 remove_size -= entry->vme_end - submap_end;
7553 }
7554 if (entry->is_sub_map) {
7555 vm_map_submap_pmap_clean(
7556 sub_map,
7557 (start + entry->vme_start) - offset,
7558 ((start + entry->vme_start) - offset) + remove_size,
7559 VME_SUBMAP(entry),
7560 VME_OFFSET(entry));
7561 } else {
7562 if (map->mapped_in_other_pmaps &&
7563 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7564 VME_OBJECT(entry) != NULL) {
7565 vm_object_pmap_protect_options(
7566 VME_OBJECT(entry),
7567 VME_OFFSET(entry),
7568 remove_size,
7569 PMAP_NULL,
7570 PAGE_SIZE,
7571 entry->vme_start,
7572 VM_PROT_NONE,
7573 PMAP_OPTIONS_REMOVE);
7574 } else {
7575 pmap_remove(map->pmap,
7576 (addr64_t)((start + entry->vme_start)
7577 - offset),
7578 (addr64_t)(((start + entry->vme_start)
7579 - offset) + remove_size));
7580 }
7581 }
7582 entry = entry->vme_next;
7583 }
7584 vm_map_unlock_read(sub_map);
7585 return;
7586 }
7587
7588 /*
7589 * virt_memory_guard_ast:
7590 *
7591 * Handle the AST callout for a virtual memory guard.
7592 * raise an EXC_GUARD exception and terminate the task
7593 * if configured to do so.
7594 */
7595 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7596 virt_memory_guard_ast(
7597 thread_t thread,
7598 mach_exception_data_type_t code,
7599 mach_exception_data_type_t subcode)
7600 {
7601 task_t task = get_threadtask(thread);
7602 assert(task != kernel_task);
7603 assert(task == current_task());
7604 kern_return_t sync_exception_result;
7605 uint32_t behavior;
7606
7607 behavior = task->task_exc_guard;
7608
7609 /* Is delivery enabled */
7610 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7611 return;
7612 }
7613
7614 /* If only once, make sure we're that once */
7615 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7616 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7617
7618 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7619 break;
7620 }
7621 behavior = task->task_exc_guard;
7622 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7623 return;
7624 }
7625 }
7626
7627 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7628 /* Raise exception synchronously and see if handler claimed it */
7629 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7630
7631 if (fatal) {
7632 /*
7633 * If Synchronous EXC_GUARD delivery was successful then
7634 * kill the process and return, else kill the process
7635 * and deliver the exception via EXC_CORPSE_NOTIFY.
7636 */
7637
7638
7639 int flags = PX_DEBUG_NO_HONOR;
7640 exception_info_t info = {
7641 .os_reason = OS_REASON_GUARD,
7642 .exception_type = EXC_GUARD,
7643 .mx_code = code,
7644 .mx_subcode = subcode
7645 };
7646
7647 if (sync_exception_result == KERN_SUCCESS) {
7648 flags |= PX_PSIGNAL;
7649 }
7650 exit_with_mach_exception(current_proc(), info, flags);
7651 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7652 /*
7653 * If the synchronous EXC_GUARD delivery was not successful,
7654 * raise a simulated crash.
7655 */
7656 if (sync_exception_result != KERN_SUCCESS) {
7657 task_violated_guard(code, subcode, NULL, FALSE);
7658 }
7659 }
7660 }
7661
7662 /*
7663 * vm_map_guard_exception:
7664 *
7665 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7666 *
7667 * Right now, we do this when we find nothing mapped, or a
7668 * gap in the mapping when a user address space deallocate
7669 * was requested. We report the address of the first gap found.
7670 */
7671 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7672 vm_map_guard_exception(
7673 vm_map_offset_t gap_start,
7674 unsigned reason)
7675 {
7676 mach_exception_code_t code = 0;
7677 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7678 unsigned int target = 0; /* should we pass in pid associated with map? */
7679 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7680 boolean_t fatal = FALSE;
7681
7682 task_t task = current_task_early();
7683
7684 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7685 if (task == NULL || task == kernel_task) {
7686 return;
7687 }
7688
7689 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7690 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7691 EXC_GUARD_ENCODE_TARGET(code, target);
7692
7693 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7694 fatal = TRUE;
7695 }
7696 thread_guard_violation(current_thread(), code, subcode, fatal);
7697 }
7698
7699 static kern_return_t
vm_map_delete_submap_recurse(vm_map_t submap,vm_map_offset_t submap_start,vm_map_offset_t submap_end)7700 vm_map_delete_submap_recurse(
7701 vm_map_t submap,
7702 vm_map_offset_t submap_start,
7703 vm_map_offset_t submap_end)
7704 {
7705 vm_map_entry_t submap_entry;
7706
7707 /*
7708 * Verify that the submap does not contain any "permanent" entries
7709 * within the specified range.
7710 * We do not care about gaps.
7711 */
7712
7713 vm_map_lock(submap);
7714
7715 if (!vm_map_lookup_entry(submap, submap_start, &submap_entry)) {
7716 submap_entry = submap_entry->vme_next;
7717 }
7718
7719 for (;
7720 submap_entry != vm_map_to_entry(submap) &&
7721 submap_entry->vme_start < submap_end;
7722 submap_entry = submap_entry->vme_next) {
7723 if (submap_entry->vme_permanent) {
7724 /* "permanent" entry -> fail */
7725 vm_map_unlock(submap);
7726 return KERN_PROTECTION_FAILURE;
7727 }
7728 }
7729 /* no "permanent" entries in the range -> success */
7730 vm_map_unlock(submap);
7731 return KERN_SUCCESS;
7732 }
7733
7734 __abortlike
7735 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7736 __vm_map_delete_misaligned_panic(
7737 vm_map_t map,
7738 vm_map_offset_t start,
7739 vm_map_offset_t end)
7740 {
7741 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7742 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7743 }
7744
7745 __abortlike
7746 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7747 __vm_map_delete_failed_panic(
7748 vm_map_t map,
7749 vm_map_offset_t start,
7750 vm_map_offset_t end,
7751 kern_return_t kr)
7752 {
7753 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7754 map, (uint64_t)start, (uint64_t)end, kr);
7755 }
7756
7757 __abortlike
7758 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7759 __vm_map_delete_gap_panic(
7760 vm_map_t map,
7761 vm_map_offset_t where,
7762 vm_map_offset_t start,
7763 vm_map_offset_t end)
7764 {
7765 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7766 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7767 }
7768
7769 __abortlike
7770 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7771 __vm_map_delete_permanent_panic(
7772 vm_map_t map,
7773 vm_map_offset_t start,
7774 vm_map_offset_t end,
7775 vm_map_entry_t entry)
7776 {
7777 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7778 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7779 map, (uint64_t)start, (uint64_t)end, entry,
7780 (uint64_t)entry->vme_start,
7781 (uint64_t)entry->vme_end);
7782 }
7783
7784 __options_decl(vm_map_delete_state_t, uint32_t, {
7785 VMDS_NONE = 0x0000,
7786
7787 VMDS_FOUND_GAP = 0x0001,
7788 VMDS_GAPS_OK = 0x0002,
7789
7790 VMDS_KERNEL_PMAP = 0x0004,
7791 VMDS_NEEDS_LOOKUP = 0x0008,
7792 VMDS_NEEDS_WAKEUP = 0x0010,
7793 VMDS_KERNEL_KMEMPTR = 0x0020
7794 });
7795
7796 /*
7797 * vm_map_delete: [ internal use only ]
7798 *
7799 * Deallocates the given address range from the target map.
7800 * Removes all user wirings. Unwires one kernel wiring if
7801 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
7802 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
7803 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7804 *
7805 *
7806 * When the map is a kernel map, then any error in removing mappings
7807 * will lead to a panic so that clients do not have to repeat the panic
7808 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
7809 * is also passed, then KERN_ABORTED will not lead to a panic.
7810 *
7811 * This routine is called with map locked and leaves map locked.
7812 */
7813 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7814 vm_map_delete(
7815 vm_map_t map,
7816 vm_map_offset_t start,
7817 vm_map_offset_t end,
7818 vmr_flags_t flags,
7819 kmem_guard_t guard,
7820 vm_map_zap_t zap_list)
7821 {
7822 vm_map_entry_t entry, next;
7823 int interruptible;
7824 vm_map_offset_t gap_start = 0;
7825 vm_map_offset_t clear_in_transition_end = 0;
7826 __unused vm_map_offset_t save_start = start;
7827 __unused vm_map_offset_t save_end = end;
7828 vm_map_delete_state_t state = VMDS_NONE;
7829 kmem_return_t ret = { };
7830 vm_map_range_id_t range_id = 0;
7831 struct kmem_page_meta *meta = NULL;
7832 uint32_t size_idx, slot_idx;
7833 struct mach_vm_range slot;
7834
7835 if (vm_map_pmap(map) == kernel_pmap) {
7836 state |= VMDS_KERNEL_PMAP;
7837 range_id = kmem_addr_get_range(start, end - start);
7838 if (kmem_is_ptr_range(range_id)) {
7839 state |= VMDS_KERNEL_KMEMPTR;
7840 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, &meta,
7841 &size_idx, &slot);
7842 }
7843 }
7844
7845 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7846 state |= VMDS_GAPS_OK;
7847 }
7848
7849 if (map->corpse_source &&
7850 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
7851 !map->terminated) {
7852 /*
7853 * The map is being used for corpses related diagnostics.
7854 * So skip any entry removal to avoid perturbing the map state.
7855 * The cleanup will happen in task_terminate_internal after the
7856 * call to task_port_no_senders.
7857 */
7858 goto out;
7859 }
7860
7861 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7862 THREAD_ABORTSAFE : THREAD_UNINT;
7863
7864 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7865 (start & VM_MAP_PAGE_MASK(map))) {
7866 __vm_map_delete_misaligned_panic(map, start, end);
7867 }
7868
7869 if ((state & VMDS_GAPS_OK) == 0) {
7870 /*
7871 * If the map isn't terminated then all deletions must have
7872 * no gaps, and be within the [min, max) of the map.
7873 *
7874 * We got here without VM_MAP_RANGE_CHECK() being called,
7875 * and hence must validate bounds manually.
7876 *
7877 * It is worth noting that because vm_deallocate() will
7878 * round_page() the deallocation size, it's possible for "end"
7879 * to be 0 here due to overflow. We hence must treat it as being
7880 * beyond vm_map_max(map).
7881 *
7882 * Similarly, end < start means some wrap around happend,
7883 * which should cause an error or panic.
7884 */
7885 if (end == 0 || end > vm_map_max(map)) {
7886 state |= VMDS_FOUND_GAP;
7887 gap_start = vm_map_max(map);
7888 if (state & VMDS_KERNEL_PMAP) {
7889 __vm_map_delete_gap_panic(map,
7890 gap_start, start, end);
7891 }
7892 goto out;
7893 }
7894
7895 if (end < start) {
7896 if (state & VMDS_KERNEL_PMAP) {
7897 __vm_map_delete_gap_panic(map,
7898 vm_map_max(map), start, end);
7899 }
7900 ret.kmr_return = KERN_INVALID_ARGUMENT;
7901 goto out;
7902 }
7903
7904 if (start < vm_map_min(map)) {
7905 state |= VMDS_FOUND_GAP;
7906 gap_start = start;
7907 if (state & VMDS_KERNEL_PMAP) {
7908 __vm_map_delete_gap_panic(map,
7909 gap_start, start, end);
7910 }
7911 goto out;
7912 }
7913 } else {
7914 /*
7915 * If the map is terminated, we must accept start/end
7916 * being beyond the boundaries of the map as this is
7917 * how some of the mappings like commpage mappings
7918 * can be destroyed (they're outside of those bounds).
7919 *
7920 * end < start is still something we can't cope with,
7921 * so just bail.
7922 */
7923 if (end < start) {
7924 goto out;
7925 }
7926 }
7927
7928
7929 /*
7930 * Find the start of the region.
7931 *
7932 * If in a superpage, extend the range
7933 * to include the start of the mapping.
7934 */
7935 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7936 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7937 start = SUPERPAGE_ROUND_DOWN(start);
7938 } else {
7939 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7940 break;
7941 }
7942 }
7943
7944 if (entry->superpage_size) {
7945 end = SUPERPAGE_ROUND_UP(end);
7946 }
7947
7948 /*
7949 * Step through all entries in this region
7950 */
7951 for (vm_map_offset_t s = start; s < end;) {
7952 /*
7953 * At this point, we have deleted all the memory entries
7954 * in [start, s) and are proceeding with the [s, end) range.
7955 *
7956 * This loop might drop the map lock, and it is possible that
7957 * some memory was already reallocated within [start, s)
7958 * and we don't want to mess with those entries.
7959 *
7960 * Some of those entries could even have been re-assembled
7961 * with an entry after "s" (in vm_map_simplify_entry()), so
7962 * we may have to vm_map_clip_start() again.
7963 *
7964 * When clear_in_transition_end is set, the we had marked
7965 * [start, clear_in_transition_end) as "in_transition"
7966 * during a previous iteration and we need to clear it.
7967 */
7968
7969 /*
7970 * Step 1: If needed (because we dropped locks),
7971 * lookup the entry again.
7972 *
7973 * If we're coming back from unwiring (Step 5),
7974 * we also need to mark the entries as no longer
7975 * in transition after that.
7976 */
7977
7978 if (state & VMDS_NEEDS_LOOKUP) {
7979 state &= ~VMDS_NEEDS_LOOKUP;
7980
7981 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7982 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7983 }
7984
7985 if (state & VMDS_KERNEL_KMEMPTR) {
7986 kmem_validate_slot(s, meta, size_idx, slot_idx);
7987 }
7988 }
7989
7990 if (clear_in_transition_end) {
7991 for (vm_map_entry_t it = entry;
7992 it != vm_map_to_entry(map) &&
7993 it->vme_start < clear_in_transition_end;
7994 it = it->vme_next) {
7995 assert(it->in_transition);
7996 it->in_transition = FALSE;
7997 if (it->needs_wakeup) {
7998 it->needs_wakeup = FALSE;
7999 state |= VMDS_NEEDS_WAKEUP;
8000 }
8001 }
8002
8003 clear_in_transition_end = 0;
8004 }
8005
8006
8007 /*
8008 * Step 2: Perform various policy checks
8009 * before we do _anything_ to this entry.
8010 */
8011
8012 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8013 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8014 /*
8015 * Either we found a gap already,
8016 * or we are tearing down a map,
8017 * keep going.
8018 */
8019 } else if (state & VMDS_KERNEL_PMAP) {
8020 __vm_map_delete_gap_panic(map, s, start, end);
8021 } else if (s < end) {
8022 state |= VMDS_FOUND_GAP;
8023 gap_start = s;
8024 }
8025
8026 if (entry == vm_map_to_entry(map) ||
8027 end <= entry->vme_start) {
8028 break;
8029 }
8030
8031 s = entry->vme_start;
8032 }
8033
8034 if (state & VMDS_KERNEL_PMAP) {
8035 /*
8036 * In the kernel map and its submaps,
8037 * permanent entries never die, even
8038 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8039 */
8040 if (entry->vme_permanent) {
8041 __vm_map_delete_permanent_panic(map, start, end, entry);
8042 }
8043
8044 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8045 end = entry->vme_end;
8046 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8047 }
8048
8049 /*
8050 * In the kernel map and its submaps,
8051 * the removal of an atomic/guarded entry is strict.
8052 *
8053 * An atomic entry is processed only if it was
8054 * specifically targeted.
8055 *
8056 * We might have deleted non-atomic entries before
8057 * we reach this this point however...
8058 */
8059 kmem_entry_validate_guard(map, entry,
8060 start, end - start, guard);
8061 }
8062
8063 /*
8064 * Step 2.1: handle "permanent" and "submap" entries
8065 * *before* clipping to avoid triggering some unnecessary
8066 * un-nesting of the shared region.
8067 */
8068 if (entry->vme_permanent && entry->is_sub_map) {
8069 // printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8070 /*
8071 * Un-mapping a "permanent" mapping of a user-space
8072 * submap is not allowed unless...
8073 */
8074 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8075 /*
8076 * a. explicitly requested by the kernel caller.
8077 */
8078 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8079 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8080 developer_mode_state()) {
8081 /*
8082 * b. we're in "developer" mode (for
8083 * breakpoints, dtrace probes, ...).
8084 */
8085 // printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8086 } else if (map->terminated) {
8087 /*
8088 * c. this is the final address space cleanup.
8089 */
8090 // printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8091 } else {
8092 vm_map_offset_t submap_start, submap_end;
8093 kern_return_t submap_kr;
8094
8095 /*
8096 * Check if there are any "permanent" mappings
8097 * in this range in the submap.
8098 */
8099 if (entry->in_transition) {
8100 /* can that even happen ? */
8101 goto in_transition;
8102 }
8103 /* compute the clipped range in the submap */
8104 submap_start = s - entry->vme_start;
8105 submap_start += VME_OFFSET(entry);
8106 submap_end = end - entry->vme_start;
8107 submap_end += VME_OFFSET(entry);
8108 submap_kr = vm_map_delete_submap_recurse(
8109 VME_SUBMAP(entry),
8110 submap_start,
8111 submap_end);
8112 if (submap_kr != KERN_SUCCESS) {
8113 /*
8114 * There are some "permanent" mappings
8115 * in the submap: we are not allowed
8116 * to remove this range.
8117 */
8118 printf("%d[%s] removing permanent submap entry "
8119 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8120 proc_selfpid(),
8121 (get_bsdtask_info(current_task())
8122 ? proc_name_address(get_bsdtask_info(current_task()))
8123 : "?"), entry,
8124 (uint64_t)entry->vme_start,
8125 (uint64_t)entry->vme_end,
8126 entry->protection,
8127 entry->max_protection);
8128 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8129 vm_map_entry_t, entry,
8130 vm_map_offset_t, entry->vme_start,
8131 vm_map_offset_t, entry->vme_end,
8132 vm_prot_t, entry->protection,
8133 vm_prot_t, entry->max_protection,
8134 int, VME_ALIAS(entry));
8135 ret.kmr_return = KERN_PROTECTION_FAILURE;
8136 goto out;
8137 }
8138 /* no permanent mappings: proceed */
8139 }
8140 }
8141
8142 /*
8143 * Step 3: Perform any clipping needed.
8144 *
8145 * After this, "entry" starts at "s", ends before "end"
8146 */
8147
8148 if (entry->vme_start < s) {
8149 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8150 entry->map_aligned &&
8151 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8152 /*
8153 * The entry will no longer be map-aligned
8154 * after clipping and the caller said it's OK.
8155 */
8156 entry->map_aligned = FALSE;
8157 }
8158 vm_map_clip_start(map, entry, s);
8159 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8160 }
8161
8162 if (end < entry->vme_end) {
8163 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8164 entry->map_aligned &&
8165 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8166 /*
8167 * The entry will no longer be map-aligned
8168 * after clipping and the caller said it's OK.
8169 */
8170 entry->map_aligned = FALSE;
8171 }
8172 vm_map_clip_end(map, entry, end);
8173 }
8174
8175 if (entry->vme_permanent && entry->is_sub_map) {
8176 /*
8177 * We already went through step 2.1 which did not deny
8178 * the removal of this "permanent" and "is_sub_map"
8179 * entry.
8180 * Now that we've clipped what we actually want to
8181 * delete, undo the "permanent" part to allow the
8182 * removal to proceed.
8183 */
8184 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8185 vm_map_entry_t, entry,
8186 vm_map_offset_t, entry->vme_start,
8187 vm_map_offset_t, entry->vme_end,
8188 vm_prot_t, entry->protection,
8189 vm_prot_t, entry->max_protection,
8190 int, VME_ALIAS(entry));
8191 entry->vme_permanent = false;
8192 }
8193
8194 assert(s == entry->vme_start);
8195 assert(entry->vme_end <= end);
8196
8197
8198 /*
8199 * Step 4: If the entry is in flux, wait for this to resolve.
8200 */
8201
8202 if (entry->in_transition) {
8203 wait_result_t wait_result;
8204
8205 in_transition:
8206 /*
8207 * Another thread is wiring/unwiring this entry.
8208 * Let the other thread know we are waiting.
8209 */
8210
8211 entry->needs_wakeup = TRUE;
8212
8213 /*
8214 * wake up anybody waiting on entries that we have
8215 * already unwired/deleted.
8216 */
8217 if (state & VMDS_NEEDS_WAKEUP) {
8218 vm_map_entry_wakeup(map);
8219 state &= ~VMDS_NEEDS_WAKEUP;
8220 }
8221
8222 wait_result = vm_map_entry_wait(map, interruptible);
8223
8224 if (interruptible &&
8225 wait_result == THREAD_INTERRUPTED) {
8226 /*
8227 * We do not clear the needs_wakeup flag,
8228 * since we cannot tell if we were the only one.
8229 */
8230 ret.kmr_return = KERN_ABORTED;
8231 return ret;
8232 }
8233
8234 /*
8235 * The entry could have been clipped or it
8236 * may not exist anymore. Look it up again.
8237 */
8238 state |= VMDS_NEEDS_LOOKUP;
8239 continue;
8240 }
8241
8242
8243 /*
8244 * Step 5: Handle wiring
8245 */
8246
8247 if (entry->wired_count) {
8248 struct vm_map_entry tmp_entry;
8249 boolean_t user_wire;
8250 unsigned int last_timestamp;
8251
8252 user_wire = entry->user_wired_count > 0;
8253
8254 /*
8255 * Remove a kernel wiring if requested
8256 */
8257 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8258 entry->wired_count--;
8259 vme_btref_consider_and_put(entry);
8260 }
8261
8262 /*
8263 * Remove all user wirings for proper accounting
8264 */
8265 while (entry->user_wired_count) {
8266 subtract_wire_counts(map, entry, user_wire);
8267 }
8268
8269 /*
8270 * All our DMA I/O operations in IOKit are currently
8271 * done by wiring through the map entries of the task
8272 * requesting the I/O.
8273 *
8274 * Because of this, we must always wait for kernel wirings
8275 * to go away on the entries before deleting them.
8276 *
8277 * Any caller who wants to actually remove a kernel wiring
8278 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8279 * properly remove one wiring instead of blasting through
8280 * them all.
8281 */
8282 if (entry->wired_count != 0) {
8283 assert(map != kernel_map);
8284 /*
8285 * Cannot continue. Typical case is when
8286 * a user thread has physical io pending on
8287 * on this page. Either wait for the
8288 * kernel wiring to go away or return an
8289 * error.
8290 */
8291 wait_result_t wait_result;
8292
8293 entry->needs_wakeup = TRUE;
8294 wait_result = vm_map_entry_wait(map,
8295 interruptible);
8296
8297 if (interruptible &&
8298 wait_result == THREAD_INTERRUPTED) {
8299 /*
8300 * We do not clear the
8301 * needs_wakeup flag, since we
8302 * cannot tell if we were the
8303 * only one.
8304 */
8305 ret.kmr_return = KERN_ABORTED;
8306 return ret;
8307 }
8308
8309
8310 /*
8311 * The entry could have been clipped or
8312 * it may not exist anymore. Look it
8313 * up again.
8314 */
8315 state |= VMDS_NEEDS_LOOKUP;
8316 continue;
8317 }
8318
8319 /*
8320 * We can unlock the map now.
8321 *
8322 * The entry might be split once we unlock the map,
8323 * but we need the range as defined by this entry
8324 * to be stable. So we must make a local copy.
8325 *
8326 * The underlying objects do not change during clips,
8327 * and the in_transition state guarentees existence
8328 * of the entry.
8329 */
8330 last_timestamp = map->timestamp;
8331 entry->in_transition = TRUE;
8332 tmp_entry = *entry;
8333 vm_map_unlock(map);
8334
8335 if (tmp_entry.is_sub_map) {
8336 vm_map_t sub_map;
8337 vm_map_offset_t sub_start, sub_end;
8338 pmap_t pmap;
8339 vm_map_offset_t pmap_addr;
8340
8341
8342 sub_map = VME_SUBMAP(&tmp_entry);
8343 sub_start = VME_OFFSET(&tmp_entry);
8344 sub_end = sub_start + (tmp_entry.vme_end -
8345 tmp_entry.vme_start);
8346 if (tmp_entry.use_pmap) {
8347 pmap = sub_map->pmap;
8348 pmap_addr = tmp_entry.vme_start;
8349 } else {
8350 pmap = map->pmap;
8351 pmap_addr = tmp_entry.vme_start;
8352 }
8353 (void) vm_map_unwire_nested(sub_map,
8354 sub_start, sub_end,
8355 user_wire,
8356 pmap, pmap_addr);
8357 } else {
8358 vm_map_offset_t entry_end = tmp_entry.vme_end;
8359 vm_map_offset_t max_end;
8360
8361 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8362 max_end = end - VM_MAP_PAGE_SIZE(map);
8363 if (entry_end > max_end) {
8364 entry_end = max_end;
8365 }
8366 }
8367
8368 if (tmp_entry.vme_kernel_object) {
8369 pmap_protect_options(
8370 map->pmap,
8371 tmp_entry.vme_start,
8372 entry_end,
8373 VM_PROT_NONE,
8374 PMAP_OPTIONS_REMOVE,
8375 NULL);
8376 }
8377 vm_fault_unwire(map, &tmp_entry,
8378 tmp_entry.vme_kernel_object, map->pmap,
8379 tmp_entry.vme_start, entry_end);
8380 }
8381
8382 vm_map_lock(map);
8383
8384 /*
8385 * Unwiring happened, we can now go back to deleting
8386 * them (after we clear the in_transition bit for the range).
8387 */
8388 if (last_timestamp + 1 != map->timestamp) {
8389 state |= VMDS_NEEDS_LOOKUP;
8390 }
8391 clear_in_transition_end = tmp_entry.vme_end;
8392 continue;
8393 }
8394
8395 assert(entry->wired_count == 0);
8396 assert(entry->user_wired_count == 0);
8397
8398
8399 /*
8400 * Step 6: Entry is unwired and ready for us to delete !
8401 */
8402
8403 if (!entry->vme_permanent) {
8404 /*
8405 * Typical case: the entry really shouldn't be permanent
8406 */
8407 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8408 (entry->protection & VM_PROT_EXECUTE) &&
8409 developer_mode_state()) {
8410 /*
8411 * Allow debuggers to undo executable mappings
8412 * when developer mode is on.
8413 */
8414 #if 0
8415 printf("FBDP %d[%s] removing permanent executable entry "
8416 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8417 proc_selfpid(),
8418 (current_task()->bsd_info
8419 ? proc_name_address(current_task()->bsd_info)
8420 : "?"), entry,
8421 (uint64_t)entry->vme_start,
8422 (uint64_t)entry->vme_end,
8423 entry->protection,
8424 entry->max_protection);
8425 #endif
8426 entry->vme_permanent = FALSE;
8427 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8428 #if 0
8429 printf("FBDP %d[%s] removing permanent entry "
8430 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8431 proc_selfpid(),
8432 (current_task()->bsd_info
8433 ? proc_name_address(current_task()->bsd_info)
8434 : "?"), entry,
8435 (uint64_t)entry->vme_start,
8436 (uint64_t)entry->vme_end,
8437 entry->protection,
8438 entry->max_protection);
8439 #endif
8440 entry->vme_permanent = FALSE;
8441 #if CODE_SIGNING_MONITOR
8442 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8443 entry->vme_permanent = FALSE;
8444
8445 printf("%d[%s] %s(0x%llx,0x%llx): "
8446 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8447 "prot 0x%x/0x%x\n",
8448 proc_selfpid(),
8449 (get_bsdtask_info(current_task())
8450 ? proc_name_address(get_bsdtask_info(current_task()))
8451 : "?"),
8452 __FUNCTION__,
8453 (uint64_t)start,
8454 (uint64_t)end,
8455 (uint64_t)entry->vme_start,
8456 (uint64_t)entry->vme_end,
8457 entry->protection,
8458 entry->max_protection);
8459 #endif
8460 } else {
8461 DTRACE_VM6(vm_map_delete_permanent,
8462 vm_map_entry_t, entry,
8463 vm_map_offset_t, entry->vme_start,
8464 vm_map_offset_t, entry->vme_end,
8465 vm_prot_t, entry->protection,
8466 vm_prot_t, entry->max_protection,
8467 int, VME_ALIAS(entry));
8468 }
8469
8470 if (entry->is_sub_map) {
8471 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8472 "map %p (%d) entry %p submap %p (%d)\n",
8473 map, VM_MAP_PAGE_SHIFT(map), entry,
8474 VME_SUBMAP(entry),
8475 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8476 if (entry->use_pmap) {
8477 #ifndef NO_NESTED_PMAP
8478 int pmap_flags;
8479
8480 if (map->terminated) {
8481 /*
8482 * This is the final cleanup of the
8483 * address space being terminated.
8484 * No new mappings are expected and
8485 * we don't really need to unnest the
8486 * shared region (and lose the "global"
8487 * pmap mappings, if applicable).
8488 *
8489 * Tell the pmap layer that we're
8490 * "clean" wrt nesting.
8491 */
8492 pmap_flags = PMAP_UNNEST_CLEAN;
8493 } else {
8494 /*
8495 * We're unmapping part of the nested
8496 * shared region, so we can't keep the
8497 * nested pmap.
8498 */
8499 pmap_flags = 0;
8500 }
8501 pmap_unnest_options(
8502 map->pmap,
8503 (addr64_t)entry->vme_start,
8504 entry->vme_end - entry->vme_start,
8505 pmap_flags);
8506 #endif /* NO_NESTED_PMAP */
8507 if (map->mapped_in_other_pmaps &&
8508 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8509 /* clean up parent map/maps */
8510 vm_map_submap_pmap_clean(
8511 map, entry->vme_start,
8512 entry->vme_end,
8513 VME_SUBMAP(entry),
8514 VME_OFFSET(entry));
8515 }
8516 } else {
8517 vm_map_submap_pmap_clean(
8518 map, entry->vme_start, entry->vme_end,
8519 VME_SUBMAP(entry),
8520 VME_OFFSET(entry));
8521 }
8522 } else if (entry->vme_kernel_object ||
8523 VME_OBJECT(entry) == compressor_object) {
8524 /*
8525 * nothing to do
8526 */
8527 } else if (map->mapped_in_other_pmaps &&
8528 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8529 vm_object_pmap_protect_options(
8530 VME_OBJECT(entry), VME_OFFSET(entry),
8531 entry->vme_end - entry->vme_start,
8532 PMAP_NULL,
8533 PAGE_SIZE,
8534 entry->vme_start,
8535 VM_PROT_NONE,
8536 PMAP_OPTIONS_REMOVE);
8537 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8538 (state & VMDS_KERNEL_PMAP)) {
8539 /* Remove translations associated
8540 * with this range unless the entry
8541 * does not have an object, or
8542 * it's the kernel map or a descendant
8543 * since the platform could potentially
8544 * create "backdoor" mappings invisible
8545 * to the VM. It is expected that
8546 * objectless, non-kernel ranges
8547 * do not have such VM invisible
8548 * translations.
8549 */
8550 pmap_remove_options(map->pmap,
8551 (addr64_t)entry->vme_start,
8552 (addr64_t)entry->vme_end,
8553 PMAP_OPTIONS_REMOVE);
8554 }
8555
8556 #if DEBUG
8557 /*
8558 * All pmap mappings for this map entry must have been
8559 * cleared by now.
8560 */
8561 assert(pmap_is_empty(map->pmap,
8562 entry->vme_start,
8563 entry->vme_end));
8564 #endif /* DEBUG */
8565
8566 if (entry->iokit_acct) {
8567 /* alternate accounting */
8568 DTRACE_VM4(vm_map_iokit_unmapped_region,
8569 vm_map_t, map,
8570 vm_map_offset_t, entry->vme_start,
8571 vm_map_offset_t, entry->vme_end,
8572 int, VME_ALIAS(entry));
8573 vm_map_iokit_unmapped_region(map,
8574 (entry->vme_end -
8575 entry->vme_start));
8576 entry->iokit_acct = FALSE;
8577 entry->use_pmap = FALSE;
8578 }
8579
8580 /* move "s" forward */
8581 s = entry->vme_end;
8582 next = entry->vme_next;
8583 if (!entry->map_aligned) {
8584 vm_map_offset_t rounded_s;
8585
8586 /*
8587 * Skip artificial gap due to mis-aligned entry
8588 * on devices with a page size smaller than the
8589 * map's page size (i.e. 16k task on a 4k device).
8590 */
8591 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8592 if (next == vm_map_to_entry(map)) {
8593 s = rounded_s;
8594 } else if (s < rounded_s) {
8595 s = MIN(rounded_s, next->vme_start);
8596 }
8597 }
8598 ret.kmr_size += s - entry->vme_start;
8599
8600 if (entry->vme_permanent) {
8601 /*
8602 * A permanent entry can not be removed, so leave it
8603 * in place but remove all access permissions.
8604 */
8605 if (!entry->csm_associated) {
8606 printf("%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8607 __FUNCTION__, __LINE__,
8608 proc_selfpid(),
8609 (get_bsdtask_info(current_task())
8610 ? proc_name_address(get_bsdtask_info(current_task()))
8611 : "?"),
8612 map,
8613 entry,
8614 (uint64_t)entry->vme_start,
8615 (uint64_t)entry->vme_end,
8616 entry->is_sub_map,
8617 entry->protection,
8618 entry->max_protection);
8619 }
8620 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8621 vm_map_entry_t, entry,
8622 vm_map_offset_t, entry->vme_start,
8623 vm_map_offset_t, entry->vme_end,
8624 vm_prot_t, entry->protection,
8625 vm_prot_t, entry->max_protection,
8626 int, VME_ALIAS(entry));
8627 entry->protection = VM_PROT_NONE;
8628 entry->max_protection = VM_PROT_NONE;
8629 } else {
8630 vm_map_entry_zap(map, entry, zap_list);
8631 }
8632
8633 entry = next;
8634 next = VM_MAP_ENTRY_NULL;
8635
8636 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8637 unsigned int last_timestamp = map->timestamp++;
8638
8639 if (lck_rw_lock_yield_exclusive(&map->lock,
8640 LCK_RW_YIELD_ANY_WAITER)) {
8641 if (last_timestamp != map->timestamp + 1) {
8642 state |= VMDS_NEEDS_LOOKUP;
8643 }
8644 } else {
8645 /* we didn't yield, undo our change */
8646 map->timestamp--;
8647 }
8648 }
8649 }
8650
8651 if (map->wait_for_space) {
8652 thread_wakeup((event_t) map);
8653 }
8654
8655 if (state & VMDS_NEEDS_WAKEUP) {
8656 vm_map_entry_wakeup(map);
8657 }
8658
8659 out:
8660 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
8661 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8662 }
8663
8664 if (state & VMDS_KERNEL_KMEMPTR) {
8665 kmem_free_space(start, end, range_id, &slot);
8666 }
8667
8668 if (state & VMDS_FOUND_GAP) {
8669 DTRACE_VM3(kern_vm_deallocate_gap,
8670 vm_map_offset_t, gap_start,
8671 vm_map_offset_t, save_start,
8672 vm_map_offset_t, save_end);
8673 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8674 ret.kmr_return = KERN_INVALID_VALUE;
8675 } else {
8676 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8677 }
8678 }
8679
8680 return ret;
8681 }
8682
8683 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8684 vm_map_remove_and_unlock(
8685 vm_map_t map,
8686 vm_map_offset_t start,
8687 vm_map_offset_t end,
8688 vmr_flags_t flags,
8689 kmem_guard_t guard)
8690 {
8691 kmem_return_t ret;
8692 VM_MAP_ZAP_DECLARE(zap);
8693
8694 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8695 vm_map_unlock(map);
8696
8697 vm_map_zap_dispose(&zap);
8698
8699 return ret;
8700 }
8701
8702 /*
8703 * vm_map_remove_guard:
8704 *
8705 * Remove the given address range from the target map.
8706 * This is the exported form of vm_map_delete.
8707 */
8708 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8709 vm_map_remove_guard(
8710 vm_map_t map,
8711 vm_map_offset_t start,
8712 vm_map_offset_t end,
8713 vmr_flags_t flags,
8714 kmem_guard_t guard)
8715 {
8716 vm_map_lock(map);
8717 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8718 }
8719
8720 /*
8721 * vm_map_terminate:
8722 *
8723 * Clean out a task's map.
8724 */
8725 kern_return_t
vm_map_terminate(vm_map_t map)8726 vm_map_terminate(
8727 vm_map_t map)
8728 {
8729 vm_map_lock(map);
8730 map->terminated = TRUE;
8731 vm_map_disable_hole_optimization(map);
8732 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8733 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8734 return KERN_SUCCESS;
8735 }
8736
8737 /*
8738 * Routine: vm_map_copy_allocate
8739 *
8740 * Description:
8741 * Allocates and initializes a map copy object.
8742 */
8743 static vm_map_copy_t
vm_map_copy_allocate(uint16_t type)8744 vm_map_copy_allocate(uint16_t type)
8745 {
8746 vm_map_copy_t new_copy;
8747
8748 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
8749 new_copy->type = type;
8750 if (type == VM_MAP_COPY_ENTRY_LIST) {
8751 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8752 vm_map_store_init(&new_copy->cpy_hdr);
8753 }
8754 return new_copy;
8755 }
8756
8757 /*
8758 * Routine: vm_map_copy_discard
8759 *
8760 * Description:
8761 * Dispose of a map copy object (returned by
8762 * vm_map_copyin).
8763 */
8764 void
vm_map_copy_discard(vm_map_copy_t copy)8765 vm_map_copy_discard(
8766 vm_map_copy_t copy)
8767 {
8768 if (copy == VM_MAP_COPY_NULL) {
8769 return;
8770 }
8771
8772 /*
8773 * Assert that the vm_map_copy is coming from the right
8774 * zone and hasn't been forged
8775 */
8776 vm_map_copy_require(copy);
8777
8778 switch (copy->type) {
8779 case VM_MAP_COPY_ENTRY_LIST:
8780 while (vm_map_copy_first_entry(copy) !=
8781 vm_map_copy_to_entry(copy)) {
8782 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8783
8784 vm_map_copy_entry_unlink(copy, entry);
8785 if (entry->is_sub_map) {
8786 vm_map_deallocate(VME_SUBMAP(entry));
8787 } else {
8788 vm_object_deallocate(VME_OBJECT(entry));
8789 }
8790 vm_map_copy_entry_dispose(entry);
8791 }
8792 break;
8793 case VM_MAP_COPY_KERNEL_BUFFER:
8794
8795 /*
8796 * The vm_map_copy_t and possibly the data buffer were
8797 * allocated by a single call to kalloc_data(), i.e. the
8798 * vm_map_copy_t was not allocated out of the zone.
8799 */
8800 if (copy->size > msg_ool_size_small || copy->offset) {
8801 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8802 (long long)copy->size, (long long)copy->offset);
8803 }
8804 kfree_data(copy->cpy_kdata, copy->size);
8805 }
8806 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
8807 }
8808
8809 #if XNU_PLATFORM_MacOSX
8810
8811 __exported
8812 extern vm_map_copy_t vm_map_copy_copy(vm_map_copy_t copy);
8813
8814 /*
8815 * Routine: vm_map_copy_copy
8816 *
8817 * Description:
8818 * Move the information in a map copy object to
8819 * a new map copy object, leaving the old one
8820 * empty.
8821 *
8822 * This is used by kernel routines that need
8823 * to look at out-of-line data (in copyin form)
8824 * before deciding whether to return SUCCESS.
8825 * If the routine returns FAILURE, the original
8826 * copy object will be deallocated; therefore,
8827 * these routines must make a copy of the copy
8828 * object and leave the original empty so that
8829 * deallocation will not fail.
8830 */
8831 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8832 vm_map_copy_copy(
8833 vm_map_copy_t copy)
8834 {
8835 vm_map_copy_t new_copy;
8836
8837 if (copy == VM_MAP_COPY_NULL) {
8838 return VM_MAP_COPY_NULL;
8839 }
8840
8841 /*
8842 * Assert that the vm_map_copy is coming from the right
8843 * zone and hasn't been forged
8844 */
8845 vm_map_copy_require(copy);
8846
8847 /*
8848 * Allocate a new copy object, and copy the information
8849 * from the old one into it.
8850 */
8851
8852 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8853 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8854 #if __has_feature(ptrauth_calls)
8855 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8856 new_copy->cpy_kdata = copy->cpy_kdata;
8857 }
8858 #endif
8859
8860 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8861 /*
8862 * The links in the entry chain must be
8863 * changed to point to the new copy object.
8864 */
8865 vm_map_copy_first_entry(copy)->vme_prev
8866 = vm_map_copy_to_entry(new_copy);
8867 vm_map_copy_last_entry(copy)->vme_next
8868 = vm_map_copy_to_entry(new_copy);
8869 }
8870
8871 /*
8872 * Change the old copy object into one that contains
8873 * nothing to be deallocated.
8874 */
8875 bzero(copy, sizeof(struct vm_map_copy));
8876 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
8877
8878 /*
8879 * Return the new object.
8880 */
8881 return new_copy;
8882 }
8883
8884 #endif /* XNU_PLATFORM_MacOSX */
8885
8886 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8887 vm_map_entry_is_overwritable(
8888 vm_map_t dst_map __unused,
8889 vm_map_entry_t entry)
8890 {
8891 if (!(entry->protection & VM_PROT_WRITE)) {
8892 /* can't overwrite if not writable */
8893 return FALSE;
8894 }
8895 #if !__x86_64__
8896 if (entry->used_for_jit &&
8897 vm_map_cs_enforcement(dst_map) &&
8898 !dst_map->cs_debugged) {
8899 /*
8900 * Can't overwrite a JIT region while cs_enforced
8901 * and not cs_debugged.
8902 */
8903 return FALSE;
8904 }
8905
8906 #if __arm64e__
8907 /* Do not allow overwrite HW assisted TPRO entries */
8908 if (entry->used_for_tpro) {
8909 return FALSE;
8910 }
8911 #endif /* __arm64e__ */
8912
8913 if (entry->vme_permanent) {
8914 if (entry->is_sub_map) {
8915 /*
8916 * We can't tell if the submap contains "permanent"
8917 * entries within the range targeted by the caller.
8918 * The caller will have to check for that with
8919 * vm_map_overwrite_submap_recurse() for example.
8920 */
8921 } else {
8922 /*
8923 * Do not allow overwriting of a "permanent"
8924 * entry.
8925 */
8926 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
8927 vm_map_entry_t, entry,
8928 vm_map_offset_t, entry->vme_start,
8929 vm_map_offset_t, entry->vme_end,
8930 vm_prot_t, entry->protection,
8931 vm_prot_t, entry->max_protection,
8932 int, VME_ALIAS(entry));
8933 return FALSE;
8934 }
8935 }
8936 #endif /* !__x86_64__ */
8937 return TRUE;
8938 }
8939
8940 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8941 vm_map_overwrite_submap_recurse(
8942 vm_map_t dst_map,
8943 vm_map_offset_t dst_addr,
8944 vm_map_size_t dst_size)
8945 {
8946 vm_map_offset_t dst_end;
8947 vm_map_entry_t tmp_entry;
8948 vm_map_entry_t entry;
8949 kern_return_t result;
8950 boolean_t encountered_sub_map = FALSE;
8951
8952
8953
8954 /*
8955 * Verify that the destination is all writeable
8956 * initially. We have to trunc the destination
8957 * address and round the copy size or we'll end up
8958 * splitting entries in strange ways.
8959 */
8960
8961 dst_end = vm_map_round_page(dst_addr + dst_size,
8962 VM_MAP_PAGE_MASK(dst_map));
8963 vm_map_lock(dst_map);
8964
8965 start_pass_1:
8966 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8967 vm_map_unlock(dst_map);
8968 return KERN_INVALID_ADDRESS;
8969 }
8970
8971 vm_map_clip_start(dst_map,
8972 tmp_entry,
8973 vm_map_trunc_page(dst_addr,
8974 VM_MAP_PAGE_MASK(dst_map)));
8975 if (tmp_entry->is_sub_map) {
8976 /* clipping did unnest if needed */
8977 assert(!tmp_entry->use_pmap);
8978 }
8979
8980 for (entry = tmp_entry;;) {
8981 vm_map_entry_t next;
8982
8983 next = entry->vme_next;
8984 while (entry->is_sub_map) {
8985 vm_map_offset_t sub_start;
8986 vm_map_offset_t sub_end;
8987 vm_map_offset_t local_end;
8988
8989 if (entry->in_transition) {
8990 /*
8991 * Say that we are waiting, and wait for entry.
8992 */
8993 entry->needs_wakeup = TRUE;
8994 vm_map_entry_wait(dst_map, THREAD_UNINT);
8995
8996 goto start_pass_1;
8997 }
8998
8999 encountered_sub_map = TRUE;
9000 sub_start = VME_OFFSET(entry);
9001
9002 if (entry->vme_end < dst_end) {
9003 sub_end = entry->vme_end;
9004 } else {
9005 sub_end = dst_end;
9006 }
9007 sub_end -= entry->vme_start;
9008 sub_end += VME_OFFSET(entry);
9009 local_end = entry->vme_end;
9010 vm_map_unlock(dst_map);
9011
9012 result = vm_map_overwrite_submap_recurse(
9013 VME_SUBMAP(entry),
9014 sub_start,
9015 sub_end - sub_start);
9016
9017 if (result != KERN_SUCCESS) {
9018 return result;
9019 }
9020 if (dst_end <= entry->vme_end) {
9021 return KERN_SUCCESS;
9022 }
9023 vm_map_lock(dst_map);
9024 if (!vm_map_lookup_entry(dst_map, local_end,
9025 &tmp_entry)) {
9026 vm_map_unlock(dst_map);
9027 return KERN_INVALID_ADDRESS;
9028 }
9029 entry = tmp_entry;
9030 next = entry->vme_next;
9031 }
9032
9033 if (!(entry->protection & VM_PROT_WRITE)) {
9034 vm_map_unlock(dst_map);
9035 return KERN_PROTECTION_FAILURE;
9036 }
9037
9038 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9039 vm_map_unlock(dst_map);
9040 return KERN_PROTECTION_FAILURE;
9041 }
9042
9043 /*
9044 * If the entry is in transition, we must wait
9045 * for it to exit that state. Anything could happen
9046 * when we unlock the map, so start over.
9047 */
9048 if (entry->in_transition) {
9049 /*
9050 * Say that we are waiting, and wait for entry.
9051 */
9052 entry->needs_wakeup = TRUE;
9053 vm_map_entry_wait(dst_map, THREAD_UNINT);
9054
9055 goto start_pass_1;
9056 }
9057
9058 /*
9059 * our range is contained completely within this map entry
9060 */
9061 if (dst_end <= entry->vme_end) {
9062 vm_map_unlock(dst_map);
9063 return KERN_SUCCESS;
9064 }
9065 /*
9066 * check that range specified is contiguous region
9067 */
9068 if ((next == vm_map_to_entry(dst_map)) ||
9069 (next->vme_start != entry->vme_end)) {
9070 vm_map_unlock(dst_map);
9071 return KERN_INVALID_ADDRESS;
9072 }
9073
9074 /*
9075 * Check for permanent objects in the destination.
9076 */
9077 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9078 ((!VME_OBJECT(entry)->internal) ||
9079 (VME_OBJECT(entry)->true_share))) {
9080 if (encountered_sub_map) {
9081 vm_map_unlock(dst_map);
9082 return KERN_FAILURE;
9083 }
9084 }
9085
9086
9087 entry = next;
9088 }/* for */
9089 vm_map_unlock(dst_map);
9090 return KERN_SUCCESS;
9091 }
9092
9093 /*
9094 * Routine: vm_map_copy_overwrite
9095 *
9096 * Description:
9097 * Copy the memory described by the map copy
9098 * object (copy; returned by vm_map_copyin) onto
9099 * the specified destination region (dst_map, dst_addr).
9100 * The destination must be writeable.
9101 *
9102 * Unlike vm_map_copyout, this routine actually
9103 * writes over previously-mapped memory. If the
9104 * previous mapping was to a permanent (user-supplied)
9105 * memory object, it is preserved.
9106 *
9107 * The attributes (protection and inheritance) of the
9108 * destination region are preserved.
9109 *
9110 * If successful, consumes the copy object.
9111 * Otherwise, the caller is responsible for it.
9112 *
9113 * Implementation notes:
9114 * To overwrite aligned temporary virtual memory, it is
9115 * sufficient to remove the previous mapping and insert
9116 * the new copy. This replacement is done either on
9117 * the whole region (if no permanent virtual memory
9118 * objects are embedded in the destination region) or
9119 * in individual map entries.
9120 *
9121 * To overwrite permanent virtual memory , it is necessary
9122 * to copy each page, as the external memory management
9123 * interface currently does not provide any optimizations.
9124 *
9125 * Unaligned memory also has to be copied. It is possible
9126 * to use 'vm_trickery' to copy the aligned data. This is
9127 * not done but not hard to implement.
9128 *
9129 * Once a page of permanent memory has been overwritten,
9130 * it is impossible to interrupt this function; otherwise,
9131 * the call would be neither atomic nor location-independent.
9132 * The kernel-state portion of a user thread must be
9133 * interruptible.
9134 *
9135 * It may be expensive to forward all requests that might
9136 * overwrite permanent memory (vm_write, vm_copy) to
9137 * uninterruptible kernel threads. This routine may be
9138 * called by interruptible threads; however, success is
9139 * not guaranteed -- if the request cannot be performed
9140 * atomically and interruptibly, an error indication is
9141 * returned.
9142 *
9143 * Callers of this function must call vm_map_copy_require on
9144 * previously created vm_map_copy_t or pass a newly created
9145 * one to ensure that it hasn't been forged.
9146 */
9147 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)9148 vm_map_copy_overwrite_nested(
9149 vm_map_t dst_map,
9150 vm_map_address_t dst_addr,
9151 vm_map_copy_t copy,
9152 boolean_t interruptible,
9153 pmap_t pmap,
9154 boolean_t discard_on_success)
9155 {
9156 vm_map_offset_t dst_end;
9157 vm_map_entry_t tmp_entry;
9158 vm_map_entry_t entry;
9159 kern_return_t kr;
9160 boolean_t aligned = TRUE;
9161 boolean_t contains_permanent_objects = FALSE;
9162 boolean_t encountered_sub_map = FALSE;
9163 vm_map_offset_t base_addr;
9164 vm_map_size_t copy_size;
9165 vm_map_size_t total_size;
9166 uint16_t copy_page_shift;
9167
9168 /*
9169 * Check for special kernel buffer allocated
9170 * by new_ipc_kmsg_copyin.
9171 */
9172
9173 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9174 kr = vm_map_copyout_kernel_buffer(
9175 dst_map, &dst_addr,
9176 copy, copy->size, TRUE, discard_on_success);
9177 return kr;
9178 }
9179
9180 /*
9181 * Only works for entry lists at the moment. Will
9182 * support page lists later.
9183 */
9184
9185 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9186
9187 if (copy->size == 0) {
9188 if (discard_on_success) {
9189 vm_map_copy_discard(copy);
9190 }
9191 return KERN_SUCCESS;
9192 }
9193
9194 copy_page_shift = copy->cpy_hdr.page_shift;
9195
9196 /*
9197 * Verify that the destination is all writeable
9198 * initially. We have to trunc the destination
9199 * address and round the copy size or we'll end up
9200 * splitting entries in strange ways.
9201 */
9202
9203 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9204 VM_MAP_PAGE_MASK(dst_map)) ||
9205 !VM_MAP_PAGE_ALIGNED(copy->offset,
9206 VM_MAP_PAGE_MASK(dst_map)) ||
9207 !VM_MAP_PAGE_ALIGNED(dst_addr,
9208 VM_MAP_PAGE_MASK(dst_map)) ||
9209 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
9210 aligned = FALSE;
9211 dst_end = vm_map_round_page(dst_addr + copy->size,
9212 VM_MAP_PAGE_MASK(dst_map));
9213 } else {
9214 dst_end = dst_addr + copy->size;
9215 }
9216
9217 vm_map_lock(dst_map);
9218
9219 /* LP64todo - remove this check when vm_map_commpage64()
9220 * no longer has to stuff in a map_entry for the commpage
9221 * above the map's max_offset.
9222 */
9223 if (dst_addr >= dst_map->max_offset) {
9224 vm_map_unlock(dst_map);
9225 return KERN_INVALID_ADDRESS;
9226 }
9227
9228 start_pass_1:
9229 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
9230 vm_map_unlock(dst_map);
9231 return KERN_INVALID_ADDRESS;
9232 }
9233 vm_map_clip_start(dst_map,
9234 tmp_entry,
9235 vm_map_trunc_page(dst_addr,
9236 VM_MAP_PAGE_MASK(dst_map)));
9237 for (entry = tmp_entry;;) {
9238 vm_map_entry_t next = entry->vme_next;
9239
9240 while (entry->is_sub_map) {
9241 vm_map_offset_t sub_start;
9242 vm_map_offset_t sub_end;
9243 vm_map_offset_t local_end;
9244
9245 if (entry->in_transition) {
9246 /*
9247 * Say that we are waiting, and wait for entry.
9248 */
9249 entry->needs_wakeup = TRUE;
9250 vm_map_entry_wait(dst_map, THREAD_UNINT);
9251
9252 goto start_pass_1;
9253 }
9254
9255 local_end = entry->vme_end;
9256 if (!(entry->needs_copy)) {
9257 /* if needs_copy we are a COW submap */
9258 /* in such a case we just replace so */
9259 /* there is no need for the follow- */
9260 /* ing check. */
9261 encountered_sub_map = TRUE;
9262 sub_start = VME_OFFSET(entry);
9263
9264 if (entry->vme_end < dst_end) {
9265 sub_end = entry->vme_end;
9266 } else {
9267 sub_end = dst_end;
9268 }
9269 sub_end -= entry->vme_start;
9270 sub_end += VME_OFFSET(entry);
9271 vm_map_unlock(dst_map);
9272
9273 kr = vm_map_overwrite_submap_recurse(
9274 VME_SUBMAP(entry),
9275 sub_start,
9276 sub_end - sub_start);
9277 if (kr != KERN_SUCCESS) {
9278 return kr;
9279 }
9280 vm_map_lock(dst_map);
9281 }
9282
9283 if (dst_end <= entry->vme_end) {
9284 goto start_overwrite;
9285 }
9286 if (!vm_map_lookup_entry(dst_map, local_end,
9287 &entry)) {
9288 vm_map_unlock(dst_map);
9289 return KERN_INVALID_ADDRESS;
9290 }
9291 next = entry->vme_next;
9292 }
9293
9294 if (!(entry->protection & VM_PROT_WRITE)) {
9295 vm_map_unlock(dst_map);
9296 return KERN_PROTECTION_FAILURE;
9297 }
9298
9299 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9300 vm_map_unlock(dst_map);
9301 return KERN_PROTECTION_FAILURE;
9302 }
9303
9304 /*
9305 * If the entry is in transition, we must wait
9306 * for it to exit that state. Anything could happen
9307 * when we unlock the map, so start over.
9308 */
9309 if (entry->in_transition) {
9310 /*
9311 * Say that we are waiting, and wait for entry.
9312 */
9313 entry->needs_wakeup = TRUE;
9314 vm_map_entry_wait(dst_map, THREAD_UNINT);
9315
9316 goto start_pass_1;
9317 }
9318
9319 /*
9320 * our range is contained completely within this map entry
9321 */
9322 if (dst_end <= entry->vme_end) {
9323 break;
9324 }
9325 /*
9326 * check that range specified is contiguous region
9327 */
9328 if ((next == vm_map_to_entry(dst_map)) ||
9329 (next->vme_start != entry->vme_end)) {
9330 vm_map_unlock(dst_map);
9331 return KERN_INVALID_ADDRESS;
9332 }
9333
9334
9335 /*
9336 * Check for permanent objects in the destination.
9337 */
9338 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9339 ((!VME_OBJECT(entry)->internal) ||
9340 (VME_OBJECT(entry)->true_share))) {
9341 contains_permanent_objects = TRUE;
9342 }
9343
9344 entry = next;
9345 }/* for */
9346
9347 start_overwrite:
9348 /*
9349 * If there are permanent objects in the destination, then
9350 * the copy cannot be interrupted.
9351 */
9352
9353 if (interruptible && contains_permanent_objects) {
9354 vm_map_unlock(dst_map);
9355 return KERN_FAILURE; /* XXX */
9356 }
9357
9358 /*
9359 *
9360 * Make a second pass, overwriting the data
9361 * At the beginning of each loop iteration,
9362 * the next entry to be overwritten is "tmp_entry"
9363 * (initially, the value returned from the lookup above),
9364 * and the starting address expected in that entry
9365 * is "start".
9366 */
9367
9368 total_size = copy->size;
9369 if (encountered_sub_map) {
9370 copy_size = 0;
9371 /* re-calculate tmp_entry since we've had the map */
9372 /* unlocked */
9373 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9374 vm_map_unlock(dst_map);
9375 return KERN_INVALID_ADDRESS;
9376 }
9377 } else {
9378 copy_size = copy->size;
9379 }
9380
9381 base_addr = dst_addr;
9382 while (TRUE) {
9383 /* deconstruct the copy object and do in parts */
9384 /* only in sub_map, interruptable case */
9385 vm_map_entry_t copy_entry;
9386 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9387 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9388 int nentries;
9389 int remaining_entries = 0;
9390 vm_map_offset_t new_offset = 0;
9391
9392 for (entry = tmp_entry; copy_size == 0;) {
9393 vm_map_entry_t next;
9394
9395 next = entry->vme_next;
9396
9397 /* tmp_entry and base address are moved along */
9398 /* each time we encounter a sub-map. Otherwise */
9399 /* entry can outpase tmp_entry, and the copy_size */
9400 /* may reflect the distance between them */
9401 /* if the current entry is found to be in transition */
9402 /* we will start over at the beginning or the last */
9403 /* encounter of a submap as dictated by base_addr */
9404 /* we will zero copy_size accordingly. */
9405 if (entry->in_transition) {
9406 /*
9407 * Say that we are waiting, and wait for entry.
9408 */
9409 entry->needs_wakeup = TRUE;
9410 vm_map_entry_wait(dst_map, THREAD_UNINT);
9411
9412 if (!vm_map_lookup_entry(dst_map, base_addr,
9413 &tmp_entry)) {
9414 vm_map_unlock(dst_map);
9415 return KERN_INVALID_ADDRESS;
9416 }
9417 copy_size = 0;
9418 entry = tmp_entry;
9419 continue;
9420 }
9421 if (entry->is_sub_map) {
9422 vm_map_offset_t sub_start;
9423 vm_map_offset_t sub_end;
9424 vm_map_offset_t local_end;
9425
9426 if (entry->needs_copy) {
9427 /* if this is a COW submap */
9428 /* just back the range with a */
9429 /* anonymous entry */
9430 assert(!entry->vme_permanent);
9431 if (entry->vme_end < dst_end) {
9432 sub_end = entry->vme_end;
9433 } else {
9434 sub_end = dst_end;
9435 }
9436 if (entry->vme_start < base_addr) {
9437 sub_start = base_addr;
9438 } else {
9439 sub_start = entry->vme_start;
9440 }
9441 vm_map_clip_end(
9442 dst_map, entry, sub_end);
9443 vm_map_clip_start(
9444 dst_map, entry, sub_start);
9445 assert(!entry->use_pmap);
9446 assert(!entry->iokit_acct);
9447 entry->use_pmap = TRUE;
9448 vm_map_deallocate(VME_SUBMAP(entry));
9449 assert(!entry->vme_permanent);
9450 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9451 VME_OFFSET_SET(entry, 0);
9452 entry->is_shared = FALSE;
9453 entry->needs_copy = FALSE;
9454 entry->protection = VM_PROT_DEFAULT;
9455 entry->max_protection = VM_PROT_ALL;
9456 entry->wired_count = 0;
9457 entry->user_wired_count = 0;
9458 if (entry->inheritance
9459 == VM_INHERIT_SHARE) {
9460 entry->inheritance = VM_INHERIT_COPY;
9461 }
9462 continue;
9463 }
9464 /* first take care of any non-sub_map */
9465 /* entries to send */
9466 if (base_addr < entry->vme_start) {
9467 /* stuff to send */
9468 copy_size =
9469 entry->vme_start - base_addr;
9470 break;
9471 }
9472 sub_start = VME_OFFSET(entry);
9473
9474 if (entry->vme_end < dst_end) {
9475 sub_end = entry->vme_end;
9476 } else {
9477 sub_end = dst_end;
9478 }
9479 sub_end -= entry->vme_start;
9480 sub_end += VME_OFFSET(entry);
9481 local_end = entry->vme_end;
9482 vm_map_unlock(dst_map);
9483 copy_size = sub_end - sub_start;
9484
9485 /* adjust the copy object */
9486 if (total_size > copy_size) {
9487 vm_map_size_t local_size = 0;
9488 vm_map_size_t entry_size;
9489
9490 nentries = 1;
9491 new_offset = copy->offset;
9492 copy_entry = vm_map_copy_first_entry(copy);
9493 while (copy_entry !=
9494 vm_map_copy_to_entry(copy)) {
9495 entry_size = copy_entry->vme_end -
9496 copy_entry->vme_start;
9497 if ((local_size < copy_size) &&
9498 ((local_size + entry_size)
9499 >= copy_size)) {
9500 vm_map_copy_clip_end(copy,
9501 copy_entry,
9502 copy_entry->vme_start +
9503 (copy_size - local_size));
9504 entry_size = copy_entry->vme_end -
9505 copy_entry->vme_start;
9506 local_size += entry_size;
9507 new_offset += entry_size;
9508 }
9509 if (local_size >= copy_size) {
9510 next_copy = copy_entry->vme_next;
9511 copy_entry->vme_next =
9512 vm_map_copy_to_entry(copy);
9513 previous_prev =
9514 copy->cpy_hdr.links.prev;
9515 copy->cpy_hdr.links.prev = copy_entry;
9516 copy->size = copy_size;
9517 remaining_entries =
9518 copy->cpy_hdr.nentries;
9519 remaining_entries -= nentries;
9520 copy->cpy_hdr.nentries = nentries;
9521 break;
9522 } else {
9523 local_size += entry_size;
9524 new_offset += entry_size;
9525 nentries++;
9526 }
9527 copy_entry = copy_entry->vme_next;
9528 }
9529 }
9530
9531 if ((entry->use_pmap) && (pmap == NULL)) {
9532 kr = vm_map_copy_overwrite_nested(
9533 VME_SUBMAP(entry),
9534 sub_start,
9535 copy,
9536 interruptible,
9537 VME_SUBMAP(entry)->pmap,
9538 TRUE);
9539 } else if (pmap != NULL) {
9540 kr = vm_map_copy_overwrite_nested(
9541 VME_SUBMAP(entry),
9542 sub_start,
9543 copy,
9544 interruptible, pmap,
9545 TRUE);
9546 } else {
9547 kr = vm_map_copy_overwrite_nested(
9548 VME_SUBMAP(entry),
9549 sub_start,
9550 copy,
9551 interruptible,
9552 dst_map->pmap,
9553 TRUE);
9554 }
9555 if (kr != KERN_SUCCESS) {
9556 if (next_copy != NULL) {
9557 copy->cpy_hdr.nentries +=
9558 remaining_entries;
9559 copy->cpy_hdr.links.prev->vme_next =
9560 next_copy;
9561 copy->cpy_hdr.links.prev
9562 = previous_prev;
9563 copy->size = total_size;
9564 }
9565 return kr;
9566 }
9567 if (dst_end <= local_end) {
9568 return KERN_SUCCESS;
9569 }
9570 /* otherwise copy no longer exists, it was */
9571 /* destroyed after successful copy_overwrite */
9572 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9573 copy->offset = new_offset;
9574 copy->cpy_hdr.page_shift = copy_page_shift;
9575
9576 total_size -= copy_size;
9577 copy_size = 0;
9578 /* put back remainder of copy in container */
9579 if (next_copy != NULL) {
9580 copy->cpy_hdr.nentries = remaining_entries;
9581 copy->cpy_hdr.links.next = next_copy;
9582 copy->cpy_hdr.links.prev = previous_prev;
9583 copy->size = total_size;
9584 next_copy->vme_prev =
9585 vm_map_copy_to_entry(copy);
9586 next_copy = NULL;
9587 }
9588 base_addr = local_end;
9589 vm_map_lock(dst_map);
9590 if (!vm_map_lookup_entry(dst_map,
9591 local_end, &tmp_entry)) {
9592 vm_map_unlock(dst_map);
9593 return KERN_INVALID_ADDRESS;
9594 }
9595 entry = tmp_entry;
9596 continue;
9597 }
9598 if (dst_end <= entry->vme_end) {
9599 copy_size = dst_end - base_addr;
9600 break;
9601 }
9602
9603 if ((next == vm_map_to_entry(dst_map)) ||
9604 (next->vme_start != entry->vme_end)) {
9605 vm_map_unlock(dst_map);
9606 return KERN_INVALID_ADDRESS;
9607 }
9608
9609 entry = next;
9610 }/* for */
9611
9612 next_copy = NULL;
9613 nentries = 1;
9614
9615 /* adjust the copy object */
9616 if (total_size > copy_size) {
9617 vm_map_size_t local_size = 0;
9618 vm_map_size_t entry_size;
9619
9620 new_offset = copy->offset;
9621 copy_entry = vm_map_copy_first_entry(copy);
9622 while (copy_entry != vm_map_copy_to_entry(copy)) {
9623 entry_size = copy_entry->vme_end -
9624 copy_entry->vme_start;
9625 if ((local_size < copy_size) &&
9626 ((local_size + entry_size)
9627 >= copy_size)) {
9628 vm_map_copy_clip_end(copy, copy_entry,
9629 copy_entry->vme_start +
9630 (copy_size - local_size));
9631 entry_size = copy_entry->vme_end -
9632 copy_entry->vme_start;
9633 local_size += entry_size;
9634 new_offset += entry_size;
9635 }
9636 if (local_size >= copy_size) {
9637 next_copy = copy_entry->vme_next;
9638 copy_entry->vme_next =
9639 vm_map_copy_to_entry(copy);
9640 previous_prev =
9641 copy->cpy_hdr.links.prev;
9642 copy->cpy_hdr.links.prev = copy_entry;
9643 copy->size = copy_size;
9644 remaining_entries =
9645 copy->cpy_hdr.nentries;
9646 remaining_entries -= nentries;
9647 copy->cpy_hdr.nentries = nentries;
9648 break;
9649 } else {
9650 local_size += entry_size;
9651 new_offset += entry_size;
9652 nentries++;
9653 }
9654 copy_entry = copy_entry->vme_next;
9655 }
9656 }
9657
9658 if (aligned) {
9659 pmap_t local_pmap;
9660
9661 if (pmap) {
9662 local_pmap = pmap;
9663 } else {
9664 local_pmap = dst_map->pmap;
9665 }
9666
9667 if ((kr = vm_map_copy_overwrite_aligned(
9668 dst_map, tmp_entry, copy,
9669 base_addr, local_pmap)) != KERN_SUCCESS) {
9670 if (next_copy != NULL) {
9671 copy->cpy_hdr.nentries +=
9672 remaining_entries;
9673 copy->cpy_hdr.links.prev->vme_next =
9674 next_copy;
9675 copy->cpy_hdr.links.prev =
9676 previous_prev;
9677 copy->size += copy_size;
9678 }
9679 return kr;
9680 }
9681 vm_map_unlock(dst_map);
9682 } else {
9683 /*
9684 * Performance gain:
9685 *
9686 * if the copy and dst address are misaligned but the same
9687 * offset within the page we can copy_not_aligned the
9688 * misaligned parts and copy aligned the rest. If they are
9689 * aligned but len is unaligned we simply need to copy
9690 * the end bit unaligned. We'll need to split the misaligned
9691 * bits of the region in this case !
9692 */
9693 /* ALWAYS UNLOCKS THE dst_map MAP */
9694 kr = vm_map_copy_overwrite_unaligned(
9695 dst_map,
9696 tmp_entry,
9697 copy,
9698 base_addr,
9699 discard_on_success);
9700 if (kr != KERN_SUCCESS) {
9701 if (next_copy != NULL) {
9702 copy->cpy_hdr.nentries +=
9703 remaining_entries;
9704 copy->cpy_hdr.links.prev->vme_next =
9705 next_copy;
9706 copy->cpy_hdr.links.prev =
9707 previous_prev;
9708 copy->size += copy_size;
9709 }
9710 return kr;
9711 }
9712 }
9713 total_size -= copy_size;
9714 if (total_size == 0) {
9715 break;
9716 }
9717 base_addr += copy_size;
9718 copy_size = 0;
9719 copy->offset = new_offset;
9720 if (next_copy != NULL) {
9721 copy->cpy_hdr.nentries = remaining_entries;
9722 copy->cpy_hdr.links.next = next_copy;
9723 copy->cpy_hdr.links.prev = previous_prev;
9724 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9725 copy->size = total_size;
9726 }
9727 vm_map_lock(dst_map);
9728 while (TRUE) {
9729 if (!vm_map_lookup_entry(dst_map,
9730 base_addr, &tmp_entry)) {
9731 vm_map_unlock(dst_map);
9732 return KERN_INVALID_ADDRESS;
9733 }
9734 if (tmp_entry->in_transition) {
9735 entry->needs_wakeup = TRUE;
9736 vm_map_entry_wait(dst_map, THREAD_UNINT);
9737 } else {
9738 break;
9739 }
9740 }
9741 vm_map_clip_start(dst_map,
9742 tmp_entry,
9743 vm_map_trunc_page(base_addr,
9744 VM_MAP_PAGE_MASK(dst_map)));
9745
9746 entry = tmp_entry;
9747 } /* while */
9748
9749 /*
9750 * Throw away the vm_map_copy object
9751 */
9752 if (discard_on_success) {
9753 vm_map_copy_discard(copy);
9754 }
9755
9756 return KERN_SUCCESS;
9757 }/* vm_map_copy_overwrite */
9758
9759 static inline kern_return_t
vm_map_copy_addr_size_sanitize(vm_map_t map,vm_map_offset_ut addr_u,vm_map_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_offset_t * addr,vm_map_offset_t * end,vm_map_size_t * size)9760 vm_map_copy_addr_size_sanitize(
9761 vm_map_t map,
9762 vm_map_offset_ut addr_u,
9763 vm_map_size_ut size_u,
9764 vm_sanitize_caller_t vm_sanitize_caller,
9765 vm_map_offset_t *addr,
9766 vm_map_offset_t *end,
9767 vm_map_size_t *size)
9768 {
9769 return vm_sanitize_addr_size(addr_u, size_u,
9770 vm_sanitize_caller, map,
9771 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
9772 addr, end, size);
9773 }
9774
9775 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_ut dst_addr_u,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t interruptible)9776 vm_map_copy_overwrite(
9777 vm_map_t dst_map,
9778 vm_map_offset_ut dst_addr_u,
9779 vm_map_copy_t copy,
9780 vm_map_size_ut copy_size_u,
9781 boolean_t interruptible)
9782 {
9783 vm_map_offset_t dst_addr, dst_end;
9784 vm_map_size_t copy_size;
9785 vm_map_size_t head_size, tail_size;
9786 vm_map_copy_t head_copy, tail_copy;
9787 vm_map_offset_t head_addr, tail_addr;
9788 vm_map_entry_t entry;
9789 kern_return_t kr;
9790 vm_map_offset_t effective_page_mask, effective_page_size;
9791 uint16_t copy_page_shift;
9792
9793 head_size = 0;
9794 tail_size = 0;
9795 head_copy = NULL;
9796 tail_copy = NULL;
9797 head_addr = 0;
9798 tail_addr = 0;
9799
9800 /*
9801 * Check for null copy object.
9802 */
9803 if (copy == VM_MAP_COPY_NULL) {
9804 return KERN_SUCCESS;
9805 }
9806
9807 /*
9808 * Sanitize any input parameters that are addr/size/prot/inherit
9809 */
9810 kr = vm_map_copy_addr_size_sanitize(
9811 dst_map,
9812 dst_addr_u,
9813 copy_size_u,
9814 VM_SANITIZE_CALLER_VM_MAP_COPY_OVERWRITE,
9815 &dst_addr,
9816 &dst_end,
9817 ©_size);
9818 if (__improbable(kr != KERN_SUCCESS)) {
9819 return vm_sanitize_get_kr(kr);
9820 }
9821
9822 /*
9823 * Assert that the vm_map_copy is coming from the right
9824 * zone and hasn't been forged
9825 */
9826 vm_map_copy_require(copy);
9827
9828 if (interruptible ||
9829 copy->type != VM_MAP_COPY_ENTRY_LIST) {
9830 /*
9831 * We can't split the "copy" map if we're interruptible
9832 * or if we don't have a "copy" map...
9833 */
9834 blunt_copy:
9835 kr = vm_map_copy_overwrite_nested(dst_map,
9836 dst_addr,
9837 copy,
9838 interruptible,
9839 (pmap_t) NULL,
9840 TRUE);
9841 if (kr) {
9842 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), kr /* arg */);
9843 }
9844 return kr;
9845 }
9846
9847 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9848 if (copy_page_shift < PAGE_SHIFT ||
9849 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9850 goto blunt_copy;
9851 }
9852
9853 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9854 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9855 } else {
9856 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9857 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9858 effective_page_mask);
9859 }
9860 effective_page_size = effective_page_mask + 1;
9861
9862 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9863 /*
9864 * Too small to bother with optimizing...
9865 */
9866 goto blunt_copy;
9867 }
9868
9869 if ((dst_addr & effective_page_mask) !=
9870 (copy->offset & effective_page_mask)) {
9871 /*
9872 * Incompatible mis-alignment of source and destination...
9873 */
9874 goto blunt_copy;
9875 }
9876
9877 /*
9878 * Proper alignment or identical mis-alignment at the beginning.
9879 * Let's try and do a small unaligned copy first (if needed)
9880 * and then an aligned copy for the rest.
9881 */
9882 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9883 head_addr = dst_addr;
9884 head_size = (effective_page_size -
9885 (copy->offset & effective_page_mask));
9886 head_size = MIN(head_size, copy_size);
9887 }
9888 if (!vm_map_page_aligned(copy->offset + copy_size,
9889 effective_page_mask)) {
9890 /*
9891 * Mis-alignment at the end.
9892 * Do an aligned copy up to the last page and
9893 * then an unaligned copy for the remaining bytes.
9894 */
9895 tail_size = ((copy->offset + copy_size) &
9896 effective_page_mask);
9897 tail_size = MIN(tail_size, copy_size);
9898 tail_addr = dst_addr + copy_size - tail_size;
9899 assert(tail_addr >= head_addr + head_size);
9900 }
9901 assert(head_size + tail_size <= copy_size);
9902
9903 if (head_size + tail_size == copy_size) {
9904 /*
9905 * It's all unaligned, no optimization possible...
9906 */
9907 goto blunt_copy;
9908 }
9909
9910 /*
9911 * Can't optimize if there are any submaps in the
9912 * destination due to the way we free the "copy" map
9913 * progressively in vm_map_copy_overwrite_nested()
9914 * in that case.
9915 */
9916 vm_map_lock_read(dst_map);
9917 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9918 vm_map_unlock_read(dst_map);
9919 goto blunt_copy;
9920 }
9921 for (;
9922 (entry != vm_map_to_entry(dst_map) &&
9923 entry->vme_start < dst_addr + copy_size);
9924 entry = entry->vme_next) {
9925 if (entry->is_sub_map) {
9926 vm_map_unlock_read(dst_map);
9927 goto blunt_copy;
9928 }
9929 }
9930 vm_map_unlock_read(dst_map);
9931
9932 if (head_size) {
9933 /*
9934 * Unaligned copy of the first "head_size" bytes, to reach
9935 * a page boundary.
9936 */
9937
9938 /*
9939 * Extract "head_copy" out of "copy".
9940 */
9941 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9942 head_copy->cpy_hdr.entries_pageable =
9943 copy->cpy_hdr.entries_pageable;
9944 head_copy->cpy_hdr.page_shift = copy_page_shift;
9945
9946 entry = vm_map_copy_first_entry(copy);
9947 if (entry->vme_end < copy->offset + head_size) {
9948 head_size = entry->vme_end - copy->offset;
9949 }
9950
9951 head_copy->offset = copy->offset;
9952 head_copy->size = head_size;
9953 copy->offset += head_size;
9954 copy->size -= head_size;
9955 copy_size -= head_size;
9956 assert(copy_size > 0);
9957
9958 vm_map_copy_clip_end(copy, entry, copy->offset);
9959 vm_map_copy_entry_unlink(copy, entry);
9960 vm_map_copy_entry_link(head_copy,
9961 vm_map_copy_to_entry(head_copy),
9962 entry);
9963
9964 /*
9965 * Do the unaligned copy.
9966 */
9967 kr = vm_map_copy_overwrite_nested(dst_map,
9968 head_addr,
9969 head_copy,
9970 interruptible,
9971 (pmap_t) NULL,
9972 FALSE);
9973 if (kr != KERN_SUCCESS) {
9974 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), kr /* arg */);
9975 goto done;
9976 }
9977 }
9978
9979 if (tail_size) {
9980 /*
9981 * Extract "tail_copy" out of "copy".
9982 */
9983 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9984 tail_copy->cpy_hdr.entries_pageable =
9985 copy->cpy_hdr.entries_pageable;
9986 tail_copy->cpy_hdr.page_shift = copy_page_shift;
9987
9988 tail_copy->offset = copy->offset + copy_size - tail_size;
9989 tail_copy->size = tail_size;
9990
9991 copy->size -= tail_size;
9992 copy_size -= tail_size;
9993 assert(copy_size > 0);
9994
9995 entry = vm_map_copy_last_entry(copy);
9996 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9997 entry = vm_map_copy_last_entry(copy);
9998 vm_map_copy_entry_unlink(copy, entry);
9999 vm_map_copy_entry_link(tail_copy,
10000 vm_map_copy_last_entry(tail_copy),
10001 entry);
10002 }
10003
10004 /*
10005 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10006 * we want to avoid TOCTOU issues w.r.t copy->size but
10007 * we don't need to change vm_map_copy_overwrite_nested()
10008 * and all other vm_map_copy_overwrite variants.
10009 *
10010 * So we assign the original copy_size that was passed into
10011 * this routine back to copy.
10012 *
10013 * This use of local 'copy_size' passed into this routine is
10014 * to try and protect against TOCTOU attacks where the kernel
10015 * has been exploited. We don't expect this to be an issue
10016 * during normal system operation.
10017 */
10018 assertf(copy->size == copy_size,
10019 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10020 copy->size = copy_size;
10021
10022 /*
10023 * Copy most (or possibly all) of the data.
10024 */
10025 kr = vm_map_copy_overwrite_nested(dst_map,
10026 dst_addr + head_size,
10027 copy,
10028 interruptible,
10029 (pmap_t) NULL,
10030 FALSE);
10031 if (kr != KERN_SUCCESS) {
10032 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), kr /* arg */);
10033 goto done;
10034 }
10035
10036 if (tail_size) {
10037 kr = vm_map_copy_overwrite_nested(dst_map,
10038 tail_addr,
10039 tail_copy,
10040 interruptible,
10041 (pmap_t) NULL,
10042 FALSE);
10043 if (kr) {
10044 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), kr /* arg */);
10045 }
10046 }
10047
10048 done:
10049 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10050 if (kr == KERN_SUCCESS) {
10051 /*
10052 * Discard all the copy maps.
10053 */
10054 if (head_copy) {
10055 vm_map_copy_discard(head_copy);
10056 head_copy = NULL;
10057 }
10058 vm_map_copy_discard(copy);
10059 if (tail_copy) {
10060 vm_map_copy_discard(tail_copy);
10061 tail_copy = NULL;
10062 }
10063 } else {
10064 /*
10065 * Re-assemble the original copy map.
10066 */
10067 if (head_copy) {
10068 entry = vm_map_copy_first_entry(head_copy);
10069 vm_map_copy_entry_unlink(head_copy, entry);
10070 vm_map_copy_entry_link(copy,
10071 vm_map_copy_to_entry(copy),
10072 entry);
10073 copy->offset -= head_size;
10074 copy->size += head_size;
10075 vm_map_copy_discard(head_copy);
10076 head_copy = NULL;
10077 }
10078 if (tail_copy) {
10079 entry = vm_map_copy_last_entry(tail_copy);
10080 vm_map_copy_entry_unlink(tail_copy, entry);
10081 vm_map_copy_entry_link(copy,
10082 vm_map_copy_last_entry(copy),
10083 entry);
10084 copy->size += tail_size;
10085 vm_map_copy_discard(tail_copy);
10086 tail_copy = NULL;
10087 }
10088 }
10089 return kr;
10090 }
10091
10092
10093 /*
10094 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10095 *
10096 * Decription:
10097 * Physically copy unaligned data
10098 *
10099 * Implementation:
10100 * Unaligned parts of pages have to be physically copied. We use
10101 * a modified form of vm_fault_copy (which understands none-aligned
10102 * page offsets and sizes) to do the copy. We attempt to copy as
10103 * much memory in one go as possibly, however vm_fault_copy copies
10104 * within 1 memory object so we have to find the smaller of "amount left"
10105 * "source object data size" and "target object data size". With
10106 * unaligned data we don't need to split regions, therefore the source
10107 * (copy) object should be one map entry, the target range may be split
10108 * over multiple map entries however. In any event we are pessimistic
10109 * about these assumptions.
10110 *
10111 * Callers of this function must call vm_map_copy_require on
10112 * previously created vm_map_copy_t or pass a newly created
10113 * one to ensure that it hasn't been forged.
10114 *
10115 * Assumptions:
10116 * dst_map is locked on entry and is return locked on success,
10117 * unlocked on error.
10118 */
10119
10120 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)10121 vm_map_copy_overwrite_unaligned(
10122 vm_map_t dst_map,
10123 vm_map_entry_t entry,
10124 vm_map_copy_t copy,
10125 vm_map_offset_t start,
10126 boolean_t discard_on_success)
10127 {
10128 vm_map_entry_t copy_entry;
10129 vm_map_entry_t copy_entry_next;
10130 vm_map_version_t version;
10131 vm_object_t dst_object;
10132 vm_object_offset_t dst_offset;
10133 vm_object_offset_t src_offset;
10134 vm_object_offset_t entry_offset;
10135 vm_map_offset_t entry_end;
10136 vm_map_size_t src_size,
10137 dst_size,
10138 copy_size,
10139 amount_left;
10140 kern_return_t kr = KERN_SUCCESS;
10141
10142
10143 copy_entry = vm_map_copy_first_entry(copy);
10144
10145 vm_map_lock_write_to_read(dst_map);
10146
10147 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10148 amount_left = copy->size;
10149 /*
10150 * unaligned so we never clipped this entry, we need the offset into
10151 * the vm_object not just the data.
10152 */
10153 while (amount_left > 0) {
10154 if (entry == vm_map_to_entry(dst_map)) {
10155 vm_map_unlock_read(dst_map);
10156 return KERN_INVALID_ADDRESS;
10157 }
10158
10159 /* "start" must be within the current map entry */
10160 assert((start >= entry->vme_start) && (start < entry->vme_end));
10161
10162 /*
10163 * Check protection again
10164 */
10165 if (!(entry->protection & VM_PROT_WRITE)) {
10166 vm_map_unlock_read(dst_map);
10167 return KERN_PROTECTION_FAILURE;
10168 }
10169 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10170 vm_map_unlock_read(dst_map);
10171 return KERN_PROTECTION_FAILURE;
10172 }
10173
10174 /*
10175 * If the entry is in transition, we must wait
10176 * for it to exit that state. Anything could happen
10177 * when we unlock the map, so start over.
10178 */
10179 if (entry->in_transition) {
10180 /*
10181 * Say that we are waiting, and wait for entry.
10182 */
10183 entry->needs_wakeup = TRUE;
10184 vm_map_entry_wait(dst_map, THREAD_UNINT);
10185
10186 goto RetryLookup;
10187 }
10188
10189 dst_offset = start - entry->vme_start;
10190
10191 dst_size = entry->vme_end - start;
10192
10193 src_size = copy_entry->vme_end -
10194 (copy_entry->vme_start + src_offset);
10195
10196 if (dst_size < src_size) {
10197 /*
10198 * we can only copy dst_size bytes before
10199 * we have to get the next destination entry
10200 */
10201 copy_size = dst_size;
10202 } else {
10203 /*
10204 * we can only copy src_size bytes before
10205 * we have to get the next source copy entry
10206 */
10207 copy_size = src_size;
10208 }
10209
10210 if (copy_size > amount_left) {
10211 copy_size = amount_left;
10212 }
10213 /*
10214 * Entry needs copy, create a shadow shadow object for
10215 * Copy on write region.
10216 */
10217 if (entry->needs_copy) {
10218 if (vm_map_lock_read_to_write(dst_map)) {
10219 vm_map_lock_read(dst_map);
10220 goto RetryLookup;
10221 }
10222 VME_OBJECT_SHADOW(entry,
10223 (vm_map_size_t)(entry->vme_end
10224 - entry->vme_start),
10225 vm_map_always_shadow(dst_map));
10226 entry->needs_copy = FALSE;
10227 vm_map_lock_write_to_read(dst_map);
10228 }
10229 dst_object = VME_OBJECT(entry);
10230 /*
10231 * unlike with the virtual (aligned) copy we're going
10232 * to fault on it therefore we need a target object.
10233 */
10234 if (dst_object == VM_OBJECT_NULL) {
10235 if (vm_map_lock_read_to_write(dst_map)) {
10236 vm_map_lock_read(dst_map);
10237 goto RetryLookup;
10238 }
10239 dst_object = vm_object_allocate((vm_map_size_t)
10240 entry->vme_end - entry->vme_start);
10241 VME_OBJECT_SET(entry, dst_object, false, 0);
10242 VME_OFFSET_SET(entry, 0);
10243 assert(entry->use_pmap);
10244 vm_map_lock_write_to_read(dst_map);
10245 }
10246 /*
10247 * Take an object reference and unlock map. The "entry" may
10248 * disappear or change when the map is unlocked.
10249 */
10250 vm_object_reference(dst_object);
10251 version.main_timestamp = dst_map->timestamp;
10252 entry_offset = VME_OFFSET(entry);
10253 entry_end = entry->vme_end;
10254 vm_map_unlock_read(dst_map);
10255 /*
10256 * Copy as much as possible in one pass
10257 */
10258 kr = vm_fault_copy(
10259 VME_OBJECT(copy_entry),
10260 VME_OFFSET(copy_entry) + src_offset,
10261 ©_size,
10262 dst_object,
10263 entry_offset + dst_offset,
10264 dst_map,
10265 &version,
10266 THREAD_UNINT );
10267
10268 start += copy_size;
10269 src_offset += copy_size;
10270 amount_left -= copy_size;
10271 /*
10272 * Release the object reference
10273 */
10274 vm_object_deallocate(dst_object);
10275 /*
10276 * If a hard error occurred, return it now
10277 */
10278 if (kr != KERN_SUCCESS) {
10279 return kr;
10280 }
10281
10282 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10283 || amount_left == 0) {
10284 /*
10285 * all done with this copy entry, dispose.
10286 */
10287 copy_entry_next = copy_entry->vme_next;
10288
10289 if (discard_on_success) {
10290 vm_map_copy_entry_unlink(copy, copy_entry);
10291 assert(!copy_entry->is_sub_map);
10292 vm_object_deallocate(VME_OBJECT(copy_entry));
10293 vm_map_copy_entry_dispose(copy_entry);
10294 }
10295
10296 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10297 amount_left) {
10298 /*
10299 * not finished copying but run out of source
10300 */
10301 return KERN_INVALID_ADDRESS;
10302 }
10303
10304 copy_entry = copy_entry_next;
10305
10306 src_offset = 0;
10307 }
10308
10309 if (amount_left == 0) {
10310 return KERN_SUCCESS;
10311 }
10312
10313 vm_map_lock_read(dst_map);
10314 if (version.main_timestamp == dst_map->timestamp) {
10315 if (start == entry_end) {
10316 /*
10317 * destination region is split. Use the version
10318 * information to avoid a lookup in the normal
10319 * case.
10320 */
10321 entry = entry->vme_next;
10322 /*
10323 * should be contiguous. Fail if we encounter
10324 * a hole in the destination.
10325 */
10326 if (start != entry->vme_start) {
10327 vm_map_unlock_read(dst_map);
10328 return KERN_INVALID_ADDRESS;
10329 }
10330 }
10331 } else {
10332 /*
10333 * Map version check failed.
10334 * we must lookup the entry because somebody
10335 * might have changed the map behind our backs.
10336 */
10337 RetryLookup:
10338 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
10339 vm_map_unlock_read(dst_map);
10340 return KERN_INVALID_ADDRESS;
10341 }
10342 }
10343 }/* while */
10344
10345 return KERN_SUCCESS;
10346 }/* vm_map_copy_overwrite_unaligned */
10347
10348 /*
10349 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10350 *
10351 * Description:
10352 * Does all the vm_trickery possible for whole pages.
10353 *
10354 * Implementation:
10355 *
10356 * If there are no permanent objects in the destination,
10357 * and the source and destination map entry zones match,
10358 * and the destination map entry is not shared,
10359 * then the map entries can be deleted and replaced
10360 * with those from the copy. The following code is the
10361 * basic idea of what to do, but there are lots of annoying
10362 * little details about getting protection and inheritance
10363 * right. Should add protection, inheritance, and sharing checks
10364 * to the above pass and make sure that no wiring is involved.
10365 *
10366 * Callers of this function must call vm_map_copy_require on
10367 * previously created vm_map_copy_t or pass a newly created
10368 * one to ensure that it hasn't been forged.
10369 */
10370
10371 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10372 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10373 int vm_map_copy_overwrite_aligned_src_large = 0;
10374
10375 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10376 vm_map_copy_overwrite_aligned(
10377 vm_map_t dst_map,
10378 vm_map_entry_t tmp_entry,
10379 vm_map_copy_t copy,
10380 vm_map_offset_t start,
10381 __unused pmap_t pmap)
10382 {
10383 vm_object_t object;
10384 vm_map_entry_t copy_entry;
10385 vm_map_size_t copy_size;
10386 vm_map_size_t size;
10387 vm_map_entry_t entry;
10388
10389 while ((copy_entry = vm_map_copy_first_entry(copy))
10390 != vm_map_copy_to_entry(copy)) {
10391 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10392
10393 entry = tmp_entry;
10394 if (entry->is_sub_map) {
10395 /* unnested when clipped earlier */
10396 assert(!entry->use_pmap);
10397 }
10398 if (entry == vm_map_to_entry(dst_map)) {
10399 vm_map_unlock(dst_map);
10400 return KERN_INVALID_ADDRESS;
10401 }
10402 size = (entry->vme_end - entry->vme_start);
10403 /*
10404 * Make sure that no holes popped up in the
10405 * address map, and that the protection is
10406 * still valid, in case the map was unlocked
10407 * earlier.
10408 */
10409
10410 if ((entry->vme_start != start) || ((entry->is_sub_map)
10411 && !entry->needs_copy)) {
10412 vm_map_unlock(dst_map);
10413 return KERN_INVALID_ADDRESS;
10414 }
10415 assert(entry != vm_map_to_entry(dst_map));
10416
10417 /*
10418 * Check protection again
10419 */
10420
10421 if (!(entry->protection & VM_PROT_WRITE)) {
10422 vm_map_unlock(dst_map);
10423 return KERN_PROTECTION_FAILURE;
10424 }
10425
10426 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10427 vm_map_unlock(dst_map);
10428 return KERN_PROTECTION_FAILURE;
10429 }
10430
10431 /*
10432 * If the entry is in transition, we must wait
10433 * for it to exit that state. Anything could happen
10434 * when we unlock the map, so start over.
10435 */
10436 if (entry->in_transition) {
10437 /*
10438 * Say that we are waiting, and wait for entry.
10439 */
10440 entry->needs_wakeup = TRUE;
10441 vm_map_entry_wait(dst_map, THREAD_UNINT);
10442
10443 goto RetryLookup;
10444 }
10445
10446 /*
10447 * Adjust to source size first
10448 */
10449
10450 if (copy_size < size) {
10451 if (entry->map_aligned &&
10452 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10453 VM_MAP_PAGE_MASK(dst_map))) {
10454 /* no longer map-aligned */
10455 entry->map_aligned = FALSE;
10456 }
10457 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10458 size = copy_size;
10459 }
10460
10461 /*
10462 * Adjust to destination size
10463 */
10464
10465 if (size < copy_size) {
10466 vm_map_copy_clip_end(copy, copy_entry,
10467 copy_entry->vme_start + size);
10468 copy_size = size;
10469 }
10470
10471 assert((entry->vme_end - entry->vme_start) == size);
10472 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10473 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10474
10475 /*
10476 * If the destination contains temporary unshared memory,
10477 * we can perform the copy by throwing it away and
10478 * installing the source data.
10479 *
10480 * Exceptions for mappings with special semantics:
10481 * + "permanent" entries,
10482 * + JIT regions,
10483 * + TPRO regions,
10484 * + pmap-specific protection policies,
10485 * + VM objects with COPY_NONE copy strategy.
10486 */
10487
10488 object = VME_OBJECT(entry);
10489 if ((!entry->is_shared &&
10490 !entry->vme_permanent &&
10491 !entry->used_for_jit &&
10492 #if __arm64e__
10493 !entry->used_for_tpro &&
10494 #endif /* __arm64e__ */
10495 !(entry->protection & VM_PROT_EXECUTE) &&
10496 !pmap_has_prot_policy(dst_map->pmap, entry->translated_allow_execute, entry->protection) &&
10497 ((object == VM_OBJECT_NULL) ||
10498 (object->internal &&
10499 !object->true_share &&
10500 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10501 entry->needs_copy) {
10502 vm_object_t old_object = VME_OBJECT(entry);
10503 vm_object_offset_t old_offset = VME_OFFSET(entry);
10504 vm_object_offset_t offset;
10505
10506 /*
10507 * Ensure that the source and destination aren't
10508 * identical
10509 */
10510 if (old_object == VME_OBJECT(copy_entry) &&
10511 old_offset == VME_OFFSET(copy_entry)) {
10512 vm_map_copy_entry_unlink(copy, copy_entry);
10513 vm_map_copy_entry_dispose(copy_entry);
10514
10515 if (old_object != VM_OBJECT_NULL) {
10516 vm_object_deallocate(old_object);
10517 }
10518
10519 start = tmp_entry->vme_end;
10520 tmp_entry = tmp_entry->vme_next;
10521 continue;
10522 }
10523
10524 #if XNU_TARGET_OS_OSX
10525 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10526 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10527 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10528 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10529 copy_size <= __TRADEOFF1_COPY_SIZE) {
10530 /*
10531 * Virtual vs. Physical copy tradeoff #1.
10532 *
10533 * Copying only a few pages out of a large
10534 * object: do a physical copy instead of
10535 * a virtual copy, to avoid possibly keeping
10536 * the entire large object alive because of
10537 * those few copy-on-write pages.
10538 */
10539 vm_map_copy_overwrite_aligned_src_large++;
10540 goto slow_copy;
10541 }
10542 #endif /* XNU_TARGET_OS_OSX */
10543
10544 if ((dst_map->pmap != kernel_pmap) &&
10545 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10546 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10547 vm_object_t new_object, new_shadow;
10548
10549 /*
10550 * We're about to map something over a mapping
10551 * established by malloc()...
10552 */
10553 new_object = VME_OBJECT(copy_entry);
10554 if (new_object != VM_OBJECT_NULL) {
10555 vm_object_lock_shared(new_object);
10556 }
10557 while (new_object != VM_OBJECT_NULL &&
10558 #if XNU_TARGET_OS_OSX
10559 !new_object->true_share &&
10560 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10561 #endif /* XNU_TARGET_OS_OSX */
10562 new_object->internal) {
10563 new_shadow = new_object->shadow;
10564 if (new_shadow == VM_OBJECT_NULL) {
10565 break;
10566 }
10567 vm_object_lock_shared(new_shadow);
10568 vm_object_unlock(new_object);
10569 new_object = new_shadow;
10570 }
10571 if (new_object != VM_OBJECT_NULL) {
10572 if (!new_object->internal) {
10573 /*
10574 * The new mapping is backed
10575 * by an external object. We
10576 * don't want malloc'ed memory
10577 * to be replaced with such a
10578 * non-anonymous mapping, so
10579 * let's go off the optimized
10580 * path...
10581 */
10582 vm_map_copy_overwrite_aligned_src_not_internal++;
10583 vm_object_unlock(new_object);
10584 goto slow_copy;
10585 }
10586 #if XNU_TARGET_OS_OSX
10587 if (new_object->true_share ||
10588 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10589 /*
10590 * Same if there's a "true_share"
10591 * object in the shadow chain, or
10592 * an object with a non-default
10593 * (SYMMETRIC) copy strategy.
10594 */
10595 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10596 vm_object_unlock(new_object);
10597 goto slow_copy;
10598 }
10599 #endif /* XNU_TARGET_OS_OSX */
10600 vm_object_unlock(new_object);
10601 }
10602 /*
10603 * The new mapping is still backed by
10604 * anonymous (internal) memory, so it's
10605 * OK to substitute it for the original
10606 * malloc() mapping.
10607 */
10608 }
10609
10610 if (old_object != VM_OBJECT_NULL) {
10611 assert(!entry->vme_permanent);
10612 if (entry->is_sub_map) {
10613 if (entry->use_pmap) {
10614 #ifndef NO_NESTED_PMAP
10615 pmap_unnest(dst_map->pmap,
10616 (addr64_t)entry->vme_start,
10617 entry->vme_end - entry->vme_start);
10618 #endif /* NO_NESTED_PMAP */
10619 if (dst_map->mapped_in_other_pmaps) {
10620 /* clean up parent */
10621 /* map/maps */
10622 vm_map_submap_pmap_clean(
10623 dst_map, entry->vme_start,
10624 entry->vme_end,
10625 VME_SUBMAP(entry),
10626 VME_OFFSET(entry));
10627 }
10628 } else {
10629 vm_map_submap_pmap_clean(
10630 dst_map, entry->vme_start,
10631 entry->vme_end,
10632 VME_SUBMAP(entry),
10633 VME_OFFSET(entry));
10634 }
10635 vm_map_deallocate(VME_SUBMAP(entry));
10636 } else {
10637 if (dst_map->mapped_in_other_pmaps) {
10638 vm_object_pmap_protect_options(
10639 VME_OBJECT(entry),
10640 VME_OFFSET(entry),
10641 entry->vme_end
10642 - entry->vme_start,
10643 PMAP_NULL,
10644 PAGE_SIZE,
10645 entry->vme_start,
10646 VM_PROT_NONE,
10647 PMAP_OPTIONS_REMOVE);
10648 } else {
10649 pmap_remove_options(
10650 dst_map->pmap,
10651 (addr64_t)(entry->vme_start),
10652 (addr64_t)(entry->vme_end),
10653 PMAP_OPTIONS_REMOVE);
10654 }
10655 vm_object_deallocate(old_object);
10656 }
10657 }
10658
10659 if (entry->iokit_acct) {
10660 /* keep using iokit accounting */
10661 entry->use_pmap = FALSE;
10662 } else {
10663 /* use pmap accounting */
10664 entry->use_pmap = TRUE;
10665 }
10666 assert(!entry->vme_permanent);
10667 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10668 object = VME_OBJECT(entry);
10669 entry->needs_copy = copy_entry->needs_copy;
10670 entry->wired_count = 0;
10671 entry->user_wired_count = 0;
10672 offset = VME_OFFSET(copy_entry);
10673 VME_OFFSET_SET(entry, offset);
10674
10675 vm_map_copy_entry_unlink(copy, copy_entry);
10676 vm_map_copy_entry_dispose(copy_entry);
10677
10678 /*
10679 * we could try to push pages into the pmap at this point, BUT
10680 * this optimization only saved on average 2 us per page if ALL
10681 * the pages in the source were currently mapped
10682 * and ALL the pages in the dest were touched, if there were fewer
10683 * than 2/3 of the pages touched, this optimization actually cost more cycles
10684 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10685 */
10686
10687 /*
10688 * Set up for the next iteration. The map
10689 * has not been unlocked, so the next
10690 * address should be at the end of this
10691 * entry, and the next map entry should be
10692 * the one following it.
10693 */
10694
10695 start = tmp_entry->vme_end;
10696 tmp_entry = tmp_entry->vme_next;
10697 } else {
10698 vm_map_version_t version;
10699 vm_object_t dst_object;
10700 vm_object_offset_t dst_offset;
10701 kern_return_t r;
10702
10703 slow_copy:
10704 if (entry->needs_copy) {
10705 VME_OBJECT_SHADOW(entry,
10706 (entry->vme_end -
10707 entry->vme_start),
10708 vm_map_always_shadow(dst_map));
10709 entry->needs_copy = FALSE;
10710 }
10711
10712 dst_object = VME_OBJECT(entry);
10713 dst_offset = VME_OFFSET(entry);
10714
10715 /*
10716 * Take an object reference, and record
10717 * the map version information so that the
10718 * map can be safely unlocked.
10719 */
10720
10721 if (dst_object == VM_OBJECT_NULL) {
10722 /*
10723 * We would usually have just taken the
10724 * optimized path above if the destination
10725 * object has not been allocated yet. But we
10726 * now disable that optimization if the copy
10727 * entry's object is not backed by anonymous
10728 * memory to avoid replacing malloc'ed
10729 * (i.e. re-usable) anonymous memory with a
10730 * not-so-anonymous mapping.
10731 * So we have to handle this case here and
10732 * allocate a new VM object for this map entry.
10733 */
10734 dst_object = vm_object_allocate(
10735 entry->vme_end - entry->vme_start);
10736 dst_offset = 0;
10737 VME_OBJECT_SET(entry, dst_object, false, 0);
10738 VME_OFFSET_SET(entry, dst_offset);
10739 assert(entry->use_pmap);
10740 }
10741
10742 vm_object_reference(dst_object);
10743
10744 /* account for unlock bumping up timestamp */
10745 version.main_timestamp = dst_map->timestamp + 1;
10746
10747 vm_map_unlock(dst_map);
10748
10749 /*
10750 * Copy as much as possible in one pass
10751 */
10752
10753 copy_size = size;
10754 r = vm_fault_copy(
10755 VME_OBJECT(copy_entry),
10756 VME_OFFSET(copy_entry),
10757 ©_size,
10758 dst_object,
10759 dst_offset,
10760 dst_map,
10761 &version,
10762 THREAD_UNINT );
10763
10764 /*
10765 * Release the object reference
10766 */
10767
10768 vm_object_deallocate(dst_object);
10769
10770 /*
10771 * If a hard error occurred, return it now
10772 */
10773
10774 if (r != KERN_SUCCESS) {
10775 return r;
10776 }
10777
10778 if (copy_size != 0) {
10779 /*
10780 * Dispose of the copied region
10781 */
10782
10783 vm_map_copy_clip_end(copy, copy_entry,
10784 copy_entry->vme_start + copy_size);
10785 vm_map_copy_entry_unlink(copy, copy_entry);
10786 vm_object_deallocate(VME_OBJECT(copy_entry));
10787 vm_map_copy_entry_dispose(copy_entry);
10788 }
10789
10790 /*
10791 * Pick up in the destination map where we left off.
10792 *
10793 * Use the version information to avoid a lookup
10794 * in the normal case.
10795 */
10796
10797 start += copy_size;
10798 vm_map_lock(dst_map);
10799 if (version.main_timestamp == dst_map->timestamp &&
10800 copy_size != 0) {
10801 /* We can safely use saved tmp_entry value */
10802
10803 if (tmp_entry->map_aligned &&
10804 !VM_MAP_PAGE_ALIGNED(
10805 start,
10806 VM_MAP_PAGE_MASK(dst_map))) {
10807 /* no longer map-aligned */
10808 tmp_entry->map_aligned = FALSE;
10809 }
10810 vm_map_clip_end(dst_map, tmp_entry, start);
10811 tmp_entry = tmp_entry->vme_next;
10812 } else {
10813 /* Must do lookup of tmp_entry */
10814
10815 RetryLookup:
10816 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10817 vm_map_unlock(dst_map);
10818 return KERN_INVALID_ADDRESS;
10819 }
10820 if (tmp_entry->map_aligned &&
10821 !VM_MAP_PAGE_ALIGNED(
10822 start,
10823 VM_MAP_PAGE_MASK(dst_map))) {
10824 /* no longer map-aligned */
10825 tmp_entry->map_aligned = FALSE;
10826 }
10827 vm_map_clip_start(dst_map, tmp_entry, start);
10828 }
10829 }
10830 }/* while */
10831
10832 return KERN_SUCCESS;
10833 }/* vm_map_copy_overwrite_aligned */
10834
10835 /*
10836 * Routine: vm_map_copyin_kernel_buffer [internal use only]
10837 *
10838 * Description:
10839 * Copy in data to a kernel buffer from space in the
10840 * source map. The original space may be optionally
10841 * deallocated.
10842 *
10843 * If successful, returns a new copy object.
10844 */
10845 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10846 vm_map_copyin_kernel_buffer(
10847 vm_map_t src_map,
10848 vm_map_offset_t src_addr,
10849 vm_map_size_t len,
10850 boolean_t src_destroy,
10851 vm_map_copy_t *copy_result)
10852 {
10853 kern_return_t kr;
10854 vm_map_copy_t copy;
10855 void *kdata;
10856
10857 if (len > msg_ool_size_small) {
10858 return KERN_INVALID_ARGUMENT;
10859 }
10860
10861 kdata = kalloc_data(len, Z_WAITOK);
10862 if (kdata == NULL) {
10863 return KERN_RESOURCE_SHORTAGE;
10864 }
10865 kr = copyinmap(src_map, src_addr, kdata, (vm_size_t)len);
10866 if (kr != KERN_SUCCESS) {
10867 kfree_data(kdata, len);
10868 return kr;
10869 }
10870
10871 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
10872 copy->cpy_kdata = kdata;
10873 copy->size = len;
10874 copy->offset = 0;
10875
10876 if (src_destroy) {
10877 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10878
10879 if (src_map == kernel_map) {
10880 flags |= VM_MAP_REMOVE_KUNWIRE;
10881 }
10882
10883 (void)vm_map_remove_guard(src_map,
10884 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10885 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10886 flags, KMEM_GUARD_NONE);
10887 }
10888
10889 *copy_result = copy;
10890 return KERN_SUCCESS;
10891 }
10892
10893 /*
10894 * Routine: vm_map_copyout_kernel_buffer [internal use only]
10895 *
10896 * Description:
10897 * Copy out data from a kernel buffer into space in the
10898 * destination map. The space may be otpionally dynamically
10899 * allocated.
10900 *
10901 * If successful, consumes the copy object.
10902 * Otherwise, the caller is responsible for it.
10903 *
10904 * Callers of this function must call vm_map_copy_require on
10905 * previously created vm_map_copy_t or pass a newly created
10906 * one to ensure that it hasn't been forged.
10907 */
10908 static int vm_map_copyout_kernel_buffer_failures = 0;
10909 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10910 vm_map_copyout_kernel_buffer(
10911 vm_map_t map,
10912 vm_map_address_t *addr, /* IN/OUT */
10913 vm_map_copy_t copy,
10914 vm_map_size_t copy_size,
10915 boolean_t overwrite,
10916 boolean_t consume_on_success)
10917 {
10918 kern_return_t kr = KERN_SUCCESS;
10919 thread_t thread = current_thread();
10920
10921 assert(copy->size == copy_size);
10922
10923 /*
10924 * check for corrupted vm_map_copy structure
10925 */
10926 if (copy_size > msg_ool_size_small || copy->offset) {
10927 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10928 (long long)copy->size, (long long)copy->offset);
10929 }
10930
10931 if (!overwrite) {
10932 /*
10933 * Allocate space in the target map for the data
10934 */
10935 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
10936
10937 if (map == kernel_map) {
10938 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10939 }
10940
10941 *addr = 0;
10942 kr = vm_map_enter(map,
10943 addr,
10944 vm_map_round_page(copy_size,
10945 VM_MAP_PAGE_MASK(map)),
10946 (vm_map_offset_t) 0,
10947 vmk_flags,
10948 VM_OBJECT_NULL,
10949 (vm_object_offset_t) 0,
10950 FALSE,
10951 VM_PROT_DEFAULT,
10952 VM_PROT_ALL,
10953 VM_INHERIT_DEFAULT);
10954 if (kr != KERN_SUCCESS) {
10955 return kr;
10956 }
10957 #if KASAN
10958 if (map->pmap == kernel_pmap) {
10959 kasan_notify_address(*addr, copy->size);
10960 }
10961 #endif
10962 }
10963
10964 /*
10965 * Copyout the data from the kernel buffer to the target map.
10966 */
10967 if (thread->map == map) {
10968 /*
10969 * If the target map is the current map, just do
10970 * the copy.
10971 */
10972 assert((vm_size_t)copy_size == copy_size);
10973 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10974 kr = KERN_INVALID_ADDRESS;
10975 }
10976 } else {
10977 vm_map_t oldmap;
10978
10979 /*
10980 * If the target map is another map, assume the
10981 * target's address space identity for the duration
10982 * of the copy.
10983 */
10984 vm_map_reference(map);
10985 oldmap = vm_map_switch(map);
10986
10987 assert((vm_size_t)copy_size == copy_size);
10988 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10989 vm_map_copyout_kernel_buffer_failures++;
10990 kr = KERN_INVALID_ADDRESS;
10991 }
10992
10993 (void) vm_map_switch(oldmap);
10994 vm_map_deallocate(map);
10995 }
10996
10997 if (kr != KERN_SUCCESS) {
10998 /* the copy failed, clean up */
10999 if (!overwrite) {
11000 /*
11001 * Deallocate the space we allocated in the target map.
11002 */
11003 (void) vm_map_remove(map,
11004 vm_map_trunc_page(*addr,
11005 VM_MAP_PAGE_MASK(map)),
11006 vm_map_round_page((*addr +
11007 vm_map_round_page(copy_size,
11008 VM_MAP_PAGE_MASK(map))),
11009 VM_MAP_PAGE_MASK(map)));
11010 *addr = 0;
11011 }
11012 } else {
11013 /* copy was successful, dicard the copy structure */
11014 if (consume_on_success) {
11015 kfree_data(copy->cpy_kdata, copy_size);
11016 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11017 }
11018 }
11019
11020 return kr;
11021 }
11022
11023 /*
11024 * Routine: vm_map_copy_insert [internal use only]
11025 *
11026 * Description:
11027 * Link a copy chain ("copy") into a map at the
11028 * specified location (after "where").
11029 *
11030 * Callers of this function must call vm_map_copy_require on
11031 * previously created vm_map_copy_t or pass a newly created
11032 * one to ensure that it hasn't been forged.
11033 * Side effects:
11034 * The copy chain is destroyed.
11035 */
11036 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)11037 vm_map_copy_insert(
11038 vm_map_t map,
11039 vm_map_entry_t after_where,
11040 vm_map_copy_t copy)
11041 {
11042 vm_map_entry_t entry;
11043
11044 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11045 entry = vm_map_copy_first_entry(copy);
11046 vm_map_copy_entry_unlink(copy, entry);
11047 vm_map_store_entry_link(map, after_where, entry,
11048 VM_MAP_KERNEL_FLAGS_NONE);
11049 after_where = entry;
11050 }
11051 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11052 }
11053
11054 /*
11055 * Callers of this function must call vm_map_copy_require on
11056 * previously created vm_map_copy_t or pass a newly created
11057 * one to ensure that it hasn't been forged.
11058 */
11059 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)11060 vm_map_copy_remap(
11061 vm_map_t map,
11062 vm_map_entry_t where,
11063 vm_map_copy_t copy,
11064 vm_map_offset_t adjustment,
11065 vm_prot_t cur_prot,
11066 vm_prot_t max_prot,
11067 vm_inherit_t inheritance)
11068 {
11069 vm_map_entry_t copy_entry, new_entry;
11070
11071 for (copy_entry = vm_map_copy_first_entry(copy);
11072 copy_entry != vm_map_copy_to_entry(copy);
11073 copy_entry = copy_entry->vme_next) {
11074 /* get a new VM map entry for the map */
11075 new_entry = vm_map_entry_create(map);
11076 /* copy the "copy entry" to the new entry */
11077 vm_map_entry_copy(map, new_entry, copy_entry);
11078 /* adjust "start" and "end" */
11079 new_entry->vme_start += adjustment;
11080 new_entry->vme_end += adjustment;
11081 /* clear some attributes */
11082 new_entry->inheritance = inheritance;
11083 new_entry->protection = cur_prot;
11084 new_entry->max_protection = max_prot;
11085 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11086 /* take an extra reference on the entry's "object" */
11087 if (new_entry->is_sub_map) {
11088 assert(!new_entry->use_pmap); /* not nested */
11089 vm_map_reference(VME_SUBMAP(new_entry));
11090 } else {
11091 vm_object_reference(VME_OBJECT(new_entry));
11092 }
11093 /* insert the new entry in the map */
11094 vm_map_store_entry_link(map, where, new_entry,
11095 VM_MAP_KERNEL_FLAGS_NONE);
11096 /* continue inserting the "copy entries" after the new entry */
11097 where = new_entry;
11098 }
11099 }
11100
11101
11102 /*
11103 * Returns true if *size matches (or is in the range of) copy->size.
11104 * Upon returning true, the *size field is updated with the actual size of the
11105 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11106 */
11107 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)11108 vm_map_copy_validate_size(
11109 vm_map_t dst_map,
11110 vm_map_copy_t copy,
11111 vm_map_size_t *size)
11112 {
11113 if (copy == VM_MAP_COPY_NULL) {
11114 return FALSE;
11115 }
11116
11117 /*
11118 * Assert that the vm_map_copy is coming from the right
11119 * zone and hasn't been forged
11120 */
11121 vm_map_copy_require(copy);
11122
11123 vm_map_size_t copy_sz = copy->size;
11124 vm_map_size_t sz = *size;
11125 switch (copy->type) {
11126 case VM_MAP_COPY_KERNEL_BUFFER:
11127 if (sz == copy_sz) {
11128 return TRUE;
11129 }
11130 break;
11131 case VM_MAP_COPY_ENTRY_LIST:
11132 /*
11133 * potential page-size rounding prevents us from exactly
11134 * validating this flavor of vm_map_copy, but we can at least
11135 * assert that it's within a range.
11136 */
11137 if (copy_sz >= sz &&
11138 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11139 *size = copy_sz;
11140 return TRUE;
11141 }
11142 break;
11143 default:
11144 break;
11145 }
11146 return FALSE;
11147 }
11148
11149 static kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size_u,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)11150 vm_map_copyout_internal(
11151 vm_map_t dst_map,
11152 vm_map_address_t *dst_addr, /* OUT */
11153 vm_map_copy_t copy,
11154 vm_map_size_ut copy_size_u,
11155 boolean_t consume_on_success,
11156 vm_prot_t cur_protection,
11157 vm_prot_t max_protection,
11158 vm_inherit_t inheritance)
11159 {
11160 vm_map_size_t size, copy_size;
11161 vm_map_size_t adjustment;
11162 vm_map_offset_t start;
11163 vm_object_offset_t vm_copy_start;
11164 vm_map_entry_t last;
11165 vm_map_entry_t entry;
11166 vm_map_copy_t original_copy;
11167 kern_return_t kr;
11168 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11169
11170 /*
11171 * Check for null copy object.
11172 */
11173
11174 if (copy == VM_MAP_COPY_NULL) {
11175 *dst_addr = 0;
11176 return KERN_SUCCESS;
11177 }
11178
11179 /*
11180 * Assert that the vm_map_copy is coming from the right
11181 * zone and hasn't been forged
11182 */
11183 vm_map_copy_require(copy);
11184
11185 if (!VM_SANITIZE_UNSAFE_IS_EQUAL(copy_size_u, copy->size)) {
11186 *dst_addr = 0;
11187 ktriage_record(thread_tid(current_thread()),
11188 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11189 KDBG_TRIAGE_RESERVED,
11190 KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR),
11191 KERN_FAILURE /* arg */);
11192 return KERN_FAILURE;
11193 }
11194 copy_size = copy->size;
11195
11196 /*
11197 * Check for special kernel buffer allocated
11198 * by new_ipc_kmsg_copyin.
11199 */
11200
11201 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11202 kr = vm_map_copyout_kernel_buffer(dst_map, dst_addr,
11203 copy, copy_size, FALSE,
11204 consume_on_success);
11205 if (kr) {
11206 ktriage_record(thread_tid(current_thread()),
11207 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
11208 KDBG_TRIAGE_RESERVED,
11209 KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), kr /* arg */);
11210 }
11211 return kr;
11212 }
11213
11214 original_copy = copy;
11215 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
11216 vm_map_copy_t target_copy;
11217 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11218
11219 target_copy = VM_MAP_COPY_NULL;
11220 DEBUG4K_ADJUST("adjusting...\n");
11221 kr = vm_map_copy_adjust_to_target(
11222 copy,
11223 0, /* offset */
11224 copy->size, /* size */
11225 dst_map,
11226 TRUE, /* copy */
11227 &target_copy,
11228 &overmap_start,
11229 &overmap_end,
11230 &trimmed_start);
11231 if (kr != KERN_SUCCESS) {
11232 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11233 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), kr /* arg */);
11234 return kr;
11235 }
11236 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11237 if (target_copy != copy) {
11238 copy = target_copy;
11239 }
11240 copy_size = copy->size;
11241 }
11242
11243 /*
11244 * Find space for the data
11245 */
11246
11247 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11248 VM_MAP_COPY_PAGE_MASK(copy));
11249 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11250 VM_MAP_COPY_PAGE_MASK(copy))
11251 - vm_copy_start;
11252
11253 vm_map_kernel_flags_update_range_id(&vmk_flags, dst_map, size);
11254
11255 vm_map_lock(dst_map);
11256 kr = vm_map_locate_space_anywhere(dst_map, size, 0, vmk_flags,
11257 &start, &last);
11258 if (kr != KERN_SUCCESS) {
11259 vm_map_unlock(dst_map);
11260 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), kr /* arg */);
11261 return kr;
11262 }
11263
11264 adjustment = start - vm_copy_start;
11265 if (!consume_on_success) {
11266 /*
11267 * We're not allowed to consume "copy", so we'll have to
11268 * copy its map entries into the destination map below.
11269 * No need to re-allocate map entries from the correct
11270 * (pageable or not) zone, since we'll get new map entries
11271 * during the transfer.
11272 * We'll also adjust the map entries's "start" and "end"
11273 * during the transfer, to keep "copy"'s entries consistent
11274 * with its "offset".
11275 */
11276 goto after_adjustments;
11277 }
11278
11279 /*
11280 * Since we're going to just drop the map
11281 * entries from the copy into the destination
11282 * map, they must come from the same pool.
11283 */
11284
11285 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11286 /*
11287 * Mismatches occur when dealing with the default
11288 * pager.
11289 */
11290 vm_map_entry_t next, new;
11291
11292 /*
11293 * Find the zone that the copies were allocated from
11294 */
11295
11296 entry = vm_map_copy_first_entry(copy);
11297
11298 /*
11299 * Reinitialize the copy so that vm_map_copy_entry_link
11300 * will work.
11301 */
11302 vm_map_store_copy_reset(copy, entry);
11303 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11304
11305 /*
11306 * Copy each entry.
11307 */
11308 while (entry != vm_map_copy_to_entry(copy)) {
11309 new = vm_map_copy_entry_create(copy);
11310 vm_map_entry_copy_full(new, entry);
11311 new->vme_no_copy_on_read = FALSE;
11312 assert(!new->iokit_acct);
11313 if (new->is_sub_map) {
11314 /* clr address space specifics */
11315 new->use_pmap = FALSE;
11316 }
11317 vm_map_copy_entry_link(copy,
11318 vm_map_copy_last_entry(copy),
11319 new);
11320 next = entry->vme_next;
11321 vm_map_entry_dispose(entry);
11322 entry = next;
11323 }
11324 }
11325
11326 /*
11327 * Adjust the addresses in the copy chain, and
11328 * reset the region attributes.
11329 */
11330
11331 for (entry = vm_map_copy_first_entry(copy);
11332 entry != vm_map_copy_to_entry(copy);
11333 entry = entry->vme_next) {
11334 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11335 /*
11336 * We're injecting this copy entry into a map that
11337 * has the standard page alignment, so clear
11338 * "map_aligned" (which might have been inherited
11339 * from the original map entry).
11340 */
11341 entry->map_aligned = FALSE;
11342 }
11343
11344 entry->vme_start += adjustment;
11345 entry->vme_end += adjustment;
11346
11347 if (entry->map_aligned) {
11348 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11349 VM_MAP_PAGE_MASK(dst_map)));
11350 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11351 VM_MAP_PAGE_MASK(dst_map)));
11352 }
11353
11354 entry->inheritance = VM_INHERIT_DEFAULT;
11355 entry->protection = VM_PROT_DEFAULT;
11356 entry->max_protection = VM_PROT_ALL;
11357 entry->behavior = VM_BEHAVIOR_DEFAULT;
11358
11359 /*
11360 * If the entry is now wired,
11361 * map the pages into the destination map.
11362 */
11363 if (entry->wired_count != 0) {
11364 vm_map_offset_t va;
11365 vm_object_offset_t offset;
11366 vm_object_t object;
11367 vm_prot_t prot;
11368 int type_of_fault;
11369 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11370
11371 /* TODO4K would need to use actual page size */
11372 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11373
11374 object = VME_OBJECT(entry);
11375 offset = VME_OFFSET(entry);
11376 va = entry->vme_start;
11377
11378 pmap_pageable(dst_map->pmap,
11379 entry->vme_start,
11380 entry->vme_end,
11381 TRUE);
11382
11383 while (va < entry->vme_end) {
11384 vm_page_t m;
11385 struct vm_object_fault_info fault_info = {};
11386
11387 /*
11388 * Look up the page in the object.
11389 * Assert that the page will be found in the
11390 * top object:
11391 * either
11392 * the object was newly created by
11393 * vm_object_copy_slowly, and has
11394 * copies of all of the pages from
11395 * the source object
11396 * or
11397 * the object was moved from the old
11398 * map entry; because the old map
11399 * entry was wired, all of the pages
11400 * were in the top-level object.
11401 * (XXX not true if we wire pages for
11402 * reading)
11403 */
11404 vm_object_lock(object);
11405
11406 m = vm_page_lookup(object, offset);
11407 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11408 m->vmp_absent) {
11409 panic("vm_map_copyout: wiring %p", m);
11410 }
11411
11412 prot = entry->protection;
11413
11414 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11415 prot) {
11416 prot |= VM_PROT_EXECUTE;
11417 }
11418
11419 type_of_fault = DBG_CACHE_HIT_FAULT;
11420
11421 fault_info.user_tag = VME_ALIAS(entry);
11422 fault_info.pmap_options = 0;
11423 if (entry->iokit_acct ||
11424 (!entry->is_sub_map && !entry->use_pmap)) {
11425 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11426 }
11427 if (entry->vme_xnu_user_debug &&
11428 !VM_PAGE_OBJECT(m)->code_signed) {
11429 /*
11430 * Modified code-signed executable
11431 * region: this page does not belong
11432 * to a code-signed VM object, so it
11433 * must have been copied and should
11434 * therefore be typed XNU_USER_DEBUG
11435 * rather than XNU_USER_EXEC.
11436 */
11437 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11438 }
11439
11440 vm_fault_enter(m,
11441 dst_map->pmap,
11442 va,
11443 PAGE_SIZE, 0,
11444 prot,
11445 prot,
11446 VM_PAGE_WIRED(m),
11447 FALSE, /* change_wiring */
11448 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11449 &fault_info,
11450 NULL, /* need_retry */
11451 &type_of_fault,
11452 &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11453
11454 vm_object_unlock(object);
11455
11456 offset += PAGE_SIZE_64;
11457 va += PAGE_SIZE;
11458 }
11459 }
11460 }
11461
11462 after_adjustments:
11463
11464 /*
11465 * Correct the page alignment for the result
11466 */
11467
11468 *dst_addr = start + (copy->offset - vm_copy_start);
11469
11470 #if KASAN
11471 kasan_notify_address(*dst_addr, size);
11472 #endif
11473
11474 /*
11475 * Update the hints and the map size
11476 */
11477
11478 if (consume_on_success) {
11479 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11480 } else {
11481 SAVE_HINT_MAP_WRITE(dst_map, last);
11482 }
11483
11484 dst_map->size += size;
11485
11486 /*
11487 * Link in the copy
11488 */
11489
11490 if (consume_on_success) {
11491 vm_map_copy_insert(dst_map, last, copy);
11492 if (copy != original_copy) {
11493 vm_map_copy_discard(original_copy);
11494 original_copy = VM_MAP_COPY_NULL;
11495 }
11496 } else {
11497 vm_map_copy_remap(dst_map, last, copy, adjustment,
11498 cur_protection, max_protection,
11499 inheritance);
11500 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11501 vm_map_copy_discard(copy);
11502 copy = original_copy;
11503 }
11504 }
11505
11506
11507 vm_map_unlock(dst_map);
11508
11509 /*
11510 * XXX If wiring_required, call vm_map_pageable
11511 */
11512
11513 return KERN_SUCCESS;
11514 }
11515
11516 /*
11517 * Routine: vm_map_copyout_size
11518 *
11519 * Description:
11520 * Copy out a copy chain ("copy") into newly-allocated
11521 * space in the destination map. Uses a prevalidated
11522 * size for the copy object (vm_map_copy_validate_size).
11523 *
11524 * If successful, consumes the copy object.
11525 * Otherwise, the caller is responsible for it.
11526 */
11527 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_ut copy_size)11528 vm_map_copyout_size(
11529 vm_map_t dst_map,
11530 vm_map_address_t *dst_addr, /* OUT */
11531 vm_map_copy_t copy,
11532 vm_map_size_ut copy_size)
11533 {
11534 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11535 TRUE, /* consume_on_success */
11536 VM_PROT_DEFAULT,
11537 VM_PROT_ALL,
11538 VM_INHERIT_DEFAULT);
11539 }
11540
11541 /*
11542 * Routine: vm_map_copyout
11543 *
11544 * Description:
11545 * Copy out a copy chain ("copy") into newly-allocated
11546 * space in the destination map.
11547 *
11548 * If successful, consumes the copy object.
11549 * Otherwise, the caller is responsible for it.
11550 */
11551 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)11552 vm_map_copyout(
11553 vm_map_t dst_map,
11554 vm_map_address_t *dst_addr, /* OUT */
11555 vm_map_copy_t copy)
11556 {
11557 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
11558 TRUE, /* consume_on_success */
11559 VM_PROT_DEFAULT,
11560 VM_PROT_ALL,
11561 VM_INHERIT_DEFAULT);
11562 }
11563
11564 /*
11565 * Routine: vm_map_copyin
11566 *
11567 * Description:
11568 * see vm_map_copyin_common. Exported via Unsupported.exports.
11569 *
11570 */
11571
11572 #undef vm_map_copyin
11573
11574 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,vm_map_copy_t * copy_result)11575 vm_map_copyin(
11576 vm_map_t src_map,
11577 vm_map_address_ut src_addr,
11578 vm_map_size_ut len,
11579 boolean_t src_destroy,
11580 vm_map_copy_t *copy_result) /* OUT */
11581 {
11582 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11583 FALSE, copy_result, FALSE);
11584 }
11585
11586 /*
11587 * Routine: vm_map_copyin_common
11588 *
11589 * Description:
11590 * Copy the specified region (src_addr, len) from the
11591 * source address space (src_map), possibly removing
11592 * the region from the source address space (src_destroy).
11593 *
11594 * Returns:
11595 * A vm_map_copy_t object (copy_result), suitable for
11596 * insertion into another address space (using vm_map_copyout),
11597 * copying over another address space region (using
11598 * vm_map_copy_overwrite). If the copy is unused, it
11599 * should be destroyed (using vm_map_copy_discard).
11600 *
11601 * In/out conditions:
11602 * The source map should not be locked on entry.
11603 */
11604
11605 typedef struct submap_map {
11606 vm_map_t parent_map;
11607 vm_map_offset_t base_start;
11608 vm_map_offset_t base_end;
11609 vm_map_size_t base_len;
11610 struct submap_map *next;
11611 } submap_map_t;
11612
11613 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_ut src_addr,vm_map_size_ut len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11614 vm_map_copyin_common(
11615 vm_map_t src_map,
11616 vm_map_address_ut src_addr,
11617 vm_map_size_ut len,
11618 boolean_t src_destroy,
11619 __unused boolean_t src_volatile,
11620 vm_map_copy_t *copy_result, /* OUT */
11621 boolean_t use_maxprot)
11622 {
11623 int flags;
11624
11625 flags = 0;
11626 if (src_destroy) {
11627 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11628 }
11629 if (use_maxprot) {
11630 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11631 }
11632 return vm_map_copyin_internal(src_map,
11633 src_addr,
11634 len,
11635 flags,
11636 copy_result);
11637 }
11638
11639 static inline kern_return_t
vm_map_copyin_sanitize(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,vm_map_offset_t * src_start,vm_map_offset_t * src_end,vm_map_size_t * len,vm_map_offset_t * src_addr_unaligned)11640 vm_map_copyin_sanitize(
11641 vm_map_t src_map,
11642 vm_map_address_ut src_addr_u,
11643 vm_map_size_ut len_u,
11644 vm_map_offset_t *src_start,
11645 vm_map_offset_t *src_end,
11646 vm_map_size_t *len,
11647 vm_map_offset_t *src_addr_unaligned)
11648 {
11649 kern_return_t kr;
11650
11651 kr = vm_sanitize_addr_size(src_addr_u, len_u, VM_SANITIZE_CALLER_VM_MAP_COPYIN,
11652 src_map,
11653 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES |
11654 (src_map->pmap == kernel_pmap ? VM_SANITIZE_FLAGS_CANONICALIZE : VM_SANITIZE_FLAGS_NONE),
11655 src_start, src_end, len);
11656 if (__improbable(kr != KERN_SUCCESS)) {
11657 return kr;
11658 }
11659
11660 /*
11661 * Compute (page aligned) start and end of region
11662 */
11663 *src_addr_unaligned = *src_start; /* remember unaligned value */
11664 *src_start = vm_map_trunc_page(*src_addr_unaligned,
11665 VM_MAP_PAGE_MASK(src_map));
11666 *src_end = vm_map_round_page(*src_end, VM_MAP_PAGE_MASK(src_map));
11667 return KERN_SUCCESS;
11668 }
11669
11670 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_ut src_addr_u,vm_map_size_ut len_u,int flags,vm_map_copy_t * copy_result)11671 vm_map_copyin_internal(
11672 vm_map_t src_map,
11673 vm_map_address_ut src_addr_u,
11674 vm_map_size_ut len_u,
11675 int flags,
11676 vm_map_copy_t *copy_result) /* OUT */
11677 {
11678 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11679 * in multi-level lookup, this
11680 * entry contains the actual
11681 * vm_object/offset.
11682 */
11683 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11684
11685 vm_map_offset_t src_start; /* Start of current entry --
11686 * where copy is taking place now
11687 */
11688 vm_map_offset_t src_end; /* End of entire region to be
11689 * copied */
11690 vm_map_offset_t src_addr_unaligned;
11691 vm_map_offset_t src_base;
11692 vm_map_size_t len;
11693 vm_map_t base_map = src_map;
11694 boolean_t map_share = FALSE;
11695 submap_map_t *parent_maps = NULL;
11696
11697 vm_map_copy_t copy; /* Resulting copy */
11698 vm_map_address_t copy_addr;
11699 vm_map_size_t copy_size;
11700 boolean_t src_destroy;
11701 boolean_t use_maxprot;
11702 boolean_t preserve_purgeable;
11703 boolean_t entry_was_shared;
11704 vm_map_entry_t saved_src_entry;
11705 kern_return_t kr;
11706
11707
11708 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11709 return KERN_INVALID_ARGUMENT;
11710 }
11711
11712 /*
11713 * Check for copies of zero bytes.
11714 */
11715 if (VM_SANITIZE_UNSAFE_IS_ZERO(len_u)) {
11716 *copy_result = VM_MAP_COPY_NULL;
11717 return KERN_SUCCESS;
11718 }
11719
11720 /*
11721 * Sanitize any input parameters that are addr/size/prot/inherit
11722 */
11723 kr = vm_map_copyin_sanitize(
11724 src_map,
11725 src_addr_u,
11726 len_u,
11727 &src_start,
11728 &src_end,
11729 &len,
11730 &src_addr_unaligned);
11731 if (__improbable(kr != KERN_SUCCESS)) {
11732 return vm_sanitize_get_kr(kr);
11733 }
11734
11735 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11736 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11737 preserve_purgeable =
11738 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11739
11740 /*
11741 * If the copy is sufficiently small, use a kernel buffer instead
11742 * of making a virtual copy. The theory being that the cost of
11743 * setting up VM (and taking C-O-W faults) dominates the copy costs
11744 * for small regions.
11745 */
11746 if ((len <= msg_ool_size_small) &&
11747 !use_maxprot &&
11748 !preserve_purgeable &&
11749 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11750 /*
11751 * Since the "msg_ool_size_small" threshold was increased and
11752 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11753 * address space limits, we revert to doing a virtual copy if the
11754 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11755 * of the commpage would now fail when it used to work.
11756 */
11757 (src_start >= vm_map_min(src_map) &&
11758 src_start < vm_map_max(src_map) &&
11759 src_end >= vm_map_min(src_map) &&
11760 src_end < vm_map_max(src_map))) {
11761 return vm_map_copyin_kernel_buffer(src_map, src_addr_unaligned, len,
11762 src_destroy, copy_result);
11763 }
11764
11765 /*
11766 * Allocate a header element for the list.
11767 *
11768 * Use the start and end in the header to
11769 * remember the endpoints prior to rounding.
11770 */
11771
11772 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
11773 copy->cpy_hdr.entries_pageable = TRUE;
11774 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11775 copy->offset = src_addr_unaligned;
11776 copy->size = len;
11777
11778 new_entry = vm_map_copy_entry_create(copy);
11779
11780 #define RETURN(x) \
11781 MACRO_BEGIN \
11782 vm_map_unlock(src_map); \
11783 if(src_map != base_map) \
11784 vm_map_deallocate(src_map); \
11785 if (new_entry != VM_MAP_ENTRY_NULL) \
11786 vm_map_copy_entry_dispose(new_entry); \
11787 vm_map_copy_discard(copy); \
11788 { \
11789 submap_map_t *_ptr; \
11790 \
11791 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11792 parent_maps=parent_maps->next; \
11793 if (_ptr->parent_map != base_map) \
11794 vm_map_deallocate(_ptr->parent_map); \
11795 kfree_type(submap_map_t, _ptr); \
11796 } \
11797 } \
11798 MACRO_RETURN(x); \
11799 MACRO_END
11800
11801 /*
11802 * Find the beginning of the region.
11803 */
11804
11805 vm_map_lock(src_map);
11806
11807 /*
11808 * Lookup the original "src_addr_unaligned" rather than the truncated
11809 * "src_start", in case "src_start" falls in a non-map-aligned
11810 * map entry *before* the map entry that contains "src_addr_unaligned"...
11811 */
11812 if (!vm_map_lookup_entry(src_map, src_addr_unaligned, &tmp_entry)) {
11813 RETURN(KERN_INVALID_ADDRESS);
11814 }
11815 if (!tmp_entry->is_sub_map) {
11816 /*
11817 * ... but clip to the map-rounded "src_start" rather than
11818 * "src_addr_unaligned" to preserve map-alignment. We'll adjust the
11819 * first copy entry at the end, if needed.
11820 */
11821 vm_map_clip_start(src_map, tmp_entry, src_start);
11822 }
11823 if (src_start < tmp_entry->vme_start) {
11824 /*
11825 * Move "src_start" up to the start of the
11826 * first map entry to copy.
11827 */
11828 src_start = tmp_entry->vme_start;
11829 }
11830 /* set for later submap fix-up */
11831 copy_addr = src_start;
11832
11833 /*
11834 * Go through entries until we get to the end.
11835 */
11836
11837 while (TRUE) {
11838 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
11839 vm_map_size_t src_size; /* Size of source
11840 * map entry (in both
11841 * maps)
11842 */
11843
11844 vm_object_t src_object; /* Object to copy */
11845 vm_object_offset_t src_offset;
11846
11847 vm_object_t new_copy_object;/* vm_object_copy_* result */
11848
11849 boolean_t src_needs_copy; /* Should source map
11850 * be made read-only
11851 * for copy-on-write?
11852 */
11853
11854 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
11855
11856 boolean_t was_wired; /* Was source wired? */
11857 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
11858 vm_map_version_t version; /* Version before locks
11859 * dropped to make copy
11860 */
11861 kern_return_t result; /* Return value from
11862 * copy_strategically.
11863 */
11864 while (tmp_entry->is_sub_map) {
11865 vm_map_size_t submap_len;
11866 submap_map_t *ptr;
11867
11868 ptr = kalloc_type(submap_map_t, Z_WAITOK);
11869 ptr->next = parent_maps;
11870 parent_maps = ptr;
11871 ptr->parent_map = src_map;
11872 ptr->base_start = src_start;
11873 ptr->base_end = src_end;
11874 submap_len = tmp_entry->vme_end - src_start;
11875 if (submap_len > (src_end - src_start)) {
11876 submap_len = src_end - src_start;
11877 }
11878 ptr->base_len = submap_len;
11879
11880 src_start -= tmp_entry->vme_start;
11881 src_start += VME_OFFSET(tmp_entry);
11882 src_end = src_start + submap_len;
11883 src_map = VME_SUBMAP(tmp_entry);
11884 vm_map_lock(src_map);
11885 /* keep an outstanding reference for all maps in */
11886 /* the parents tree except the base map */
11887 vm_map_reference(src_map);
11888 vm_map_unlock(ptr->parent_map);
11889 if (!vm_map_lookup_entry(
11890 src_map, src_start, &tmp_entry)) {
11891 RETURN(KERN_INVALID_ADDRESS);
11892 }
11893 map_share = TRUE;
11894 if (!tmp_entry->is_sub_map) {
11895 vm_map_clip_start(src_map, tmp_entry, src_start);
11896 }
11897 src_entry = tmp_entry;
11898 }
11899 /* we are now in the lowest level submap... */
11900
11901 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11902 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11903 /* This is not, supported for now.In future */
11904 /* we will need to detect the phys_contig */
11905 /* condition and then upgrade copy_slowly */
11906 /* to do physical copy from the device mem */
11907 /* based object. We can piggy-back off of */
11908 /* the was wired boolean to set-up the */
11909 /* proper handling */
11910 RETURN(KERN_PROTECTION_FAILURE);
11911 }
11912 /*
11913 * Create a new address map entry to hold the result.
11914 * Fill in the fields from the appropriate source entries.
11915 * We must unlock the source map to do this if we need
11916 * to allocate a map entry.
11917 */
11918 if (new_entry == VM_MAP_ENTRY_NULL) {
11919 version.main_timestamp = src_map->timestamp;
11920 vm_map_unlock(src_map);
11921
11922 new_entry = vm_map_copy_entry_create(copy);
11923
11924 vm_map_lock(src_map);
11925 if ((version.main_timestamp + 1) != src_map->timestamp) {
11926 if (!vm_map_lookup_entry(src_map, src_start,
11927 &tmp_entry)) {
11928 RETURN(KERN_INVALID_ADDRESS);
11929 }
11930 if (!tmp_entry->is_sub_map) {
11931 vm_map_clip_start(src_map, tmp_entry, src_start);
11932 }
11933 continue; /* restart w/ new tmp_entry */
11934 }
11935 }
11936
11937 /*
11938 * Verify that the region can be read.
11939 */
11940 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11941 !use_maxprot) ||
11942 (src_entry->max_protection & VM_PROT_READ) == 0) {
11943 RETURN(KERN_PROTECTION_FAILURE);
11944 }
11945
11946 /*
11947 * Clip against the endpoints of the entire region.
11948 */
11949
11950 vm_map_clip_end(src_map, src_entry, src_end);
11951
11952 src_size = src_entry->vme_end - src_start;
11953 src_object = VME_OBJECT(src_entry);
11954 src_offset = VME_OFFSET(src_entry);
11955 was_wired = (src_entry->wired_count != 0);
11956
11957 vm_map_entry_copy(src_map, new_entry, src_entry);
11958 if (new_entry->is_sub_map) {
11959 /* clr address space specifics */
11960 new_entry->use_pmap = FALSE;
11961 } else {
11962 /*
11963 * We're dealing with a copy-on-write operation,
11964 * so the resulting mapping should not inherit the
11965 * original mapping's accounting settings.
11966 * "iokit_acct" should have been cleared in
11967 * vm_map_entry_copy().
11968 * "use_pmap" should be reset to its default (TRUE)
11969 * so that the new mapping gets accounted for in
11970 * the task's memory footprint.
11971 */
11972 assert(!new_entry->iokit_acct);
11973 new_entry->use_pmap = TRUE;
11974 }
11975
11976 /*
11977 * Attempt non-blocking copy-on-write optimizations.
11978 */
11979
11980 /*
11981 * If we are destroying the source, and the object
11982 * is internal, we could move the object reference
11983 * from the source to the copy. The copy is
11984 * copy-on-write only if the source is.
11985 * We make another reference to the object, because
11986 * destroying the source entry will deallocate it.
11987 *
11988 * This memory transfer has to be atomic, (to prevent
11989 * the VM object from being shared or copied while
11990 * it's being moved here), so we could only do this
11991 * if we won't have to unlock the VM map until the
11992 * original mapping has been fully removed.
11993 */
11994
11995 RestartCopy:
11996 if ((src_object == VM_OBJECT_NULL ||
11997 (!was_wired && !map_share && !tmp_entry->is_shared
11998 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11999 vm_object_copy_quickly(
12000 VME_OBJECT(new_entry),
12001 src_offset,
12002 src_size,
12003 &src_needs_copy,
12004 &new_entry_needs_copy)) {
12005 new_entry->needs_copy = new_entry_needs_copy;
12006
12007 /*
12008 * Handle copy-on-write obligations
12009 */
12010
12011 if (src_needs_copy && !tmp_entry->needs_copy) {
12012 vm_prot_t prot;
12013
12014 prot = src_entry->protection & ~VM_PROT_WRITE;
12015
12016 if (override_nx(src_map, VME_ALIAS(src_entry))
12017 && prot) {
12018 prot |= VM_PROT_EXECUTE;
12019 }
12020
12021 vm_object_pmap_protect(
12022 src_object,
12023 src_offset,
12024 src_size,
12025 (src_entry->is_shared ?
12026 PMAP_NULL
12027 : src_map->pmap),
12028 VM_MAP_PAGE_SIZE(src_map),
12029 src_entry->vme_start,
12030 prot);
12031
12032 assert(tmp_entry->wired_count == 0);
12033 tmp_entry->needs_copy = TRUE;
12034 }
12035
12036 /*
12037 * The map has never been unlocked, so it's safe
12038 * to move to the next entry rather than doing
12039 * another lookup.
12040 */
12041
12042 goto CopySuccessful;
12043 }
12044
12045 entry_was_shared = tmp_entry->is_shared;
12046
12047 /*
12048 * Take an object reference, so that we may
12049 * release the map lock(s).
12050 */
12051
12052 assert(src_object != VM_OBJECT_NULL);
12053 vm_object_reference(src_object);
12054
12055 /*
12056 * Record the timestamp for later verification.
12057 * Unlock the map.
12058 */
12059
12060 version.main_timestamp = src_map->timestamp;
12061 vm_map_unlock(src_map); /* Increments timestamp once! */
12062 saved_src_entry = src_entry;
12063 tmp_entry = VM_MAP_ENTRY_NULL;
12064 src_entry = VM_MAP_ENTRY_NULL;
12065
12066 /*
12067 * Perform the copy
12068 */
12069
12070 if (was_wired ||
12071 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12072 !(flags & VM_MAP_COPYIN_FORK)) ||
12073 (debug4k_no_cow_copyin &&
12074 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
12075 CopySlowly:
12076 vm_object_lock(src_object);
12077 result = vm_object_copy_slowly(
12078 src_object,
12079 src_offset,
12080 src_size,
12081 THREAD_UNINT,
12082 &new_copy_object);
12083 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12084 saved_used_for_jit = new_entry->used_for_jit;
12085 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12086 new_entry->used_for_jit = saved_used_for_jit;
12087 VME_OFFSET_SET(new_entry,
12088 src_offset - vm_object_trunc_page(src_offset));
12089 new_entry->needs_copy = FALSE;
12090 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12091 (entry_was_shared || map_share)) {
12092 vm_object_t new_object;
12093
12094 vm_object_lock_shared(src_object);
12095 new_object = vm_object_copy_delayed(
12096 src_object,
12097 src_offset,
12098 src_size,
12099 TRUE);
12100 if (new_object == VM_OBJECT_NULL) {
12101 goto CopySlowly;
12102 }
12103
12104 VME_OBJECT_SET(new_entry, new_object, false, 0);
12105 assert(new_entry->wired_count == 0);
12106 new_entry->needs_copy = TRUE;
12107 assert(!new_entry->iokit_acct);
12108 assert(new_object->purgable == VM_PURGABLE_DENY);
12109 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12110 result = KERN_SUCCESS;
12111 } else {
12112 vm_object_offset_t new_offset;
12113 new_offset = VME_OFFSET(new_entry);
12114 result = vm_object_copy_strategically(src_object,
12115 src_offset,
12116 src_size,
12117 (flags & VM_MAP_COPYIN_FORK),
12118 &new_copy_object,
12119 &new_offset,
12120 &new_entry_needs_copy);
12121 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12122 saved_used_for_jit = new_entry->used_for_jit;
12123 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
12124 new_entry->used_for_jit = saved_used_for_jit;
12125 if (new_offset != VME_OFFSET(new_entry)) {
12126 VME_OFFSET_SET(new_entry, new_offset);
12127 }
12128
12129 new_entry->needs_copy = new_entry_needs_copy;
12130 }
12131
12132 if (result == KERN_SUCCESS &&
12133 ((preserve_purgeable &&
12134 src_object->purgable != VM_PURGABLE_DENY) ||
12135 new_entry->used_for_jit)) {
12136 /*
12137 * Purgeable objects should be COPY_NONE, true share;
12138 * this should be propogated to the copy.
12139 *
12140 * Also force mappings the pmap specially protects to
12141 * be COPY_NONE; trying to COW these mappings would
12142 * change the effective protections, which could have
12143 * side effects if the pmap layer relies on the
12144 * specified protections.
12145 */
12146
12147 vm_object_t new_object;
12148
12149 new_object = VME_OBJECT(new_entry);
12150 assert(new_object != src_object);
12151 vm_object_lock(new_object);
12152 assert(new_object->ref_count == 1);
12153 assert(new_object->shadow == VM_OBJECT_NULL);
12154 assert(new_object->vo_copy == VM_OBJECT_NULL);
12155 assert(new_object->vo_owner == NULL);
12156
12157 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12158
12159 if (preserve_purgeable &&
12160 src_object->purgable != VM_PURGABLE_DENY) {
12161 VM_OBJECT_SET_TRUE_SHARE(new_object, TRUE);
12162
12163 /* start as non-volatile with no owner... */
12164 VM_OBJECT_SET_PURGABLE(new_object, VM_PURGABLE_NONVOLATILE);
12165 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
12166 /* ... and move to src_object's purgeable state */
12167 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12168 int state;
12169 state = src_object->purgable;
12170 vm_object_purgable_control(
12171 new_object,
12172 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12173 &state);
12174 }
12175 /* no pmap accounting for purgeable objects */
12176 new_entry->use_pmap = FALSE;
12177 }
12178
12179 vm_object_unlock(new_object);
12180 new_object = VM_OBJECT_NULL;
12181 }
12182
12183 if (result != KERN_SUCCESS &&
12184 result != KERN_MEMORY_RESTART_COPY) {
12185 vm_map_lock(src_map);
12186 RETURN(result);
12187 }
12188
12189 /*
12190 * Throw away the extra reference
12191 */
12192
12193 vm_object_deallocate(src_object);
12194
12195 /*
12196 * Verify that the map has not substantially
12197 * changed while the copy was being made.
12198 */
12199
12200 vm_map_lock(src_map);
12201
12202 if ((version.main_timestamp + 1) == src_map->timestamp) {
12203 /* src_map hasn't changed: src_entry is still valid */
12204 src_entry = saved_src_entry;
12205 goto VerificationSuccessful;
12206 }
12207
12208 /*
12209 * Simple version comparison failed.
12210 *
12211 * Retry the lookup and verify that the
12212 * same object/offset are still present.
12213 *
12214 * [Note: a memory manager that colludes with
12215 * the calling task can detect that we have
12216 * cheated. While the map was unlocked, the
12217 * mapping could have been changed and restored.]
12218 */
12219
12220 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
12221 if (result != KERN_MEMORY_RESTART_COPY) {
12222 vm_object_deallocate(VME_OBJECT(new_entry));
12223 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
12224 /* reset accounting state */
12225 new_entry->iokit_acct = FALSE;
12226 new_entry->use_pmap = TRUE;
12227 }
12228 RETURN(KERN_INVALID_ADDRESS);
12229 }
12230
12231 src_entry = tmp_entry;
12232 vm_map_clip_start(src_map, src_entry, src_start);
12233
12234 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12235 !use_maxprot) ||
12236 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12237 goto VerificationFailed;
12238 }
12239
12240 if (src_entry->vme_end < new_entry->vme_end) {
12241 /*
12242 * This entry might have been shortened
12243 * (vm_map_clip_end) or been replaced with
12244 * an entry that ends closer to "src_start"
12245 * than before.
12246 * Adjust "new_entry" accordingly; copying
12247 * less memory would be correct but we also
12248 * redo the copy (see below) if the new entry
12249 * no longer points at the same object/offset.
12250 */
12251 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12252 VM_MAP_COPY_PAGE_MASK(copy)));
12253 new_entry->vme_end = src_entry->vme_end;
12254 src_size = new_entry->vme_end - src_start;
12255 } else if (src_entry->vme_end > new_entry->vme_end) {
12256 /*
12257 * This entry might have been extended
12258 * (vm_map_entry_simplify() or coalesce)
12259 * or been replaced with an entry that ends farther
12260 * from "src_start" than before.
12261 *
12262 * We've called vm_object_copy_*() only on
12263 * the previous <start:end> range, so we can't
12264 * just extend new_entry. We have to re-do
12265 * the copy based on the new entry as if it was
12266 * pointing at a different object/offset (see
12267 * "Verification failed" below).
12268 */
12269 }
12270
12271 if ((VME_OBJECT(src_entry) != src_object) ||
12272 (VME_OFFSET(src_entry) != src_offset) ||
12273 (src_entry->vme_end > new_entry->vme_end)) {
12274 /*
12275 * Verification failed.
12276 *
12277 * Start over with this top-level entry.
12278 */
12279
12280 VerificationFailed: ;
12281
12282 vm_object_deallocate(VME_OBJECT(new_entry));
12283 tmp_entry = src_entry;
12284 continue;
12285 }
12286
12287 /*
12288 * Verification succeeded.
12289 */
12290
12291 VerificationSuccessful:;
12292
12293 if (result == KERN_MEMORY_RESTART_COPY) {
12294 goto RestartCopy;
12295 }
12296
12297 /*
12298 * Copy succeeded.
12299 */
12300
12301 CopySuccessful: ;
12302
12303 /*
12304 * Link in the new copy entry.
12305 */
12306
12307 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12308 new_entry);
12309
12310 /*
12311 * Determine whether the entire region
12312 * has been copied.
12313 */
12314 src_base = src_start;
12315 src_start = new_entry->vme_end;
12316 new_entry = VM_MAP_ENTRY_NULL;
12317 while ((src_start >= src_end) && (src_end != 0)) {
12318 submap_map_t *ptr;
12319
12320 if (src_map == base_map) {
12321 /* back to the top */
12322 break;
12323 }
12324
12325 ptr = parent_maps;
12326 assert(ptr != NULL);
12327 parent_maps = parent_maps->next;
12328
12329 /* fix up the damage we did in that submap */
12330 vm_map_simplify_range(src_map,
12331 src_base,
12332 src_end);
12333
12334 vm_map_unlock(src_map);
12335 vm_map_deallocate(src_map);
12336 vm_map_lock(ptr->parent_map);
12337 src_map = ptr->parent_map;
12338 src_base = ptr->base_start;
12339 src_start = ptr->base_start + ptr->base_len;
12340 src_end = ptr->base_end;
12341 if (!vm_map_lookup_entry(src_map,
12342 src_start,
12343 &tmp_entry) &&
12344 (src_end > src_start)) {
12345 RETURN(KERN_INVALID_ADDRESS);
12346 }
12347 kfree_type(submap_map_t, ptr);
12348 if (parent_maps == NULL) {
12349 map_share = FALSE;
12350 }
12351 src_entry = tmp_entry->vme_prev;
12352 }
12353
12354 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
12355 (src_start >= src_addr_unaligned + len) &&
12356 (src_addr_unaligned + len != 0)) {
12357 /*
12358 * Stop copying now, even though we haven't reached
12359 * "src_end". We'll adjust the end of the last copy
12360 * entry at the end, if needed.
12361 *
12362 * If src_map's aligment is different from the
12363 * system's page-alignment, there could be
12364 * extra non-map-aligned map entries between
12365 * the original (non-rounded) "src_addr_unaligned + len"
12366 * and the rounded "src_end".
12367 * We do not want to copy those map entries since
12368 * they're not part of the copied range.
12369 */
12370 break;
12371 }
12372
12373 if ((src_start >= src_end) && (src_end != 0)) {
12374 break;
12375 }
12376
12377 /*
12378 * Verify that there are no gaps in the region
12379 */
12380
12381 tmp_entry = src_entry->vme_next;
12382 if ((tmp_entry->vme_start != src_start) ||
12383 (tmp_entry == vm_map_to_entry(src_map))) {
12384 RETURN(KERN_INVALID_ADDRESS);
12385 }
12386 }
12387
12388 /*
12389 * If the source should be destroyed, do it now, since the
12390 * copy was successful.
12391 */
12392 if (src_destroy) {
12393 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12394
12395 if (src_map == kernel_map) {
12396 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12397 }
12398 (void)vm_map_remove_and_unlock(src_map,
12399 vm_map_trunc_page(src_addr_unaligned, VM_MAP_PAGE_MASK(src_map)),
12400 src_end,
12401 remove_flags,
12402 KMEM_GUARD_NONE);
12403 } else {
12404 /* fix up the damage we did in the base map */
12405 vm_map_simplify_range(
12406 src_map,
12407 vm_map_trunc_page(src_addr_unaligned,
12408 VM_MAP_PAGE_MASK(src_map)),
12409 vm_map_round_page(src_end,
12410 VM_MAP_PAGE_MASK(src_map)));
12411 vm_map_unlock(src_map);
12412 }
12413
12414 tmp_entry = VM_MAP_ENTRY_NULL;
12415
12416 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
12417 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12418 vm_map_offset_t original_start, original_offset, original_end;
12419
12420 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12421
12422 /* adjust alignment of first copy_entry's "vme_start" */
12423 tmp_entry = vm_map_copy_first_entry(copy);
12424 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12425 vm_map_offset_t adjustment;
12426
12427 original_start = tmp_entry->vme_start;
12428 original_offset = VME_OFFSET(tmp_entry);
12429
12430 /* map-align the start of the first copy entry... */
12431 adjustment = (tmp_entry->vme_start -
12432 vm_map_trunc_page(
12433 tmp_entry->vme_start,
12434 VM_MAP_PAGE_MASK(src_map)));
12435 tmp_entry->vme_start -= adjustment;
12436 VME_OFFSET_SET(tmp_entry,
12437 VME_OFFSET(tmp_entry) - adjustment);
12438 copy_addr -= adjustment;
12439 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12440 /* ... adjust for mis-aligned start of copy range */
12441 adjustment =
12442 (vm_map_trunc_page(copy->offset,
12443 PAGE_MASK) -
12444 vm_map_trunc_page(copy->offset,
12445 VM_MAP_PAGE_MASK(src_map)));
12446 if (adjustment) {
12447 assert(page_aligned(adjustment));
12448 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12449 tmp_entry->vme_start += adjustment;
12450 VME_OFFSET_SET(tmp_entry,
12451 (VME_OFFSET(tmp_entry) +
12452 adjustment));
12453 copy_addr += adjustment;
12454 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12455 }
12456
12457 /*
12458 * Assert that the adjustments haven't exposed
12459 * more than was originally copied...
12460 */
12461 assert(tmp_entry->vme_start >= original_start);
12462 assert(VME_OFFSET(tmp_entry) >= original_offset);
12463 /*
12464 * ... and that it did not adjust outside of a
12465 * a single 16K page.
12466 */
12467 assert(vm_map_trunc_page(tmp_entry->vme_start,
12468 VM_MAP_PAGE_MASK(src_map)) ==
12469 vm_map_trunc_page(original_start,
12470 VM_MAP_PAGE_MASK(src_map)));
12471 }
12472
12473 /* adjust alignment of last copy_entry's "vme_end" */
12474 tmp_entry = vm_map_copy_last_entry(copy);
12475 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12476 vm_map_offset_t adjustment;
12477
12478 original_end = tmp_entry->vme_end;
12479
12480 /* map-align the end of the last copy entry... */
12481 tmp_entry->vme_end =
12482 vm_map_round_page(tmp_entry->vme_end,
12483 VM_MAP_PAGE_MASK(src_map));
12484 /* ... adjust for mis-aligned end of copy range */
12485 adjustment =
12486 (vm_map_round_page((copy->offset +
12487 copy->size),
12488 VM_MAP_PAGE_MASK(src_map)) -
12489 vm_map_round_page((copy->offset +
12490 copy->size),
12491 PAGE_MASK));
12492 if (adjustment) {
12493 assert(page_aligned(adjustment));
12494 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12495 tmp_entry->vme_end -= adjustment;
12496 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12497 }
12498
12499 /*
12500 * Assert that the adjustments haven't exposed
12501 * more than was originally copied...
12502 */
12503 assert(tmp_entry->vme_end <= original_end);
12504 /*
12505 * ... and that it did not adjust outside of a
12506 * a single 16K page.
12507 */
12508 assert(vm_map_round_page(tmp_entry->vme_end,
12509 VM_MAP_PAGE_MASK(src_map)) ==
12510 vm_map_round_page(original_end,
12511 VM_MAP_PAGE_MASK(src_map)));
12512 }
12513 }
12514
12515 /* Fix-up start and end points in copy. This is necessary */
12516 /* when the various entries in the copy object were picked */
12517 /* up from different sub-maps */
12518
12519 tmp_entry = vm_map_copy_first_entry(copy);
12520 copy_size = 0; /* compute actual size */
12521 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12522 assert(VM_MAP_PAGE_ALIGNED(
12523 copy_addr + (tmp_entry->vme_end -
12524 tmp_entry->vme_start),
12525 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12526 assert(VM_MAP_PAGE_ALIGNED(
12527 copy_addr,
12528 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12529
12530 /*
12531 * The copy_entries will be injected directly into the
12532 * destination map and might not be "map aligned" there...
12533 */
12534 tmp_entry->map_aligned = FALSE;
12535
12536 tmp_entry->vme_end = copy_addr +
12537 (tmp_entry->vme_end - tmp_entry->vme_start);
12538 tmp_entry->vme_start = copy_addr;
12539 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12540 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12541 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12542 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12543 }
12544
12545 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12546 copy_size < copy->size) {
12547 /*
12548 * The actual size of the VM map copy is smaller than what
12549 * was requested by the caller. This must be because some
12550 * PAGE_SIZE-sized pages are missing at the end of the last
12551 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12552 * The caller might not have been aware of those missing
12553 * pages and might not want to be aware of it, which is
12554 * fine as long as they don't try to access (and crash on)
12555 * those missing pages.
12556 * Let's adjust the size of the "copy", to avoid failing
12557 * in vm_map_copyout() or vm_map_copy_overwrite().
12558 */
12559 assert(vm_map_round_page(copy_size,
12560 VM_MAP_PAGE_MASK(src_map)) ==
12561 vm_map_round_page(copy->size,
12562 VM_MAP_PAGE_MASK(src_map)));
12563 copy->size = copy_size;
12564 }
12565
12566 *copy_result = copy;
12567 return KERN_SUCCESS;
12568
12569 #undef RETURN
12570 }
12571
12572 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12573 vm_map_copy_extract(
12574 vm_map_t src_map,
12575 vm_map_address_t src_addr,
12576 vm_map_size_t len,
12577 boolean_t do_copy,
12578 vm_map_copy_t *copy_result, /* OUT */
12579 vm_prot_t *cur_prot, /* IN/OUT */
12580 vm_prot_t *max_prot, /* IN/OUT */
12581 vm_inherit_t inheritance,
12582 vm_map_kernel_flags_t vmk_flags)
12583 {
12584 vm_map_copy_t copy;
12585 kern_return_t kr;
12586 vm_prot_t required_cur_prot, required_max_prot;
12587
12588 /*
12589 * Check for copies of zero bytes.
12590 */
12591
12592 if (len == 0) {
12593 *copy_result = VM_MAP_COPY_NULL;
12594 return KERN_SUCCESS;
12595 }
12596
12597 /*
12598 * Check that the end address doesn't overflow
12599 */
12600 if (src_addr + len < src_addr) {
12601 return KERN_INVALID_ADDRESS;
12602 }
12603 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12604 return KERN_INVALID_ADDRESS;
12605 }
12606
12607 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12608 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12609 }
12610
12611 required_cur_prot = *cur_prot;
12612 required_max_prot = *max_prot;
12613
12614 /*
12615 * Allocate a header element for the list.
12616 *
12617 * Use the start and end in the header to
12618 * remember the endpoints prior to rounding.
12619 */
12620
12621 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12622 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12623 copy->offset = 0;
12624 copy->size = len;
12625
12626 kr = vm_map_remap_extract(src_map,
12627 src_addr,
12628 len,
12629 do_copy, /* copy */
12630 copy,
12631 cur_prot, /* IN/OUT */
12632 max_prot, /* IN/OUT */
12633 inheritance,
12634 vmk_flags);
12635 if (kr != KERN_SUCCESS) {
12636 vm_map_copy_discard(copy);
12637 if ((kr == KERN_INVALID_ADDRESS ||
12638 kr == KERN_INVALID_ARGUMENT) &&
12639 src_map->terminated) {
12640 /* tell the caller that this address space is gone */
12641 kr = KERN_TERMINATED;
12642 }
12643 return kr;
12644 }
12645 if (required_cur_prot != VM_PROT_NONE) {
12646 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12647 assert((*max_prot & required_max_prot) == required_max_prot);
12648 }
12649
12650 *copy_result = copy;
12651 return KERN_SUCCESS;
12652 }
12653
12654 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12655 vm_map_fork_share(
12656 vm_map_t old_map,
12657 vm_map_entry_t old_entry,
12658 vm_map_t new_map)
12659 {
12660 vm_object_t object;
12661 vm_map_entry_t new_entry;
12662
12663 /*
12664 * New sharing code. New map entry
12665 * references original object. Internal
12666 * objects use asynchronous copy algorithm for
12667 * future copies. First make sure we have
12668 * the right object. If we need a shadow,
12669 * or someone else already has one, then
12670 * make a new shadow and share it.
12671 */
12672
12673 if (!old_entry->is_sub_map) {
12674 object = VME_OBJECT(old_entry);
12675 }
12676
12677 if (old_entry->is_sub_map) {
12678 assert(old_entry->wired_count == 0);
12679 #ifndef NO_NESTED_PMAP
12680 #if !PMAP_FORK_NEST
12681 if (old_entry->use_pmap) {
12682 kern_return_t result;
12683
12684 result = pmap_nest(new_map->pmap,
12685 (VME_SUBMAP(old_entry))->pmap,
12686 (addr64_t)old_entry->vme_start,
12687 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12688 if (result) {
12689 panic("vm_map_fork_share: pmap_nest failed!");
12690 }
12691 }
12692 #endif /* !PMAP_FORK_NEST */
12693 #endif /* NO_NESTED_PMAP */
12694 } else if (object == VM_OBJECT_NULL) {
12695 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12696 old_entry->vme_start));
12697 VME_OFFSET_SET(old_entry, 0);
12698 VME_OBJECT_SET(old_entry, object, false, 0);
12699 old_entry->use_pmap = TRUE;
12700 // assert(!old_entry->needs_copy);
12701 } else if (object->copy_strategy !=
12702 MEMORY_OBJECT_COPY_SYMMETRIC) {
12703 /*
12704 * We are already using an asymmetric
12705 * copy, and therefore we already have
12706 * the right object.
12707 */
12708
12709 assert(!old_entry->needs_copy);
12710 } else if (old_entry->needs_copy || /* case 1 */
12711 object->shadowed || /* case 2 */
12712 (!object->true_share && /* case 3 */
12713 !old_entry->is_shared &&
12714 (object->vo_size >
12715 (vm_map_size_t)(old_entry->vme_end -
12716 old_entry->vme_start)))) {
12717 bool is_writable;
12718
12719 /*
12720 * We need to create a shadow.
12721 * There are three cases here.
12722 * In the first case, we need to
12723 * complete a deferred symmetrical
12724 * copy that we participated in.
12725 * In the second and third cases,
12726 * we need to create the shadow so
12727 * that changes that we make to the
12728 * object do not interfere with
12729 * any symmetrical copies which
12730 * have occured (case 2) or which
12731 * might occur (case 3).
12732 *
12733 * The first case is when we had
12734 * deferred shadow object creation
12735 * via the entry->needs_copy mechanism.
12736 * This mechanism only works when
12737 * only one entry points to the source
12738 * object, and we are about to create
12739 * a second entry pointing to the
12740 * same object. The problem is that
12741 * there is no way of mapping from
12742 * an object to the entries pointing
12743 * to it. (Deferred shadow creation
12744 * works with one entry because occurs
12745 * at fault time, and we walk from the
12746 * entry to the object when handling
12747 * the fault.)
12748 *
12749 * The second case is when the object
12750 * to be shared has already been copied
12751 * with a symmetric copy, but we point
12752 * directly to the object without
12753 * needs_copy set in our entry. (This
12754 * can happen because different ranges
12755 * of an object can be pointed to by
12756 * different entries. In particular,
12757 * a single entry pointing to an object
12758 * can be split by a call to vm_inherit,
12759 * which, combined with task_create, can
12760 * result in the different entries
12761 * having different needs_copy values.)
12762 * The shadowed flag in the object allows
12763 * us to detect this case. The problem
12764 * with this case is that if this object
12765 * has or will have shadows, then we
12766 * must not perform an asymmetric copy
12767 * of this object, since such a copy
12768 * allows the object to be changed, which
12769 * will break the previous symmetrical
12770 * copies (which rely upon the object
12771 * not changing). In a sense, the shadowed
12772 * flag says "don't change this object".
12773 * We fix this by creating a shadow
12774 * object for this object, and sharing
12775 * that. This works because we are free
12776 * to change the shadow object (and thus
12777 * to use an asymmetric copy strategy);
12778 * this is also semantically correct,
12779 * since this object is temporary, and
12780 * therefore a copy of the object is
12781 * as good as the object itself. (This
12782 * is not true for permanent objects,
12783 * since the pager needs to see changes,
12784 * which won't happen if the changes
12785 * are made to a copy.)
12786 *
12787 * The third case is when the object
12788 * to be shared has parts sticking
12789 * outside of the entry we're working
12790 * with, and thus may in the future
12791 * be subject to a symmetrical copy.
12792 * (This is a preemptive version of
12793 * case 2.)
12794 */
12795 VME_OBJECT_SHADOW(old_entry,
12796 (vm_map_size_t) (old_entry->vme_end -
12797 old_entry->vme_start),
12798 vm_map_always_shadow(old_map));
12799
12800 /*
12801 * If we're making a shadow for other than
12802 * copy on write reasons, then we have
12803 * to remove write permission.
12804 */
12805
12806 is_writable = false;
12807 if (old_entry->protection & VM_PROT_WRITE) {
12808 is_writable = true;
12809 #if __arm64e__
12810 } else if (old_entry->used_for_tpro) {
12811 is_writable = true;
12812 #endif /* __arm64e__ */
12813 }
12814 if (!old_entry->needs_copy && is_writable) {
12815 vm_prot_t prot;
12816
12817 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
12818 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
12819 __FUNCTION__, old_map, old_map->pmap,
12820 old_entry,
12821 (uint64_t)old_entry->vme_start,
12822 (uint64_t)old_entry->vme_end,
12823 old_entry->protection);
12824 }
12825
12826 prot = old_entry->protection & ~VM_PROT_WRITE;
12827
12828 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
12829 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
12830 __FUNCTION__, old_map, old_map->pmap,
12831 old_entry,
12832 (uint64_t)old_entry->vme_start,
12833 (uint64_t)old_entry->vme_end,
12834 prot);
12835 }
12836
12837 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12838 prot |= VM_PROT_EXECUTE;
12839 }
12840
12841
12842 if (old_map->mapped_in_other_pmaps) {
12843 vm_object_pmap_protect(
12844 VME_OBJECT(old_entry),
12845 VME_OFFSET(old_entry),
12846 (old_entry->vme_end -
12847 old_entry->vme_start),
12848 PMAP_NULL,
12849 PAGE_SIZE,
12850 old_entry->vme_start,
12851 prot);
12852 } else {
12853 pmap_protect(old_map->pmap,
12854 old_entry->vme_start,
12855 old_entry->vme_end,
12856 prot);
12857 }
12858 }
12859
12860 old_entry->needs_copy = FALSE;
12861 object = VME_OBJECT(old_entry);
12862 }
12863
12864
12865 /*
12866 * If object was using a symmetric copy strategy,
12867 * change its copy strategy to the default
12868 * asymmetric copy strategy, which is copy_delay
12869 * in the non-norma case and copy_call in the
12870 * norma case. Bump the reference count for the
12871 * new entry.
12872 */
12873
12874 if (old_entry->is_sub_map) {
12875 vm_map_reference(VME_SUBMAP(old_entry));
12876 } else {
12877 vm_object_lock(object);
12878 vm_object_reference_locked(object);
12879 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12880 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12881 }
12882 vm_object_unlock(object);
12883 }
12884
12885 /*
12886 * Clone the entry, using object ref from above.
12887 * Mark both entries as shared.
12888 */
12889
12890 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12891 vm_map_entry_copy(old_map, new_entry, old_entry);
12892 old_entry->is_shared = TRUE;
12893 new_entry->is_shared = TRUE;
12894
12895 /*
12896 * We're dealing with a shared mapping, so the resulting mapping
12897 * should inherit some of the original mapping's accounting settings.
12898 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12899 * "use_pmap" should stay the same as before (if it hasn't been reset
12900 * to TRUE when we cleared "iokit_acct").
12901 */
12902 assert(!new_entry->iokit_acct);
12903
12904 /*
12905 * If old entry's inheritence is VM_INHERIT_NONE,
12906 * the new entry is for corpse fork, remove the
12907 * write permission from the new entry.
12908 */
12909 if (old_entry->inheritance == VM_INHERIT_NONE) {
12910 new_entry->protection &= ~VM_PROT_WRITE;
12911 new_entry->max_protection &= ~VM_PROT_WRITE;
12912 }
12913
12914 /*
12915 * Insert the entry into the new map -- we
12916 * know we're inserting at the end of the new
12917 * map.
12918 */
12919
12920 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12921 VM_MAP_KERNEL_FLAGS_NONE);
12922
12923 /*
12924 * Update the physical map
12925 */
12926
12927 if (old_entry->is_sub_map) {
12928 /* Bill Angell pmap support goes here */
12929 } else {
12930 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12931 old_entry->vme_end - old_entry->vme_start,
12932 old_entry->vme_start);
12933 }
12934 }
12935
12936 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12937 vm_map_fork_copy(
12938 vm_map_t old_map,
12939 vm_map_entry_t *old_entry_p,
12940 vm_map_t new_map,
12941 int vm_map_copyin_flags)
12942 {
12943 vm_map_entry_t old_entry = *old_entry_p;
12944 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12945 vm_map_offset_t start = old_entry->vme_start;
12946 vm_map_copy_t copy;
12947 vm_map_entry_t last = vm_map_last_entry(new_map);
12948
12949 vm_map_unlock(old_map);
12950 /*
12951 * Use maxprot version of copyin because we
12952 * care about whether this memory can ever
12953 * be accessed, not just whether it's accessible
12954 * right now.
12955 */
12956 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12957 if (vm_map_copyin_internal(old_map, start, entry_size,
12958 vm_map_copyin_flags, ©)
12959 != KERN_SUCCESS) {
12960 /*
12961 * The map might have changed while it
12962 * was unlocked, check it again. Skip
12963 * any blank space or permanently
12964 * unreadable region.
12965 */
12966 vm_map_lock(old_map);
12967 if (!vm_map_lookup_entry(old_map, start, &last) ||
12968 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12969 last = last->vme_next;
12970 }
12971 *old_entry_p = last;
12972
12973 /*
12974 * XXX For some error returns, want to
12975 * XXX skip to the next element. Note
12976 * that INVALID_ADDRESS and
12977 * PROTECTION_FAILURE are handled above.
12978 */
12979
12980 return FALSE;
12981 }
12982
12983 /*
12984 * Assert that the vm_map_copy is coming from the right
12985 * zone and hasn't been forged
12986 */
12987 vm_map_copy_require(copy);
12988
12989 /*
12990 * Insert the copy into the new map
12991 */
12992 vm_map_copy_insert(new_map, last, copy);
12993
12994 /*
12995 * Pick up the traversal at the end of
12996 * the copied region.
12997 */
12998
12999 vm_map_lock(old_map);
13000 start += entry_size;
13001 if (!vm_map_lookup_entry(old_map, start, &last)) {
13002 last = last->vme_next;
13003 } else {
13004 if (last->vme_start == start) {
13005 /*
13006 * No need to clip here and we don't
13007 * want to cause any unnecessary
13008 * unnesting...
13009 */
13010 } else {
13011 vm_map_clip_start(old_map, last, start);
13012 }
13013 }
13014 *old_entry_p = last;
13015
13016 return TRUE;
13017 }
13018
13019 #if PMAP_FORK_NEST
13020 #define PMAP_FORK_NEST_DEBUG 0
13021 static inline void
vm_map_fork_unnest(pmap_t new_pmap,vm_map_offset_t pre_nested_start,vm_map_offset_t pre_nested_end,vm_map_offset_t start,vm_map_offset_t end)13022 vm_map_fork_unnest(
13023 pmap_t new_pmap,
13024 vm_map_offset_t pre_nested_start,
13025 vm_map_offset_t pre_nested_end,
13026 vm_map_offset_t start,
13027 vm_map_offset_t end)
13028 {
13029 kern_return_t kr;
13030 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13031
13032 assertf(pre_nested_start <= pre_nested_end,
13033 "pre_nested start 0x%llx end 0x%llx",
13034 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13035 assertf(start <= end,
13036 "start 0x%llx end 0x%llx",
13037 (uint64_t) start, (uint64_t)end);
13038
13039 if (pre_nested_start == pre_nested_end) {
13040 /* nothing was pre-nested: done */
13041 return;
13042 }
13043 if (end <= pre_nested_start) {
13044 /* fully before pre-nested range: done */
13045 return;
13046 }
13047 if (start >= pre_nested_end) {
13048 /* fully after pre-nested range: done */
13049 return;
13050 }
13051 /* ignore parts of range outside of pre_nested range */
13052 if (start < pre_nested_start) {
13053 start = pre_nested_start;
13054 }
13055 if (end > pre_nested_end) {
13056 end = pre_nested_end;
13057 }
13058 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13059 start_unnest = start & ~nesting_mask;
13060 end_unnest = (end + nesting_mask) & ~nesting_mask;
13061 kr = pmap_unnest(new_pmap,
13062 (addr64_t)start_unnest,
13063 (uint64_t)(end_unnest - start_unnest));
13064 #if PMAP_FORK_NEST_DEBUG
13065 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13066 #endif /* PMAP_FORK_NEST_DEBUG */
13067 assertf(kr == KERN_SUCCESS,
13068 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13069 (uint64_t)start, (uint64_t)end, new_pmap,
13070 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13071 kr);
13072 }
13073 #endif /* PMAP_FORK_NEST */
13074
13075 void
vm_map_inherit_limits(vm_map_t new_map,const struct _vm_map * old_map)13076 vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13077 {
13078 new_map->size_limit = old_map->size_limit;
13079 new_map->data_limit = old_map->data_limit;
13080 new_map->user_wire_limit = old_map->user_wire_limit;
13081 new_map->reserved_regions = old_map->reserved_regions;
13082 }
13083
13084 /*
13085 * vm_map_fork:
13086 *
13087 * Create and return a new map based on the old
13088 * map, according to the inheritance values on the
13089 * regions in that map and the options.
13090 *
13091 * The source map must not be locked.
13092 */
13093 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)13094 vm_map_fork(
13095 ledger_t ledger,
13096 vm_map_t old_map,
13097 int options)
13098 {
13099 pmap_t new_pmap;
13100 vm_map_t new_map;
13101 vm_map_entry_t old_entry;
13102 vm_map_size_t new_size = 0, entry_size;
13103 vm_map_entry_t new_entry;
13104 boolean_t src_needs_copy;
13105 boolean_t new_entry_needs_copy;
13106 boolean_t pmap_is64bit;
13107 int vm_map_copyin_flags;
13108 vm_inherit_t old_entry_inheritance;
13109 int map_create_options;
13110 kern_return_t footprint_collect_kr;
13111
13112 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13113 VM_MAP_FORK_PRESERVE_PURGEABLE |
13114 VM_MAP_FORK_CORPSE_FOOTPRINT |
13115 VM_MAP_FORK_SHARE_IF_OWNED)) {
13116 /* unsupported option */
13117 return VM_MAP_NULL;
13118 }
13119
13120 pmap_is64bit =
13121 #if defined(__i386__) || defined(__x86_64__)
13122 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13123 #elif defined(__arm64__)
13124 old_map->pmap->is_64bit;
13125 #else
13126 #error Unknown architecture.
13127 #endif
13128
13129 unsigned int pmap_flags = 0;
13130 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13131 #if defined(HAS_APPLE_PAC)
13132 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13133 #endif
13134 #if CONFIG_ROSETTA
13135 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13136 #endif
13137 #if PMAP_CREATE_FORCE_4K_PAGES
13138 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13139 PAGE_SIZE != FOURK_PAGE_SIZE) {
13140 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13141 }
13142 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
13143 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
13144 if (new_pmap == NULL) {
13145 return VM_MAP_NULL;
13146 }
13147
13148 vm_map_reference(old_map);
13149 vm_map_lock(old_map);
13150
13151 map_create_options = 0;
13152 if (old_map->hdr.entries_pageable) {
13153 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13154 }
13155 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13156 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13157 footprint_collect_kr = KERN_SUCCESS;
13158 }
13159 new_map = vm_map_create_options(new_pmap,
13160 old_map->min_offset,
13161 old_map->max_offset,
13162 map_create_options);
13163
13164 /* inherit cs_enforcement */
13165 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
13166
13167 vm_map_lock(new_map);
13168 vm_commit_pagezero_status(new_map);
13169 /* inherit the parent map's page size */
13170 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
13171
13172 /* inherit the parent rlimits */
13173 vm_map_inherit_limits(new_map, old_map);
13174
13175 #if CONFIG_MAP_RANGES
13176 /* inherit the parent map's VM ranges */
13177 vm_map_range_fork(new_map, old_map);
13178 #endif
13179
13180 #if CODE_SIGNING_MONITOR
13181 /* Prepare the monitor for the fork */
13182 csm_fork_prepare(old_map->pmap, new_pmap);
13183 #endif
13184
13185 #if PMAP_FORK_NEST
13186 /*
13187 * Pre-nest the shared region's pmap.
13188 */
13189 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13190 pmap_fork_nest(old_map->pmap, new_pmap,
13191 &pre_nested_start, &pre_nested_end);
13192 #if PMAP_FORK_NEST_DEBUG
13193 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13194 #endif /* PMAP_FORK_NEST_DEBUG */
13195 #endif /* PMAP_FORK_NEST */
13196
13197 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13198 /*
13199 * Abort any corpse collection if the system is shutting down.
13200 */
13201 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13202 get_system_inshutdown()) {
13203 #if PMAP_FORK_NEST
13204 new_entry = vm_map_last_entry(new_map);
13205 if (new_entry == vm_map_to_entry(new_map)) {
13206 /* unnest all that was pre-nested */
13207 vm_map_fork_unnest(new_pmap,
13208 pre_nested_start, pre_nested_end,
13209 vm_map_min(new_map), vm_map_max(new_map));
13210 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13211 /* unnest hole at the end, if pre-nested */
13212 vm_map_fork_unnest(new_pmap,
13213 pre_nested_start, pre_nested_end,
13214 new_entry->vme_end, vm_map_max(new_map));
13215 }
13216 #endif /* PMAP_FORK_NEST */
13217 vm_map_corpse_footprint_collect_done(new_map);
13218 vm_map_unlock(new_map);
13219 vm_map_unlock(old_map);
13220 vm_map_deallocate(new_map);
13221 vm_map_deallocate(old_map);
13222 printf("Aborting corpse map due to system shutdown\n");
13223 return VM_MAP_NULL;
13224 }
13225
13226 entry_size = old_entry->vme_end - old_entry->vme_start;
13227
13228 #if PMAP_FORK_NEST
13229 /*
13230 * Undo any unnecessary pre-nesting.
13231 */
13232 vm_map_offset_t prev_end;
13233 if (old_entry == vm_map_first_entry(old_map)) {
13234 prev_end = vm_map_min(old_map);
13235 } else {
13236 prev_end = old_entry->vme_prev->vme_end;
13237 }
13238 if (prev_end < old_entry->vme_start) {
13239 /* unnest hole before this entry, if pre-nested */
13240 vm_map_fork_unnest(new_pmap,
13241 pre_nested_start, pre_nested_end,
13242 prev_end, old_entry->vme_start);
13243 }
13244 if (old_entry->is_sub_map && old_entry->use_pmap) {
13245 /* keep this entry nested in the child */
13246 #if PMAP_FORK_NEST_DEBUG
13247 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13248 #endif /* PMAP_FORK_NEST_DEBUG */
13249 } else {
13250 /* undo nesting for this entry, if pre-nested */
13251 vm_map_fork_unnest(new_pmap,
13252 pre_nested_start, pre_nested_end,
13253 old_entry->vme_start, old_entry->vme_end);
13254 }
13255 #endif /* PMAP_FORK_NEST */
13256
13257 old_entry_inheritance = old_entry->inheritance;
13258 /*
13259 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13260 * share VM_INHERIT_NONE entries that are not backed by a
13261 * device pager.
13262 */
13263 if (old_entry_inheritance == VM_INHERIT_NONE &&
13264 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13265 (old_entry->protection & VM_PROT_READ) &&
13266 !(!old_entry->is_sub_map &&
13267 VME_OBJECT(old_entry) != NULL &&
13268 VME_OBJECT(old_entry)->pager != NULL &&
13269 is_device_pager_ops(
13270 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13271 old_entry_inheritance = VM_INHERIT_SHARE;
13272 }
13273 if (old_entry_inheritance == VM_INHERIT_COPY &&
13274 (options & VM_MAP_FORK_SHARE_IF_OWNED) &&
13275 !old_entry->is_sub_map &&
13276 VME_OBJECT(old_entry) != VM_OBJECT_NULL) {
13277 vm_object_t object;
13278 task_t owner;
13279 object = VME_OBJECT(old_entry);
13280 owner = VM_OBJECT_OWNER(object);
13281 if (owner != TASK_NULL &&
13282 owner->map == old_map) {
13283 /*
13284 * This mapping points at a VM object owned
13285 * by the task being forked.
13286 * Some tools reporting memory accounting
13287 * info rely on the object ID, so share this
13288 * mapping instead of copying, to make the
13289 * corpse look exactly like the original
13290 * task in that respect.
13291 */
13292 assert(object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC);
13293 old_entry_inheritance = VM_INHERIT_SHARE;
13294 }
13295 }
13296
13297 if (old_entry_inheritance != VM_INHERIT_NONE &&
13298 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13299 footprint_collect_kr == KERN_SUCCESS) {
13300 /*
13301 * The corpse won't have old_map->pmap to query
13302 * footprint information, so collect that data now
13303 * and store it in new_map->vmmap_corpse_footprint
13304 * for later autopsy.
13305 */
13306 footprint_collect_kr =
13307 vm_map_corpse_footprint_collect(old_map,
13308 old_entry,
13309 new_map);
13310 }
13311
13312 switch (old_entry_inheritance) {
13313 case VM_INHERIT_NONE:
13314 break;
13315
13316 case VM_INHERIT_SHARE:
13317 vm_map_fork_share(old_map, old_entry, new_map);
13318 new_size += entry_size;
13319 break;
13320
13321 case VM_INHERIT_COPY:
13322
13323 /*
13324 * Inline the copy_quickly case;
13325 * upon failure, fall back on call
13326 * to vm_map_fork_copy.
13327 */
13328
13329 if (old_entry->is_sub_map) {
13330 break;
13331 }
13332 if ((old_entry->wired_count != 0) ||
13333 ((VME_OBJECT(old_entry) != NULL) &&
13334 (VME_OBJECT(old_entry)->true_share))) {
13335 goto slow_vm_map_fork_copy;
13336 }
13337
13338 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13339 vm_map_entry_copy(old_map, new_entry, old_entry);
13340 if (old_entry->vme_permanent) {
13341 /* inherit "permanent" on fork() */
13342 new_entry->vme_permanent = TRUE;
13343 }
13344
13345 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13346 new_map->jit_entry_exists = TRUE;
13347 }
13348
13349 if (new_entry->is_sub_map) {
13350 /* clear address space specifics */
13351 new_entry->use_pmap = FALSE;
13352 } else {
13353 /*
13354 * We're dealing with a copy-on-write operation,
13355 * so the resulting mapping should not inherit
13356 * the original mapping's accounting settings.
13357 * "iokit_acct" should have been cleared in
13358 * vm_map_entry_copy().
13359 * "use_pmap" should be reset to its default
13360 * (TRUE) so that the new mapping gets
13361 * accounted for in the task's memory footprint.
13362 */
13363 assert(!new_entry->iokit_acct);
13364 new_entry->use_pmap = TRUE;
13365 }
13366
13367 if (!vm_object_copy_quickly(
13368 VME_OBJECT(new_entry),
13369 VME_OFFSET(old_entry),
13370 (old_entry->vme_end -
13371 old_entry->vme_start),
13372 &src_needs_copy,
13373 &new_entry_needs_copy)) {
13374 vm_map_entry_dispose(new_entry);
13375 goto slow_vm_map_fork_copy;
13376 }
13377
13378 /*
13379 * Handle copy-on-write obligations
13380 */
13381
13382 if (src_needs_copy && !old_entry->needs_copy) {
13383 vm_prot_t prot;
13384
13385 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection)) {
13386 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13387 __FUNCTION__,
13388 old_map, old_map->pmap, old_entry,
13389 (uint64_t)old_entry->vme_start,
13390 (uint64_t)old_entry->vme_end,
13391 old_entry->protection);
13392 }
13393
13394 prot = old_entry->protection & ~VM_PROT_WRITE;
13395
13396 if (override_nx(old_map, VME_ALIAS(old_entry))
13397 && prot) {
13398 prot |= VM_PROT_EXECUTE;
13399 }
13400
13401 if (pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot)) {
13402 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13403 __FUNCTION__,
13404 old_map, old_map->pmap, old_entry,
13405 (uint64_t)old_entry->vme_start,
13406 (uint64_t)old_entry->vme_end,
13407 prot);
13408 }
13409
13410 vm_object_pmap_protect(
13411 VME_OBJECT(old_entry),
13412 VME_OFFSET(old_entry),
13413 (old_entry->vme_end -
13414 old_entry->vme_start),
13415 ((old_entry->is_shared
13416 || old_map->mapped_in_other_pmaps)
13417 ? PMAP_NULL :
13418 old_map->pmap),
13419 VM_MAP_PAGE_SIZE(old_map),
13420 old_entry->vme_start,
13421 prot);
13422
13423 assert(old_entry->wired_count == 0);
13424 old_entry->needs_copy = TRUE;
13425 }
13426 new_entry->needs_copy = new_entry_needs_copy;
13427
13428 /*
13429 * Insert the entry at the end
13430 * of the map.
13431 */
13432
13433 vm_map_store_entry_link(new_map,
13434 vm_map_last_entry(new_map),
13435 new_entry,
13436 VM_MAP_KERNEL_FLAGS_NONE);
13437 new_size += entry_size;
13438 break;
13439
13440 slow_vm_map_fork_copy:
13441 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13442 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13443 vm_map_copyin_flags |=
13444 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13445 }
13446 if (vm_map_fork_copy(old_map,
13447 &old_entry,
13448 new_map,
13449 vm_map_copyin_flags)) {
13450 new_size += entry_size;
13451 }
13452 continue;
13453 }
13454 old_entry = old_entry->vme_next;
13455 }
13456
13457 #if PMAP_FORK_NEST
13458 new_entry = vm_map_last_entry(new_map);
13459 if (new_entry == vm_map_to_entry(new_map)) {
13460 /* unnest all that was pre-nested */
13461 vm_map_fork_unnest(new_pmap,
13462 pre_nested_start, pre_nested_end,
13463 vm_map_min(new_map), vm_map_max(new_map));
13464 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13465 /* unnest hole at the end, if pre-nested */
13466 vm_map_fork_unnest(new_pmap,
13467 pre_nested_start, pre_nested_end,
13468 new_entry->vme_end, vm_map_max(new_map));
13469 }
13470 #endif /* PMAP_FORK_NEST */
13471
13472 #if defined(__arm64__)
13473 pmap_insert_commpage(new_map->pmap);
13474 #endif /* __arm64__ */
13475
13476 new_map->size = new_size;
13477
13478 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13479 vm_map_corpse_footprint_collect_done(new_map);
13480 }
13481
13482 /* Propagate JIT entitlement for the pmap layer. */
13483 if (pmap_get_jit_entitled(old_map->pmap)) {
13484 /* Tell the pmap that it supports JIT. */
13485 pmap_set_jit_entitled(new_map->pmap);
13486 }
13487
13488 /* Propagate TPRO settings for the pmap layer */
13489 if (pmap_get_tpro(old_map->pmap)) {
13490 /* Tell the pmap that it supports TPRO */
13491 pmap_set_tpro(new_map->pmap);
13492 }
13493
13494
13495 vm_map_unlock(new_map);
13496 vm_map_unlock(old_map);
13497 vm_map_deallocate(old_map);
13498
13499 return new_map;
13500 }
13501
13502 /*
13503 * vm_map_exec:
13504 *
13505 * Setup the "new_map" with the proper execution environment according
13506 * to the type of executable (platform, 64bit, chroot environment).
13507 * Map the comm page and shared region, etc...
13508 */
13509 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit,uint32_t rsr_version)13510 vm_map_exec(
13511 vm_map_t new_map,
13512 task_t task,
13513 boolean_t is64bit,
13514 void *fsroot,
13515 cpu_type_t cpu,
13516 cpu_subtype_t cpu_subtype,
13517 boolean_t reslide,
13518 boolean_t is_driverkit,
13519 uint32_t rsr_version)
13520 {
13521 SHARED_REGION_TRACE_DEBUG(
13522 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13523 (void *)VM_KERNEL_ADDRPERM(current_task()),
13524 (void *)VM_KERNEL_ADDRPERM(new_map),
13525 (void *)VM_KERNEL_ADDRPERM(task),
13526 (void *)VM_KERNEL_ADDRPERM(fsroot),
13527 cpu,
13528 cpu_subtype));
13529 (void) vm_commpage_enter(new_map, task, is64bit);
13530
13531 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13532
13533 SHARED_REGION_TRACE_DEBUG(
13534 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13535 (void *)VM_KERNEL_ADDRPERM(current_task()),
13536 (void *)VM_KERNEL_ADDRPERM(new_map),
13537 (void *)VM_KERNEL_ADDRPERM(task),
13538 (void *)VM_KERNEL_ADDRPERM(fsroot),
13539 cpu,
13540 cpu_subtype));
13541
13542 /*
13543 * Some devices have region(s) of memory that shouldn't get allocated by
13544 * user processes. The following code creates dummy vm_map_entry_t's for each
13545 * of the regions that needs to be reserved to prevent any allocations in
13546 * those regions.
13547 */
13548 kern_return_t kr = KERN_FAILURE;
13549 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13550 vmk_flags.vmkf_beyond_max = true;
13551
13552 const struct vm_reserved_region *regions = NULL;
13553 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
13554 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13555
13556 for (size_t i = 0; i < num_regions; ++i) {
13557 vm_map_offset_t address = regions[i].vmrr_addr;
13558
13559 kr = vm_map_enter(
13560 new_map,
13561 &address,
13562 regions[i].vmrr_size,
13563 (vm_map_offset_t)0,
13564 vmk_flags,
13565 VM_OBJECT_NULL,
13566 (vm_object_offset_t)0,
13567 FALSE,
13568 VM_PROT_NONE,
13569 VM_PROT_NONE,
13570 VM_INHERIT_COPY);
13571
13572 if (kr != KERN_SUCCESS) {
13573 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13574 }
13575 }
13576
13577 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13578
13579 return KERN_SUCCESS;
13580 }
13581
13582 uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13583 uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13584 uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13585 uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13586 uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13587 uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13588 uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13589 uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13590 uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13591 uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13592 uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13593 uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13594 uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13595 /*
13596 * vm_map_lookup_and_lock_object:
13597 *
13598 * Finds the VM object, offset, and
13599 * protection for a given virtual address in the
13600 * specified map, assuming a page fault of the
13601 * type specified.
13602 *
13603 * Returns the (object, offset, protection) for
13604 * this address, whether it is wired down, and whether
13605 * this map has the only reference to the data in question.
13606 * In order to later verify this lookup, a "version"
13607 * is returned.
13608 * If contended != NULL, *contended will be set to
13609 * true iff the thread had to spin or block to acquire
13610 * an exclusive lock.
13611 *
13612 * The map MUST be locked by the caller and WILL be
13613 * locked on exit. In order to guarantee the
13614 * existence of the returned object, it is returned
13615 * locked.
13616 *
13617 * If a lookup is requested with "write protection"
13618 * specified, the map may be changed to perform virtual
13619 * copying operations, although the data referenced will
13620 * remain the same.
13621 */
13622 kern_return_t
vm_map_lookup_and_lock_object(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13623 vm_map_lookup_and_lock_object(
13624 vm_map_t *var_map, /* IN/OUT */
13625 vm_map_offset_t vaddr,
13626 vm_prot_t fault_type,
13627 int object_lock_type,
13628 vm_map_version_t *out_version, /* OUT */
13629 vm_object_t *object, /* OUT */
13630 vm_object_offset_t *offset, /* OUT */
13631 vm_prot_t *out_prot, /* OUT */
13632 boolean_t *wired, /* OUT */
13633 vm_object_fault_info_t fault_info, /* OUT */
13634 vm_map_t *real_map, /* OUT */
13635 bool *contended) /* OUT */
13636 {
13637 vm_map_entry_t entry;
13638 vm_map_t map = *var_map;
13639 vm_map_t old_map = *var_map;
13640 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13641 vm_map_offset_t cow_parent_vaddr = 0;
13642 vm_map_offset_t old_start = 0;
13643 vm_map_offset_t old_end = 0;
13644 vm_prot_t prot;
13645 boolean_t mask_protections;
13646 boolean_t force_copy;
13647 boolean_t no_force_copy_if_executable;
13648 boolean_t submap_needed_copy;
13649 vm_prot_t original_fault_type;
13650 vm_map_size_t fault_page_mask;
13651
13652 /*
13653 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13654 * as a mask against the mapping's actual protections, not as an
13655 * absolute value.
13656 */
13657 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13658 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13659 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13660 fault_type &= VM_PROT_ALL;
13661 original_fault_type = fault_type;
13662 if (contended) {
13663 *contended = false;
13664 }
13665
13666 *real_map = map;
13667
13668 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13669 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13670
13671 RetryLookup:
13672 fault_type = original_fault_type;
13673
13674 /*
13675 * If the map has an interesting hint, try it before calling
13676 * full blown lookup routine.
13677 */
13678 entry = map->hint;
13679
13680 if ((entry == vm_map_to_entry(map)) ||
13681 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13682 vm_map_entry_t tmp_entry;
13683
13684 /*
13685 * Entry was either not a valid hint, or the vaddr
13686 * was not contained in the entry, so do a full lookup.
13687 */
13688 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13689 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13690 vm_map_unlock(cow_sub_map_parent);
13691 }
13692 if ((*real_map != map)
13693 && (*real_map != cow_sub_map_parent)) {
13694 vm_map_unlock(*real_map);
13695 }
13696 return KERN_INVALID_ADDRESS;
13697 }
13698
13699 entry = tmp_entry;
13700 }
13701 if (map == old_map) {
13702 old_start = entry->vme_start;
13703 old_end = entry->vme_end;
13704 }
13705
13706 /*
13707 * Handle submaps. Drop lock on upper map, submap is
13708 * returned locked.
13709 */
13710
13711 submap_needed_copy = FALSE;
13712 submap_recurse:
13713 if (entry->is_sub_map) {
13714 vm_map_offset_t local_vaddr;
13715 vm_map_offset_t end_delta;
13716 vm_map_offset_t start_delta;
13717 vm_map_offset_t top_entry_saved_start;
13718 vm_object_offset_t top_entry_saved_offset;
13719 vm_map_entry_t submap_entry, saved_submap_entry;
13720 vm_object_offset_t submap_entry_offset;
13721 vm_object_size_t submap_entry_size;
13722 vm_prot_t subentry_protection;
13723 vm_prot_t subentry_max_protection;
13724 boolean_t subentry_no_copy_on_read;
13725 boolean_t subentry_permanent;
13726 boolean_t subentry_csm_associated;
13727 #if __arm64e__
13728 boolean_t subentry_used_for_tpro;
13729 #endif /* __arm64e__ */
13730 boolean_t mapped_needs_copy = FALSE;
13731 vm_map_version_t version;
13732
13733 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13734 "map %p (%d) entry %p submap %p (%d)\n",
13735 map, VM_MAP_PAGE_SHIFT(map), entry,
13736 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13737
13738 local_vaddr = vaddr;
13739 top_entry_saved_start = entry->vme_start;
13740 top_entry_saved_offset = VME_OFFSET(entry);
13741
13742 if ((entry->use_pmap &&
13743 !((fault_type & VM_PROT_WRITE) ||
13744 force_copy))) {
13745 /* if real_map equals map we unlock below */
13746 if ((*real_map != map) &&
13747 (*real_map != cow_sub_map_parent)) {
13748 vm_map_unlock(*real_map);
13749 }
13750 *real_map = VME_SUBMAP(entry);
13751 }
13752
13753 if (entry->needs_copy &&
13754 ((fault_type & VM_PROT_WRITE) ||
13755 force_copy)) {
13756 if (!mapped_needs_copy) {
13757 if (vm_map_lock_read_to_write(map)) {
13758 vm_map_lock_read(map);
13759 *real_map = map;
13760 goto RetryLookup;
13761 }
13762 vm_map_lock_read(VME_SUBMAP(entry));
13763 *var_map = VME_SUBMAP(entry);
13764 cow_sub_map_parent = map;
13765 /* reset base to map before cow object */
13766 /* this is the map which will accept */
13767 /* the new cow object */
13768 old_start = entry->vme_start;
13769 old_end = entry->vme_end;
13770 cow_parent_vaddr = vaddr;
13771 mapped_needs_copy = TRUE;
13772 } else {
13773 vm_map_lock_read(VME_SUBMAP(entry));
13774 *var_map = VME_SUBMAP(entry);
13775 if ((cow_sub_map_parent != map) &&
13776 (*real_map != map)) {
13777 vm_map_unlock(map);
13778 }
13779 }
13780 } else {
13781 if (entry->needs_copy) {
13782 submap_needed_copy = TRUE;
13783 }
13784 vm_map_lock_read(VME_SUBMAP(entry));
13785 *var_map = VME_SUBMAP(entry);
13786 /* leave map locked if it is a target */
13787 /* cow sub_map above otherwise, just */
13788 /* follow the maps down to the object */
13789 /* here we unlock knowing we are not */
13790 /* revisiting the map. */
13791 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13792 vm_map_unlock_read(map);
13793 }
13794 }
13795
13796 entry = NULL;
13797 map = *var_map;
13798
13799 /* calculate the offset in the submap for vaddr */
13800 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
13801 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13802 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13803 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
13804
13805 RetrySubMap:
13806 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13807 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13808 vm_map_unlock(cow_sub_map_parent);
13809 }
13810 if ((*real_map != map)
13811 && (*real_map != cow_sub_map_parent)) {
13812 vm_map_unlock(*real_map);
13813 }
13814 *real_map = map;
13815 return KERN_INVALID_ADDRESS;
13816 }
13817
13818 /* find the attenuated shadow of the underlying object */
13819 /* on our target map */
13820
13821 /* in english the submap object may extend beyond the */
13822 /* region mapped by the entry or, may only fill a portion */
13823 /* of it. For our purposes, we only care if the object */
13824 /* doesn't fill. In this case the area which will */
13825 /* ultimately be clipped in the top map will only need */
13826 /* to be as big as the portion of the underlying entry */
13827 /* which is mapped */
13828 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
13829 submap_entry->vme_start - top_entry_saved_offset : 0;
13830
13831 end_delta =
13832 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
13833 submap_entry->vme_end ?
13834 0 : (top_entry_saved_offset +
13835 (old_end - old_start))
13836 - submap_entry->vme_end;
13837
13838 old_start += start_delta;
13839 old_end -= end_delta;
13840
13841 if (submap_entry->is_sub_map) {
13842 entry = submap_entry;
13843 vaddr = local_vaddr;
13844 goto submap_recurse;
13845 }
13846
13847 if (((fault_type & VM_PROT_WRITE) ||
13848 force_copy)
13849 && cow_sub_map_parent) {
13850 vm_object_t sub_object, copy_object;
13851 vm_object_offset_t copy_offset;
13852 vm_map_offset_t local_start;
13853 vm_map_offset_t local_end;
13854 boolean_t object_copied = FALSE;
13855 vm_object_offset_t object_copied_offset = 0;
13856 boolean_t object_copied_needs_copy = FALSE;
13857 kern_return_t kr = KERN_SUCCESS;
13858
13859 if (vm_map_lock_read_to_write(map)) {
13860 vm_map_lock_read(map);
13861 old_start -= start_delta;
13862 old_end += end_delta;
13863 goto RetrySubMap;
13864 }
13865
13866
13867 sub_object = VME_OBJECT(submap_entry);
13868 if (sub_object == VM_OBJECT_NULL) {
13869 sub_object =
13870 vm_object_allocate(
13871 (vm_map_size_t)
13872 (submap_entry->vme_end -
13873 submap_entry->vme_start));
13874 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13875 VME_OFFSET_SET(submap_entry, 0);
13876 assert(!submap_entry->is_sub_map);
13877 assert(submap_entry->use_pmap);
13878 }
13879 local_start = local_vaddr -
13880 (cow_parent_vaddr - old_start);
13881 local_end = local_vaddr +
13882 (old_end - cow_parent_vaddr);
13883 vm_map_clip_start(map, submap_entry, local_start);
13884 vm_map_clip_end(map, submap_entry, local_end);
13885 if (submap_entry->is_sub_map) {
13886 /* unnesting was done when clipping */
13887 assert(!submap_entry->use_pmap);
13888 }
13889
13890 /* This is the COW case, lets connect */
13891 /* an entry in our space to the underlying */
13892 /* object in the submap, bypassing the */
13893 /* submap. */
13894 submap_entry_offset = VME_OFFSET(submap_entry);
13895 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13896
13897 if ((submap_entry->wired_count != 0 ||
13898 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13899 (submap_entry->protection & VM_PROT_EXECUTE) &&
13900 no_force_copy_if_executable) {
13901 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13902 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13903 vm_map_unlock(cow_sub_map_parent);
13904 }
13905 if ((*real_map != map)
13906 && (*real_map != cow_sub_map_parent)) {
13907 vm_map_unlock(*real_map);
13908 }
13909 *real_map = map;
13910 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13911 vm_map_lock_write_to_read(map);
13912 kr = KERN_PROTECTION_FAILURE;
13913 DTRACE_VM4(submap_no_copy_executable,
13914 vm_map_t, map,
13915 vm_object_offset_t, submap_entry_offset,
13916 vm_object_size_t, submap_entry_size,
13917 int, kr);
13918 return kr;
13919 }
13920
13921 if (submap_entry->wired_count != 0) {
13922 vm_object_reference(sub_object);
13923
13924 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13925 "submap_entry %p offset 0x%llx\n",
13926 submap_entry, VME_OFFSET(submap_entry));
13927
13928 DTRACE_VM6(submap_copy_slowly,
13929 vm_map_t, cow_sub_map_parent,
13930 vm_map_offset_t, vaddr,
13931 vm_map_t, map,
13932 vm_object_size_t, submap_entry_size,
13933 int, submap_entry->wired_count,
13934 int, sub_object->copy_strategy);
13935
13936 saved_submap_entry = submap_entry;
13937 version.main_timestamp = map->timestamp;
13938 vm_map_unlock(map); /* Increments timestamp by 1 */
13939 submap_entry = VM_MAP_ENTRY_NULL;
13940
13941 vm_object_lock(sub_object);
13942 kr = vm_object_copy_slowly(sub_object,
13943 submap_entry_offset,
13944 submap_entry_size,
13945 FALSE,
13946 ©_object);
13947 object_copied = TRUE;
13948 object_copied_offset = 0;
13949 /* 4k: account for extra offset in physical page */
13950 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13951 object_copied_needs_copy = FALSE;
13952 vm_object_deallocate(sub_object);
13953
13954 vm_map_lock(map);
13955
13956 if (kr != KERN_SUCCESS &&
13957 kr != KERN_MEMORY_RESTART_COPY) {
13958 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13959 vm_map_unlock(cow_sub_map_parent);
13960 }
13961 if ((*real_map != map)
13962 && (*real_map != cow_sub_map_parent)) {
13963 vm_map_unlock(*real_map);
13964 }
13965 *real_map = map;
13966 vm_object_deallocate(copy_object);
13967 copy_object = VM_OBJECT_NULL;
13968 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13969 vm_map_lock_write_to_read(map);
13970 DTRACE_VM4(submap_copy_error_slowly,
13971 vm_object_t, sub_object,
13972 vm_object_offset_t, submap_entry_offset,
13973 vm_object_size_t, submap_entry_size,
13974 int, kr);
13975 vm_map_lookup_and_lock_object_copy_slowly_error++;
13976 return kr;
13977 }
13978
13979 if ((kr == KERN_SUCCESS) &&
13980 (version.main_timestamp + 1) == map->timestamp) {
13981 submap_entry = saved_submap_entry;
13982 } else {
13983 saved_submap_entry = NULL;
13984 old_start -= start_delta;
13985 old_end += end_delta;
13986 vm_object_deallocate(copy_object);
13987 copy_object = VM_OBJECT_NULL;
13988 vm_map_lock_write_to_read(map);
13989 vm_map_lookup_and_lock_object_copy_slowly_restart++;
13990 goto RetrySubMap;
13991 }
13992 vm_map_lookup_and_lock_object_copy_slowly_count++;
13993 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
13994 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
13995 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
13996 }
13997 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13998 submap_entry_offset = VME_OFFSET(submap_entry);
13999 copy_object = VM_OBJECT_NULL;
14000 object_copied_offset = submap_entry_offset;
14001 object_copied_needs_copy = FALSE;
14002 DTRACE_VM6(submap_copy_strategically,
14003 vm_map_t, cow_sub_map_parent,
14004 vm_map_offset_t, vaddr,
14005 vm_map_t, map,
14006 vm_object_size_t, submap_entry_size,
14007 int, submap_entry->wired_count,
14008 int, sub_object->copy_strategy);
14009 kr = vm_object_copy_strategically(
14010 sub_object,
14011 submap_entry_offset,
14012 submap_entry->vme_end - submap_entry->vme_start,
14013 false, /* forking */
14014 ©_object,
14015 &object_copied_offset,
14016 &object_copied_needs_copy);
14017 if (kr == KERN_MEMORY_RESTART_COPY) {
14018 old_start -= start_delta;
14019 old_end += end_delta;
14020 vm_object_deallocate(copy_object);
14021 copy_object = VM_OBJECT_NULL;
14022 vm_map_lock_write_to_read(map);
14023 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14024 goto RetrySubMap;
14025 }
14026 if (kr != KERN_SUCCESS) {
14027 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14028 vm_map_unlock(cow_sub_map_parent);
14029 }
14030 if ((*real_map != map)
14031 && (*real_map != cow_sub_map_parent)) {
14032 vm_map_unlock(*real_map);
14033 }
14034 *real_map = map;
14035 vm_object_deallocate(copy_object);
14036 copy_object = VM_OBJECT_NULL;
14037 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
14038 vm_map_lock_write_to_read(map);
14039 DTRACE_VM4(submap_copy_error_strategically,
14040 vm_object_t, sub_object,
14041 vm_object_offset_t, submap_entry_offset,
14042 vm_object_size_t, submap_entry_size,
14043 int, kr);
14044 vm_map_lookup_and_lock_object_copy_strategically_error++;
14045 return kr;
14046 }
14047 assert(copy_object != VM_OBJECT_NULL);
14048 assert(copy_object != sub_object);
14049 object_copied = TRUE;
14050 vm_map_lookup_and_lock_object_copy_strategically_count++;
14051 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14052 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14053 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14054 }
14055 } else {
14056 /* set up shadow object */
14057 object_copied = FALSE;
14058 copy_object = sub_object;
14059 vm_object_lock(sub_object);
14060 vm_object_reference_locked(sub_object);
14061 VM_OBJECT_SET_SHADOWED(sub_object, TRUE);
14062 vm_object_unlock(sub_object);
14063
14064 assert(submap_entry->wired_count == 0);
14065 submap_entry->needs_copy = TRUE;
14066
14067 prot = submap_entry->protection;
14068 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14069 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14070 __FUNCTION__,
14071 map, map->pmap, submap_entry,
14072 (uint64_t)submap_entry->vme_start,
14073 (uint64_t)submap_entry->vme_end,
14074 prot);
14075 }
14076 prot = prot & ~VM_PROT_WRITE;
14077 if (pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot)) {
14078 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14079 __FUNCTION__,
14080 map, map->pmap, submap_entry,
14081 (uint64_t)submap_entry->vme_start,
14082 (uint64_t)submap_entry->vme_end,
14083 prot);
14084 }
14085
14086 if (override_nx(old_map,
14087 VME_ALIAS(submap_entry))
14088 && prot) {
14089 prot |= VM_PROT_EXECUTE;
14090 }
14091
14092 vm_object_pmap_protect(
14093 sub_object,
14094 VME_OFFSET(submap_entry),
14095 submap_entry->vme_end -
14096 submap_entry->vme_start,
14097 (submap_entry->is_shared
14098 || map->mapped_in_other_pmaps) ?
14099 PMAP_NULL : map->pmap,
14100 VM_MAP_PAGE_SIZE(map),
14101 submap_entry->vme_start,
14102 prot);
14103 vm_map_lookup_and_lock_object_copy_shadow_count++;
14104 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14105 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14106 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14107 }
14108 }
14109
14110 /*
14111 * Adjust the fault offset to the submap entry.
14112 */
14113 copy_offset = (local_vaddr -
14114 submap_entry->vme_start +
14115 VME_OFFSET(submap_entry));
14116
14117 /* This works diffently than the */
14118 /* normal submap case. We go back */
14119 /* to the parent of the cow map and*/
14120 /* clip out the target portion of */
14121 /* the sub_map, substituting the */
14122 /* new copy object, */
14123
14124 subentry_protection = submap_entry->protection;
14125 subentry_max_protection = submap_entry->max_protection;
14126 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14127 subentry_permanent = submap_entry->vme_permanent;
14128 subentry_csm_associated = submap_entry->csm_associated;
14129 #if __arm64e__
14130 subentry_used_for_tpro = submap_entry->used_for_tpro;
14131 #endif // __arm64e__
14132 vm_map_unlock(map);
14133 submap_entry = NULL; /* not valid after map unlock */
14134
14135 local_start = old_start;
14136 local_end = old_end;
14137 map = cow_sub_map_parent;
14138 *var_map = cow_sub_map_parent;
14139 vaddr = cow_parent_vaddr;
14140 cow_sub_map_parent = NULL;
14141
14142 if (!vm_map_lookup_entry(map,
14143 vaddr, &entry)) {
14144 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14145 vm_map_unlock(cow_sub_map_parent);
14146 }
14147 if ((*real_map != map)
14148 && (*real_map != cow_sub_map_parent)) {
14149 vm_map_unlock(*real_map);
14150 }
14151 *real_map = map;
14152 vm_object_deallocate(
14153 copy_object);
14154 copy_object = VM_OBJECT_NULL;
14155 vm_map_lock_write_to_read(map);
14156 DTRACE_VM4(submap_lookup_post_unlock,
14157 uint64_t, (uint64_t)entry->vme_start,
14158 uint64_t, (uint64_t)entry->vme_end,
14159 vm_map_offset_t, vaddr,
14160 int, object_copied);
14161 return KERN_INVALID_ADDRESS;
14162 }
14163
14164 /* clip out the portion of space */
14165 /* mapped by the sub map which */
14166 /* corresponds to the underlying */
14167 /* object */
14168
14169 /*
14170 * Clip (and unnest) the smallest nested chunk
14171 * possible around the faulting address...
14172 */
14173 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
14174 local_end = local_start + pmap_shared_region_size_min(map->pmap);
14175 /*
14176 * ... but don't go beyond the "old_start" to "old_end"
14177 * range, to avoid spanning over another VM region
14178 * with a possibly different VM object and/or offset.
14179 */
14180 if (local_start < old_start) {
14181 local_start = old_start;
14182 }
14183 if (local_end > old_end) {
14184 local_end = old_end;
14185 }
14186 /*
14187 * Adjust copy_offset to the start of the range.
14188 */
14189 copy_offset -= (vaddr - local_start);
14190
14191 vm_map_clip_start(map, entry, local_start);
14192 vm_map_clip_end(map, entry, local_end);
14193 if (entry->is_sub_map) {
14194 /* unnesting was done when clipping */
14195 assert(!entry->use_pmap);
14196 }
14197
14198 /* substitute copy object for */
14199 /* shared map entry */
14200 vm_map_deallocate(VME_SUBMAP(entry));
14201 assert(!entry->iokit_acct);
14202 entry->use_pmap = TRUE;
14203 VME_OBJECT_SET(entry, copy_object, false, 0);
14204
14205 /* propagate the submap entry's protections */
14206 if (entry->protection != VM_PROT_READ) {
14207 /*
14208 * Someone has already altered the top entry's
14209 * protections via vm_protect(VM_PROT_COPY).
14210 * Respect these new values and ignore the
14211 * submap entry's protections.
14212 */
14213 } else {
14214 /*
14215 * Regular copy-on-write: propagate the submap
14216 * entry's protections to the top map entry.
14217 */
14218 entry->protection |= subentry_protection;
14219 }
14220 entry->max_protection |= subentry_max_protection;
14221 /* propagate some attributes from subentry */
14222 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14223 entry->vme_permanent = subentry_permanent;
14224 entry->csm_associated = subentry_csm_associated;
14225 #if __arm64e__
14226 /* propagate TPRO iff the destination map has TPRO enabled */
14227 if (subentry_used_for_tpro && vm_map_tpro(map)) {
14228 entry->used_for_tpro = subentry_used_for_tpro;
14229 }
14230 #endif /* __arm64e */
14231 if ((entry->protection & VM_PROT_WRITE) &&
14232 (entry->protection & VM_PROT_EXECUTE) &&
14233 #if XNU_TARGET_OS_OSX
14234 map->pmap != kernel_pmap &&
14235 (vm_map_cs_enforcement(map)
14236 #if __arm64__
14237 || !VM_MAP_IS_EXOTIC(map)
14238 #endif /* __arm64__ */
14239 ) &&
14240 #endif /* XNU_TARGET_OS_OSX */
14241 #if CODE_SIGNING_MONITOR
14242 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14243 #endif
14244 !(entry->used_for_jit) &&
14245 VM_MAP_POLICY_WX_STRIP_X(map)) {
14246 DTRACE_VM3(cs_wx,
14247 uint64_t, (uint64_t)entry->vme_start,
14248 uint64_t, (uint64_t)entry->vme_end,
14249 vm_prot_t, entry->protection);
14250 printf("CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14251 proc_selfpid(),
14252 (get_bsdtask_info(current_task())
14253 ? proc_name_address(get_bsdtask_info(current_task()))
14254 : "?"),
14255 __FUNCTION__, __LINE__,
14256 #if DEVELOPMENT || DEBUG
14257 (uint64_t)entry->vme_start,
14258 (uint64_t)entry->vme_end,
14259 #else /* DEVELOPMENT || DEBUG */
14260 (uint64_t)0,
14261 (uint64_t)0,
14262 #endif /* DEVELOPMENT || DEBUG */
14263 entry->protection);
14264 entry->protection &= ~VM_PROT_EXECUTE;
14265 }
14266
14267 if (object_copied) {
14268 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
14269 entry->needs_copy = object_copied_needs_copy;
14270 entry->is_shared = FALSE;
14271 } else {
14272 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14273 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14274 assert(entry->wired_count == 0);
14275 VME_OFFSET_SET(entry, copy_offset);
14276 entry->needs_copy = TRUE;
14277 if (map != old_map) {
14278 entry->is_shared = TRUE;
14279 }
14280 }
14281 if (entry->inheritance == VM_INHERIT_SHARE) {
14282 entry->inheritance = VM_INHERIT_COPY;
14283 }
14284
14285 vm_map_lock_write_to_read(map);
14286 } else {
14287 if ((cow_sub_map_parent)
14288 && (cow_sub_map_parent != *real_map)
14289 && (cow_sub_map_parent != map)) {
14290 vm_map_unlock(cow_sub_map_parent);
14291 }
14292 entry = submap_entry;
14293 vaddr = local_vaddr;
14294 }
14295 }
14296
14297 /*
14298 * Check whether this task is allowed to have
14299 * this page.
14300 */
14301
14302 prot = entry->protection;
14303
14304 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
14305 /*
14306 * HACK -- if not a stack, then allow execution
14307 */
14308 prot |= VM_PROT_EXECUTE;
14309 }
14310
14311 #if __arm64e__
14312 /*
14313 * If the entry we're dealing with is TPRO and we have a write
14314 * fault, inject VM_PROT_WRITE into protections. This allows us
14315 * to maintain RO permissions when not marked as TPRO.
14316 */
14317 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14318 prot |= VM_PROT_WRITE;
14319 }
14320 #endif /* __arm64e__ */
14321 if (mask_protections) {
14322 fault_type &= prot;
14323 if (fault_type == VM_PROT_NONE) {
14324 goto protection_failure;
14325 }
14326 }
14327 if (((fault_type & prot) != fault_type)
14328 #if __arm64__
14329 /* prefetch abort in execute-only page */
14330 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14331 #elif defined(__x86_64__)
14332 /* Consider the UEXEC bit when handling an EXECUTE fault */
14333 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14334 #endif
14335 ) {
14336 protection_failure:
14337 if (*real_map != map) {
14338 vm_map_unlock(*real_map);
14339 }
14340 *real_map = map;
14341
14342 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14343 log_stack_execution_failure((addr64_t)vaddr, prot);
14344 }
14345
14346 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14347 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14348 /*
14349 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14350 *
14351 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14352 */
14353 return KERN_PROTECTION_FAILURE;
14354 }
14355
14356 /*
14357 * If this page is not pageable, we have to get
14358 * it for all possible accesses.
14359 */
14360
14361 *wired = (entry->wired_count != 0);
14362 if (*wired) {
14363 fault_type = prot;
14364 }
14365
14366 /*
14367 * If the entry was copy-on-write, we either ...
14368 */
14369
14370 if (entry->needs_copy) {
14371 /*
14372 * If we want to write the page, we may as well
14373 * handle that now since we've got the map locked.
14374 *
14375 * If we don't need to write the page, we just
14376 * demote the permissions allowed.
14377 */
14378
14379 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14380 /*
14381 * Make a new object, and place it in the
14382 * object chain. Note that no new references
14383 * have appeared -- one just moved from the
14384 * map to the new object.
14385 */
14386
14387 if (vm_map_lock_read_to_write(map)) {
14388 vm_map_lock_read(map);
14389 goto RetryLookup;
14390 }
14391
14392 if (VME_OBJECT(entry)->shadowed == FALSE) {
14393 vm_object_lock(VME_OBJECT(entry));
14394 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14395 vm_object_unlock(VME_OBJECT(entry));
14396 }
14397 VME_OBJECT_SHADOW(entry,
14398 (vm_map_size_t) (entry->vme_end -
14399 entry->vme_start),
14400 vm_map_always_shadow(map));
14401 entry->needs_copy = FALSE;
14402
14403 vm_map_lock_write_to_read(map);
14404 }
14405 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14406 /*
14407 * We're attempting to read a copy-on-write
14408 * page -- don't allow writes.
14409 */
14410
14411 prot &= (~VM_PROT_WRITE);
14412 }
14413 }
14414
14415 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14416 /*
14417 * We went through a "needs_copy" submap without triggering
14418 * a copy, so granting write access to the page would bypass
14419 * that submap's "needs_copy".
14420 */
14421 assert(!(fault_type & VM_PROT_WRITE));
14422 assert(!*wired);
14423 assert(!force_copy);
14424 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14425 prot &= ~VM_PROT_WRITE;
14426 }
14427
14428 /*
14429 * Create an object if necessary.
14430 */
14431 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14432 if (vm_map_lock_read_to_write(map)) {
14433 vm_map_lock_read(map);
14434 goto RetryLookup;
14435 }
14436
14437 VME_OBJECT_SET(entry,
14438 vm_object_allocate(
14439 (vm_map_size_t)(entry->vme_end -
14440 entry->vme_start)), false, 0);
14441 VME_OFFSET_SET(entry, 0);
14442 assert(entry->use_pmap);
14443 vm_map_lock_write_to_read(map);
14444 }
14445
14446 /*
14447 * Return the object/offset from this entry. If the entry
14448 * was copy-on-write or empty, it has been fixed up. Also
14449 * return the protection.
14450 */
14451
14452 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14453 *object = VME_OBJECT(entry);
14454 *out_prot = prot;
14455 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14456
14457 if (fault_info) {
14458 /* ... the caller will change "interruptible" if needed */
14459 fault_info->user_tag = VME_ALIAS(entry);
14460 fault_info->pmap_options = 0;
14461 if (entry->iokit_acct ||
14462 (!entry->is_sub_map && !entry->use_pmap)) {
14463 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14464 }
14465 if (fault_info->behavior == VM_BEHAVIOR_DEFAULT) {
14466 fault_info->behavior = entry->behavior;
14467 }
14468 fault_info->lo_offset = VME_OFFSET(entry);
14469 fault_info->hi_offset =
14470 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14471 fault_info->no_cache = entry->no_cache;
14472 fault_info->stealth = FALSE;
14473 fault_info->io_sync = FALSE;
14474 if (entry->used_for_jit ||
14475 #if CODE_SIGNING_MONITOR
14476 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14477 #endif
14478 entry->vme_resilient_codesign) {
14479 fault_info->cs_bypass = TRUE;
14480 } else {
14481 fault_info->cs_bypass = FALSE;
14482 }
14483 fault_info->csm_associated = FALSE;
14484 #if CODE_SIGNING_MONITOR
14485 if (entry->csm_associated) {
14486 /*
14487 * The pmap layer will validate this page
14488 * before allowing it to be executed from.
14489 */
14490 fault_info->csm_associated = TRUE;
14491 }
14492 #endif
14493 fault_info->mark_zf_absent = FALSE;
14494 fault_info->batch_pmap_op = FALSE;
14495 fault_info->resilient_media = entry->vme_resilient_media;
14496 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14497 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14498 #if __arm64e__
14499 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14500 #else /* __arm64e__ */
14501 fault_info->fi_used_for_tpro = FALSE;
14502 #endif
14503 if (entry->translated_allow_execute) {
14504 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14505 }
14506 }
14507
14508 /*
14509 * Lock the object to prevent it from disappearing
14510 */
14511 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14512 if (contended == NULL) {
14513 vm_object_lock(*object);
14514 } else {
14515 *contended = vm_object_lock_check_contended(*object);
14516 }
14517 } else {
14518 vm_object_lock_shared(*object);
14519 }
14520
14521 /*
14522 * Save the version number
14523 */
14524
14525 out_version->main_timestamp = map->timestamp;
14526
14527 return KERN_SUCCESS;
14528 }
14529
14530
14531 /*
14532 * vm_map_verify:
14533 *
14534 * Verifies that the map in question has not changed
14535 * since the given version. The map has to be locked
14536 * ("shared" mode is fine) before calling this function
14537 * and it will be returned locked too.
14538 */
14539 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)14540 vm_map_verify(
14541 vm_map_t map,
14542 vm_map_version_t *version) /* REF */
14543 {
14544 boolean_t result;
14545
14546 vm_map_lock_assert_held(map);
14547 result = (map->timestamp == version->main_timestamp);
14548
14549 return result;
14550 }
14551
14552 /*
14553 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14554 * Goes away after regular vm_region_recurse function migrates to
14555 * 64 bits
14556 * vm_region_recurse: A form of vm_region which follows the
14557 * submaps in a target map
14558 *
14559 */
14560
14561 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)14562 vm_map_region_recurse_64(
14563 vm_map_t map,
14564 vm_map_offset_t *address, /* IN/OUT */
14565 vm_map_size_t *size, /* OUT */
14566 natural_t *nesting_depth, /* IN/OUT */
14567 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14568 mach_msg_type_number_t *count) /* IN/OUT */
14569 {
14570 mach_msg_type_number_t original_count;
14571 vm_region_extended_info_data_t extended;
14572 vm_map_entry_t tmp_entry;
14573 vm_map_offset_t user_address;
14574 unsigned int user_max_depth;
14575
14576 /*
14577 * "curr_entry" is the VM map entry preceding or including the
14578 * address we're looking for.
14579 * "curr_map" is the map or sub-map containing "curr_entry".
14580 * "curr_address" is the equivalent of the top map's "user_address"
14581 * in the current map.
14582 * "curr_offset" is the cumulated offset of "curr_map" in the
14583 * target task's address space.
14584 * "curr_depth" is the depth of "curr_map" in the chain of
14585 * sub-maps.
14586 *
14587 * "curr_max_below" and "curr_max_above" limit the range (around
14588 * "curr_address") we should take into account in the current (sub)map.
14589 * They limit the range to what's visible through the map entries
14590 * we've traversed from the top map to the current map.
14591 *
14592 */
14593 vm_map_entry_t curr_entry;
14594 vm_map_address_t curr_address;
14595 vm_map_offset_t curr_offset;
14596 vm_map_t curr_map;
14597 unsigned int curr_depth;
14598 vm_map_offset_t curr_max_below, curr_max_above;
14599 vm_map_offset_t curr_skip;
14600
14601 /*
14602 * "next_" is the same as "curr_" but for the VM region immediately
14603 * after the address we're looking for. We need to keep track of this
14604 * too because we want to return info about that region if the
14605 * address we're looking for is not mapped.
14606 */
14607 vm_map_entry_t next_entry;
14608 vm_map_offset_t next_offset;
14609 vm_map_offset_t next_address;
14610 vm_map_t next_map;
14611 unsigned int next_depth;
14612 vm_map_offset_t next_max_below, next_max_above;
14613 vm_map_offset_t next_skip;
14614
14615 boolean_t look_for_pages;
14616 vm_region_submap_short_info_64_t short_info;
14617 boolean_t do_region_footprint;
14618 int effective_page_size, effective_page_shift;
14619 boolean_t submap_needed_copy;
14620
14621 if (map == VM_MAP_NULL) {
14622 /* no address space to work on */
14623 return KERN_INVALID_ARGUMENT;
14624 }
14625
14626 effective_page_shift = vm_self_region_page_shift(map);
14627 effective_page_size = (1 << effective_page_shift);
14628
14629 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14630 /*
14631 * "info" structure is not big enough and
14632 * would overflow
14633 */
14634 return KERN_INVALID_ARGUMENT;
14635 }
14636
14637 do_region_footprint = task_self_region_footprint();
14638 original_count = *count;
14639
14640 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14641 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14642 look_for_pages = FALSE;
14643 short_info = (vm_region_submap_short_info_64_t) submap_info;
14644 submap_info = NULL;
14645 } else {
14646 look_for_pages = TRUE;
14647 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14648 short_info = NULL;
14649
14650 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14651 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14652 }
14653 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14654 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14655 }
14656 }
14657
14658 user_address = *address;
14659 user_max_depth = *nesting_depth;
14660 submap_needed_copy = FALSE;
14661
14662 if (not_in_kdp) {
14663 vm_map_lock_read(map);
14664 }
14665
14666 recurse_again:
14667 curr_entry = NULL;
14668 curr_map = map;
14669 curr_address = user_address;
14670 curr_offset = 0;
14671 curr_skip = 0;
14672 curr_depth = 0;
14673 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14674 curr_max_below = curr_address;
14675
14676 next_entry = NULL;
14677 next_map = NULL;
14678 next_address = 0;
14679 next_offset = 0;
14680 next_skip = 0;
14681 next_depth = 0;
14682 next_max_above = (vm_map_offset_t) -1;
14683 next_max_below = (vm_map_offset_t) -1;
14684
14685 for (;;) {
14686 if (vm_map_lookup_entry(curr_map,
14687 curr_address,
14688 &tmp_entry)) {
14689 /* tmp_entry contains the address we're looking for */
14690 curr_entry = tmp_entry;
14691 } else {
14692 vm_map_offset_t skip;
14693 /*
14694 * The address is not mapped. "tmp_entry" is the
14695 * map entry preceding the address. We want the next
14696 * one, if it exists.
14697 */
14698 curr_entry = tmp_entry->vme_next;
14699
14700 if (curr_entry == vm_map_to_entry(curr_map) ||
14701 (curr_entry->vme_start >=
14702 curr_address + curr_max_above)) {
14703 /* no next entry at this level: stop looking */
14704 if (not_in_kdp) {
14705 vm_map_unlock_read(curr_map);
14706 }
14707 curr_entry = NULL;
14708 curr_map = NULL;
14709 curr_skip = 0;
14710 curr_offset = 0;
14711 curr_depth = 0;
14712 curr_max_above = 0;
14713 curr_max_below = 0;
14714 break;
14715 }
14716
14717 /* adjust current address and offset */
14718 skip = curr_entry->vme_start - curr_address;
14719 curr_address = curr_entry->vme_start;
14720 curr_skip += skip;
14721 curr_offset += skip;
14722 curr_max_above -= skip;
14723 curr_max_below = 0;
14724 }
14725
14726 /*
14727 * Is the next entry at this level closer to the address (or
14728 * deeper in the submap chain) than the one we had
14729 * so far ?
14730 */
14731 tmp_entry = curr_entry->vme_next;
14732 if (tmp_entry == vm_map_to_entry(curr_map)) {
14733 /* no next entry at this level */
14734 } else if (tmp_entry->vme_start >=
14735 curr_address + curr_max_above) {
14736 /*
14737 * tmp_entry is beyond the scope of what we mapped of
14738 * this submap in the upper level: ignore it.
14739 */
14740 } else if ((next_entry == NULL) ||
14741 (tmp_entry->vme_start + curr_offset <=
14742 next_entry->vme_start + next_offset)) {
14743 /*
14744 * We didn't have a "next_entry" or this one is
14745 * closer to the address we're looking for:
14746 * use this "tmp_entry" as the new "next_entry".
14747 */
14748 if (next_entry != NULL) {
14749 /* unlock the last "next_map" */
14750 if (next_map != curr_map && not_in_kdp) {
14751 vm_map_unlock_read(next_map);
14752 }
14753 }
14754 next_entry = tmp_entry;
14755 next_map = curr_map;
14756 next_depth = curr_depth;
14757 next_address = next_entry->vme_start;
14758 next_skip = curr_skip;
14759 next_skip += (next_address - curr_address);
14760 next_offset = curr_offset;
14761 next_offset += (next_address - curr_address);
14762 next_max_above = MIN(next_max_above, curr_max_above);
14763 next_max_above = MIN(next_max_above,
14764 next_entry->vme_end - next_address);
14765 next_max_below = MIN(next_max_below, curr_max_below);
14766 next_max_below = MIN(next_max_below,
14767 next_address - next_entry->vme_start);
14768 }
14769
14770 /*
14771 * "curr_max_{above,below}" allow us to keep track of the
14772 * portion of the submap that is actually mapped at this level:
14773 * the rest of that submap is irrelevant to us, since it's not
14774 * mapped here.
14775 * The relevant portion of the map starts at
14776 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14777 */
14778 curr_max_above = MIN(curr_max_above,
14779 curr_entry->vme_end - curr_address);
14780 curr_max_below = MIN(curr_max_below,
14781 curr_address - curr_entry->vme_start);
14782
14783 if (!curr_entry->is_sub_map ||
14784 curr_depth >= user_max_depth) {
14785 /*
14786 * We hit a leaf map or we reached the maximum depth
14787 * we could, so stop looking. Keep the current map
14788 * locked.
14789 */
14790 break;
14791 }
14792
14793 /*
14794 * Get down to the next submap level.
14795 */
14796
14797 if (curr_entry->needs_copy) {
14798 /* everything below this is effectively copy-on-write */
14799 submap_needed_copy = TRUE;
14800 }
14801
14802 /*
14803 * Lock the next level and unlock the current level,
14804 * unless we need to keep it locked to access the "next_entry"
14805 * later.
14806 */
14807 if (not_in_kdp) {
14808 vm_map_lock_read(VME_SUBMAP(curr_entry));
14809 }
14810 if (curr_map == next_map) {
14811 /* keep "next_map" locked in case we need it */
14812 } else {
14813 /* release this map */
14814 if (not_in_kdp) {
14815 vm_map_unlock_read(curr_map);
14816 }
14817 }
14818
14819 /*
14820 * Adjust the offset. "curr_entry" maps the submap
14821 * at relative address "curr_entry->vme_start" in the
14822 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14823 * bytes of the submap.
14824 * "curr_offset" always represents the offset of a virtual
14825 * address in the curr_map relative to the absolute address
14826 * space (i.e. the top-level VM map).
14827 */
14828 curr_offset +=
14829 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14830 curr_address = user_address + curr_offset;
14831 /* switch to the submap */
14832 curr_map = VME_SUBMAP(curr_entry);
14833 curr_depth++;
14834 curr_entry = NULL;
14835 }
14836
14837 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14838 // so probably should be a real 32b ID vs. ptr.
14839 // Current users just check for equality
14840
14841 if (curr_entry == NULL) {
14842 /* no VM region contains the address... */
14843
14844 if (do_region_footprint && /* we want footprint numbers */
14845 next_entry == NULL && /* & there are no more regions */
14846 /* & we haven't already provided our fake region: */
14847 user_address <= vm_map_last_entry(map)->vme_end) {
14848 ledger_amount_t ledger_resident, ledger_compressed;
14849
14850 /*
14851 * Add a fake memory region to account for
14852 * purgeable and/or ledger-tagged memory that
14853 * counts towards this task's memory footprint,
14854 * i.e. the resident/compressed pages of non-volatile
14855 * objects owned by that task.
14856 */
14857 task_ledgers_footprint(map->pmap->ledger,
14858 &ledger_resident,
14859 &ledger_compressed);
14860 if (ledger_resident + ledger_compressed == 0) {
14861 /* no purgeable memory usage to report */
14862 return KERN_INVALID_ADDRESS;
14863 }
14864 /* fake region to show nonvolatile footprint */
14865 if (look_for_pages) {
14866 submap_info->protection = VM_PROT_DEFAULT;
14867 submap_info->max_protection = VM_PROT_DEFAULT;
14868 submap_info->inheritance = VM_INHERIT_DEFAULT;
14869 submap_info->offset = 0;
14870 submap_info->user_tag = -1;
14871 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14872 submap_info->pages_shared_now_private = 0;
14873 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14874 submap_info->pages_dirtied = submap_info->pages_resident;
14875 submap_info->ref_count = 1;
14876 submap_info->shadow_depth = 0;
14877 submap_info->external_pager = 0;
14878 submap_info->share_mode = SM_PRIVATE;
14879 if (submap_needed_copy) {
14880 submap_info->share_mode = SM_COW;
14881 }
14882 submap_info->is_submap = 0;
14883 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14884 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14885 submap_info->user_wired_count = 0;
14886 submap_info->pages_reusable = 0;
14887 } else {
14888 short_info->user_tag = -1;
14889 short_info->offset = 0;
14890 short_info->protection = VM_PROT_DEFAULT;
14891 short_info->inheritance = VM_INHERIT_DEFAULT;
14892 short_info->max_protection = VM_PROT_DEFAULT;
14893 short_info->behavior = VM_BEHAVIOR_DEFAULT;
14894 short_info->user_wired_count = 0;
14895 short_info->is_submap = 0;
14896 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14897 short_info->external_pager = 0;
14898 short_info->shadow_depth = 0;
14899 short_info->share_mode = SM_PRIVATE;
14900 if (submap_needed_copy) {
14901 short_info->share_mode = SM_COW;
14902 }
14903 short_info->ref_count = 1;
14904 }
14905 *nesting_depth = 0;
14906 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14907 // *address = user_address;
14908 *address = vm_map_last_entry(map)->vme_end;
14909 return KERN_SUCCESS;
14910 }
14911
14912 if (next_entry == NULL) {
14913 /* ... and no VM region follows it either */
14914 return KERN_INVALID_ADDRESS;
14915 }
14916 /* ... gather info about the next VM region */
14917 curr_entry = next_entry;
14918 curr_map = next_map; /* still locked ... */
14919 curr_address = next_address;
14920 curr_skip = next_skip;
14921 curr_offset = next_offset;
14922 curr_depth = next_depth;
14923 curr_max_above = next_max_above;
14924 curr_max_below = next_max_below;
14925 } else {
14926 /* we won't need "next_entry" after all */
14927 if (next_entry != NULL) {
14928 /* release "next_map" */
14929 if (next_map != curr_map && not_in_kdp) {
14930 vm_map_unlock_read(next_map);
14931 }
14932 }
14933 }
14934 next_entry = NULL;
14935 next_map = NULL;
14936 next_offset = 0;
14937 next_skip = 0;
14938 next_depth = 0;
14939 next_max_below = -1;
14940 next_max_above = -1;
14941
14942 if (curr_entry->is_sub_map &&
14943 curr_depth < user_max_depth) {
14944 /*
14945 * We're not as deep as we could be: we must have
14946 * gone back up after not finding anything mapped
14947 * below the original top-level map entry's.
14948 * Let's move "curr_address" forward and recurse again.
14949 */
14950 user_address = curr_address;
14951 goto recurse_again;
14952 }
14953
14954 *nesting_depth = curr_depth;
14955 *size = curr_max_above + curr_max_below;
14956 *address = user_address + curr_skip - curr_max_below;
14957
14958 if (look_for_pages) {
14959 submap_info->user_tag = VME_ALIAS(curr_entry);
14960 submap_info->offset = VME_OFFSET(curr_entry);
14961 submap_info->protection = curr_entry->protection;
14962 submap_info->inheritance = curr_entry->inheritance;
14963 submap_info->max_protection = curr_entry->max_protection;
14964 submap_info->behavior = curr_entry->behavior;
14965 submap_info->user_wired_count = curr_entry->user_wired_count;
14966 submap_info->is_submap = curr_entry->is_sub_map;
14967 if (curr_entry->is_sub_map) {
14968 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14969 } else {
14970 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14971 }
14972 } else {
14973 short_info->user_tag = VME_ALIAS(curr_entry);
14974 short_info->offset = VME_OFFSET(curr_entry);
14975 short_info->protection = curr_entry->protection;
14976 short_info->inheritance = curr_entry->inheritance;
14977 short_info->max_protection = curr_entry->max_protection;
14978 short_info->behavior = curr_entry->behavior;
14979 short_info->user_wired_count = curr_entry->user_wired_count;
14980 short_info->is_submap = curr_entry->is_sub_map;
14981 if (curr_entry->is_sub_map) {
14982 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14983 } else {
14984 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14985 }
14986 }
14987
14988 extended.pages_resident = 0;
14989 extended.pages_swapped_out = 0;
14990 extended.pages_shared_now_private = 0;
14991 extended.pages_dirtied = 0;
14992 extended.pages_reusable = 0;
14993 extended.external_pager = 0;
14994 extended.shadow_depth = 0;
14995 extended.share_mode = SM_EMPTY;
14996 extended.ref_count = 0;
14997
14998 if (not_in_kdp) {
14999 if (!curr_entry->is_sub_map) {
15000 vm_map_offset_t range_start, range_end;
15001 range_start = MAX((curr_address - curr_max_below),
15002 curr_entry->vme_start);
15003 range_end = MIN((curr_address + curr_max_above),
15004 curr_entry->vme_end);
15005 vm_map_region_walk(curr_map,
15006 range_start,
15007 curr_entry,
15008 (VME_OFFSET(curr_entry) +
15009 (range_start -
15010 curr_entry->vme_start)),
15011 range_end - range_start,
15012 &extended,
15013 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15014 if (submap_needed_copy) {
15015 extended.share_mode = SM_COW;
15016 }
15017 } else {
15018 if (curr_entry->use_pmap) {
15019 extended.share_mode = SM_TRUESHARED;
15020 } else {
15021 extended.share_mode = SM_PRIVATE;
15022 }
15023 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
15024 }
15025 }
15026
15027 if (look_for_pages) {
15028 submap_info->pages_resident = extended.pages_resident;
15029 submap_info->pages_swapped_out = extended.pages_swapped_out;
15030 submap_info->pages_shared_now_private =
15031 extended.pages_shared_now_private;
15032 submap_info->pages_dirtied = extended.pages_dirtied;
15033 submap_info->external_pager = extended.external_pager;
15034 submap_info->shadow_depth = extended.shadow_depth;
15035 submap_info->share_mode = extended.share_mode;
15036 submap_info->ref_count = extended.ref_count;
15037
15038 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15039 submap_info->pages_reusable = extended.pages_reusable;
15040 }
15041 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15042 if (curr_entry->is_sub_map) {
15043 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15044 } else if (VME_OBJECT(curr_entry)) {
15045 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15046 } else {
15047 submap_info->object_id_full = 0ull;
15048 }
15049 }
15050 } else {
15051 short_info->external_pager = extended.external_pager;
15052 short_info->shadow_depth = extended.shadow_depth;
15053 short_info->share_mode = extended.share_mode;
15054 short_info->ref_count = extended.ref_count;
15055 }
15056
15057 if (not_in_kdp) {
15058 vm_map_unlock_read(curr_map);
15059 }
15060
15061 return KERN_SUCCESS;
15062 }
15063
15064 /*
15065 * vm_region:
15066 *
15067 * User call to obtain information about a region in
15068 * a task's address map. Currently, only one flavor is
15069 * supported.
15070 *
15071 * XXX The reserved and behavior fields cannot be filled
15072 * in until the vm merge from the IK is completed, and
15073 * vm_reserve is implemented.
15074 */
15075
15076 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)15077 vm_map_region(
15078 vm_map_t map,
15079 vm_map_offset_t *address, /* IN/OUT */
15080 vm_map_size_t *size, /* OUT */
15081 vm_region_flavor_t flavor, /* IN */
15082 vm_region_info_t info, /* OUT */
15083 mach_msg_type_number_t *count, /* IN/OUT */
15084 mach_port_t *object_name) /* OUT */
15085 {
15086 vm_map_entry_t tmp_entry;
15087 vm_map_entry_t entry;
15088 vm_map_offset_t start;
15089
15090 if (map == VM_MAP_NULL) {
15091 return KERN_INVALID_ARGUMENT;
15092 }
15093
15094 switch (flavor) {
15095 case VM_REGION_BASIC_INFO:
15096 /* legacy for old 32-bit objects info */
15097 {
15098 vm_region_basic_info_t basic;
15099
15100 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15101 return KERN_INVALID_ARGUMENT;
15102 }
15103
15104 basic = (vm_region_basic_info_t) info;
15105 *count = VM_REGION_BASIC_INFO_COUNT;
15106
15107 vm_map_lock_read(map);
15108
15109 start = *address;
15110 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15111 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15112 vm_map_unlock_read(map);
15113 return KERN_INVALID_ADDRESS;
15114 }
15115 } else {
15116 entry = tmp_entry;
15117 }
15118
15119 start = entry->vme_start;
15120
15121 basic->offset = (uint32_t)VME_OFFSET(entry);
15122 basic->protection = entry->protection;
15123 basic->inheritance = entry->inheritance;
15124 basic->max_protection = entry->max_protection;
15125 basic->behavior = entry->behavior;
15126 basic->user_wired_count = entry->user_wired_count;
15127 basic->reserved = entry->is_sub_map;
15128 *address = start;
15129 *size = (entry->vme_end - start);
15130
15131 if (object_name) {
15132 *object_name = IP_NULL;
15133 }
15134 if (entry->is_sub_map) {
15135 basic->shared = FALSE;
15136 } else {
15137 basic->shared = entry->is_shared;
15138 }
15139
15140 vm_map_unlock_read(map);
15141 return KERN_SUCCESS;
15142 }
15143
15144 case VM_REGION_BASIC_INFO_64:
15145 {
15146 vm_region_basic_info_64_t basic;
15147
15148 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15149 return KERN_INVALID_ARGUMENT;
15150 }
15151
15152 basic = (vm_region_basic_info_64_t) info;
15153 *count = VM_REGION_BASIC_INFO_COUNT_64;
15154
15155 vm_map_lock_read(map);
15156
15157 start = *address;
15158 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15159 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15160 vm_map_unlock_read(map);
15161 return KERN_INVALID_ADDRESS;
15162 }
15163 } else {
15164 entry = tmp_entry;
15165 }
15166
15167 start = entry->vme_start;
15168
15169 basic->offset = VME_OFFSET(entry);
15170 basic->protection = entry->protection;
15171 basic->inheritance = entry->inheritance;
15172 basic->max_protection = entry->max_protection;
15173 basic->behavior = entry->behavior;
15174 basic->user_wired_count = entry->user_wired_count;
15175 basic->reserved = entry->is_sub_map;
15176 *address = start;
15177 *size = (entry->vme_end - start);
15178
15179 if (object_name) {
15180 *object_name = IP_NULL;
15181 }
15182 if (entry->is_sub_map) {
15183 basic->shared = FALSE;
15184 } else {
15185 basic->shared = entry->is_shared;
15186 }
15187
15188 vm_map_unlock_read(map);
15189 return KERN_SUCCESS;
15190 }
15191 case VM_REGION_EXTENDED_INFO:
15192 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15193 return KERN_INVALID_ARGUMENT;
15194 }
15195 OS_FALLTHROUGH;
15196 case VM_REGION_EXTENDED_INFO__legacy:
15197 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15198 return KERN_INVALID_ARGUMENT;
15199 }
15200
15201 {
15202 vm_region_extended_info_t extended;
15203 mach_msg_type_number_t original_count;
15204 int effective_page_size, effective_page_shift;
15205
15206 extended = (vm_region_extended_info_t) info;
15207
15208 effective_page_shift = vm_self_region_page_shift(map);
15209 effective_page_size = (1 << effective_page_shift);
15210
15211 vm_map_lock_read(map);
15212
15213 start = *address;
15214 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15215 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15216 vm_map_unlock_read(map);
15217 return KERN_INVALID_ADDRESS;
15218 }
15219 } else {
15220 entry = tmp_entry;
15221 }
15222 start = entry->vme_start;
15223
15224 extended->protection = entry->protection;
15225 extended->user_tag = VME_ALIAS(entry);
15226 extended->pages_resident = 0;
15227 extended->pages_swapped_out = 0;
15228 extended->pages_shared_now_private = 0;
15229 extended->pages_dirtied = 0;
15230 extended->external_pager = 0;
15231 extended->shadow_depth = 0;
15232
15233 original_count = *count;
15234 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15235 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15236 } else {
15237 extended->pages_reusable = 0;
15238 *count = VM_REGION_EXTENDED_INFO_COUNT;
15239 }
15240
15241 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
15242
15243 if (object_name) {
15244 *object_name = IP_NULL;
15245 }
15246 *address = start;
15247 *size = (entry->vme_end - start);
15248
15249 vm_map_unlock_read(map);
15250 return KERN_SUCCESS;
15251 }
15252 case VM_REGION_TOP_INFO:
15253 {
15254 vm_region_top_info_t top;
15255
15256 if (*count < VM_REGION_TOP_INFO_COUNT) {
15257 return KERN_INVALID_ARGUMENT;
15258 }
15259
15260 top = (vm_region_top_info_t) info;
15261 *count = VM_REGION_TOP_INFO_COUNT;
15262
15263 vm_map_lock_read(map);
15264
15265 start = *address;
15266 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
15267 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15268 vm_map_unlock_read(map);
15269 return KERN_INVALID_ADDRESS;
15270 }
15271 } else {
15272 entry = tmp_entry;
15273 }
15274 start = entry->vme_start;
15275
15276 top->private_pages_resident = 0;
15277 top->shared_pages_resident = 0;
15278
15279 vm_map_region_top_walk(entry, top);
15280
15281 if (object_name) {
15282 *object_name = IP_NULL;
15283 }
15284 *address = start;
15285 *size = (entry->vme_end - start);
15286
15287 vm_map_unlock_read(map);
15288 return KERN_SUCCESS;
15289 }
15290 default:
15291 return KERN_INVALID_ARGUMENT;
15292 }
15293 }
15294
15295 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
15296 MIN((entry_size), \
15297 ((obj)->all_reusable ? \
15298 (obj)->wired_page_count : \
15299 (obj)->resident_page_count - (obj)->reusable_page_count))
15300
15301 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)15302 vm_map_region_top_walk(
15303 vm_map_entry_t entry,
15304 vm_region_top_info_t top)
15305 {
15306 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15307 top->share_mode = SM_EMPTY;
15308 top->ref_count = 0;
15309 top->obj_id = 0;
15310 return;
15311 }
15312
15313 {
15314 struct vm_object *obj, *tmp_obj;
15315 int ref_count;
15316 uint32_t entry_size;
15317
15318 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15319
15320 obj = VME_OBJECT(entry);
15321
15322 vm_object_lock(obj);
15323
15324 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15325 ref_count--;
15326 }
15327
15328 assert(obj->reusable_page_count <= obj->resident_page_count);
15329 if (obj->shadow) {
15330 if (ref_count == 1) {
15331 top->private_pages_resident =
15332 OBJ_RESIDENT_COUNT(obj, entry_size);
15333 } else {
15334 top->shared_pages_resident =
15335 OBJ_RESIDENT_COUNT(obj, entry_size);
15336 }
15337 top->ref_count = ref_count;
15338 top->share_mode = SM_COW;
15339
15340 while ((tmp_obj = obj->shadow)) {
15341 vm_object_lock(tmp_obj);
15342 vm_object_unlock(obj);
15343 obj = tmp_obj;
15344
15345 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15346 ref_count--;
15347 }
15348
15349 assert(obj->reusable_page_count <= obj->resident_page_count);
15350 top->shared_pages_resident +=
15351 OBJ_RESIDENT_COUNT(obj, entry_size);
15352 top->ref_count += ref_count - 1;
15353 }
15354 } else {
15355 if (entry->superpage_size) {
15356 top->share_mode = SM_LARGE_PAGE;
15357 top->shared_pages_resident = 0;
15358 top->private_pages_resident = entry_size;
15359 } else if (entry->needs_copy) {
15360 top->share_mode = SM_COW;
15361 top->shared_pages_resident =
15362 OBJ_RESIDENT_COUNT(obj, entry_size);
15363 } else {
15364 if (ref_count == 1 ||
15365 (ref_count == 2 && obj->named)) {
15366 top->share_mode = SM_PRIVATE;
15367 top->private_pages_resident =
15368 OBJ_RESIDENT_COUNT(obj,
15369 entry_size);
15370 } else {
15371 top->share_mode = SM_SHARED;
15372 top->shared_pages_resident =
15373 OBJ_RESIDENT_COUNT(obj,
15374 entry_size);
15375 }
15376 }
15377 top->ref_count = ref_count;
15378 }
15379
15380 vm_object_unlock(obj);
15381
15382 /* XXX K64: obj_id will be truncated */
15383 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15384 }
15385 }
15386
15387 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)15388 vm_map_region_walk(
15389 vm_map_t map,
15390 vm_map_offset_t va,
15391 vm_map_entry_t entry,
15392 vm_object_offset_t offset,
15393 vm_object_size_t range,
15394 vm_region_extended_info_t extended,
15395 boolean_t look_for_pages,
15396 mach_msg_type_number_t count)
15397 {
15398 struct vm_object *obj, *tmp_obj;
15399 vm_map_offset_t last_offset;
15400 int i;
15401 int ref_count;
15402 struct vm_object *shadow_object;
15403 unsigned short shadow_depth;
15404 boolean_t do_region_footprint;
15405 int effective_page_size, effective_page_shift;
15406 vm_map_offset_t effective_page_mask;
15407
15408 do_region_footprint = task_self_region_footprint();
15409
15410 if ((entry->is_sub_map) ||
15411 (VME_OBJECT(entry) == 0) ||
15412 (VME_OBJECT(entry)->phys_contiguous &&
15413 !entry->superpage_size)) {
15414 extended->share_mode = SM_EMPTY;
15415 extended->ref_count = 0;
15416 return;
15417 }
15418
15419 if (entry->superpage_size) {
15420 extended->shadow_depth = 0;
15421 extended->share_mode = SM_LARGE_PAGE;
15422 extended->ref_count = 1;
15423 extended->external_pager = 0;
15424
15425 /* TODO4K: Superpage in 4k mode? */
15426 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15427 extended->shadow_depth = 0;
15428 return;
15429 }
15430
15431 effective_page_shift = vm_self_region_page_shift(map);
15432 effective_page_size = (1 << effective_page_shift);
15433 effective_page_mask = effective_page_size - 1;
15434
15435 offset = vm_map_trunc_page(offset, effective_page_mask);
15436
15437 obj = VME_OBJECT(entry);
15438
15439 vm_object_lock(obj);
15440
15441 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15442 ref_count--;
15443 }
15444
15445 if (look_for_pages) {
15446 for (last_offset = offset + range;
15447 offset < last_offset;
15448 offset += effective_page_size, va += effective_page_size) {
15449 if (do_region_footprint) {
15450 int disp;
15451
15452 disp = 0;
15453 if (map->has_corpse_footprint) {
15454 /*
15455 * Query the page info data we saved
15456 * while forking the corpse.
15457 */
15458 vm_map_corpse_footprint_query_page_info(
15459 map,
15460 va,
15461 &disp);
15462 } else {
15463 /*
15464 * Query the pmap.
15465 */
15466 vm_map_footprint_query_page_info(
15467 map,
15468 entry,
15469 va,
15470 &disp);
15471 }
15472 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15473 extended->pages_resident++;
15474 }
15475 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15476 extended->pages_reusable++;
15477 }
15478 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15479 extended->pages_dirtied++;
15480 }
15481 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15482 extended->pages_swapped_out++;
15483 }
15484 continue;
15485 }
15486
15487 vm_map_region_look_for_page(map, va, obj,
15488 vm_object_trunc_page(offset), ref_count,
15489 0, extended, count);
15490 }
15491
15492 if (do_region_footprint) {
15493 goto collect_object_info;
15494 }
15495 } else {
15496 collect_object_info:
15497 shadow_object = obj->shadow;
15498 shadow_depth = 0;
15499
15500 if (!(obj->internal)) {
15501 extended->external_pager = 1;
15502 }
15503
15504 if (shadow_object != VM_OBJECT_NULL) {
15505 vm_object_lock(shadow_object);
15506 for (;
15507 shadow_object != VM_OBJECT_NULL;
15508 shadow_depth++) {
15509 vm_object_t next_shadow;
15510
15511 if (!(shadow_object->internal)) {
15512 extended->external_pager = 1;
15513 }
15514
15515 next_shadow = shadow_object->shadow;
15516 if (next_shadow) {
15517 vm_object_lock(next_shadow);
15518 }
15519 vm_object_unlock(shadow_object);
15520 shadow_object = next_shadow;
15521 }
15522 }
15523 extended->shadow_depth = shadow_depth;
15524 }
15525
15526 if (extended->shadow_depth || entry->needs_copy) {
15527 extended->share_mode = SM_COW;
15528 } else {
15529 if (ref_count == 1) {
15530 extended->share_mode = SM_PRIVATE;
15531 } else {
15532 if (obj->true_share) {
15533 extended->share_mode = SM_TRUESHARED;
15534 } else {
15535 extended->share_mode = SM_SHARED;
15536 }
15537 }
15538 }
15539 extended->ref_count = ref_count - extended->shadow_depth;
15540
15541 for (i = 0; i < extended->shadow_depth; i++) {
15542 if ((tmp_obj = obj->shadow) == 0) {
15543 break;
15544 }
15545 vm_object_lock(tmp_obj);
15546 vm_object_unlock(obj);
15547
15548 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15549 ref_count--;
15550 }
15551
15552 extended->ref_count += ref_count;
15553 obj = tmp_obj;
15554 }
15555 vm_object_unlock(obj);
15556
15557 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15558 extended->share_mode = SM_PRIVATE;
15559 } else if (extended->share_mode == SM_SHARED && !(task_self_region_info_flags() & VM_REGION_INFO_FLAGS_NO_ALIASED)) {
15560 vm_map_entry_t cur;
15561 vm_map_entry_t last;
15562 int my_refs;
15563
15564 obj = VME_OBJECT(entry);
15565 last = vm_map_to_entry(map);
15566 my_refs = 0;
15567
15568 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15569 ref_count--;
15570 }
15571 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15572 if (vm_map_region_has_obj_ref(cur, obj)) {
15573 my_refs++;
15574 }
15575 }
15576
15577 if (my_refs == ref_count) {
15578 extended->share_mode = SM_PRIVATE_ALIASED;
15579 } else if (my_refs > 1) {
15580 extended->share_mode = SM_SHARED_ALIASED;
15581 }
15582 }
15583 }
15584
15585
15586 /* object is locked on entry and locked on return */
15587
15588
15589 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)15590 vm_map_region_look_for_page(
15591 __unused vm_map_t map,
15592 __unused vm_map_offset_t va,
15593 vm_object_t object,
15594 vm_object_offset_t offset,
15595 int max_refcnt,
15596 unsigned short depth,
15597 vm_region_extended_info_t extended,
15598 mach_msg_type_number_t count)
15599 {
15600 vm_page_t p;
15601 vm_object_t shadow;
15602 int ref_count;
15603 vm_object_t caller_object;
15604
15605 shadow = object->shadow;
15606 caller_object = object;
15607
15608
15609 while (TRUE) {
15610 if (!(object->internal)) {
15611 extended->external_pager = 1;
15612 }
15613
15614 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15615 if (shadow && (max_refcnt == 1)) {
15616 extended->pages_shared_now_private++;
15617 }
15618
15619 if (!p->vmp_fictitious &&
15620 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
15621 extended->pages_dirtied++;
15622 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15623 if (p->vmp_reusable || object->all_reusable) {
15624 extended->pages_reusable++;
15625 }
15626 }
15627
15628 extended->pages_resident++;
15629
15630 if (object != caller_object) {
15631 vm_object_unlock(object);
15632 }
15633
15634 return;
15635 }
15636 if (object->internal &&
15637 object->alive &&
15638 !object->terminating &&
15639 object->pager_ready) {
15640 if (vm_object_compressor_pager_state_get(object, offset)
15641 == VM_EXTERNAL_STATE_EXISTS) {
15642 /* the pager has that page */
15643 extended->pages_swapped_out++;
15644 if (object != caller_object) {
15645 vm_object_unlock(object);
15646 }
15647 return;
15648 }
15649 }
15650
15651 if (shadow) {
15652 vm_object_lock(shadow);
15653
15654 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15655 ref_count--;
15656 }
15657
15658 if (++depth > extended->shadow_depth) {
15659 extended->shadow_depth = depth;
15660 }
15661
15662 if (ref_count > max_refcnt) {
15663 max_refcnt = ref_count;
15664 }
15665
15666 if (object != caller_object) {
15667 vm_object_unlock(object);
15668 }
15669
15670 offset = offset + object->vo_shadow_offset;
15671 object = shadow;
15672 shadow = object->shadow;
15673 continue;
15674 }
15675 if (object != caller_object) {
15676 vm_object_unlock(object);
15677 }
15678 break;
15679 }
15680 }
15681
15682 static inline boolean_t
vm_map_region_has_obj_ref(vm_map_entry_t entry,vm_object_t object)15683 vm_map_region_has_obj_ref(
15684 vm_map_entry_t entry,
15685 vm_object_t object)
15686 {
15687 vm_object_t cur_obj;
15688 vm_object_t shadow_obj;
15689
15690 if (entry->is_sub_map) {
15691 return FALSE;
15692 }
15693
15694 cur_obj = VME_OBJECT(entry);
15695 if (cur_obj == VM_OBJECT_NULL) {
15696 return FALSE;
15697 } else if (cur_obj == object) {
15698 return TRUE;
15699 }
15700
15701 /*
15702 * Avoid locks for first shadow check, otherwise diagnostic tools will
15703 * spend most of their time obtaining locks in this function when analyzing
15704 * processes with many VM entries which may commonly have no shadow chain.
15705 *
15706 * This is acceptable because:
15707 * - Shadow's fields are not accessed outside of its lock
15708 * - Objects are unlikely to be modified due to:
15709 * - Many diagnostic tools suspend the task
15710 * - VM map is locked
15711 * - The rare incorrect return from this function turns a guess into a
15712 * slightly worse guess
15713 * - Entire shadow chain is not locked as a whole, so can still change
15714 * while traversing, resulting in incorrect guess even with locking
15715 */
15716 shadow_obj = cur_obj->shadow;
15717 if (shadow_obj == VM_OBJECT_NULL) {
15718 return FALSE;
15719 } else if (shadow_obj == object) {
15720 return TRUE;
15721 }
15722
15723 vm_object_lock(cur_obj);
15724
15725 while ((shadow_obj = cur_obj->shadow)) {
15726 /* check if object was found before grabbing a lock */
15727 if (shadow_obj == object) {
15728 vm_object_unlock(cur_obj);
15729 return TRUE;
15730 }
15731
15732 vm_object_lock(shadow_obj);
15733 vm_object_unlock(cur_obj);
15734 cur_obj = shadow_obj;
15735 }
15736
15737 /* exhausted the shadow chain */
15738 vm_object_unlock(cur_obj);
15739 return FALSE;
15740 }
15741
15742
15743 /*
15744 * Routine: vm_map_simplify
15745 *
15746 * Description:
15747 * Attempt to simplify the map representation in
15748 * the vicinity of the given starting address.
15749 * Note:
15750 * This routine is intended primarily to keep the
15751 * kernel maps more compact -- they generally don't
15752 * benefit from the "expand a map entry" technology
15753 * at allocation time because the adjacent entry
15754 * is often wired down.
15755 */
15756 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15757 vm_map_simplify_entry(
15758 vm_map_t map,
15759 vm_map_entry_t this_entry)
15760 {
15761 vm_map_entry_t prev_entry;
15762
15763 prev_entry = this_entry->vme_prev;
15764
15765 if ((this_entry != vm_map_to_entry(map)) &&
15766 (prev_entry != vm_map_to_entry(map)) &&
15767
15768 (prev_entry->vme_end == this_entry->vme_start) &&
15769
15770 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15771 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15772 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15773 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15774 prev_entry->vme_start))
15775 == VME_OFFSET(this_entry)) &&
15776
15777 (prev_entry->behavior == this_entry->behavior) &&
15778 (prev_entry->needs_copy == this_entry->needs_copy) &&
15779 (prev_entry->protection == this_entry->protection) &&
15780 (prev_entry->max_protection == this_entry->max_protection) &&
15781 (prev_entry->inheritance == this_entry->inheritance) &&
15782 (prev_entry->use_pmap == this_entry->use_pmap) &&
15783 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15784 (prev_entry->no_cache == this_entry->no_cache) &&
15785 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
15786 (prev_entry->map_aligned == this_entry->map_aligned) &&
15787 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15788 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15789 #if __arm64e__
15790 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
15791 #endif
15792 (prev_entry->csm_associated == this_entry->csm_associated) &&
15793 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
15794 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15795 (prev_entry->vme_resilient_codesign ==
15796 this_entry->vme_resilient_codesign) &&
15797 (prev_entry->vme_resilient_media ==
15798 this_entry->vme_resilient_media) &&
15799 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15800 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
15801
15802 (prev_entry->wired_count == this_entry->wired_count) &&
15803 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15804
15805 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15806 (prev_entry->in_transition == FALSE) &&
15807 (this_entry->in_transition == FALSE) &&
15808 (prev_entry->needs_wakeup == FALSE) &&
15809 (this_entry->needs_wakeup == FALSE) &&
15810 (prev_entry->is_shared == this_entry->is_shared) &&
15811 (prev_entry->superpage_size == FALSE) &&
15812 (this_entry->superpage_size == FALSE)
15813 ) {
15814 if (prev_entry->vme_permanent) {
15815 assert(this_entry->vme_permanent);
15816 prev_entry->vme_permanent = false;
15817 }
15818 vm_map_store_entry_unlink(map, prev_entry, true);
15819 assert(prev_entry->vme_start < this_entry->vme_end);
15820 if (prev_entry->map_aligned) {
15821 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15822 VM_MAP_PAGE_MASK(map)));
15823 }
15824 this_entry->vme_start = prev_entry->vme_start;
15825 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15826
15827 if (map->holelistenabled) {
15828 vm_map_store_update_first_free(map, this_entry, TRUE);
15829 }
15830
15831 if (prev_entry->is_sub_map) {
15832 vm_map_deallocate(VME_SUBMAP(prev_entry));
15833 } else {
15834 vm_object_deallocate(VME_OBJECT(prev_entry));
15835 }
15836 vm_map_entry_dispose(prev_entry);
15837 SAVE_HINT_MAP_WRITE(map, this_entry);
15838 }
15839 }
15840
15841 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15842 vm_map_simplify(
15843 vm_map_t map,
15844 vm_map_offset_t start)
15845 {
15846 vm_map_entry_t this_entry;
15847
15848 vm_map_lock(map);
15849 if (vm_map_lookup_entry(map, start, &this_entry)) {
15850 vm_map_simplify_entry(map, this_entry);
15851 vm_map_simplify_entry(map, this_entry->vme_next);
15852 }
15853 vm_map_unlock(map);
15854 }
15855
15856 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15857 vm_map_simplify_range(
15858 vm_map_t map,
15859 vm_map_offset_t start,
15860 vm_map_offset_t end)
15861 {
15862 vm_map_entry_t entry;
15863
15864 /*
15865 * The map should be locked (for "write") by the caller.
15866 */
15867
15868 if (start >= end) {
15869 /* invalid address range */
15870 return;
15871 }
15872
15873 start = vm_map_trunc_page(start,
15874 VM_MAP_PAGE_MASK(map));
15875 end = vm_map_round_page(end,
15876 VM_MAP_PAGE_MASK(map));
15877
15878 if (!vm_map_lookup_entry(map, start, &entry)) {
15879 /* "start" is not mapped and "entry" ends before "start" */
15880 if (entry == vm_map_to_entry(map)) {
15881 /* start with first entry in the map */
15882 entry = vm_map_first_entry(map);
15883 } else {
15884 /* start with next entry */
15885 entry = entry->vme_next;
15886 }
15887 }
15888
15889 while (entry != vm_map_to_entry(map) &&
15890 entry->vme_start <= end) {
15891 /* try and coalesce "entry" with its previous entry */
15892 vm_map_simplify_entry(map, entry);
15893 entry = entry->vme_next;
15894 }
15895 }
15896
15897
15898 /*
15899 * Routine: vm_map_machine_attribute
15900 * Purpose:
15901 * Provide machine-specific attributes to mappings,
15902 * such as cachability etc. for machines that provide
15903 * them. NUMA architectures and machines with big/strange
15904 * caches will use this.
15905 * Note:
15906 * Responsibilities for locking and checking are handled here,
15907 * everything else in the pmap module. If any non-volatile
15908 * information must be kept, the pmap module should handle
15909 * it itself. [This assumes that attributes do not
15910 * need to be inherited, which seems ok to me]
15911 */
15912 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15913 vm_map_machine_attribute(
15914 vm_map_t map,
15915 vm_map_offset_t start,
15916 vm_map_offset_t end,
15917 vm_machine_attribute_t attribute,
15918 vm_machine_attribute_val_t* value) /* IN/OUT */
15919 {
15920 kern_return_t ret;
15921 vm_map_size_t sync_size;
15922 vm_map_entry_t entry;
15923
15924 if (start < vm_map_min(map) || end > vm_map_max(map)) {
15925 return KERN_INVALID_ADDRESS;
15926 }
15927 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
15928 return KERN_INVALID_ADDRESS;
15929 }
15930
15931 /* Figure how much memory we need to flush (in page increments) */
15932 sync_size = end - start;
15933
15934 vm_map_lock(map);
15935
15936 if (attribute != MATTR_CACHE) {
15937 /* If we don't have to find physical addresses, we */
15938 /* don't have to do an explicit traversal here. */
15939 ret = pmap_attribute(map->pmap, start, end - start,
15940 attribute, value);
15941 vm_map_unlock(map);
15942 return ret;
15943 }
15944
15945 ret = KERN_SUCCESS; /* Assume it all worked */
15946
15947 while (sync_size) {
15948 if (vm_map_lookup_entry(map, start, &entry)) {
15949 vm_map_size_t sub_size;
15950 if ((entry->vme_end - start) > sync_size) {
15951 sub_size = sync_size;
15952 sync_size = 0;
15953 } else {
15954 sub_size = entry->vme_end - start;
15955 sync_size -= sub_size;
15956 }
15957 if (entry->is_sub_map) {
15958 vm_map_offset_t sub_start;
15959 vm_map_offset_t sub_end;
15960
15961 sub_start = (start - entry->vme_start)
15962 + VME_OFFSET(entry);
15963 sub_end = sub_start + sub_size;
15964 vm_map_machine_attribute(
15965 VME_SUBMAP(entry),
15966 sub_start,
15967 sub_end,
15968 attribute, value);
15969 } else if (VME_OBJECT(entry)) {
15970 vm_page_t m;
15971 vm_object_t object;
15972 vm_object_t base_object;
15973 vm_object_t last_object;
15974 vm_object_offset_t offset;
15975 vm_object_offset_t base_offset;
15976 vm_map_size_t range;
15977 range = sub_size;
15978 offset = (start - entry->vme_start)
15979 + VME_OFFSET(entry);
15980 offset = vm_object_trunc_page(offset);
15981 base_offset = offset;
15982 object = VME_OBJECT(entry);
15983 base_object = object;
15984 last_object = NULL;
15985
15986 vm_object_lock(object);
15987
15988 while (range) {
15989 m = vm_page_lookup(
15990 object, offset);
15991
15992 if (m && !m->vmp_fictitious) {
15993 ret =
15994 pmap_attribute_cache_sync(
15995 VM_PAGE_GET_PHYS_PAGE(m),
15996 PAGE_SIZE,
15997 attribute, value);
15998 } else if (object->shadow) {
15999 offset = offset + object->vo_shadow_offset;
16000 last_object = object;
16001 object = object->shadow;
16002 vm_object_lock(last_object->shadow);
16003 vm_object_unlock(last_object);
16004 continue;
16005 }
16006 if (range < PAGE_SIZE) {
16007 range = 0;
16008 } else {
16009 range -= PAGE_SIZE;
16010 }
16011
16012 if (base_object != object) {
16013 vm_object_unlock(object);
16014 vm_object_lock(base_object);
16015 object = base_object;
16016 }
16017 /* Bump to the next page */
16018 base_offset += PAGE_SIZE;
16019 offset = base_offset;
16020 }
16021 vm_object_unlock(object);
16022 }
16023 start += sub_size;
16024 } else {
16025 vm_map_unlock(map);
16026 return KERN_FAILURE;
16027 }
16028 }
16029
16030 vm_map_unlock(map);
16031
16032 return ret;
16033 }
16034
16035 /*
16036 * vm_map_behavior_set:
16037 *
16038 * Sets the paging reference behavior of the specified address
16039 * range in the target map. Paging reference behavior affects
16040 * how pagein operations resulting from faults on the map will be
16041 * clustered.
16042 */
16043 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)16044 vm_map_behavior_set(
16045 vm_map_t map,
16046 vm_map_offset_t start,
16047 vm_map_offset_t end,
16048 vm_behavior_t new_behavior)
16049 {
16050 vm_map_entry_t entry;
16051 vm_map_entry_t temp_entry;
16052
16053 if (start > end ||
16054 start < vm_map_min(map) ||
16055 end > vm_map_max(map)) {
16056 return KERN_NO_SPACE;
16057 }
16058 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16059 return KERN_INVALID_ADDRESS;
16060 }
16061
16062 switch (new_behavior) {
16063 /*
16064 * This first block of behaviors all set a persistent state on the specified
16065 * memory range. All we have to do here is to record the desired behavior
16066 * in the vm_map_entry_t's.
16067 */
16068
16069 case VM_BEHAVIOR_DEFAULT:
16070 case VM_BEHAVIOR_RANDOM:
16071 case VM_BEHAVIOR_SEQUENTIAL:
16072 case VM_BEHAVIOR_RSEQNTL:
16073 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16074 vm_map_lock(map);
16075
16076 /*
16077 * The entire address range must be valid for the map.
16078 * Note that vm_map_range_check() does a
16079 * vm_map_lookup_entry() internally and returns the
16080 * entry containing the start of the address range if
16081 * the entire range is valid.
16082 */
16083 if (vm_map_range_check(map, start, end, &temp_entry)) {
16084 entry = temp_entry;
16085 vm_map_clip_start(map, entry, start);
16086 } else {
16087 vm_map_unlock(map);
16088 return KERN_INVALID_ADDRESS;
16089 }
16090
16091 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16092 vm_map_clip_end(map, entry, end);
16093 if (entry->is_sub_map) {
16094 assert(!entry->use_pmap);
16095 }
16096
16097 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16098 entry->zero_wired_pages = TRUE;
16099 } else {
16100 entry->behavior = new_behavior;
16101 }
16102 entry = entry->vme_next;
16103 }
16104
16105 vm_map_unlock(map);
16106 break;
16107
16108 /*
16109 * The rest of these are different from the above in that they cause
16110 * an immediate action to take place as opposed to setting a behavior that
16111 * affects future actions.
16112 */
16113
16114 case VM_BEHAVIOR_WILLNEED:
16115 return vm_map_willneed(map, start, end);
16116
16117 case VM_BEHAVIOR_DONTNEED:
16118 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16119
16120 case VM_BEHAVIOR_FREE:
16121 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16122
16123 case VM_BEHAVIOR_REUSABLE:
16124 return vm_map_reusable_pages(map, start, end);
16125
16126 case VM_BEHAVIOR_REUSE:
16127 return vm_map_reuse_pages(map, start, end);
16128
16129 case VM_BEHAVIOR_CAN_REUSE:
16130 return vm_map_can_reuse(map, start, end);
16131
16132 #if MACH_ASSERT
16133 case VM_BEHAVIOR_PAGEOUT:
16134 return vm_map_pageout(map, start, end);
16135 #endif /* MACH_ASSERT */
16136
16137 case VM_BEHAVIOR_ZERO:
16138 return vm_map_zero(map, start, end);
16139
16140 default:
16141 return KERN_INVALID_ARGUMENT;
16142 }
16143
16144 return KERN_SUCCESS;
16145 }
16146
16147
16148 /*
16149 * Internals for madvise(MADV_WILLNEED) system call.
16150 *
16151 * The implementation is to do:-
16152 * a) read-ahead if the mapping corresponds to a mapped regular file
16153 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16154 */
16155
16156
16157 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16158 vm_map_willneed(
16159 vm_map_t map,
16160 vm_map_offset_t start,
16161 vm_map_offset_t end
16162 )
16163 {
16164 vm_map_entry_t entry;
16165 vm_object_t object;
16166 memory_object_t pager;
16167 struct vm_object_fault_info fault_info = {};
16168 kern_return_t kr;
16169 vm_object_size_t len;
16170 vm_object_offset_t offset;
16171
16172 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_START,
16173 task_pid(current_task()), start, end);
16174 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16175 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16176 fault_info.stealth = TRUE;
16177
16178 /*
16179 * The MADV_WILLNEED operation doesn't require any changes to the
16180 * vm_map_entry_t's, so the read lock is sufficient.
16181 */
16182
16183 vm_map_lock_read(map);
16184
16185 /*
16186 * The madvise semantics require that the address range be fully
16187 * allocated with no holes. Otherwise, we're required to return
16188 * an error.
16189 */
16190
16191 if (!vm_map_range_check(map, start, end, &entry)) {
16192 vm_map_unlock_read(map);
16193 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16194 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16195 return KERN_INVALID_ADDRESS;
16196 }
16197
16198 /*
16199 * Examine each vm_map_entry_t in the range.
16200 */
16201 for (; entry != vm_map_to_entry(map) && start < end;) {
16202 /*
16203 * The first time through, the start address could be anywhere
16204 * within the vm_map_entry we found. So adjust the offset to
16205 * correspond. After that, the offset will always be zero to
16206 * correspond to the beginning of the current vm_map_entry.
16207 */
16208 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16209
16210 /*
16211 * Set the length so we don't go beyond the end of the
16212 * map_entry or beyond the end of the range we were given.
16213 * This range could span also multiple map entries all of which
16214 * map different files, so make sure we only do the right amount
16215 * of I/O for each object. Note that it's possible for there
16216 * to be multiple map entries all referring to the same object
16217 * but with different page permissions, but it's not worth
16218 * trying to optimize that case.
16219 */
16220 len = MIN(entry->vme_end - start, end - start);
16221
16222 if ((vm_size_t) len != len) {
16223 /* 32-bit overflow */
16224 len = (vm_size_t) (0 - PAGE_SIZE);
16225 }
16226 fault_info.cluster_size = (vm_size_t) len;
16227 fault_info.lo_offset = offset;
16228 fault_info.hi_offset = offset + len;
16229 fault_info.user_tag = VME_ALIAS(entry);
16230 fault_info.pmap_options = 0;
16231 if (entry->iokit_acct ||
16232 (!entry->is_sub_map && !entry->use_pmap)) {
16233 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16234 }
16235 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16236
16237 /*
16238 * If the entry is a submap OR there's no read permission
16239 * to this mapping, then just skip it.
16240 */
16241 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16242 entry = entry->vme_next;
16243 start = entry->vme_start;
16244 continue;
16245 }
16246
16247 object = VME_OBJECT(entry);
16248
16249 if (object == NULL ||
16250 (object && object->internal)) {
16251 /*
16252 * Memory range backed by anonymous memory.
16253 */
16254 vm_size_t region_size = 0, effective_page_size = 0;
16255 vm_map_offset_t addr = 0, effective_page_mask = 0;
16256
16257 region_size = len;
16258 addr = start;
16259
16260 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16261 effective_page_size = effective_page_mask + 1;
16262
16263 vm_map_unlock_read(map);
16264
16265 while (region_size) {
16266 vm_pre_fault(
16267 vm_map_trunc_page(addr, effective_page_mask),
16268 VM_PROT_READ | VM_PROT_WRITE);
16269
16270 region_size -= effective_page_size;
16271 addr += effective_page_size;
16272 }
16273 } else {
16274 /*
16275 * Find the file object backing this map entry. If there is
16276 * none, then we simply ignore the "will need" advice for this
16277 * entry and go on to the next one.
16278 */
16279 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16280 entry = entry->vme_next;
16281 start = entry->vme_start;
16282 continue;
16283 }
16284
16285 vm_object_paging_begin(object);
16286 pager = object->pager;
16287 vm_object_unlock(object);
16288
16289 /*
16290 * The data_request() could take a long time, so let's
16291 * release the map lock to avoid blocking other threads.
16292 */
16293 vm_map_unlock_read(map);
16294
16295 /*
16296 * Get the data from the object asynchronously.
16297 *
16298 * Note that memory_object_data_request() places limits on the
16299 * amount of I/O it will do. Regardless of the len we
16300 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16301 * silently truncates the len to that size. This isn't
16302 * necessarily bad since madvise shouldn't really be used to
16303 * page in unlimited amounts of data. Other Unix variants
16304 * limit the willneed case as well. If this turns out to be an
16305 * issue for developers, then we can always adjust the policy
16306 * here and still be backwards compatible since this is all
16307 * just "advice".
16308 */
16309 kr = memory_object_data_request(
16310 pager,
16311 vm_object_trunc_page(offset) + object->paging_offset,
16312 0, /* ignored */
16313 VM_PROT_READ,
16314 (memory_object_fault_info_t)&fault_info);
16315
16316 vm_object_lock(object);
16317 vm_object_paging_end(object);
16318 vm_object_unlock(object);
16319
16320 /*
16321 * If we couldn't do the I/O for some reason, just give up on
16322 * the madvise. We still return success to the user since
16323 * madvise isn't supposed to fail when the advice can't be
16324 * taken.
16325 */
16326
16327 if (kr != KERN_SUCCESS) {
16328 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16329 task_pid(current_task()), start, kr);
16330 return KERN_SUCCESS;
16331 }
16332 }
16333
16334 start += len;
16335 if (start >= end) {
16336 /* done */
16337 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16338 task_pid(current_task()), start, KERN_SUCCESS);
16339 return KERN_SUCCESS;
16340 }
16341
16342 /* look up next entry */
16343 vm_map_lock_read(map);
16344 if (!vm_map_lookup_entry(map, start, &entry)) {
16345 /*
16346 * There's a new hole in the address range.
16347 */
16348 vm_map_unlock_read(map);
16349 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16350 task_pid(current_task()), start, KERN_INVALID_ADDRESS);
16351 return KERN_INVALID_ADDRESS;
16352 }
16353 }
16354
16355 vm_map_unlock_read(map);
16356 KDBG(VMDBG_CODE(DBG_VM_MAP_WILLNEED) | DBG_FUNC_END,
16357 task_pid(current_task()), start, KERN_SUCCESS);
16358 return KERN_SUCCESS;
16359 }
16360
16361 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)16362 vm_map_entry_is_reusable(
16363 vm_map_entry_t entry)
16364 {
16365 /* Only user map entries */
16366
16367 vm_object_t object;
16368
16369 if (entry->is_sub_map) {
16370 return FALSE;
16371 }
16372
16373 switch (VME_ALIAS(entry)) {
16374 case VM_MEMORY_MALLOC:
16375 case VM_MEMORY_MALLOC_SMALL:
16376 case VM_MEMORY_MALLOC_LARGE:
16377 case VM_MEMORY_REALLOC:
16378 case VM_MEMORY_MALLOC_TINY:
16379 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16380 case VM_MEMORY_MALLOC_LARGE_REUSED:
16381 /*
16382 * This is a malloc() memory region: check if it's still
16383 * in its original state and can be re-used for more
16384 * malloc() allocations.
16385 */
16386 break;
16387 default:
16388 /*
16389 * Not a malloc() memory region: let the caller decide if
16390 * it's re-usable.
16391 */
16392 return TRUE;
16393 }
16394
16395 if (/*entry->is_shared ||*/
16396 entry->is_sub_map ||
16397 entry->in_transition ||
16398 entry->protection != VM_PROT_DEFAULT ||
16399 entry->max_protection != VM_PROT_ALL ||
16400 entry->inheritance != VM_INHERIT_DEFAULT ||
16401 entry->no_cache ||
16402 entry->vme_permanent ||
16403 entry->superpage_size != FALSE ||
16404 entry->zero_wired_pages ||
16405 entry->wired_count != 0 ||
16406 entry->user_wired_count != 0) {
16407 return FALSE;
16408 }
16409
16410 object = VME_OBJECT(entry);
16411 if (object == VM_OBJECT_NULL) {
16412 return TRUE;
16413 }
16414 if (
16415 #if 0
16416 /*
16417 * Let's proceed even if the VM object is potentially
16418 * shared.
16419 * We check for this later when processing the actual
16420 * VM pages, so the contents will be safe if shared.
16421 *
16422 * But we can still mark this memory region as "reusable" to
16423 * acknowledge that the caller did let us know that the memory
16424 * could be re-used and should not be penalized for holding
16425 * on to it. This allows its "resident size" to not include
16426 * the reusable range.
16427 */
16428 object->ref_count == 1 &&
16429 #endif
16430 object->vo_copy == VM_OBJECT_NULL &&
16431 object->shadow == VM_OBJECT_NULL &&
16432 object->internal &&
16433 object->purgable == VM_PURGABLE_DENY &&
16434 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16435 !object->code_signed) {
16436 return TRUE;
16437 }
16438 return FALSE;
16439 }
16440
16441 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16442 vm_map_reuse_pages(
16443 vm_map_t map,
16444 vm_map_offset_t start,
16445 vm_map_offset_t end)
16446 {
16447 vm_map_entry_t entry;
16448 vm_object_t object;
16449 vm_object_offset_t start_offset, end_offset;
16450
16451 /*
16452 * The MADV_REUSE operation doesn't require any changes to the
16453 * vm_map_entry_t's, so the read lock is sufficient.
16454 */
16455
16456 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16457 /*
16458 * XXX TODO4K
16459 * need to figure out what reusable means for a
16460 * portion of a native page.
16461 */
16462 return KERN_SUCCESS;
16463 }
16464
16465 vm_map_lock_read(map);
16466 assert(map->pmap != kernel_pmap); /* protect alias access */
16467
16468 /*
16469 * The madvise semantics require that the address range be fully
16470 * allocated with no holes. Otherwise, we're required to return
16471 * an error.
16472 */
16473
16474 if (!vm_map_range_check(map, start, end, &entry)) {
16475 vm_map_unlock_read(map);
16476 vm_page_stats_reusable.reuse_pages_failure++;
16477 return KERN_INVALID_ADDRESS;
16478 }
16479
16480 /*
16481 * Examine each vm_map_entry_t in the range.
16482 */
16483 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16484 entry = entry->vme_next) {
16485 /*
16486 * Sanity check on the VM map entry.
16487 */
16488 if (!vm_map_entry_is_reusable(entry)) {
16489 vm_map_unlock_read(map);
16490 vm_page_stats_reusable.reuse_pages_failure++;
16491 return KERN_INVALID_ADDRESS;
16492 }
16493
16494 /*
16495 * The first time through, the start address could be anywhere
16496 * within the vm_map_entry we found. So adjust the offset to
16497 * correspond.
16498 */
16499 if (entry->vme_start < start) {
16500 start_offset = start - entry->vme_start;
16501 } else {
16502 start_offset = 0;
16503 }
16504 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16505 start_offset += VME_OFFSET(entry);
16506 end_offset += VME_OFFSET(entry);
16507
16508 object = VME_OBJECT(entry);
16509 if (object != VM_OBJECT_NULL) {
16510 vm_object_lock(object);
16511 vm_object_reuse_pages(object, start_offset, end_offset,
16512 TRUE);
16513 vm_object_unlock(object);
16514 }
16515
16516 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16517 /*
16518 * XXX
16519 * We do not hold the VM map exclusively here.
16520 * The "alias" field is not that critical, so it's
16521 * safe to update it here, as long as it is the only
16522 * one that can be modified while holding the VM map
16523 * "shared".
16524 */
16525 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16526 }
16527 }
16528
16529 vm_map_unlock_read(map);
16530 vm_page_stats_reusable.reuse_pages_success++;
16531 return KERN_SUCCESS;
16532 }
16533
16534
16535 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16536 vm_map_reusable_pages(
16537 vm_map_t map,
16538 vm_map_offset_t start,
16539 vm_map_offset_t end)
16540 {
16541 vm_map_entry_t entry;
16542 vm_object_t object;
16543 vm_object_offset_t start_offset, end_offset;
16544 vm_map_offset_t pmap_offset;
16545
16546 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16547 /*
16548 * XXX TODO4K
16549 * need to figure out what reusable means for a portion
16550 * of a native page.
16551 */
16552 return KERN_SUCCESS;
16553 }
16554
16555 /*
16556 * The MADV_REUSABLE operation doesn't require any changes to the
16557 * vm_map_entry_t's, so the read lock is sufficient.
16558 */
16559
16560 vm_map_lock_read(map);
16561 assert(map->pmap != kernel_pmap); /* protect alias access */
16562
16563 /*
16564 * The madvise semantics require that the address range be fully
16565 * allocated with no holes. Otherwise, we're required to return
16566 * an error.
16567 */
16568
16569 if (!vm_map_range_check(map, start, end, &entry)) {
16570 vm_map_unlock_read(map);
16571 vm_page_stats_reusable.reusable_pages_failure++;
16572 return KERN_INVALID_ADDRESS;
16573 }
16574
16575 /*
16576 * Examine each vm_map_entry_t in the range.
16577 */
16578 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16579 entry = entry->vme_next) {
16580 int kill_pages = 0;
16581 boolean_t reusable_no_write = FALSE;
16582
16583 /*
16584 * Sanity check on the VM map entry.
16585 */
16586 if (!vm_map_entry_is_reusable(entry)) {
16587 vm_map_unlock_read(map);
16588 vm_page_stats_reusable.reusable_pages_failure++;
16589 return KERN_INVALID_ADDRESS;
16590 }
16591
16592 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16593 #if __arm64e__
16594 && !entry->used_for_tpro
16595 #endif
16596 ) {
16597 /* not writable: can't discard contents */
16598 vm_map_unlock_read(map);
16599 vm_page_stats_reusable.reusable_nonwritable++;
16600 vm_page_stats_reusable.reusable_pages_failure++;
16601 return KERN_PROTECTION_FAILURE;
16602 }
16603
16604 /*
16605 * The first time through, the start address could be anywhere
16606 * within the vm_map_entry we found. So adjust the offset to
16607 * correspond.
16608 */
16609 if (entry->vme_start < start) {
16610 start_offset = start - entry->vme_start;
16611 pmap_offset = start;
16612 } else {
16613 start_offset = 0;
16614 pmap_offset = entry->vme_start;
16615 }
16616 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16617 start_offset += VME_OFFSET(entry);
16618 end_offset += VME_OFFSET(entry);
16619
16620 object = VME_OBJECT(entry);
16621 if (object == VM_OBJECT_NULL) {
16622 continue;
16623 }
16624
16625 if (entry->protection & VM_PROT_EXECUTE) {
16626 /*
16627 * Executable mappings might be write-protected by
16628 * hardware, so do not attempt to write to these pages.
16629 */
16630 reusable_no_write = TRUE;
16631 }
16632
16633 if (entry->vme_xnu_user_debug) {
16634 /*
16635 * User debug pages might be write-protected by hardware,
16636 * so do not attempt to write to these pages.
16637 */
16638 reusable_no_write = TRUE;
16639 }
16640
16641 vm_object_lock(object);
16642 if (((object->ref_count == 1) ||
16643 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16644 object->vo_copy == VM_OBJECT_NULL)) &&
16645 object->shadow == VM_OBJECT_NULL &&
16646 /*
16647 * "iokit_acct" entries are billed for their virtual size
16648 * (rather than for their resident pages only), so they
16649 * wouldn't benefit from making pages reusable, and it
16650 * would be hard to keep track of pages that are both
16651 * "iokit_acct" and "reusable" in the pmap stats and
16652 * ledgers.
16653 */
16654 !(entry->iokit_acct ||
16655 (!entry->is_sub_map && !entry->use_pmap))) {
16656 if (object->ref_count != 1) {
16657 vm_page_stats_reusable.reusable_shared++;
16658 }
16659 kill_pages = 1;
16660 } else {
16661 kill_pages = -1;
16662 }
16663 if (kill_pages != -1) {
16664 vm_object_deactivate_pages(object,
16665 start_offset,
16666 end_offset - start_offset,
16667 kill_pages,
16668 TRUE /*reusable_pages*/,
16669 reusable_no_write,
16670 map->pmap,
16671 pmap_offset);
16672 } else {
16673 vm_page_stats_reusable.reusable_pages_shared++;
16674 DTRACE_VM4(vm_map_reusable_pages_shared,
16675 unsigned int, VME_ALIAS(entry),
16676 vm_map_t, map,
16677 vm_map_entry_t, entry,
16678 vm_object_t, object);
16679 }
16680 vm_object_unlock(object);
16681
16682 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16683 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16684 /*
16685 * XXX
16686 * We do not hold the VM map exclusively here.
16687 * The "alias" field is not that critical, so it's
16688 * safe to update it here, as long as it is the only
16689 * one that can be modified while holding the VM map
16690 * "shared".
16691 */
16692 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16693 }
16694 }
16695
16696 vm_map_unlock_read(map);
16697 vm_page_stats_reusable.reusable_pages_success++;
16698 return KERN_SUCCESS;
16699 }
16700
16701
16702 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16703 vm_map_can_reuse(
16704 vm_map_t map,
16705 vm_map_offset_t start,
16706 vm_map_offset_t end)
16707 {
16708 vm_map_entry_t entry;
16709
16710 /*
16711 * The MADV_REUSABLE operation doesn't require any changes to the
16712 * vm_map_entry_t's, so the read lock is sufficient.
16713 */
16714
16715 vm_map_lock_read(map);
16716 assert(map->pmap != kernel_pmap); /* protect alias access */
16717
16718 /*
16719 * The madvise semantics require that the address range be fully
16720 * allocated with no holes. Otherwise, we're required to return
16721 * an error.
16722 */
16723
16724 if (!vm_map_range_check(map, start, end, &entry)) {
16725 vm_map_unlock_read(map);
16726 vm_page_stats_reusable.can_reuse_failure++;
16727 return KERN_INVALID_ADDRESS;
16728 }
16729
16730 /*
16731 * Examine each vm_map_entry_t in the range.
16732 */
16733 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16734 entry = entry->vme_next) {
16735 /*
16736 * Sanity check on the VM map entry.
16737 */
16738 if (!vm_map_entry_is_reusable(entry)) {
16739 vm_map_unlock_read(map);
16740 vm_page_stats_reusable.can_reuse_failure++;
16741 return KERN_INVALID_ADDRESS;
16742 }
16743 }
16744
16745 vm_map_unlock_read(map);
16746 vm_page_stats_reusable.can_reuse_success++;
16747 return KERN_SUCCESS;
16748 }
16749
16750
16751 #if MACH_ASSERT
16752 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16753 vm_map_pageout(
16754 vm_map_t map,
16755 vm_map_offset_t start,
16756 vm_map_offset_t end)
16757 {
16758 vm_map_entry_t entry;
16759
16760 /*
16761 * The MADV_PAGEOUT operation doesn't require any changes to the
16762 * vm_map_entry_t's, so the read lock is sufficient.
16763 */
16764
16765 vm_map_lock_read(map);
16766
16767 /*
16768 * The madvise semantics require that the address range be fully
16769 * allocated with no holes. Otherwise, we're required to return
16770 * an error.
16771 */
16772
16773 if (!vm_map_range_check(map, start, end, &entry)) {
16774 vm_map_unlock_read(map);
16775 return KERN_INVALID_ADDRESS;
16776 }
16777
16778 /*
16779 * Examine each vm_map_entry_t in the range.
16780 */
16781 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16782 entry = entry->vme_next) {
16783 vm_object_t object;
16784
16785 /*
16786 * Sanity check on the VM map entry.
16787 */
16788 if (entry->is_sub_map) {
16789 vm_map_t submap;
16790 vm_map_offset_t submap_start;
16791 vm_map_offset_t submap_end;
16792 vm_map_entry_t submap_entry;
16793
16794 submap = VME_SUBMAP(entry);
16795 submap_start = VME_OFFSET(entry);
16796 submap_end = submap_start + (entry->vme_end -
16797 entry->vme_start);
16798
16799 vm_map_lock_read(submap);
16800
16801 if (!vm_map_range_check(submap,
16802 submap_start,
16803 submap_end,
16804 &submap_entry)) {
16805 vm_map_unlock_read(submap);
16806 vm_map_unlock_read(map);
16807 return KERN_INVALID_ADDRESS;
16808 }
16809
16810 if (submap_entry->is_sub_map) {
16811 vm_map_unlock_read(submap);
16812 continue;
16813 }
16814
16815 object = VME_OBJECT(submap_entry);
16816 if (object == VM_OBJECT_NULL || !object->internal) {
16817 vm_map_unlock_read(submap);
16818 continue;
16819 }
16820
16821 vm_object_pageout(object);
16822
16823 vm_map_unlock_read(submap);
16824 submap = VM_MAP_NULL;
16825 submap_entry = VM_MAP_ENTRY_NULL;
16826 continue;
16827 }
16828
16829 object = VME_OBJECT(entry);
16830 if (object == VM_OBJECT_NULL || !object->internal) {
16831 continue;
16832 }
16833
16834 vm_object_pageout(object);
16835 }
16836
16837 vm_map_unlock_read(map);
16838 return KERN_SUCCESS;
16839 }
16840 #endif /* MACH_ASSERT */
16841
16842 /*
16843 * This function determines if the zero operation can be run on the
16844 * respective entry. Additional checks on the object are in
16845 * vm_object_zero_preflight.
16846 */
16847 static kern_return_t
vm_map_zero_entry_preflight(vm_map_entry_t entry)16848 vm_map_zero_entry_preflight(vm_map_entry_t entry)
16849 {
16850 /*
16851 * Zeroing is restricted to writable non-executable entries and non-JIT
16852 * regions.
16853 */
16854 if (!(entry->protection & VM_PROT_WRITE) ||
16855 (entry->protection & VM_PROT_EXECUTE) ||
16856 entry->used_for_jit ||
16857 entry->vme_xnu_user_debug) {
16858 return KERN_PROTECTION_FAILURE;
16859 }
16860
16861 /*
16862 * Zeroing for copy on write isn't yet supported. Zeroing is also not
16863 * allowed for submaps.
16864 */
16865 if (entry->needs_copy || entry->is_sub_map) {
16866 return KERN_NO_ACCESS;
16867 }
16868
16869 return KERN_SUCCESS;
16870 }
16871
16872 /*
16873 * This function translates entry's start and end to offsets in the object
16874 */
16875 static void
vm_map_get_bounds_in_object(vm_map_entry_t entry,vm_map_offset_t start,vm_map_offset_t end,vm_map_offset_t * start_offset,vm_map_offset_t * end_offset)16876 vm_map_get_bounds_in_object(
16877 vm_map_entry_t entry,
16878 vm_map_offset_t start,
16879 vm_map_offset_t end,
16880 vm_map_offset_t *start_offset,
16881 vm_map_offset_t *end_offset)
16882 {
16883 if (entry->vme_start < start) {
16884 *start_offset = start - entry->vme_start;
16885 } else {
16886 *start_offset = 0;
16887 }
16888 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16889 *start_offset += VME_OFFSET(entry);
16890 *end_offset += VME_OFFSET(entry);
16891 }
16892
16893 /*
16894 * This function iterates through the entries in the requested range
16895 * and zeroes any resident pages in the corresponding objects. Compressed
16896 * pages are dropped instead of being faulted in and zeroed.
16897 */
16898 static kern_return_t
vm_map_zero(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16899 vm_map_zero(
16900 vm_map_t map,
16901 vm_map_offset_t start,
16902 vm_map_offset_t end)
16903 {
16904 vm_map_entry_t entry;
16905 vm_map_offset_t cur = start;
16906 kern_return_t ret;
16907
16908 /*
16909 * This operation isn't supported where the map page size is less than
16910 * the hardware page size. Caller will need to handle error and
16911 * explicitly zero memory if needed.
16912 */
16913 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16914 return KERN_NO_ACCESS;
16915 }
16916
16917 /*
16918 * The MADV_ZERO operation doesn't require any changes to the
16919 * vm_map_entry_t's, so the read lock is sufficient.
16920 */
16921 vm_map_lock_read(map);
16922 assert(map->pmap != kernel_pmap); /* protect alias access */
16923
16924 /*
16925 * The madvise semantics require that the address range be fully
16926 * allocated with no holes. Otherwise, we're required to return
16927 * an error. This check needs to be redone if the map has changed.
16928 */
16929 if (!vm_map_range_check(map, cur, end, &entry)) {
16930 vm_map_unlock_read(map);
16931 return KERN_INVALID_ADDRESS;
16932 }
16933
16934 /*
16935 * Examine each vm_map_entry_t in the range.
16936 */
16937 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
16938 vm_map_offset_t cur_offset;
16939 vm_map_offset_t end_offset;
16940 unsigned int last_timestamp = map->timestamp;
16941 vm_object_t object = VME_OBJECT(entry);
16942
16943 ret = vm_map_zero_entry_preflight(entry);
16944 if (ret != KERN_SUCCESS) {
16945 vm_map_unlock_read(map);
16946 return ret;
16947 }
16948
16949 if (object == VM_OBJECT_NULL) {
16950 entry = entry->vme_next;
16951 continue;
16952 }
16953
16954 vm_map_get_bounds_in_object(entry, cur, end, &cur_offset, &end_offset);
16955 vm_object_lock(object);
16956 /*
16957 * Take a reference on the object as vm_object_zero will drop the object
16958 * lock when it encounters a busy page.
16959 */
16960 vm_object_reference_locked(object);
16961 vm_map_unlock_read(map);
16962
16963 ret = vm_object_zero(object, cur_offset, end_offset);
16964 vm_object_unlock(object);
16965 vm_object_deallocate(object);
16966 if (ret != KERN_SUCCESS) {
16967 return ret;
16968 }
16969 /*
16970 * Update cur as vm_object_zero has succeeded.
16971 */
16972 cur += (end_offset - cur_offset);
16973 if (cur == end) {
16974 return KERN_SUCCESS;
16975 }
16976
16977 /*
16978 * If the map timestamp has changed, restart by relooking up cur in the
16979 * map
16980 */
16981 vm_map_lock_read(map);
16982 if (last_timestamp != map->timestamp) {
16983 /*
16984 * Relookup cur in the map
16985 */
16986 if (!vm_map_range_check(map, cur, end, &entry)) {
16987 vm_map_unlock_read(map);
16988 return KERN_INVALID_ADDRESS;
16989 }
16990 continue;
16991 }
16992 /*
16993 * If the map hasn't changed proceed with the next entry
16994 */
16995 entry = entry->vme_next;
16996 }
16997
16998 vm_map_unlock_read(map);
16999 return KERN_SUCCESS;
17000 }
17001
17002
17003 /*
17004 * Routine: vm_map_entry_insert
17005 *
17006 * Description: This routine inserts a new vm_entry in a locked map.
17007 */
17008 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t clear_map_aligned)17009 vm_map_entry_insert(
17010 vm_map_t map,
17011 vm_map_entry_t insp_entry,
17012 vm_map_offset_t start,
17013 vm_map_offset_t end,
17014 vm_object_t object,
17015 vm_object_offset_t offset,
17016 vm_map_kernel_flags_t vmk_flags,
17017 boolean_t needs_copy,
17018 vm_prot_t cur_protection,
17019 vm_prot_t max_protection,
17020 vm_inherit_t inheritance,
17021 boolean_t clear_map_aligned)
17022 {
17023 vm_map_entry_t new_entry;
17024 boolean_t map_aligned = FALSE;
17025
17026 assert(insp_entry != (vm_map_entry_t)0);
17027 vm_map_lock_assert_exclusive(map);
17028
17029 __assert_only vm_object_offset_t end_offset = 0;
17030 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17031
17032 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17033 map_aligned = TRUE;
17034 }
17035 if (clear_map_aligned &&
17036 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17037 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17038 map_aligned = FALSE;
17039 }
17040 if (map_aligned) {
17041 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17042 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17043 } else {
17044 assert(page_aligned(start));
17045 assert(page_aligned(end));
17046 }
17047 assert(start < end);
17048
17049 new_entry = vm_map_entry_create(map);
17050
17051 new_entry->vme_start = start;
17052 new_entry->vme_end = end;
17053
17054 if (vmk_flags.vmkf_submap) {
17055 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17056 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
17057 } else {
17058 VME_OBJECT_SET(new_entry, object, false, 0);
17059 }
17060 VME_OFFSET_SET(new_entry, offset);
17061 VME_ALIAS_SET(new_entry, vmk_flags.vm_tag);
17062
17063 new_entry->map_aligned = map_aligned;
17064 new_entry->needs_copy = needs_copy;
17065 new_entry->inheritance = inheritance;
17066 new_entry->protection = cur_protection;
17067 new_entry->max_protection = max_protection;
17068 /*
17069 * submap: "use_pmap" means "nested".
17070 * default: false.
17071 *
17072 * object: "use_pmap" means "use pmap accounting" for footprint.
17073 * default: true.
17074 */
17075 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17076 new_entry->no_cache = vmk_flags.vmf_no_cache;
17077 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17078 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17079 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17080 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17081
17082 if (vmk_flags.vmkf_map_jit) {
17083 if (!(map->jit_entry_exists) ||
17084 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17085 new_entry->used_for_jit = TRUE;
17086 map->jit_entry_exists = TRUE;
17087 }
17088 }
17089
17090 /*
17091 * Insert the new entry into the list.
17092 */
17093
17094 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
17095 map->size += end - start;
17096
17097 /*
17098 * Update the free space hint and the lookup hint.
17099 */
17100
17101 SAVE_HINT_MAP_WRITE(map, new_entry);
17102 return new_entry;
17103 }
17104
17105 /*
17106 * Routine: vm_map_remap_extract
17107 *
17108 * Description: This routine returns a vm_entry list from a map.
17109 */
17110 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,vm_map_copy_t map_copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)17111 vm_map_remap_extract(
17112 vm_map_t map,
17113 vm_map_offset_t addr,
17114 vm_map_size_t size,
17115 boolean_t copy,
17116 vm_map_copy_t map_copy,
17117 vm_prot_t *cur_protection, /* IN/OUT */
17118 vm_prot_t *max_protection, /* IN/OUT */
17119 /* What, no behavior? */
17120 vm_inherit_t inheritance,
17121 vm_map_kernel_flags_t vmk_flags)
17122 {
17123 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17124 kern_return_t result;
17125 vm_map_size_t mapped_size;
17126 vm_map_size_t tmp_size;
17127 vm_map_entry_t src_entry; /* result of last map lookup */
17128 vm_map_entry_t new_entry;
17129 vm_object_offset_t offset;
17130 vm_map_offset_t map_address;
17131 vm_map_offset_t src_start; /* start of entry to map */
17132 vm_map_offset_t src_end; /* end of region to be mapped */
17133 vm_object_t object;
17134 vm_map_version_t version;
17135 boolean_t src_needs_copy;
17136 boolean_t new_entry_needs_copy;
17137 vm_map_entry_t saved_src_entry;
17138 boolean_t src_entry_was_wired;
17139 vm_prot_t max_prot_for_prot_copy;
17140 vm_map_offset_t effective_page_mask;
17141 bool pageable, same_map;
17142 boolean_t vm_remap_legacy;
17143 vm_prot_t required_cur_prot, required_max_prot;
17144 vm_object_t new_copy_object; /* vm_object_copy_* result */
17145 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17146
17147 pageable = vmk_flags.vmkf_copy_pageable;
17148 same_map = vmk_flags.vmkf_copy_same_map;
17149
17150 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17151
17152 assert(map != VM_MAP_NULL);
17153 assert(size != 0);
17154 assert(size == vm_map_round_page(size, effective_page_mask));
17155 assert(inheritance == VM_INHERIT_NONE ||
17156 inheritance == VM_INHERIT_COPY ||
17157 inheritance == VM_INHERIT_SHARE);
17158 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17159 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17160 assert((*cur_protection & *max_protection) == *cur_protection);
17161
17162 /*
17163 * Compute start and end of region.
17164 */
17165 src_start = vm_map_trunc_page(addr, effective_page_mask);
17166 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17167
17168 /*
17169 * Initialize map_header.
17170 */
17171 map_header->nentries = 0;
17172 map_header->entries_pageable = pageable;
17173 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17174 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17175 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17176 vm_map_store_init(map_header);
17177
17178 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17179 /*
17180 * Special case for vm_map_protect(VM_PROT_COPY):
17181 * we want to set the new mappings' max protection to the
17182 * specified *max_protection...
17183 */
17184 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17185 /* ... but we want to use the vm_remap() legacy mode */
17186 *max_protection = VM_PROT_NONE;
17187 *cur_protection = VM_PROT_NONE;
17188 } else {
17189 max_prot_for_prot_copy = VM_PROT_NONE;
17190 }
17191
17192 if (*cur_protection == VM_PROT_NONE &&
17193 *max_protection == VM_PROT_NONE) {
17194 /*
17195 * vm_remap() legacy mode:
17196 * Extract all memory regions in the specified range and
17197 * collect the strictest set of protections allowed on the
17198 * entire range, so the caller knows what they can do with
17199 * the remapped range.
17200 * We start with VM_PROT_ALL and we'll remove the protections
17201 * missing from each memory region.
17202 */
17203 vm_remap_legacy = TRUE;
17204 *cur_protection = VM_PROT_ALL;
17205 *max_protection = VM_PROT_ALL;
17206 required_cur_prot = VM_PROT_NONE;
17207 required_max_prot = VM_PROT_NONE;
17208 } else {
17209 /*
17210 * vm_remap_new() mode:
17211 * Extract all memory regions in the specified range and
17212 * ensure that they have at least the protections specified
17213 * by the caller via *cur_protection and *max_protection.
17214 * The resulting mapping should have these protections.
17215 */
17216 vm_remap_legacy = FALSE;
17217 if (copy) {
17218 required_cur_prot = VM_PROT_NONE;
17219 required_max_prot = VM_PROT_READ;
17220 } else {
17221 required_cur_prot = *cur_protection;
17222 required_max_prot = *max_protection;
17223 }
17224 }
17225
17226 map_address = 0;
17227 mapped_size = 0;
17228 result = KERN_SUCCESS;
17229
17230 /*
17231 * The specified source virtual space might correspond to
17232 * multiple map entries, need to loop on them.
17233 */
17234 vm_map_lock(map);
17235
17236 if (map->pmap == kernel_pmap) {
17237 map_copy->is_kernel_range = true;
17238 map_copy->orig_range = kmem_addr_get_range(addr, size);
17239 #if CONFIG_MAP_RANGES
17240 } else if (map->uses_user_ranges) {
17241 map_copy->is_user_range = true;
17242 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17243 #endif /* CONFIG_MAP_RANGES */
17244 }
17245
17246 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17247 /*
17248 * This address space uses sub-pages so the range might
17249 * not be re-mappable in an address space with larger
17250 * pages. Re-assemble any broken-up VM map entries to
17251 * improve our chances of making it work.
17252 */
17253 vm_map_simplify_range(map, src_start, src_end);
17254 }
17255 while (mapped_size != size) {
17256 vm_map_size_t entry_size;
17257
17258 /*
17259 * Find the beginning of the region.
17260 */
17261 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
17262 result = KERN_INVALID_ADDRESS;
17263 break;
17264 }
17265
17266 if (src_start < src_entry->vme_start ||
17267 (mapped_size && src_start != src_entry->vme_start)) {
17268 result = KERN_INVALID_ADDRESS;
17269 break;
17270 }
17271
17272 tmp_size = size - mapped_size;
17273 if (src_end > src_entry->vme_end) {
17274 tmp_size -= (src_end - src_entry->vme_end);
17275 }
17276
17277 entry_size = (vm_map_size_t)(src_entry->vme_end -
17278 src_entry->vme_start);
17279
17280 if (src_entry->is_sub_map &&
17281 vmk_flags.vmkf_copy_single_object) {
17282 vm_map_t submap;
17283 vm_map_offset_t submap_start;
17284 vm_map_size_t submap_size;
17285 boolean_t submap_needs_copy;
17286
17287 /*
17288 * No check for "required protection" on "src_entry"
17289 * because the protections that matter are the ones
17290 * on the submap's VM map entry, which will be checked
17291 * during the call to vm_map_remap_extract() below.
17292 */
17293 object = VM_OBJECT_NULL;
17294
17295 submap_size = src_entry->vme_end - src_start;
17296 if (submap_size > size) {
17297 submap_size = size;
17298 }
17299 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17300 submap = VME_SUBMAP(src_entry);
17301 if (copy) {
17302 /*
17303 * The caller wants a copy-on-write re-mapping,
17304 * so let's extract from the submap accordingly.
17305 */
17306 submap_needs_copy = TRUE;
17307 } else if (src_entry->needs_copy) {
17308 /*
17309 * The caller wants a shared re-mapping but the
17310 * submap is mapped with "needs_copy", so its
17311 * contents can't be shared as is. Extract the
17312 * contents of the submap as "copy-on-write".
17313 * The re-mapping won't be shared with the
17314 * original mapping but this is equivalent to
17315 * what happened with the original "remap from
17316 * submap" code.
17317 * The shared region is mapped "needs_copy", for
17318 * example.
17319 */
17320 submap_needs_copy = TRUE;
17321 } else {
17322 /*
17323 * The caller wants a shared re-mapping and
17324 * this mapping can be shared (no "needs_copy"),
17325 * so let's extract from the submap accordingly.
17326 * Kernel submaps are mapped without
17327 * "needs_copy", for example.
17328 */
17329 submap_needs_copy = FALSE;
17330 }
17331 vm_map_reference(submap);
17332 vm_map_unlock(map);
17333 src_entry = NULL;
17334 if (vm_remap_legacy) {
17335 *cur_protection = VM_PROT_NONE;
17336 *max_protection = VM_PROT_NONE;
17337 }
17338
17339 DTRACE_VM7(remap_submap_recurse,
17340 vm_map_t, map,
17341 vm_map_offset_t, addr,
17342 vm_map_size_t, size,
17343 boolean_t, copy,
17344 vm_map_offset_t, submap_start,
17345 vm_map_size_t, submap_size,
17346 boolean_t, submap_needs_copy);
17347
17348 result = vm_map_remap_extract(submap,
17349 submap_start,
17350 submap_size,
17351 submap_needs_copy,
17352 map_copy,
17353 cur_protection,
17354 max_protection,
17355 inheritance,
17356 vmk_flags);
17357 vm_map_deallocate(submap);
17358
17359 if (result == KERN_SUCCESS &&
17360 submap_needs_copy &&
17361 !copy) {
17362 /*
17363 * We were asked for a "shared"
17364 * re-mapping but had to ask for a
17365 * "copy-on-write" remapping of the
17366 * submap's mapping to honor the
17367 * submap's "needs_copy".
17368 * We now need to resolve that
17369 * pending "copy-on-write" to
17370 * get something we can share.
17371 */
17372 vm_map_entry_t copy_entry;
17373 vm_object_offset_t copy_offset;
17374 vm_map_size_t copy_size;
17375 vm_object_t copy_object;
17376 copy_entry = vm_map_copy_first_entry(map_copy);
17377 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17378 copy_object = VME_OBJECT(copy_entry);
17379 copy_offset = VME_OFFSET(copy_entry);
17380 if (copy_object == VM_OBJECT_NULL) {
17381 assert(copy_offset == 0);
17382 assert(!copy_entry->needs_copy);
17383 if (copy_entry->max_protection == VM_PROT_NONE) {
17384 assert(copy_entry->protection == VM_PROT_NONE);
17385 /* nothing to share */
17386 } else {
17387 assert(copy_offset == 0);
17388 copy_object = vm_object_allocate(copy_size);
17389 VME_OFFSET_SET(copy_entry, 0);
17390 VME_OBJECT_SET(copy_entry, copy_object, false, 0);
17391 assert(copy_entry->use_pmap);
17392 }
17393 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17394 /* already shareable */
17395 assert(!copy_entry->needs_copy);
17396 } else if (copy_entry->needs_copy ||
17397 copy_object->shadowed ||
17398 (copy_object->internal &&
17399 !copy_object->true_share &&
17400 !copy_entry->is_shared &&
17401 copy_object->vo_size > copy_size)) {
17402 VME_OBJECT_SHADOW(copy_entry, copy_size, TRUE);
17403 assert(copy_entry->use_pmap);
17404 if (copy_entry->needs_copy) {
17405 /* already write-protected */
17406 } else {
17407 vm_prot_t prot;
17408 prot = copy_entry->protection & ~VM_PROT_WRITE;
17409 vm_object_pmap_protect(copy_object,
17410 copy_offset,
17411 copy_size,
17412 PMAP_NULL,
17413 PAGE_SIZE,
17414 0,
17415 prot);
17416 }
17417 copy_entry->needs_copy = FALSE;
17418 }
17419 copy_object = VME_OBJECT(copy_entry);
17420 copy_offset = VME_OFFSET(copy_entry);
17421 if (copy_object &&
17422 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17423 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17424 copy_object->true_share = TRUE;
17425 }
17426 }
17427
17428 return result;
17429 }
17430
17431 if (src_entry->is_sub_map) {
17432 /* protections for submap mapping are irrelevant here */
17433 } else if (((src_entry->protection & required_cur_prot) !=
17434 required_cur_prot) ||
17435 ((src_entry->max_protection & required_max_prot) !=
17436 required_max_prot)) {
17437 if (vmk_flags.vmkf_copy_single_object &&
17438 mapped_size != 0) {
17439 /*
17440 * Single object extraction.
17441 * We can't extract more with the required
17442 * protection but we've extracted some, so
17443 * stop there and declare success.
17444 * The caller should check the size of
17445 * the copy entry we've extracted.
17446 */
17447 result = KERN_SUCCESS;
17448 } else {
17449 /*
17450 * VM range extraction.
17451 * Required proctection is not available
17452 * for this part of the range: fail.
17453 */
17454 result = KERN_PROTECTION_FAILURE;
17455 }
17456 break;
17457 }
17458
17459 if (src_entry->is_sub_map) {
17460 vm_map_t submap;
17461 vm_map_offset_t submap_start;
17462 vm_map_size_t submap_size;
17463 vm_map_copy_t submap_copy;
17464 vm_prot_t submap_curprot, submap_maxprot;
17465 boolean_t submap_needs_copy;
17466
17467 /*
17468 * No check for "required protection" on "src_entry"
17469 * because the protections that matter are the ones
17470 * on the submap's VM map entry, which will be checked
17471 * during the call to vm_map_copy_extract() below.
17472 */
17473 object = VM_OBJECT_NULL;
17474 submap_copy = VM_MAP_COPY_NULL;
17475
17476 /* find equivalent range in the submap */
17477 submap = VME_SUBMAP(src_entry);
17478 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
17479 submap_size = tmp_size;
17480 if (copy) {
17481 /*
17482 * The caller wants a copy-on-write re-mapping,
17483 * so let's extract from the submap accordingly.
17484 */
17485 submap_needs_copy = TRUE;
17486 } else if (src_entry->needs_copy) {
17487 /*
17488 * The caller wants a shared re-mapping but the
17489 * submap is mapped with "needs_copy", so its
17490 * contents can't be shared as is. Extract the
17491 * contents of the submap as "copy-on-write".
17492 * The re-mapping won't be shared with the
17493 * original mapping but this is equivalent to
17494 * what happened with the original "remap from
17495 * submap" code.
17496 * The shared region is mapped "needs_copy", for
17497 * example.
17498 */
17499 submap_needs_copy = TRUE;
17500 } else {
17501 /*
17502 * The caller wants a shared re-mapping and
17503 * this mapping can be shared (no "needs_copy"),
17504 * so let's extract from the submap accordingly.
17505 * Kernel submaps are mapped without
17506 * "needs_copy", for example.
17507 */
17508 submap_needs_copy = FALSE;
17509 }
17510 /* extra ref to keep submap alive */
17511 vm_map_reference(submap);
17512
17513 DTRACE_VM7(remap_submap_recurse,
17514 vm_map_t, map,
17515 vm_map_offset_t, addr,
17516 vm_map_size_t, size,
17517 boolean_t, copy,
17518 vm_map_offset_t, submap_start,
17519 vm_map_size_t, submap_size,
17520 boolean_t, submap_needs_copy);
17521
17522 /*
17523 * The map can be safely unlocked since we
17524 * already hold a reference on the submap.
17525 *
17526 * No timestamp since we don't care if the map
17527 * gets modified while we're down in the submap.
17528 * We'll resume the extraction at src_start + tmp_size
17529 * anyway.
17530 */
17531 vm_map_unlock(map);
17532 src_entry = NULL; /* not valid once map is unlocked */
17533
17534 if (vm_remap_legacy) {
17535 submap_curprot = VM_PROT_NONE;
17536 submap_maxprot = VM_PROT_NONE;
17537 if (max_prot_for_prot_copy) {
17538 submap_maxprot = max_prot_for_prot_copy;
17539 }
17540 } else {
17541 assert(!max_prot_for_prot_copy);
17542 submap_curprot = *cur_protection;
17543 submap_maxprot = *max_protection;
17544 }
17545 result = vm_map_copy_extract(submap,
17546 submap_start,
17547 submap_size,
17548 submap_needs_copy,
17549 &submap_copy,
17550 &submap_curprot,
17551 &submap_maxprot,
17552 inheritance,
17553 vmk_flags);
17554
17555 /* release extra ref on submap */
17556 vm_map_deallocate(submap);
17557 submap = VM_MAP_NULL;
17558
17559 if (result != KERN_SUCCESS) {
17560 vm_map_lock(map);
17561 break;
17562 }
17563
17564 /* transfer submap_copy entries to map_header */
17565 while (vm_map_copy_first_entry(submap_copy) !=
17566 vm_map_copy_to_entry(submap_copy)) {
17567 vm_map_entry_t copy_entry;
17568 vm_map_size_t copy_entry_size;
17569
17570 copy_entry = vm_map_copy_first_entry(submap_copy);
17571
17572 /*
17573 * Prevent kernel_object from being exposed to
17574 * user space.
17575 */
17576 if (__improbable(copy_entry->vme_kernel_object)) {
17577 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17578 proc_selfpid(),
17579 (get_bsdtask_info(current_task())
17580 ? proc_name_address(get_bsdtask_info(current_task()))
17581 : "?"));
17582 DTRACE_VM(extract_kernel_only);
17583 result = KERN_INVALID_RIGHT;
17584 vm_map_copy_discard(submap_copy);
17585 submap_copy = VM_MAP_COPY_NULL;
17586 vm_map_lock(map);
17587 break;
17588 }
17589
17590 #ifdef __arm64e__
17591 if (vmk_flags.vmkf_tpro_enforcement_override) {
17592 copy_entry->used_for_tpro = FALSE;
17593 }
17594 #endif /* __arm64e__ */
17595
17596 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17597 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17598 copy_entry->vme_start = map_address;
17599 copy_entry->vme_end = map_address + copy_entry_size;
17600 map_address += copy_entry_size;
17601 mapped_size += copy_entry_size;
17602 src_start += copy_entry_size;
17603 assert(src_start <= src_end);
17604 _vm_map_store_entry_link(map_header,
17605 map_header->links.prev,
17606 copy_entry);
17607 }
17608 /* done with submap_copy */
17609 vm_map_copy_discard(submap_copy);
17610
17611 if (vm_remap_legacy) {
17612 *cur_protection &= submap_curprot;
17613 *max_protection &= submap_maxprot;
17614 }
17615
17616 /* re-acquire the map lock and continue to next entry */
17617 vm_map_lock(map);
17618 continue;
17619 } else {
17620 object = VME_OBJECT(src_entry);
17621
17622 /*
17623 * Prevent kernel_object from being exposed to
17624 * user space.
17625 */
17626 if (__improbable(is_kernel_object(object))) {
17627 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
17628 proc_selfpid(),
17629 (get_bsdtask_info(current_task())
17630 ? proc_name_address(get_bsdtask_info(current_task()))
17631 : "?"));
17632 DTRACE_VM(extract_kernel_only);
17633 result = KERN_INVALID_RIGHT;
17634 break;
17635 }
17636
17637 if (src_entry->iokit_acct) {
17638 /*
17639 * This entry uses "IOKit accounting".
17640 */
17641 } else if (object != VM_OBJECT_NULL &&
17642 object->internal &&
17643 (object->purgable != VM_PURGABLE_DENY ||
17644 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17645 /*
17646 * Purgeable objects have their own accounting:
17647 * no pmap accounting for them.
17648 */
17649 assertf(!src_entry->use_pmap,
17650 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17651 map,
17652 src_entry,
17653 (uint64_t)src_entry->vme_start,
17654 (uint64_t)src_entry->vme_end,
17655 src_entry->protection,
17656 src_entry->max_protection,
17657 VME_ALIAS(src_entry));
17658 } else {
17659 /*
17660 * Not IOKit or purgeable:
17661 * must be accounted by pmap stats.
17662 */
17663 assertf(src_entry->use_pmap,
17664 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17665 map,
17666 src_entry,
17667 (uint64_t)src_entry->vme_start,
17668 (uint64_t)src_entry->vme_end,
17669 src_entry->protection,
17670 src_entry->max_protection,
17671 VME_ALIAS(src_entry));
17672 }
17673
17674 if (object == VM_OBJECT_NULL) {
17675 assert(!src_entry->needs_copy);
17676 if (src_entry->max_protection == VM_PROT_NONE) {
17677 assert(src_entry->protection == VM_PROT_NONE);
17678 /*
17679 * No VM object and no permissions:
17680 * this must be a reserved range with
17681 * nothing to share or copy.
17682 * There could also be all sorts of
17683 * pmap shenanigans within that reserved
17684 * range, so let's just copy the map
17685 * entry as is to remap a similar
17686 * reserved range.
17687 */
17688 offset = 0; /* no object => no offset */
17689 goto copy_src_entry;
17690 }
17691 object = vm_object_allocate(entry_size);
17692 VME_OFFSET_SET(src_entry, 0);
17693 VME_OBJECT_SET(src_entry, object, false, 0);
17694 assert(src_entry->use_pmap);
17695 assert(!map->mapped_in_other_pmaps);
17696 } else if (src_entry->wired_count ||
17697 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17698 /*
17699 * A wired memory region should not have
17700 * any pending copy-on-write and needs to
17701 * keep pointing at the VM object that
17702 * contains the wired pages.
17703 * If we're sharing this memory (copy=false),
17704 * we'll share this VM object.
17705 * If we're copying this memory (copy=true),
17706 * we'll call vm_object_copy_slowly() below
17707 * and use the new VM object for the remapping.
17708 *
17709 * Or, we are already using an asymmetric
17710 * copy, and therefore we already have
17711 * the right object.
17712 */
17713 assert(!src_entry->needs_copy);
17714 } else if (src_entry->needs_copy || object->shadowed ||
17715 (object->internal && !object->true_share &&
17716 !src_entry->is_shared &&
17717 object->vo_size > entry_size)) {
17718 bool is_writable;
17719
17720 VME_OBJECT_SHADOW(src_entry, entry_size,
17721 vm_map_always_shadow(map));
17722 assert(src_entry->use_pmap);
17723
17724 is_writable = false;
17725 if (src_entry->protection & VM_PROT_WRITE) {
17726 is_writable = true;
17727 #if __arm64e__
17728 } else if (src_entry->used_for_tpro) {
17729 is_writable = true;
17730 #endif /* __arm64e__ */
17731 }
17732 if (!src_entry->needs_copy && is_writable) {
17733 vm_prot_t prot;
17734
17735 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17736 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17737 __FUNCTION__,
17738 map, map->pmap,
17739 src_entry,
17740 (uint64_t)src_entry->vme_start,
17741 (uint64_t)src_entry->vme_end,
17742 src_entry->protection);
17743 }
17744
17745 prot = src_entry->protection & ~VM_PROT_WRITE;
17746
17747 if (override_nx(map,
17748 VME_ALIAS(src_entry))
17749 && prot) {
17750 prot |= VM_PROT_EXECUTE;
17751 }
17752
17753 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17754 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17755 __FUNCTION__,
17756 map, map->pmap,
17757 src_entry,
17758 (uint64_t)src_entry->vme_start,
17759 (uint64_t)src_entry->vme_end,
17760 prot);
17761 }
17762
17763 if (map->mapped_in_other_pmaps) {
17764 vm_object_pmap_protect(
17765 VME_OBJECT(src_entry),
17766 VME_OFFSET(src_entry),
17767 entry_size,
17768 PMAP_NULL,
17769 PAGE_SIZE,
17770 src_entry->vme_start,
17771 prot);
17772 #if MACH_ASSERT
17773 } else if (__improbable(map->pmap == PMAP_NULL)) {
17774 extern boolean_t vm_tests_in_progress;
17775 assert(vm_tests_in_progress);
17776 /*
17777 * Some VM tests (in vm_tests.c)
17778 * sometimes want to use a VM
17779 * map without a pmap.
17780 * Otherwise, this should never
17781 * happen.
17782 */
17783 #endif /* MACH_ASSERT */
17784 } else {
17785 pmap_protect(vm_map_pmap(map),
17786 src_entry->vme_start,
17787 src_entry->vme_end,
17788 prot);
17789 }
17790 }
17791
17792 object = VME_OBJECT(src_entry);
17793 src_entry->needs_copy = FALSE;
17794 }
17795
17796
17797 vm_object_lock(object);
17798 vm_object_reference_locked(object); /* object ref. for new entry */
17799 assert(!src_entry->needs_copy);
17800 if (object->copy_strategy ==
17801 MEMORY_OBJECT_COPY_SYMMETRIC) {
17802 /*
17803 * If we want to share this object (copy==0),
17804 * it needs to be COPY_DELAY.
17805 * If we want to copy this object (copy==1),
17806 * we can't just set "needs_copy" on our side
17807 * and expect the other side to do the same
17808 * (symmetrically), so we can't let the object
17809 * stay COPY_SYMMETRIC.
17810 * So we always switch from COPY_SYMMETRIC to
17811 * COPY_DELAY.
17812 */
17813 object->copy_strategy =
17814 MEMORY_OBJECT_COPY_DELAY;
17815 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
17816 }
17817 vm_object_unlock(object);
17818 }
17819
17820 offset = (VME_OFFSET(src_entry) +
17821 (src_start - src_entry->vme_start));
17822
17823 copy_src_entry:
17824 new_entry = _vm_map_entry_create(map_header);
17825 vm_map_entry_copy(map, new_entry, src_entry);
17826 if (new_entry->is_sub_map) {
17827 /* clr address space specifics */
17828 new_entry->use_pmap = FALSE;
17829 } else if (copy) {
17830 /*
17831 * We're dealing with a copy-on-write operation,
17832 * so the resulting mapping should not inherit the
17833 * original mapping's accounting settings.
17834 * "use_pmap" should be reset to its default (TRUE)
17835 * so that the new mapping gets accounted for in
17836 * the task's memory footprint.
17837 */
17838 new_entry->use_pmap = TRUE;
17839 }
17840 /* "iokit_acct" was cleared in vm_map_entry_copy() */
17841 assert(!new_entry->iokit_acct);
17842
17843 new_entry->map_aligned = FALSE;
17844
17845 new_entry->vme_start = map_address;
17846 new_entry->vme_end = map_address + tmp_size;
17847 assert(new_entry->vme_start < new_entry->vme_end);
17848 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17849 /* security: keep "permanent" and "csm_associated" */
17850 new_entry->vme_permanent = src_entry->vme_permanent;
17851 new_entry->csm_associated = src_entry->csm_associated;
17852 /*
17853 * Remapping for vm_map_protect(VM_PROT_COPY)
17854 * to convert a read-only mapping into a
17855 * copy-on-write version of itself but
17856 * with write access:
17857 * keep the original inheritance but let's not
17858 * add VM_PROT_WRITE to the max protection yet
17859 * since we want to do more security checks against
17860 * the target map.
17861 */
17862 new_entry->inheritance = src_entry->inheritance;
17863 new_entry->protection &= max_prot_for_prot_copy;
17864 } else {
17865 new_entry->inheritance = inheritance;
17866 if (!vm_remap_legacy) {
17867 new_entry->protection = *cur_protection;
17868 new_entry->max_protection = *max_protection;
17869 }
17870 }
17871 #ifdef __arm64e__
17872 if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
17873 new_entry->used_for_tpro = FALSE;
17874 }
17875 #endif /* __arm64e__ */
17876 VME_OFFSET_SET(new_entry, offset);
17877
17878 /*
17879 * The new region has to be copied now if required.
17880 */
17881 RestartCopy:
17882 if (!copy) {
17883 if (src_entry->used_for_jit == TRUE) {
17884 if (same_map) {
17885 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
17886 /*
17887 * Cannot allow an entry describing a JIT
17888 * region to be shared across address spaces.
17889 */
17890 result = KERN_INVALID_ARGUMENT;
17891 vm_object_deallocate(object);
17892 vm_map_entry_dispose(new_entry);
17893 new_entry = VM_MAP_ENTRY_NULL;
17894 break;
17895 }
17896 }
17897
17898 if (!src_entry->is_sub_map &&
17899 VME_OBJECT(src_entry) == VM_OBJECT_NULL) {
17900 /* no accessible memory; nothing to share */
17901 assert(src_entry->protection == VM_PROT_NONE);
17902 assert(src_entry->max_protection == VM_PROT_NONE);
17903 src_entry->is_shared = FALSE;
17904 } else {
17905 src_entry->is_shared = TRUE;
17906 }
17907 if (!new_entry->is_sub_map &&
17908 VME_OBJECT(new_entry) == VM_OBJECT_NULL) {
17909 /* no accessible memory; nothing to share */
17910 assert(new_entry->protection == VM_PROT_NONE);
17911 assert(new_entry->max_protection == VM_PROT_NONE);
17912 new_entry->is_shared = FALSE;
17913 } else {
17914 new_entry->is_shared = TRUE;
17915 }
17916 if (!(new_entry->is_sub_map)) {
17917 new_entry->needs_copy = FALSE;
17918 }
17919 } else if (src_entry->is_sub_map) {
17920 /* make this a COW sub_map if not already */
17921 assert(new_entry->wired_count == 0);
17922 new_entry->needs_copy = TRUE;
17923 object = VM_OBJECT_NULL;
17924 } else if (src_entry->wired_count == 0 &&
17925 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
17926 vm_object_copy_quickly(VME_OBJECT(new_entry),
17927 VME_OFFSET(new_entry),
17928 (new_entry->vme_end -
17929 new_entry->vme_start),
17930 &src_needs_copy,
17931 &new_entry_needs_copy)) {
17932 new_entry->needs_copy = new_entry_needs_copy;
17933 new_entry->is_shared = FALSE;
17934 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17935
17936 /*
17937 * Handle copy_on_write semantics.
17938 */
17939 if (src_needs_copy && !src_entry->needs_copy) {
17940 vm_prot_t prot;
17941
17942 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection)) {
17943 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17944 __FUNCTION__,
17945 map, map->pmap, src_entry,
17946 (uint64_t)src_entry->vme_start,
17947 (uint64_t)src_entry->vme_end,
17948 src_entry->protection);
17949 }
17950
17951 prot = src_entry->protection & ~VM_PROT_WRITE;
17952
17953 if (override_nx(map,
17954 VME_ALIAS(src_entry))
17955 && prot) {
17956 prot |= VM_PROT_EXECUTE;
17957 }
17958
17959 if (pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot)) {
17960 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17961 __FUNCTION__,
17962 map, map->pmap, src_entry,
17963 (uint64_t)src_entry->vme_start,
17964 (uint64_t)src_entry->vme_end,
17965 prot);
17966 }
17967
17968 vm_object_pmap_protect(object,
17969 offset,
17970 entry_size,
17971 ((src_entry->is_shared
17972 || map->mapped_in_other_pmaps) ?
17973 PMAP_NULL : map->pmap),
17974 VM_MAP_PAGE_SIZE(map),
17975 src_entry->vme_start,
17976 prot);
17977
17978 assert(src_entry->wired_count == 0);
17979 src_entry->needs_copy = TRUE;
17980 }
17981 /*
17982 * Throw away the old object reference of the new entry.
17983 */
17984 vm_object_deallocate(object);
17985 } else {
17986 new_entry->is_shared = FALSE;
17987 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
17988
17989 src_entry_was_wired = (src_entry->wired_count > 0);
17990 saved_src_entry = src_entry;
17991 src_entry = VM_MAP_ENTRY_NULL;
17992
17993 /*
17994 * The map can be safely unlocked since we
17995 * already hold a reference on the object.
17996 *
17997 * Record the timestamp of the map for later
17998 * verification, and unlock the map.
17999 */
18000 version.main_timestamp = map->timestamp;
18001 vm_map_unlock(map); /* Increments timestamp once! */
18002
18003 /*
18004 * Perform the copy.
18005 */
18006 if (src_entry_was_wired > 0 ||
18007 (debug4k_no_cow_copyin &&
18008 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18009 vm_object_lock(object);
18010 result = vm_object_copy_slowly(
18011 object,
18012 offset,
18013 (new_entry->vme_end -
18014 new_entry->vme_start),
18015 THREAD_UNINT,
18016 &new_copy_object);
18017 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18018 saved_used_for_jit = new_entry->used_for_jit;
18019 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18020 new_entry->used_for_jit = saved_used_for_jit;
18021 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
18022 new_entry->needs_copy = FALSE;
18023 } else {
18024 vm_object_offset_t new_offset;
18025
18026 new_offset = VME_OFFSET(new_entry);
18027 result = vm_object_copy_strategically(
18028 object,
18029 offset,
18030 (new_entry->vme_end -
18031 new_entry->vme_start),
18032 false, /* forking */
18033 &new_copy_object,
18034 &new_offset,
18035 &new_entry_needs_copy);
18036 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18037 saved_used_for_jit = new_entry->used_for_jit;
18038 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
18039 new_entry->used_for_jit = saved_used_for_jit;
18040 if (new_offset != VME_OFFSET(new_entry)) {
18041 VME_OFFSET_SET(new_entry, new_offset);
18042 }
18043
18044 new_entry->needs_copy = new_entry_needs_copy;
18045 }
18046
18047 /*
18048 * Throw away the old object reference of the new entry.
18049 */
18050 vm_object_deallocate(object);
18051
18052 if (result != KERN_SUCCESS &&
18053 result != KERN_MEMORY_RESTART_COPY) {
18054 vm_map_entry_dispose(new_entry);
18055 vm_map_lock(map);
18056 break;
18057 }
18058
18059 /*
18060 * Verify that the map has not substantially
18061 * changed while the copy was being made.
18062 */
18063
18064 vm_map_lock(map);
18065 if (version.main_timestamp + 1 != map->timestamp) {
18066 /*
18067 * Simple version comparison failed.
18068 *
18069 * Retry the lookup and verify that the
18070 * same object/offset are still present.
18071 */
18072 saved_src_entry = VM_MAP_ENTRY_NULL;
18073 vm_object_deallocate(VME_OBJECT(new_entry));
18074 vm_map_entry_dispose(new_entry);
18075 if (result == KERN_MEMORY_RESTART_COPY) {
18076 result = KERN_SUCCESS;
18077 }
18078 continue;
18079 }
18080 /* map hasn't changed: src_entry is still valid */
18081 src_entry = saved_src_entry;
18082 saved_src_entry = VM_MAP_ENTRY_NULL;
18083
18084 if (result == KERN_MEMORY_RESTART_COPY) {
18085 vm_object_reference(object);
18086 goto RestartCopy;
18087 }
18088 }
18089
18090 _vm_map_store_entry_link(map_header,
18091 map_header->links.prev, new_entry);
18092
18093 /* protections for submap mapping are irrelevant here */
18094 if (vm_remap_legacy && !src_entry->is_sub_map) {
18095 *cur_protection &= src_entry->protection;
18096 *max_protection &= src_entry->max_protection;
18097 }
18098
18099 map_address += tmp_size;
18100 mapped_size += tmp_size;
18101 src_start += tmp_size;
18102
18103 if (vmk_flags.vmkf_copy_single_object) {
18104 if (mapped_size != size) {
18105 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18106 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18107 if (src_entry->vme_next != vm_map_to_entry(map) &&
18108 src_entry->vme_next->vme_object_value ==
18109 src_entry->vme_object_value) {
18110 /* XXX TODO4K */
18111 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18112 }
18113 }
18114 break;
18115 }
18116 } /* end while */
18117
18118 vm_map_unlock(map);
18119 if (result != KERN_SUCCESS) {
18120 /*
18121 * Free all allocated elements.
18122 */
18123 for (src_entry = map_header->links.next;
18124 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18125 src_entry = new_entry) {
18126 new_entry = src_entry->vme_next;
18127 _vm_map_store_entry_unlink(map_header, src_entry, false);
18128 if (src_entry->is_sub_map) {
18129 vm_map_deallocate(VME_SUBMAP(src_entry));
18130 } else {
18131 vm_object_deallocate(VME_OBJECT(src_entry));
18132 }
18133 vm_map_entry_dispose(src_entry);
18134 }
18135 }
18136 return result;
18137 }
18138
18139 bool
vm_map_is_exotic(vm_map_t map)18140 vm_map_is_exotic(
18141 vm_map_t map)
18142 {
18143 return VM_MAP_IS_EXOTIC(map);
18144 }
18145
18146 bool
vm_map_is_alien(vm_map_t map)18147 vm_map_is_alien(
18148 vm_map_t map)
18149 {
18150 return VM_MAP_IS_ALIEN(map);
18151 }
18152
18153 #if XNU_TARGET_OS_OSX
18154 void
vm_map_mark_alien(vm_map_t map)18155 vm_map_mark_alien(
18156 vm_map_t map)
18157 {
18158 vm_map_lock(map);
18159 map->is_alien = true;
18160 vm_map_unlock(map);
18161 }
18162
18163 void
vm_map_single_jit(vm_map_t map)18164 vm_map_single_jit(
18165 vm_map_t map)
18166 {
18167 vm_map_lock(map);
18168 map->single_jit = true;
18169 vm_map_unlock(map);
18170 }
18171 #endif /* XNU_TARGET_OS_OSX */
18172
18173
18174
18175 /*
18176 * Callers of this function must call vm_map_copy_require on
18177 * previously created vm_map_copy_t or pass a newly created
18178 * one to ensure that it hasn't been forged.
18179 */
18180 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)18181 vm_map_copy_to_physcopy(
18182 vm_map_copy_t copy_map,
18183 vm_map_t target_map)
18184 {
18185 vm_map_size_t size;
18186 vm_map_entry_t entry;
18187 vm_map_entry_t new_entry;
18188 vm_object_t new_object;
18189 unsigned int pmap_flags;
18190 pmap_t new_pmap;
18191 vm_map_t new_map;
18192 vm_map_address_t src_start, src_end, src_cur;
18193 vm_map_address_t dst_start, dst_end, dst_cur;
18194 kern_return_t kr;
18195 void *kbuf;
18196
18197 /*
18198 * Perform the equivalent of vm_allocate() and memcpy().
18199 * Replace the mappings in "copy_map" with the newly allocated mapping.
18200 */
18201 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18202
18203 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18204
18205 /* create a new pmap to map "copy_map" */
18206 pmap_flags = 0;
18207 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18208 #if PMAP_CREATE_FORCE_4K_PAGES
18209 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18210 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
18211 pmap_flags |= PMAP_CREATE_64BIT;
18212 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
18213 if (new_pmap == NULL) {
18214 return KERN_RESOURCE_SHORTAGE;
18215 }
18216
18217 /* allocate new VM object */
18218 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18219 new_object = vm_object_allocate(size);
18220 assert(new_object);
18221
18222 /* allocate new VM map entry */
18223 new_entry = vm_map_copy_entry_create(copy_map);
18224 assert(new_entry);
18225
18226 /* finish initializing new VM map entry */
18227 new_entry->protection = VM_PROT_DEFAULT;
18228 new_entry->max_protection = VM_PROT_DEFAULT;
18229 new_entry->use_pmap = TRUE;
18230
18231 /* make new VM map entry point to new VM object */
18232 new_entry->vme_start = 0;
18233 new_entry->vme_end = size;
18234 VME_OBJECT_SET(new_entry, new_object, false, 0);
18235 VME_OFFSET_SET(new_entry, 0);
18236
18237 /* create a new pageable VM map to map "copy_map" */
18238 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
18239 VM_MAP_CREATE_PAGEABLE);
18240 assert(new_map);
18241 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
18242
18243 /* map "copy_map" in the new VM map */
18244 src_start = 0;
18245 kr = vm_map_copyout_internal(
18246 new_map,
18247 &src_start,
18248 copy_map,
18249 copy_map->size,
18250 FALSE, /* consume_on_success */
18251 VM_PROT_DEFAULT,
18252 VM_PROT_DEFAULT,
18253 VM_INHERIT_DEFAULT);
18254 assert(kr == KERN_SUCCESS);
18255 src_end = src_start + copy_map->size;
18256
18257 /* map "new_object" in the new VM map */
18258 vm_object_reference(new_object);
18259 dst_start = 0;
18260 kr = vm_map_enter(new_map,
18261 &dst_start,
18262 size,
18263 0, /* mask */
18264 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18265 new_object,
18266 0, /* offset */
18267 FALSE, /* needs copy */
18268 VM_PROT_DEFAULT,
18269 VM_PROT_DEFAULT,
18270 VM_INHERIT_DEFAULT);
18271 assert(kr == KERN_SUCCESS);
18272 dst_end = dst_start + size;
18273
18274 /* get a kernel buffer */
18275 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18276
18277 /* physically copy "copy_map" mappings to new VM object */
18278 for (src_cur = src_start, dst_cur = dst_start;
18279 src_cur < src_end;
18280 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18281 vm_size_t bytes;
18282
18283 bytes = PAGE_SIZE;
18284 if (src_cur + PAGE_SIZE > src_end) {
18285 /* partial copy for last page */
18286 bytes = src_end - src_cur;
18287 assert(bytes > 0 && bytes < PAGE_SIZE);
18288 /* rest of dst page should be zero-filled */
18289 }
18290 /* get bytes from src mapping */
18291 kr = copyinmap(new_map, src_cur, kbuf, bytes);
18292 if (kr != KERN_SUCCESS) {
18293 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18294 }
18295 /* put bytes in dst mapping */
18296 assert(dst_cur < dst_end);
18297 assert(dst_cur + bytes <= dst_end);
18298 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
18299 if (kr != KERN_SUCCESS) {
18300 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18301 }
18302 }
18303
18304 /* free kernel buffer */
18305 kfree_data(kbuf, PAGE_SIZE);
18306
18307 /* destroy new map */
18308 vm_map_destroy(new_map);
18309 new_map = VM_MAP_NULL;
18310
18311 /* dispose of the old map entries in "copy_map" */
18312 while (vm_map_copy_first_entry(copy_map) !=
18313 vm_map_copy_to_entry(copy_map)) {
18314 entry = vm_map_copy_first_entry(copy_map);
18315 vm_map_copy_entry_unlink(copy_map, entry);
18316 if (entry->is_sub_map) {
18317 vm_map_deallocate(VME_SUBMAP(entry));
18318 } else {
18319 vm_object_deallocate(VME_OBJECT(entry));
18320 }
18321 vm_map_copy_entry_dispose(entry);
18322 }
18323
18324 /* change "copy_map"'s page_size to match "target_map" */
18325 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18326 copy_map->offset = 0;
18327 copy_map->size = size;
18328
18329 /* insert new map entry in "copy_map" */
18330 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18331 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18332
18333 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18334 return KERN_SUCCESS;
18335 }
18336
18337 void
18338 vm_map_copy_adjust_get_target_copy_map(
18339 vm_map_copy_t copy_map,
18340 vm_map_copy_t *target_copy_map_p);
18341 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)18342 vm_map_copy_adjust_get_target_copy_map(
18343 vm_map_copy_t copy_map,
18344 vm_map_copy_t *target_copy_map_p)
18345 {
18346 vm_map_copy_t target_copy_map;
18347 vm_map_entry_t entry, target_entry;
18348
18349 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18350 /* the caller already has a "target_copy_map": use it */
18351 return;
18352 }
18353
18354 /* the caller wants us to create a new copy of "copy_map" */
18355 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18356 target_copy_map = vm_map_copy_allocate(copy_map->type);
18357 target_copy_map->offset = copy_map->offset;
18358 target_copy_map->size = copy_map->size;
18359 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18360 for (entry = vm_map_copy_first_entry(copy_map);
18361 entry != vm_map_copy_to_entry(copy_map);
18362 entry = entry->vme_next) {
18363 target_entry = vm_map_copy_entry_create(target_copy_map);
18364 vm_map_entry_copy_full(target_entry, entry);
18365 if (target_entry->is_sub_map) {
18366 vm_map_reference(VME_SUBMAP(target_entry));
18367 } else {
18368 vm_object_reference(VME_OBJECT(target_entry));
18369 }
18370 vm_map_copy_entry_link(
18371 target_copy_map,
18372 vm_map_copy_last_entry(target_copy_map),
18373 target_entry);
18374 }
18375 entry = VM_MAP_ENTRY_NULL;
18376 *target_copy_map_p = target_copy_map;
18377 }
18378
18379 /*
18380 * Callers of this function must call vm_map_copy_require on
18381 * previously created vm_map_copy_t or pass a newly created
18382 * one to ensure that it hasn't been forged.
18383 */
18384 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)18385 vm_map_copy_trim(
18386 vm_map_copy_t copy_map,
18387 uint16_t new_page_shift,
18388 vm_map_offset_t trim_start,
18389 vm_map_offset_t trim_end)
18390 {
18391 uint16_t copy_page_shift;
18392 vm_map_entry_t entry, next_entry;
18393
18394 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18395 assert(copy_map->cpy_hdr.nentries > 0);
18396
18397 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18398 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18399
18400 /* use the new page_shift to do the clipping */
18401 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18402 copy_map->cpy_hdr.page_shift = new_page_shift;
18403
18404 for (entry = vm_map_copy_first_entry(copy_map);
18405 entry != vm_map_copy_to_entry(copy_map);
18406 entry = next_entry) {
18407 next_entry = entry->vme_next;
18408 if (entry->vme_end <= trim_start) {
18409 /* entry fully before trim range: skip */
18410 continue;
18411 }
18412 if (entry->vme_start >= trim_end) {
18413 /* entry fully after trim range: done */
18414 break;
18415 }
18416 /* clip entry if needed */
18417 vm_map_copy_clip_start(copy_map, entry, trim_start);
18418 vm_map_copy_clip_end(copy_map, entry, trim_end);
18419 /* dispose of entry */
18420 copy_map->size -= entry->vme_end - entry->vme_start;
18421 vm_map_copy_entry_unlink(copy_map, entry);
18422 if (entry->is_sub_map) {
18423 vm_map_deallocate(VME_SUBMAP(entry));
18424 } else {
18425 vm_object_deallocate(VME_OBJECT(entry));
18426 }
18427 vm_map_copy_entry_dispose(entry);
18428 entry = VM_MAP_ENTRY_NULL;
18429 }
18430
18431 /* restore copy_map's original page_shift */
18432 copy_map->cpy_hdr.page_shift = copy_page_shift;
18433 }
18434
18435 /*
18436 * Make any necessary adjustments to "copy_map" to allow it to be
18437 * mapped into "target_map".
18438 * If no changes were necessary, "target_copy_map" points to the
18439 * untouched "copy_map".
18440 * If changes are necessary, changes will be made to "target_copy_map".
18441 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18442 * copy the original "copy_map" to it before applying the changes.
18443 * The caller should discard "target_copy_map" if it's not the same as
18444 * the original "copy_map".
18445 */
18446 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18447 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_ut offset_u,vm_map_size_ut size_u,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)18448 vm_map_copy_adjust_to_target(
18449 vm_map_copy_t src_copy_map,
18450 vm_map_offset_ut offset_u,
18451 vm_map_size_ut size_u,
18452 vm_map_t target_map,
18453 boolean_t copy,
18454 vm_map_copy_t *target_copy_map_p,
18455 vm_map_offset_t *overmap_start_p,
18456 vm_map_offset_t *overmap_end_p,
18457 vm_map_offset_t *trimmed_start_p)
18458 {
18459 vm_map_copy_t copy_map, target_copy_map;
18460 vm_map_size_t target_size;
18461 vm_map_size_t src_copy_map_size;
18462 vm_map_size_t overmap_start, overmap_end;
18463 int misalignments;
18464 vm_map_entry_t entry, target_entry;
18465 vm_map_offset_t addr_adjustment;
18466 vm_map_offset_t new_start, new_end;
18467 int copy_page_mask, target_page_mask;
18468 uint16_t copy_page_shift, target_page_shift;
18469 vm_map_offset_t trimmed_end;
18470 vm_map_size_t map_size;
18471 kern_return_t kr;
18472
18473 /*
18474 * Sanitize any input parameters that are addr/size/prot/inherit
18475 */
18476 kr = vm_map_copy_addr_size_sanitize(
18477 target_map,
18478 offset_u,
18479 size_u,
18480 VM_SANITIZE_CALLER_MACH_MEMORY_ENTRY_MAP_SIZE,
18481 &new_start,
18482 &new_end,
18483 &map_size);
18484 if (__improbable(kr != KERN_SUCCESS)) {
18485 return vm_sanitize_get_kr(kr);
18486 }
18487
18488 /*
18489 * Assert that the vm_map_copy is coming from the right
18490 * zone and hasn't been forged
18491 */
18492 vm_map_copy_require(src_copy_map);
18493 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18494
18495 /*
18496 * Start working with "src_copy_map" but we'll switch
18497 * to "target_copy_map" as soon as we start making adjustments.
18498 */
18499 copy_map = src_copy_map;
18500 src_copy_map_size = src_copy_map->size;
18501
18502 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18503 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18504 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
18505 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18506
18507 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), *target_copy_map_p);
18508
18509 target_copy_map = *target_copy_map_p;
18510 if (target_copy_map != VM_MAP_COPY_NULL) {
18511 vm_map_copy_require(target_copy_map);
18512 }
18513
18514 if (new_end > copy_map->size) {
18515 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u));
18516 return KERN_INVALID_ARGUMENT;
18517 }
18518
18519 /* trim the end */
18520 trimmed_end = 0;
18521 new_end = VM_MAP_ROUND_PAGE(new_end, target_page_mask);
18522 if (new_end < copy_map->size) {
18523 trimmed_end = src_copy_map_size - new_end;
18524 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18525 /* get "target_copy_map" if needed and adjust it */
18526 vm_map_copy_adjust_get_target_copy_map(copy_map,
18527 &target_copy_map);
18528 copy_map = target_copy_map;
18529 vm_map_copy_trim(target_copy_map, target_page_shift,
18530 new_end, copy_map->size);
18531 }
18532
18533 /* trim the start */
18534 new_start = VM_MAP_TRUNC_PAGE(new_start, target_page_mask);
18535 if (new_start != 0) {
18536 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(offset_u), (uint64_t)VM_SANITIZE_UNSAFE_UNWRAP(size_u), target_copy_map, (uint64_t)0, (uint64_t)new_start);
18537 /* get "target_copy_map" if needed and adjust it */
18538 vm_map_copy_adjust_get_target_copy_map(copy_map,
18539 &target_copy_map);
18540 copy_map = target_copy_map;
18541 vm_map_copy_trim(target_copy_map, target_page_shift,
18542 0, new_start);
18543 }
18544 *trimmed_start_p = new_start;
18545
18546 /* target_size starts with what's left after trimming */
18547 target_size = copy_map->size;
18548 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18549 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18550 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18551 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18552
18553 /* check for misalignments but don't adjust yet */
18554 misalignments = 0;
18555 overmap_start = 0;
18556 overmap_end = 0;
18557 if (copy_page_shift < target_page_shift) {
18558 /*
18559 * Remapping from 4K to 16K: check the VM object alignments
18560 * throughout the range.
18561 * If the start and end of the range are mis-aligned, we can
18562 * over-map to re-align, and adjust the "overmap" start/end
18563 * and "target_size" of the range accordingly.
18564 * If there is any mis-alignment within the range:
18565 * if "copy":
18566 * we can do immediate-copy instead of copy-on-write,
18567 * else:
18568 * no way to remap and share; fail.
18569 */
18570 for (entry = vm_map_copy_first_entry(copy_map);
18571 entry != vm_map_copy_to_entry(copy_map);
18572 entry = entry->vme_next) {
18573 vm_object_offset_t object_offset_start, object_offset_end;
18574
18575 object_offset_start = VME_OFFSET(entry);
18576 object_offset_end = object_offset_start;
18577 object_offset_end += entry->vme_end - entry->vme_start;
18578 if (object_offset_start & target_page_mask) {
18579 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18580 overmap_start++;
18581 } else {
18582 misalignments++;
18583 }
18584 }
18585 if (object_offset_end & target_page_mask) {
18586 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18587 overmap_end++;
18588 } else {
18589 misalignments++;
18590 }
18591 }
18592 }
18593 }
18594 entry = VM_MAP_ENTRY_NULL;
18595
18596 /* decide how to deal with misalignments */
18597 assert(overmap_start <= 1);
18598 assert(overmap_end <= 1);
18599 if (!overmap_start && !overmap_end && !misalignments) {
18600 /* copy_map is properly aligned for target_map ... */
18601 if (*trimmed_start_p) {
18602 /* ... but we trimmed it, so still need to adjust */
18603 } else {
18604 /* ... and we didn't trim anything: we're done */
18605 if (target_copy_map == VM_MAP_COPY_NULL) {
18606 target_copy_map = copy_map;
18607 }
18608 *target_copy_map_p = target_copy_map;
18609 *overmap_start_p = 0;
18610 *overmap_end_p = 0;
18611 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18612 return KERN_SUCCESS;
18613 }
18614 } else if (misalignments && !copy) {
18615 /* can't "share" if misaligned */
18616 DEBUG4K_ADJUST("unsupported sharing\n");
18617 #if MACH_ASSERT
18618 if (debug4k_panic_on_misaligned_sharing) {
18619 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18620 }
18621 #endif /* MACH_ASSERT */
18622 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18623 return KERN_NOT_SUPPORTED;
18624 } else {
18625 /* can't virtual-copy if misaligned (but can physical-copy) */
18626 DEBUG4K_ADJUST("mis-aligned copying\n");
18627 }
18628
18629 /* get a "target_copy_map" if needed and switch to it */
18630 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
18631 copy_map = target_copy_map;
18632
18633 if (misalignments && copy) {
18634 vm_map_size_t target_copy_map_size;
18635
18636 /*
18637 * Can't do copy-on-write with misaligned mappings.
18638 * Replace the mappings with a physical copy of the original
18639 * mappings' contents.
18640 */
18641 target_copy_map_size = target_copy_map->size;
18642 kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
18643 if (kr != KERN_SUCCESS) {
18644 return kr;
18645 }
18646 *target_copy_map_p = target_copy_map;
18647 *overmap_start_p = 0;
18648 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18649 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18650 return KERN_SUCCESS;
18651 }
18652
18653 /* apply the adjustments */
18654 misalignments = 0;
18655 overmap_start = 0;
18656 overmap_end = 0;
18657 /* remove copy_map->offset, so that everything starts at offset 0 */
18658 addr_adjustment = copy_map->offset;
18659 /* also remove whatever we trimmed from the start */
18660 addr_adjustment += *trimmed_start_p;
18661 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18662 target_entry != vm_map_copy_to_entry(target_copy_map);
18663 target_entry = target_entry->vme_next) {
18664 vm_object_offset_t object_offset_start, object_offset_end;
18665
18666 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18667 object_offset_start = VME_OFFSET(target_entry);
18668 if (object_offset_start & target_page_mask) {
18669 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18670 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18671 /*
18672 * start of 1st entry is mis-aligned:
18673 * re-adjust by over-mapping.
18674 */
18675 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18676 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18677 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
18678 } else {
18679 misalignments++;
18680 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18681 assert(copy);
18682 }
18683 }
18684
18685 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18686 target_size += overmap_start;
18687 } else {
18688 target_entry->vme_start += overmap_start;
18689 }
18690 target_entry->vme_end += overmap_start;
18691
18692 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
18693 if (object_offset_end & target_page_mask) {
18694 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18695 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18696 /*
18697 * end of last entry is mis-aligned: re-adjust by over-mapping.
18698 */
18699 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18700 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18701 target_entry->vme_end += overmap_end;
18702 target_size += overmap_end;
18703 } else {
18704 misalignments++;
18705 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18706 assert(copy);
18707 }
18708 }
18709 target_entry->vme_start -= addr_adjustment;
18710 target_entry->vme_end -= addr_adjustment;
18711 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18712 }
18713
18714 target_copy_map->size = target_size;
18715 target_copy_map->offset += overmap_start;
18716 target_copy_map->offset -= addr_adjustment;
18717 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18718
18719 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18720 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18721 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18722 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18723
18724 *target_copy_map_p = target_copy_map;
18725 *overmap_start_p = overmap_start;
18726 *overmap_end_p = overmap_end;
18727
18728 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18729 return KERN_SUCCESS;
18730 }
18731
18732 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)18733 vm_map_range_physical_size(
18734 vm_map_t map,
18735 vm_map_address_t start,
18736 mach_vm_size_t size,
18737 mach_vm_size_t * phys_size)
18738 {
18739 kern_return_t kr;
18740 vm_map_copy_t copy_map, target_copy_map;
18741 vm_map_offset_t adjusted_start, adjusted_end;
18742 vm_map_size_t adjusted_size;
18743 vm_prot_t cur_prot, max_prot;
18744 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18745 vm_map_kernel_flags_t vmk_flags;
18746
18747 if (size == 0) {
18748 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18749 *phys_size = 0;
18750 return KERN_SUCCESS;
18751 }
18752
18753 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18754 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18755 if (__improbable(os_add_overflow(start, size, &end) ||
18756 adjusted_end <= adjusted_start)) {
18757 /* wraparound */
18758 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18759 *phys_size = 0;
18760 return KERN_INVALID_ARGUMENT;
18761 }
18762 if (__improbable(vm_map_range_overflows(map, start, size))) {
18763 *phys_size = 0;
18764 return KERN_INVALID_ADDRESS;
18765 }
18766 assert(adjusted_end > adjusted_start);
18767 adjusted_size = adjusted_end - adjusted_start;
18768 *phys_size = adjusted_size;
18769 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18770 return KERN_SUCCESS;
18771 }
18772 if (start == 0) {
18773 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18774 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18775 if (__improbable(adjusted_end <= adjusted_start)) {
18776 /* wraparound */
18777 printf("%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18778 *phys_size = 0;
18779 return KERN_INVALID_ARGUMENT;
18780 }
18781 assert(adjusted_end > adjusted_start);
18782 adjusted_size = adjusted_end - adjusted_start;
18783 *phys_size = adjusted_size;
18784 return KERN_SUCCESS;
18785 }
18786
18787 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18788 vmk_flags.vmkf_copy_pageable = TRUE;
18789 vmk_flags.vmkf_copy_same_map = TRUE;
18790 assert(adjusted_size != 0);
18791 cur_prot = VM_PROT_NONE; /* legacy mode */
18792 max_prot = VM_PROT_NONE; /* legacy mode */
18793 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
18794 FALSE /* copy */,
18795 ©_map,
18796 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
18797 vmk_flags);
18798 if (kr != KERN_SUCCESS) {
18799 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18800 //assert(0);
18801 *phys_size = 0;
18802 return kr;
18803 }
18804 assert(copy_map != VM_MAP_COPY_NULL);
18805 target_copy_map = copy_map;
18806 DEBUG4K_ADJUST("adjusting...\n");
18807 kr = vm_map_copy_adjust_to_target(
18808 copy_map,
18809 start - adjusted_start, /* offset */
18810 size, /* size */
18811 kernel_map,
18812 FALSE, /* copy */
18813 &target_copy_map,
18814 &overmap_start,
18815 &overmap_end,
18816 &trimmed_start);
18817 if (kr == KERN_SUCCESS) {
18818 if (target_copy_map->size != *phys_size) {
18819 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
18820 }
18821 *phys_size = target_copy_map->size;
18822 } else {
18823 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18824 //assert(0);
18825 *phys_size = 0;
18826 }
18827 vm_map_copy_discard(copy_map);
18828 copy_map = VM_MAP_COPY_NULL;
18829
18830 return kr;
18831 }
18832
18833 static inline kern_return_t
vm_map_remap_sanitize(vm_map_t src_map,vm_map_t target_map,vm_map_address_ut address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_offset_ut memory_address_u,vm_prot_ut cur_protection_u,vm_prot_ut max_protection_u,vm_inherit_ut inheritance_u,vm_map_kernel_flags_t vmk_flags,vm_map_address_t * target_addr,vm_map_address_t * mask,vm_map_offset_t * memory_address,vm_map_offset_t * memory_end,vm_map_size_t * memory_size,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t * inheritance)18834 vm_map_remap_sanitize(
18835 vm_map_t src_map,
18836 vm_map_t target_map,
18837 vm_map_address_ut address_u,
18838 vm_map_size_ut size_u,
18839 vm_map_offset_ut mask_u,
18840 vm_map_offset_ut memory_address_u,
18841 vm_prot_ut cur_protection_u,
18842 vm_prot_ut max_protection_u,
18843 vm_inherit_ut inheritance_u,
18844 vm_map_kernel_flags_t vmk_flags,
18845 vm_map_address_t *target_addr,
18846 vm_map_address_t *mask,
18847 vm_map_offset_t *memory_address,
18848 vm_map_offset_t *memory_end,
18849 vm_map_size_t *memory_size,
18850 vm_prot_t *cur_protection,
18851 vm_prot_t *max_protection,
18852 vm_inherit_t *inheritance)
18853 {
18854 kern_return_t result;
18855 vm_sanitize_flags_t vm_sanitize_flags;
18856
18857 result = vm_sanitize_inherit(inheritance_u, VM_SANITIZE_CALLER_VM_MAP_REMAP,
18858 inheritance);
18859 if (__improbable(result != KERN_SUCCESS)) {
18860 return result;
18861 }
18862
18863 result = vm_sanitize_cur_and_max_prots(cur_protection_u, max_protection_u,
18864 VM_SANITIZE_CALLER_VM_MAP_REMAP, target_map,
18865 cur_protection, max_protection);
18866 if (__improbable(result != KERN_SUCCESS)) {
18867 return result;
18868 }
18869
18870 result = vm_sanitize_mask(mask_u, VM_SANITIZE_CALLER_VM_MAP_REMAP, mask);
18871 if (__improbable(result != KERN_SUCCESS)) {
18872 return result;
18873 }
18874
18875 /*
18876 * If the user is requesting that we return the address of the
18877 * first byte of the data (rather than the base of the page),
18878 * then we use different rounding semantics: specifically,
18879 * we assume that (memory_address, size) describes a region
18880 * all of whose pages we must cover, rather than a base to be truncated
18881 * down and a size to be added to that base. So we figure out
18882 * the highest page that the requested region includes and make
18883 * sure that the size will cover it.
18884 *
18885 * The key example we're worried about it is of the form:
18886 *
18887 * memory_address = 0x1ff0, size = 0x20
18888 *
18889 * With the old semantics, we round down the memory_address to 0x1000
18890 * and round up the size to 0x1000, resulting in our covering *only*
18891 * page 0x1000. With the new semantics, we'd realize that the region covers
18892 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
18893 * 0x1000 and page 0x2000 in the region we remap.
18894 *
18895 * VM_SANITIZE_FLAGS_REALIGN_START asks for the old (broken) semantics.
18896 */
18897 vm_sanitize_flags = VM_SANITIZE_FLAGS_SIZE_ZERO_FAILS;
18898 if (!vmk_flags.vmf_return_data_addr) {
18899 vm_sanitize_flags |= VM_SANITIZE_FLAGS_REALIGN_START;
18900 }
18901
18902 result = vm_sanitize_addr_size(memory_address_u, size_u,
18903 VM_SANITIZE_CALLER_VM_MAP_REMAP, src_map,
18904 vm_sanitize_flags, memory_address, memory_end,
18905 memory_size);
18906 if (__improbable(result != KERN_SUCCESS)) {
18907 return result;
18908 }
18909
18910 *target_addr = vm_sanitize_addr(target_map, address_u);
18911 return KERN_SUCCESS;
18912 }
18913
18914 /*
18915 * Routine: vm_remap
18916 *
18917 * Map portion of a task's address space.
18918 * Mapped region must not overlap more than
18919 * one vm memory object. Protections and
18920 * inheritance attributes remain the same
18921 * as in the original task and are out parameters.
18922 * Source and Target task can be identical
18923 * Other attributes are identical as for vm_map()
18924 */
18925 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_ut * address_u,vm_map_size_ut size_u,vm_map_offset_ut mask_u,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,vm_map_offset_ut memory_address_u,boolean_t copy,vm_prot_ut * cur_protection_u,vm_prot_ut * max_protection_u,vm_inherit_ut inheritance_u)18926 vm_map_remap(
18927 vm_map_t target_map,
18928 vm_map_address_ut *address_u,
18929 vm_map_size_ut size_u,
18930 vm_map_offset_ut mask_u,
18931 vm_map_kernel_flags_t vmk_flags,
18932 vm_map_t src_map,
18933 vm_map_offset_ut memory_address_u,
18934 boolean_t copy,
18935 vm_prot_ut *cur_protection_u, /* IN/OUT */
18936 vm_prot_ut *max_protection_u, /* IN/OUT */
18937 vm_inherit_ut inheritance_u)
18938 {
18939 vm_map_address_t target_addr, mask;
18940 vm_map_size_t target_size;
18941 vm_map_offset_t memory_address, memory_end;
18942 vm_map_size_t memory_size;
18943 vm_prot_t cur_protection, max_protection;
18944 vm_inherit_t inheritance;
18945 kern_return_t result;
18946 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
18947 vm_map_copy_t copy_map;
18948 vm_map_offset_t offset_in_mapping;
18949 vm_map_size_t src_page_mask, target_page_mask;
18950 vm_map_size_t initial_size;
18951 VM_MAP_ZAP_DECLARE(zap_list);
18952
18953 if (target_map == VM_MAP_NULL || src_map == VM_MAP_NULL) {
18954 return KERN_INVALID_ARGUMENT;
18955 }
18956 src_page_mask = VM_MAP_PAGE_MASK(src_map);
18957 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18958
18959 if (src_page_mask != target_page_mask) {
18960 if (copy) {
18961 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18962 } else {
18963 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), VM_SANITIZE_UNSAFE_UNWRAP(memory_address_u), VM_SANITIZE_UNSAFE_UNWRAP(size_u), copy, target_map, VM_MAP_PAGE_SIZE(target_map));
18964 }
18965 }
18966
18967 /*
18968 * Sanitize any input parameters that are addr/size/prot/inherit
18969 */
18970 result = vm_map_remap_sanitize(src_map,
18971 target_map,
18972 *address_u,
18973 size_u,
18974 mask_u,
18975 memory_address_u,
18976 *cur_protection_u,
18977 *max_protection_u,
18978 inheritance_u,
18979 vmk_flags,
18980 &target_addr,
18981 &mask,
18982 &memory_address,
18983 &memory_end,
18984 &memory_size,
18985 &cur_protection,
18986 &max_protection,
18987 &inheritance);
18988 if (__improbable(result != KERN_SUCCESS)) {
18989 return vm_sanitize_get_kr(result);
18990 }
18991
18992 if (vmk_flags.vmf_return_data_addr) {
18993 /*
18994 * This is safe to unwrap now that the quantities
18995 * have been validated and rounded up normally.
18996 */
18997 offset_in_mapping = vm_sanitize_offset_in_page(src_map,
18998 memory_address_u);
18999 initial_size = VM_SANITIZE_UNSAFE_UNWRAP(size_u);
19000 } else {
19001 /*
19002 * IMPORTANT:
19003 * This legacy code path is broken: for the range mentioned
19004 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19005 * two 4k pages, it yields [ memory_address = 0x1000,
19006 * size = 0x1000 ], which covers only the first 4k page.
19007 * BUT some code unfortunately depends on this bug, so we
19008 * can't fix it without breaking something.
19009 * New code should get automatically opted in the new
19010 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19011 */
19012 offset_in_mapping = 0;
19013 initial_size = memory_size;
19014 }
19015
19016 if (vmk_flags.vmf_resilient_media) {
19017 /* must be copy-on-write to be "media resilient" */
19018 if (!copy) {
19019 return KERN_INVALID_ARGUMENT;
19020 }
19021 }
19022
19023 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19024 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19025
19026 assert(memory_size != 0);
19027 result = vm_map_copy_extract(src_map,
19028 memory_address,
19029 memory_size,
19030 copy, ©_map,
19031 &cur_protection, /* IN/OUT */
19032 &max_protection, /* IN/OUT */
19033 inheritance,
19034 vmk_flags);
19035 if (result != KERN_SUCCESS) {
19036 return result;
19037 }
19038 assert(copy_map != VM_MAP_COPY_NULL);
19039
19040 /*
19041 * Handle the policy for vm map ranges
19042 *
19043 * If the maps differ, the target_map policy applies like for vm_map()
19044 * For same mapping remaps, we preserve the range.
19045 */
19046 if (vmk_flags.vmkf_copy_same_map) {
19047 vmk_flags.vmkf_range_id = copy_map->orig_range;
19048 } else {
19049 vm_map_kernel_flags_update_range_id(&vmk_flags, target_map, memory_size);
19050 }
19051
19052 target_size = memory_size;
19053 if (src_page_mask != target_page_mask) {
19054 vm_map_copy_t target_copy_map;
19055 vm_map_offset_t overmap_start = 0;
19056 vm_map_offset_t overmap_end = 0;
19057 vm_map_offset_t trimmed_start = 0;
19058
19059 target_copy_map = copy_map; /* can modify "copy_map" itself */
19060 DEBUG4K_ADJUST("adjusting...\n");
19061 result = vm_map_copy_adjust_to_target(
19062 copy_map,
19063 offset_in_mapping, /* offset */
19064 initial_size,
19065 target_map,
19066 copy,
19067 &target_copy_map,
19068 &overmap_start,
19069 &overmap_end,
19070 &trimmed_start);
19071 if (result != KERN_SUCCESS) {
19072 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19073 vm_map_copy_discard(copy_map);
19074 return result;
19075 }
19076 if (trimmed_start == 0) {
19077 /* nothing trimmed: no adjustment needed */
19078 } else if (trimmed_start >= offset_in_mapping) {
19079 /* trimmed more than offset_in_mapping: nothing left */
19080 assert(overmap_start == 0);
19081 assert(overmap_end == 0);
19082 offset_in_mapping = 0;
19083 } else {
19084 /* trimmed some of offset_in_mapping: adjust */
19085 assert(overmap_start == 0);
19086 assert(overmap_end == 0);
19087 offset_in_mapping -= trimmed_start;
19088 }
19089 offset_in_mapping += overmap_start;
19090 target_size = target_copy_map->size;
19091 }
19092
19093 /*
19094 * Allocate/check a range of free virtual address
19095 * space for the target
19096 */
19097 target_size = vm_map_round_page(target_size, target_page_mask);
19098
19099 if (target_size == 0) {
19100 vm_map_copy_discard(copy_map);
19101 return KERN_INVALID_ARGUMENT;
19102 }
19103
19104 vm_map_lock(target_map);
19105
19106 if (!vmk_flags.vmf_fixed) {
19107 result = vm_map_locate_space_anywhere(target_map, target_size,
19108 mask, vmk_flags, &target_addr, &insp_entry);
19109 } else {
19110 /*
19111 * vm_map_locate_space_fixed will reject overflowing
19112 * target_addr + target_size values
19113 */
19114 result = vm_map_locate_space_fixed(target_map, target_addr,
19115 target_size, mask, vmk_flags, &insp_entry, &zap_list);
19116
19117 if (result == KERN_MEMORY_PRESENT) {
19118 assert(!vmk_flags.vmkf_already);
19119 insp_entry = VM_MAP_ENTRY_NULL;
19120 result = KERN_NO_SPACE;
19121 }
19122 }
19123
19124 if (result == KERN_SUCCESS) {
19125 while (vm_map_copy_first_entry(copy_map) !=
19126 vm_map_copy_to_entry(copy_map)) {
19127 vm_map_entry_t entry = vm_map_copy_first_entry(copy_map);
19128
19129 vm_map_copy_entry_unlink(copy_map, entry);
19130
19131 if (vmk_flags.vmkf_remap_prot_copy) {
19132 /*
19133 * This vm_map_remap() is for a
19134 * vm_protect(VM_PROT_COPY), so the caller
19135 * expects to be allowed to add write access
19136 * to this new mapping. This is done by
19137 * adding VM_PROT_WRITE to each entry's
19138 * max_protection... unless some security
19139 * settings disallow it.
19140 */
19141 bool allow_write = false;
19142 if (entry->vme_permanent) {
19143 /* immutable mapping... */
19144 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19145 developer_mode_state()) {
19146 /*
19147 * ... but executable and
19148 * possibly being debugged,
19149 * so let's allow it to become
19150 * writable, for breakpoints
19151 * and dtrace probes, for
19152 * example.
19153 */
19154 allow_write = true;
19155 } else {
19156 printf("%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19157 proc_selfpid(),
19158 (get_bsdtask_info(current_task())
19159 ? proc_name_address(get_bsdtask_info(current_task()))
19160 : "?"),
19161 (uint64_t)memory_address,
19162 (uint64_t)memory_size,
19163 entry->protection,
19164 entry->max_protection,
19165 developer_mode_state());
19166 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19167 vm_map_entry_t, entry,
19168 vm_map_offset_t, entry->vme_start,
19169 vm_map_offset_t, entry->vme_end,
19170 vm_prot_t, entry->protection,
19171 vm_prot_t, entry->max_protection,
19172 int, VME_ALIAS(entry));
19173 }
19174 } else {
19175 allow_write = true;
19176 }
19177
19178 /*
19179 * VM_PROT_COPY: allow this mapping to become
19180 * writable, unless it was "permanent".
19181 */
19182 if (allow_write) {
19183 entry->max_protection |= VM_PROT_WRITE;
19184 }
19185 }
19186 if (vmk_flags.vmf_resilient_codesign) {
19187 /* no codesigning -> read-only access */
19188 entry->max_protection = VM_PROT_READ;
19189 entry->protection = VM_PROT_READ;
19190 entry->vme_resilient_codesign = TRUE;
19191 }
19192 entry->vme_start += target_addr;
19193 entry->vme_end += target_addr;
19194 assert(!entry->map_aligned);
19195 if (vmk_flags.vmf_resilient_media &&
19196 !entry->is_sub_map &&
19197 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19198 VME_OBJECT(entry)->internal)) {
19199 entry->vme_resilient_media = TRUE;
19200 }
19201 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19202 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19203 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19204 vm_map_store_entry_link(target_map, insp_entry, entry,
19205 vmk_flags);
19206 insp_entry = entry;
19207 }
19208 }
19209
19210 if (vmk_flags.vmf_resilient_codesign) {
19211 cur_protection = VM_PROT_READ;
19212 max_protection = VM_PROT_READ;
19213 }
19214
19215 if (result == KERN_SUCCESS) {
19216 target_map->size += target_size;
19217 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19218 }
19219 vm_map_unlock(target_map);
19220
19221 vm_map_zap_dispose(&zap_list);
19222
19223 if (result == KERN_SUCCESS && target_map->wiring_required) {
19224 result = vm_map_wire_nested(target_map, target_addr,
19225 target_addr + target_size, cur_protection, VM_KERN_MEMORY_MLOCK,
19226 TRUE, PMAP_NULL, 0, NULL);
19227 }
19228
19229 if (result == KERN_SUCCESS) {
19230 #if KASAN
19231 if (target_map->pmap == kernel_pmap) {
19232 kasan_notify_address(target_addr, target_size);
19233 }
19234 #endif
19235 /*
19236 * If requested, return the address of the data pointed to by the
19237 * request, rather than the base of the resulting page.
19238 */
19239 if (vmk_flags.vmf_return_data_addr) {
19240 target_addr += offset_in_mapping;
19241 }
19242
19243 /*
19244 * Update OUT parameters.
19245 */
19246 *address_u = vm_sanitize_wrap_addr(target_addr);
19247
19248 *cur_protection_u = vm_sanitize_wrap_prot(cur_protection);
19249 *max_protection_u = vm_sanitize_wrap_prot(max_protection);
19250 }
19251
19252 if (src_page_mask != target_page_mask) {
19253 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)target_size, copy, target_map, (uint64_t)target_addr, (uint64_t)offset_in_mapping, result);
19254 }
19255 vm_map_copy_discard(copy_map);
19256 copy_map = VM_MAP_COPY_NULL;
19257
19258 return result;
19259 }
19260
19261 /*
19262 * vm_map_switch:
19263 *
19264 * Set the address map for the current thread to the specified map
19265 */
19266
19267 vm_map_t
vm_map_switch(vm_map_t map)19268 vm_map_switch(
19269 vm_map_t map)
19270 {
19271 thread_t thread = current_thread();
19272 vm_map_t oldmap = thread->map;
19273
19274
19275 /*
19276 * Deactivate the current map and activate the requested map
19277 */
19278 mp_disable_preemption();
19279 PMAP_SWITCH_USER(thread, map, cpu_number());
19280 mp_enable_preemption();
19281 return oldmap;
19282 }
19283
19284 static inline kern_return_t
vm_map_rw_user_sanitize(vm_map_t map,vm_map_address_ut addr_u,vm_size_ut size_u,vm_sanitize_caller_t vm_sanitize_caller,vm_map_address_t * addr,vm_map_address_t * end,vm_map_size_t * size)19285 vm_map_rw_user_sanitize(
19286 vm_map_t map,
19287 vm_map_address_ut addr_u,
19288 vm_size_ut size_u,
19289 vm_sanitize_caller_t vm_sanitize_caller,
19290 vm_map_address_t *addr,
19291 vm_map_address_t *end,
19292 vm_map_size_t *size)
19293 {
19294 return vm_sanitize_addr_size(addr_u, size_u,
19295 vm_sanitize_caller, map,
19296 VM_SANITIZE_FLAGS_SIZE_ZERO_FALLTHROUGH | VM_SANITIZE_FLAGS_GET_UNALIGNED_VALUES,
19297 addr, end, size);
19298 }
19299
19300 /*
19301 * Routine: vm_map_write_user
19302 *
19303 * Description:
19304 * Copy out data from a kernel space into space in the
19305 * destination map. The space must already exist in the
19306 * destination map.
19307 * NOTE: This routine should only be called by threads
19308 * which can block on a page fault. i.e. kernel mode user
19309 * threads.
19310 *
19311 */
19312 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_ut dst_addr_u,vm_size_ut size_u)19313 vm_map_write_user(
19314 vm_map_t map,
19315 void *src_p,
19316 vm_map_address_ut dst_addr_u,
19317 vm_size_ut size_u)
19318 {
19319 kern_return_t kr;
19320 vm_map_address_t dst_addr, dst_end;
19321 vm_map_size_t size;
19322
19323 /*
19324 * src_p isn't validated: [src_p, src_p + size_u)
19325 * is trusted kernel input.
19326 *
19327 * dst_addr_u and size_u are untrusted and need to be sanitized.
19328 */
19329 kr = vm_map_rw_user_sanitize(map,
19330 dst_addr_u,
19331 size_u,
19332 VM_SANITIZE_CALLER_VM_MAP_WRITE_USER,
19333 &dst_addr,
19334 &dst_end,
19335 &size);
19336 if (__improbable(kr != KERN_SUCCESS)) {
19337 return vm_sanitize_get_kr(kr);
19338 }
19339
19340 if (current_map() == map) {
19341 if (copyout(src_p, dst_addr, size)) {
19342 kr = KERN_INVALID_ADDRESS;
19343 }
19344 } else {
19345 vm_map_t oldmap;
19346
19347 /* take on the identity of the target map while doing */
19348 /* the transfer */
19349
19350 vm_map_reference(map);
19351 oldmap = vm_map_switch(map);
19352 if (copyout(src_p, dst_addr, size)) {
19353 kr = KERN_INVALID_ADDRESS;
19354 }
19355 vm_map_switch(oldmap);
19356 vm_map_deallocate(map);
19357 }
19358 return kr;
19359 }
19360
19361 /*
19362 * Routine: vm_map_read_user
19363 *
19364 * Description:
19365 * Copy in data from a user space source map into the
19366 * kernel map. The space must already exist in the
19367 * kernel map.
19368 * NOTE: This routine should only be called by threads
19369 * which can block on a page fault. i.e. kernel mode user
19370 * threads.
19371 *
19372 */
19373 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_ut src_addr_u,void * dst_p,vm_size_ut size_u)19374 vm_map_read_user(
19375 vm_map_t map,
19376 vm_map_address_ut src_addr_u,
19377 void *dst_p,
19378 vm_size_ut size_u)
19379 {
19380 kern_return_t kr;
19381 vm_map_address_t src_addr, src_end;
19382 vm_map_size_t size;
19383
19384 /*
19385 * dst_p isn't validated: [dst_p, dst_p + size_u)
19386 * is trusted kernel input.
19387 *
19388 * src_addr_u and size_u are untrusted and need to be sanitized.
19389 */
19390 kr = vm_map_rw_user_sanitize(map,
19391 src_addr_u,
19392 size_u,
19393 VM_SANITIZE_CALLER_VM_MAP_READ_USER,
19394 &src_addr,
19395 &src_end,
19396 &size);
19397 if (__improbable(kr != KERN_SUCCESS)) {
19398 return vm_sanitize_get_kr(kr);
19399 }
19400
19401 if (current_map() == map) {
19402 if (copyin(src_addr, dst_p, size)) {
19403 kr = KERN_INVALID_ADDRESS;
19404 }
19405 } else {
19406 vm_map_t oldmap;
19407
19408 /* take on the identity of the target map while doing */
19409 /* the transfer */
19410
19411 vm_map_reference(map);
19412 oldmap = vm_map_switch(map);
19413 if (copyin(src_addr, dst_p, size)) {
19414 kr = KERN_INVALID_ADDRESS;
19415 }
19416 vm_map_switch(oldmap);
19417 vm_map_deallocate(map);
19418 }
19419 return kr;
19420 }
19421
19422
19423 /*
19424 * vm_map_check_protection:
19425 *
19426 * Assert that the target map allows the specified
19427 * privilege on the entire address region given.
19428 * The entire region must be allocated.
19429 */
19430 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)19431 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19432 vm_map_offset_t end, vm_prot_t protection)
19433 {
19434 vm_map_entry_t entry;
19435 vm_map_entry_t tmp_entry;
19436
19437 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19438 return FALSE;
19439 }
19440
19441 vm_map_lock(map);
19442
19443 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19444 vm_map_unlock(map);
19445 return FALSE;
19446 }
19447
19448 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
19449 vm_map_unlock(map);
19450 return FALSE;
19451 }
19452
19453 entry = tmp_entry;
19454
19455 while (start < end) {
19456 if (entry == vm_map_to_entry(map)) {
19457 vm_map_unlock(map);
19458 return FALSE;
19459 }
19460
19461 /*
19462 * No holes allowed!
19463 */
19464
19465 if (start < entry->vme_start) {
19466 vm_map_unlock(map);
19467 return FALSE;
19468 }
19469
19470 /*
19471 * Check protection associated with entry.
19472 */
19473
19474 if ((entry->protection & protection) != protection) {
19475 vm_map_unlock(map);
19476 return FALSE;
19477 }
19478
19479 /* go to next entry */
19480
19481 start = entry->vme_end;
19482 entry = entry->vme_next;
19483 }
19484 vm_map_unlock(map);
19485 return TRUE;
19486 }
19487
19488 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)19489 vm_map_purgable_control(
19490 vm_map_t map,
19491 vm_map_offset_t address,
19492 vm_purgable_t control,
19493 int *state)
19494 {
19495 vm_map_entry_t entry;
19496 vm_object_t object;
19497 kern_return_t kr;
19498 boolean_t was_nonvolatile;
19499
19500 /*
19501 * Vet all the input parameters and current type and state of the
19502 * underlaying object. Return with an error if anything is amiss.
19503 */
19504 if (map == VM_MAP_NULL) {
19505 return KERN_INVALID_ARGUMENT;
19506 }
19507
19508 if (control != VM_PURGABLE_SET_STATE &&
19509 control != VM_PURGABLE_GET_STATE &&
19510 control != VM_PURGABLE_PURGE_ALL &&
19511 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19512 return KERN_INVALID_ARGUMENT;
19513 }
19514
19515 if (control == VM_PURGABLE_PURGE_ALL) {
19516 vm_purgeable_object_purge_all();
19517 return KERN_SUCCESS;
19518 }
19519
19520 if ((control == VM_PURGABLE_SET_STATE ||
19521 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19522 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19523 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19524 return KERN_INVALID_ARGUMENT;
19525 }
19526
19527 vm_map_lock_read(map);
19528
19529 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
19530 /*
19531 * Must pass a valid non-submap address.
19532 */
19533 vm_map_unlock_read(map);
19534 return KERN_INVALID_ADDRESS;
19535 }
19536
19537 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19538 control != VM_PURGABLE_GET_STATE) {
19539 /*
19540 * Can't apply purgable controls to something you can't write.
19541 */
19542 vm_map_unlock_read(map);
19543 return KERN_PROTECTION_FAILURE;
19544 }
19545
19546 object = VME_OBJECT(entry);
19547 if (object == VM_OBJECT_NULL ||
19548 object->purgable == VM_PURGABLE_DENY) {
19549 /*
19550 * Object must already be present and be purgeable.
19551 */
19552 vm_map_unlock_read(map);
19553 return KERN_INVALID_ARGUMENT;
19554 }
19555
19556 vm_object_lock(object);
19557
19558 #if 00
19559 if (VME_OFFSET(entry) != 0 ||
19560 entry->vme_end - entry->vme_start != object->vo_size) {
19561 /*
19562 * Can only apply purgable controls to the whole (existing)
19563 * object at once.
19564 */
19565 vm_map_unlock_read(map);
19566 vm_object_unlock(object);
19567 return KERN_INVALID_ARGUMENT;
19568 }
19569 #endif
19570
19571 assert(!entry->is_sub_map);
19572 assert(!entry->use_pmap); /* purgeable has its own accounting */
19573
19574 vm_map_unlock_read(map);
19575
19576 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19577
19578 kr = vm_object_purgable_control(object, control, state);
19579
19580 if (was_nonvolatile &&
19581 object->purgable != VM_PURGABLE_NONVOLATILE &&
19582 map->pmap == kernel_pmap) {
19583 #if DEBUG
19584 object->vo_purgeable_volatilizer = kernel_task;
19585 #endif /* DEBUG */
19586 }
19587
19588 vm_object_unlock(object);
19589
19590 return kr;
19591 }
19592
19593 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)19594 vm_map_footprint_query_page_info(
19595 vm_map_t map,
19596 vm_map_entry_t map_entry,
19597 vm_map_offset_t curr_s_offset,
19598 int *disposition_p)
19599 {
19600 int pmap_disp;
19601 vm_object_t object = VM_OBJECT_NULL;
19602 int disposition;
19603 int effective_page_size;
19604
19605 vm_map_lock_assert_held(map);
19606 assert(!map->has_corpse_footprint);
19607 assert(curr_s_offset >= map_entry->vme_start);
19608 assert(curr_s_offset < map_entry->vme_end);
19609
19610 if (map_entry->is_sub_map) {
19611 if (!map_entry->use_pmap) {
19612 /* nested pmap: no footprint */
19613 *disposition_p = 0;
19614 return;
19615 }
19616 } else {
19617 object = VME_OBJECT(map_entry);
19618 if (object == VM_OBJECT_NULL) {
19619 /* nothing mapped here: no need to ask */
19620 *disposition_p = 0;
19621 return;
19622 }
19623 }
19624
19625 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19626
19627 pmap_disp = 0;
19628
19629 /*
19630 * Query the pmap.
19631 */
19632 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
19633
19634 /*
19635 * Compute this page's disposition.
19636 */
19637 disposition = 0;
19638
19639 /* deal with "alternate accounting" first */
19640 if (!map_entry->is_sub_map &&
19641 object->vo_no_footprint) {
19642 /* does not count in footprint */
19643 // assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19644 } else if (!map_entry->is_sub_map &&
19645 !object->internal &&
19646 object->vo_ledger_tag &&
19647 VM_OBJECT_OWNER(object) != NULL &&
19648 VM_OBJECT_OWNER(object)->map == map) {
19649 /* owned external object: wired pages count in footprint */
19650 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19651 if ((((curr_s_offset
19652 - map_entry->vme_start
19653 + VME_OFFSET(map_entry))
19654 / effective_page_size) <
19655 object->wired_page_count)) {
19656 /*
19657 * External object owned by this task: report the first
19658 * "#wired" pages as "resident" (to show that they
19659 * contribute to the footprint) but not "dirty"
19660 * (to avoid double-counting with the fake "owned"
19661 * region we'll report at the end of the address space
19662 * to account for all (mapped or not) owned memory
19663 * owned by this task.
19664 */
19665 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19666 }
19667 } else if (!map_entry->is_sub_map &&
19668 object->internal &&
19669 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19670 (object->purgable == VM_PURGABLE_DENY &&
19671 object->vo_ledger_tag)) &&
19672 VM_OBJECT_OWNER(object) != NULL &&
19673 VM_OBJECT_OWNER(object)->map == map) {
19674 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19675 if ((((curr_s_offset
19676 - map_entry->vme_start
19677 + VME_OFFSET(map_entry))
19678 / effective_page_size) <
19679 (object->resident_page_count +
19680 vm_compressor_pager_get_count(object->pager)))) {
19681 /*
19682 * Non-volatile purgeable object owned
19683 * by this task: report the first
19684 * "#resident + #compressed" pages as
19685 * "resident" (to show that they
19686 * contribute to the footprint) but not
19687 * "dirty" (to avoid double-counting
19688 * with the fake "non-volatile" region
19689 * we'll report at the end of the
19690 * address space to account for all
19691 * (mapped or not) non-volatile memory
19692 * owned by this task.
19693 */
19694 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19695 }
19696 } else if (!map_entry->is_sub_map &&
19697 object->internal &&
19698 (object->purgable == VM_PURGABLE_VOLATILE ||
19699 object->purgable == VM_PURGABLE_EMPTY) &&
19700 VM_OBJECT_OWNER(object) != NULL &&
19701 VM_OBJECT_OWNER(object)->map == map) {
19702 if (object->internal) {
19703 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19704 }
19705 if ((((curr_s_offset
19706 - map_entry->vme_start
19707 + VME_OFFSET(map_entry))
19708 / effective_page_size) <
19709 object->wired_page_count)) {
19710 /*
19711 * Volatile|empty purgeable object owned
19712 * by this task: report the first
19713 * "#wired" pages as "resident" (to
19714 * show that they contribute to the
19715 * footprint) but not "dirty" (to avoid
19716 * double-counting with the fake
19717 * "non-volatile" region we'll report
19718 * at the end of the address space to
19719 * account for all (mapped or not)
19720 * non-volatile memory owned by this
19721 * task.
19722 */
19723 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19724 }
19725 } else if (!map_entry->is_sub_map &&
19726 map_entry->iokit_acct &&
19727 object->internal &&
19728 object->purgable == VM_PURGABLE_DENY) {
19729 /*
19730 * Non-purgeable IOKit memory: phys_footprint
19731 * includes the entire virtual mapping.
19732 */
19733 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19734 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19735 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19736 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19737 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19738 /* alternate accounting */
19739 #if __arm64__ && (DEVELOPMENT || DEBUG)
19740 if (map->pmap->footprint_was_suspended) {
19741 /*
19742 * The assertion below can fail if dyld
19743 * suspended footprint accounting
19744 * while doing some adjustments to
19745 * this page; the mapping would say
19746 * "use pmap accounting" but the page
19747 * would be marked "alternate
19748 * accounting".
19749 */
19750 } else
19751 #endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19752 {
19753 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19754 }
19755 disposition = 0;
19756 } else {
19757 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19758 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19759 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19760 disposition |= VM_PAGE_QUERY_PAGE_REF;
19761 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19762 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19763 } else {
19764 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19765 }
19766 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19767 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19768 }
19769 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19770 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19771 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19772 }
19773 }
19774
19775 *disposition_p = disposition;
19776 }
19777
19778 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)19779 vm_map_page_query_internal(
19780 vm_map_t target_map,
19781 vm_map_offset_t offset,
19782 int *disposition,
19783 int *ref_count)
19784 {
19785 kern_return_t kr;
19786 vm_page_info_basic_data_t info;
19787 mach_msg_type_number_t count;
19788
19789 count = VM_PAGE_INFO_BASIC_COUNT;
19790 kr = vm_map_page_info(target_map,
19791 offset,
19792 VM_PAGE_INFO_BASIC,
19793 (vm_page_info_t) &info,
19794 &count);
19795 if (kr == KERN_SUCCESS) {
19796 *disposition = info.disposition;
19797 *ref_count = info.ref_count;
19798 } else {
19799 *disposition = 0;
19800 *ref_count = 0;
19801 }
19802
19803 return kr;
19804 }
19805
19806 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19807 vm_map_page_info(
19808 vm_map_t map,
19809 vm_map_offset_t offset,
19810 vm_page_info_flavor_t flavor,
19811 vm_page_info_t info,
19812 mach_msg_type_number_t *count)
19813 {
19814 return vm_map_page_range_info_internal(map,
19815 offset, /* start of range */
19816 (offset + 1), /* this will get rounded in the call to the page boundary */
19817 (int)-1, /* effective_page_shift: unspecified */
19818 flavor,
19819 info,
19820 count);
19821 }
19822
19823 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)19824 vm_map_page_range_info_internal(
19825 vm_map_t map,
19826 vm_map_offset_t start_offset,
19827 vm_map_offset_t end_offset,
19828 int effective_page_shift,
19829 vm_page_info_flavor_t flavor,
19830 vm_page_info_t info,
19831 mach_msg_type_number_t *count)
19832 {
19833 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
19834 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
19835 vm_page_t m = VM_PAGE_NULL;
19836 kern_return_t retval = KERN_SUCCESS;
19837 int disposition = 0;
19838 int ref_count = 0;
19839 int depth = 0, info_idx = 0;
19840 vm_page_info_basic_t basic_info = 0;
19841 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
19842 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
19843 boolean_t do_region_footprint;
19844 ledger_amount_t ledger_resident, ledger_compressed;
19845 int effective_page_size;
19846 vm_map_offset_t effective_page_mask;
19847
19848 switch (flavor) {
19849 case VM_PAGE_INFO_BASIC:
19850 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
19851 /*
19852 * The "vm_page_info_basic_data" structure was not
19853 * properly padded, so allow the size to be off by
19854 * one to maintain backwards binary compatibility...
19855 */
19856 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
19857 return KERN_INVALID_ARGUMENT;
19858 }
19859 }
19860 break;
19861 default:
19862 return KERN_INVALID_ARGUMENT;
19863 }
19864
19865 if (effective_page_shift == -1) {
19866 effective_page_shift = vm_self_region_page_shift_safely(map);
19867 if (effective_page_shift == -1) {
19868 return KERN_INVALID_ARGUMENT;
19869 }
19870 }
19871 effective_page_size = (1 << effective_page_shift);
19872 effective_page_mask = effective_page_size - 1;
19873
19874 do_region_footprint = task_self_region_footprint();
19875 disposition = 0;
19876 ref_count = 0;
19877 depth = 0;
19878 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
19879 retval = KERN_SUCCESS;
19880
19881 if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
19882 return KERN_INVALID_ADDRESS;
19883 }
19884
19885 offset_in_page = start_offset & effective_page_mask;
19886 start = vm_map_trunc_page(start_offset, effective_page_mask);
19887 end = vm_map_round_page(end_offset, effective_page_mask);
19888
19889 if (end < start) {
19890 return KERN_INVALID_ARGUMENT;
19891 }
19892
19893 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
19894
19895 vm_map_lock_read(map);
19896
19897 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
19898
19899 for (curr_s_offset = start; curr_s_offset < end;) {
19900 /*
19901 * New lookup needs reset of these variables.
19902 */
19903 curr_object = object = VM_OBJECT_NULL;
19904 offset_in_object = 0;
19905 ref_count = 0;
19906 depth = 0;
19907
19908 if (do_region_footprint &&
19909 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
19910 /*
19911 * Request for "footprint" info about a page beyond
19912 * the end of address space: this must be for
19913 * the fake region vm_map_region_recurse_64()
19914 * reported to account for non-volatile purgeable
19915 * memory owned by this task.
19916 */
19917 disposition = 0;
19918
19919 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
19920 (unsigned) ledger_compressed) {
19921 /*
19922 * We haven't reported all the "non-volatile
19923 * compressed" pages yet, so report this fake
19924 * page as "compressed".
19925 */
19926 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19927 } else {
19928 /*
19929 * We've reported all the non-volatile
19930 * compressed page but not all the non-volatile
19931 * pages , so report this fake page as
19932 * "resident dirty".
19933 */
19934 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19935 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19936 disposition |= VM_PAGE_QUERY_PAGE_REF;
19937 }
19938 switch (flavor) {
19939 case VM_PAGE_INFO_BASIC:
19940 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19941 basic_info->disposition = disposition;
19942 basic_info->ref_count = 1;
19943 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
19944 basic_info->offset = 0;
19945 basic_info->depth = 0;
19946
19947 info_idx++;
19948 break;
19949 }
19950 curr_s_offset += effective_page_size;
19951 continue;
19952 }
19953
19954 /*
19955 * First, find the map entry covering "curr_s_offset", going down
19956 * submaps if necessary.
19957 */
19958 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
19959 /* no entry -> no object -> no page */
19960
19961 if (curr_s_offset < vm_map_min(map)) {
19962 /*
19963 * Illegal address that falls below map min.
19964 */
19965 curr_e_offset = MIN(end, vm_map_min(map));
19966 } else if (curr_s_offset >= vm_map_max(map)) {
19967 /*
19968 * Illegal address that falls on/after map max.
19969 */
19970 curr_e_offset = end;
19971 } else if (map_entry == vm_map_to_entry(map)) {
19972 /*
19973 * Hit a hole.
19974 */
19975 if (map_entry->vme_next == vm_map_to_entry(map)) {
19976 /*
19977 * Empty map.
19978 */
19979 curr_e_offset = MIN(map->max_offset, end);
19980 } else {
19981 /*
19982 * Hole at start of the map.
19983 */
19984 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19985 }
19986 } else {
19987 if (map_entry->vme_next == vm_map_to_entry(map)) {
19988 /*
19989 * Hole at the end of the map.
19990 */
19991 curr_e_offset = MIN(map->max_offset, end);
19992 } else {
19993 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
19994 }
19995 }
19996
19997 assert(curr_e_offset >= curr_s_offset);
19998
19999 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20000
20001 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20002
20003 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20004
20005 curr_s_offset = curr_e_offset;
20006
20007 info_idx += num_pages;
20008
20009 continue;
20010 }
20011
20012 /* compute offset from this map entry's start */
20013 offset_in_object = curr_s_offset - map_entry->vme_start;
20014
20015 /* compute offset into this map entry's object (or submap) */
20016 offset_in_object += VME_OFFSET(map_entry);
20017
20018 if (map_entry->is_sub_map) {
20019 vm_map_t sub_map = VM_MAP_NULL;
20020 vm_page_info_t submap_info = 0;
20021 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20022
20023 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20024
20025 submap_s_offset = offset_in_object;
20026 submap_e_offset = submap_s_offset + range_len;
20027
20028 sub_map = VME_SUBMAP(map_entry);
20029
20030 vm_map_reference(sub_map);
20031 vm_map_unlock_read(map);
20032
20033 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20034
20035 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20036 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20037
20038 retval = vm_map_page_range_info_internal(sub_map,
20039 submap_s_offset,
20040 submap_e_offset,
20041 effective_page_shift,
20042 VM_PAGE_INFO_BASIC,
20043 (vm_page_info_t) submap_info,
20044 count);
20045
20046 assert(retval == KERN_SUCCESS);
20047
20048 vm_map_lock_read(map);
20049 vm_map_deallocate(sub_map);
20050
20051 /* Move the "info" index by the number of pages we inspected.*/
20052 info_idx += range_len >> effective_page_shift;
20053
20054 /* Move our current offset by the size of the range we inspected.*/
20055 curr_s_offset += range_len;
20056
20057 continue;
20058 }
20059
20060 object = VME_OBJECT(map_entry);
20061
20062 if (object == VM_OBJECT_NULL) {
20063 /*
20064 * We don't have an object here and, hence,
20065 * no pages to inspect. We'll fill up the
20066 * info structure appropriately.
20067 */
20068
20069 curr_e_offset = MIN(map_entry->vme_end, end);
20070
20071 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20072
20073 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20074
20075 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
20076
20077 curr_s_offset = curr_e_offset;
20078
20079 info_idx += num_pages;
20080
20081 continue;
20082 }
20083
20084 if (do_region_footprint) {
20085 disposition = 0;
20086 if (map->has_corpse_footprint) {
20087 /*
20088 * Query the page info data we saved
20089 * while forking the corpse.
20090 */
20091 vm_map_corpse_footprint_query_page_info(
20092 map,
20093 curr_s_offset,
20094 &disposition);
20095 } else {
20096 /*
20097 * Query the live pmap for footprint info
20098 * about this page.
20099 */
20100 vm_map_footprint_query_page_info(
20101 map,
20102 map_entry,
20103 curr_s_offset,
20104 &disposition);
20105 }
20106 switch (flavor) {
20107 case VM_PAGE_INFO_BASIC:
20108 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20109 basic_info->disposition = disposition;
20110 basic_info->ref_count = 1;
20111 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20112 basic_info->offset = 0;
20113 basic_info->depth = 0;
20114
20115 info_idx++;
20116 break;
20117 }
20118 curr_s_offset += effective_page_size;
20119 continue;
20120 }
20121
20122 vm_object_reference(object);
20123 /*
20124 * Shared mode -- so we can allow other readers
20125 * to grab the lock too.
20126 */
20127 vm_object_lock_shared(object);
20128
20129 curr_e_offset = MIN(map_entry->vme_end, end);
20130
20131 vm_map_unlock_read(map);
20132
20133 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20134
20135 curr_object = object;
20136
20137 for (; curr_s_offset < curr_e_offset;) {
20138 if (object == curr_object) {
20139 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20140 } else {
20141 ref_count = curr_object->ref_count;
20142 }
20143
20144 curr_offset_in_object = offset_in_object;
20145
20146 for (;;) {
20147 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
20148
20149 if (m != VM_PAGE_NULL) {
20150 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20151 break;
20152 } else {
20153 if (curr_object->internal &&
20154 curr_object->alive &&
20155 !curr_object->terminating &&
20156 curr_object->pager_ready) {
20157 if (vm_object_compressor_pager_state_get(curr_object, vm_object_trunc_page(curr_offset_in_object))
20158 == VM_EXTERNAL_STATE_EXISTS) {
20159 /* the pager has that page */
20160 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20161 break;
20162 }
20163 }
20164
20165 /*
20166 * Go down the VM object shadow chain until we find the page
20167 * we're looking for.
20168 */
20169
20170 if (curr_object->shadow != VM_OBJECT_NULL) {
20171 vm_object_t shadow = VM_OBJECT_NULL;
20172
20173 curr_offset_in_object += curr_object->vo_shadow_offset;
20174 shadow = curr_object->shadow;
20175
20176 vm_object_lock_shared(shadow);
20177 vm_object_unlock(curr_object);
20178
20179 curr_object = shadow;
20180 depth++;
20181 continue;
20182 } else {
20183 break;
20184 }
20185 }
20186 }
20187
20188 /* The ref_count is not strictly accurate, it measures the number */
20189 /* of entities holding a ref on the object, they may not be mapping */
20190 /* the object or may not be mapping the section holding the */
20191 /* target page but its still a ball park number and though an over- */
20192 /* count, it picks up the copy-on-write cases */
20193
20194 /* We could also get a picture of page sharing from pmap_attributes */
20195 /* but this would under count as only faulted-in mappings would */
20196 /* show up. */
20197
20198 if ((curr_object == object) && curr_object->shadow) {
20199 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20200 }
20201
20202 if (!curr_object->internal) {
20203 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20204 }
20205
20206 if (m != VM_PAGE_NULL) {
20207 if (m->vmp_fictitious) {
20208 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20209 } else {
20210 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
20211 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20212 }
20213
20214 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
20215 disposition |= VM_PAGE_QUERY_PAGE_REF;
20216 }
20217
20218 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20219 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20220 }
20221
20222 /*
20223 * XXX TODO4K:
20224 * when this routine deals with 4k
20225 * pages, check the appropriate CS bit
20226 * here.
20227 */
20228 if (m->vmp_cs_validated) {
20229 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20230 }
20231 if (m->vmp_cs_tainted) {
20232 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20233 }
20234 if (m->vmp_cs_nx) {
20235 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20236 }
20237 if (m->vmp_reusable || curr_object->all_reusable) {
20238 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20239 }
20240 }
20241 }
20242
20243 switch (flavor) {
20244 case VM_PAGE_INFO_BASIC:
20245 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20246 basic_info->disposition = disposition;
20247 basic_info->ref_count = ref_count;
20248 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20249 VM_KERNEL_ADDRHASH(curr_object);
20250 basic_info->offset =
20251 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20252 basic_info->depth = depth;
20253
20254 info_idx++;
20255 break;
20256 }
20257
20258 disposition = 0;
20259 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20260
20261 /*
20262 * Move to next offset in the range and in our object.
20263 */
20264 curr_s_offset += effective_page_size;
20265 offset_in_object += effective_page_size;
20266 curr_offset_in_object = offset_in_object;
20267
20268 if (curr_object != object) {
20269 vm_object_unlock(curr_object);
20270
20271 curr_object = object;
20272
20273 vm_object_lock_shared(curr_object);
20274 } else {
20275 vm_object_lock_yield_shared(curr_object);
20276 }
20277 }
20278
20279 vm_object_unlock(curr_object);
20280 vm_object_deallocate(curr_object);
20281
20282 vm_map_lock_read(map);
20283 }
20284
20285 vm_map_unlock_read(map);
20286 return retval;
20287 }
20288
20289 /*
20290 * vm_map_msync
20291 *
20292 * Synchronises the memory range specified with its backing store
20293 * image by either flushing or cleaning the contents to the appropriate
20294 * memory manager engaging in a memory object synchronize dialog with
20295 * the manager. The client doesn't return until the manager issues
20296 * m_o_s_completed message. MIG Magically converts user task parameter
20297 * to the task's address map.
20298 *
20299 * interpretation of sync_flags
20300 * VM_SYNC_INVALIDATE - discard pages, only return precious
20301 * pages to manager.
20302 *
20303 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20304 * - discard pages, write dirty or precious
20305 * pages back to memory manager.
20306 *
20307 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20308 * - write dirty or precious pages back to
20309 * the memory manager.
20310 *
20311 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20312 * is a hole in the region, and we would
20313 * have returned KERN_SUCCESS, return
20314 * KERN_INVALID_ADDRESS instead.
20315 *
20316 * NOTE
20317 * The memory object attributes have not yet been implemented, this
20318 * function will have to deal with the invalidate attribute
20319 *
20320 * RETURNS
20321 * KERN_INVALID_TASK Bad task parameter
20322 * KERN_INVALID_ARGUMENT both sync and async were specified.
20323 * KERN_SUCCESS The usual.
20324 * KERN_INVALID_ADDRESS There was a hole in the region.
20325 */
20326
20327 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)20328 vm_map_msync(
20329 vm_map_t map,
20330 vm_map_address_t address,
20331 vm_map_size_t size,
20332 vm_sync_t sync_flags)
20333 {
20334 vm_map_entry_t entry;
20335 vm_map_size_t amount_left;
20336 vm_object_offset_t offset;
20337 vm_object_offset_t start_offset, end_offset;
20338 boolean_t do_sync_req;
20339 boolean_t had_hole = FALSE;
20340 vm_map_offset_t pmap_offset;
20341
20342 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20343 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20344 return KERN_INVALID_ARGUMENT;
20345 }
20346
20347 if (__improbable(vm_map_range_overflows(map, address, size))) {
20348 return KERN_INVALID_ADDRESS;
20349 }
20350
20351 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20352 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20353 }
20354
20355 /*
20356 * align address and size on page boundaries
20357 */
20358 size = (vm_map_round_page(address + size,
20359 VM_MAP_PAGE_MASK(map)) -
20360 vm_map_trunc_page(address,
20361 VM_MAP_PAGE_MASK(map)));
20362 address = vm_map_trunc_page(address,
20363 VM_MAP_PAGE_MASK(map));
20364
20365 if (map == VM_MAP_NULL) {
20366 return KERN_INVALID_TASK;
20367 }
20368
20369 if (size == 0) {
20370 return KERN_SUCCESS;
20371 }
20372
20373 amount_left = size;
20374
20375 while (amount_left > 0) {
20376 vm_object_size_t flush_size;
20377 vm_object_t object;
20378
20379 vm_map_lock(map);
20380 if (!vm_map_lookup_entry(map,
20381 address,
20382 &entry)) {
20383 vm_map_size_t skip;
20384
20385 /*
20386 * hole in the address map.
20387 */
20388 had_hole = TRUE;
20389
20390 if (sync_flags & VM_SYNC_KILLPAGES) {
20391 /*
20392 * For VM_SYNC_KILLPAGES, there should be
20393 * no holes in the range, since we couldn't
20394 * prevent someone else from allocating in
20395 * that hole and we wouldn't want to "kill"
20396 * their pages.
20397 */
20398 vm_map_unlock(map);
20399 break;
20400 }
20401
20402 /*
20403 * Check for empty map.
20404 */
20405 if (entry == vm_map_to_entry(map) &&
20406 entry->vme_next == entry) {
20407 vm_map_unlock(map);
20408 break;
20409 }
20410 /*
20411 * Check that we don't wrap and that
20412 * we have at least one real map entry.
20413 */
20414 if ((map->hdr.nentries == 0) ||
20415 (entry->vme_next->vme_start < address)) {
20416 vm_map_unlock(map);
20417 break;
20418 }
20419 /*
20420 * Move up to the next entry if needed
20421 */
20422 skip = (entry->vme_next->vme_start - address);
20423 if (skip >= amount_left) {
20424 amount_left = 0;
20425 } else {
20426 amount_left -= skip;
20427 }
20428 address = entry->vme_next->vme_start;
20429 vm_map_unlock(map);
20430 continue;
20431 }
20432
20433 offset = address - entry->vme_start;
20434 pmap_offset = address;
20435
20436 /*
20437 * do we have more to flush than is contained in this
20438 * entry ?
20439 */
20440 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20441 flush_size = entry->vme_end -
20442 (entry->vme_start + offset);
20443 } else {
20444 flush_size = amount_left;
20445 }
20446 amount_left -= flush_size;
20447 address += flush_size;
20448
20449 if (entry->is_sub_map == TRUE) {
20450 vm_map_t local_map;
20451 vm_map_offset_t local_offset;
20452
20453 local_map = VME_SUBMAP(entry);
20454 local_offset = VME_OFFSET(entry);
20455 vm_map_reference(local_map);
20456 vm_map_unlock(map);
20457 if (vm_map_msync(
20458 local_map,
20459 local_offset,
20460 flush_size,
20461 sync_flags) == KERN_INVALID_ADDRESS) {
20462 had_hole = TRUE;
20463 }
20464 vm_map_deallocate(local_map);
20465 continue;
20466 }
20467 object = VME_OBJECT(entry);
20468
20469 /*
20470 * We can't sync this object if the object has not been
20471 * created yet
20472 */
20473 if (object == VM_OBJECT_NULL) {
20474 vm_map_unlock(map);
20475 continue;
20476 }
20477 offset += VME_OFFSET(entry);
20478
20479 vm_object_lock(object);
20480
20481 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20482 int kill_pages = 0;
20483
20484 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20485 /*
20486 * This is a destructive operation and so we
20487 * err on the side of limiting the range of
20488 * the operation.
20489 */
20490 start_offset = vm_object_round_page(offset);
20491 end_offset = vm_object_trunc_page(offset + flush_size);
20492
20493 if (end_offset <= start_offset) {
20494 vm_object_unlock(object);
20495 vm_map_unlock(map);
20496 continue;
20497 }
20498
20499 pmap_offset += start_offset - offset;
20500 } else {
20501 start_offset = offset;
20502 end_offset = offset + flush_size;
20503 }
20504
20505 if (sync_flags & VM_SYNC_KILLPAGES) {
20506 if (((object->ref_count == 1) ||
20507 ((object->copy_strategy !=
20508 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20509 (object->vo_copy == VM_OBJECT_NULL))) &&
20510 (object->shadow == VM_OBJECT_NULL)) {
20511 if (object->ref_count != 1) {
20512 vm_page_stats_reusable.free_shared++;
20513 }
20514 kill_pages = 1;
20515 } else {
20516 kill_pages = -1;
20517 }
20518 }
20519 if (kill_pages != -1) {
20520 vm_object_deactivate_pages(
20521 object,
20522 start_offset,
20523 (vm_object_size_t) (end_offset - start_offset),
20524 kill_pages,
20525 FALSE, /* reusable_pages */
20526 FALSE, /* reusable_no_write */
20527 map->pmap,
20528 pmap_offset);
20529 }
20530 vm_object_unlock(object);
20531 vm_map_unlock(map);
20532 continue;
20533 }
20534 /*
20535 * We can't sync this object if there isn't a pager.
20536 * Don't bother to sync internal objects, since there can't
20537 * be any "permanent" storage for these objects anyway.
20538 */
20539 if ((object->pager == MEMORY_OBJECT_NULL) ||
20540 (object->internal) || (object->private)) {
20541 vm_object_unlock(object);
20542 vm_map_unlock(map);
20543 continue;
20544 }
20545 /*
20546 * keep reference on the object until syncing is done
20547 */
20548 vm_object_reference_locked(object);
20549 vm_object_unlock(object);
20550
20551 vm_map_unlock(map);
20552
20553 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20554 start_offset = vm_object_trunc_page(offset);
20555 end_offset = vm_object_round_page(offset + flush_size);
20556 } else {
20557 start_offset = offset;
20558 end_offset = offset + flush_size;
20559 }
20560
20561 do_sync_req = vm_object_sync(object,
20562 start_offset,
20563 (end_offset - start_offset),
20564 sync_flags & VM_SYNC_INVALIDATE,
20565 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20566 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20567 sync_flags & VM_SYNC_SYNCHRONOUS);
20568
20569 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20570 /*
20571 * clear out the clustering and read-ahead hints
20572 */
20573 vm_object_lock(object);
20574
20575 object->pages_created = 0;
20576 object->pages_used = 0;
20577 object->sequential = 0;
20578 object->last_alloc = 0;
20579
20580 vm_object_unlock(object);
20581 }
20582 vm_object_deallocate(object);
20583 } /* while */
20584
20585 /* for proper msync() behaviour */
20586 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20587 return KERN_INVALID_ADDRESS;
20588 }
20589
20590 return KERN_SUCCESS;
20591 }/* vm_msync */
20592
20593 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)20594 vm_named_entry_associate_vm_object(
20595 vm_named_entry_t named_entry,
20596 vm_object_t object,
20597 vm_object_offset_t offset,
20598 vm_object_size_t size,
20599 vm_prot_t prot)
20600 {
20601 vm_map_copy_t copy;
20602 vm_map_entry_t copy_entry;
20603
20604 assert(!named_entry->is_sub_map);
20605 assert(!named_entry->is_copy);
20606 assert(!named_entry->is_object);
20607 assert(!named_entry->internal);
20608 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20609
20610 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20611 copy->offset = offset;
20612 copy->size = size;
20613 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20614
20615 copy_entry = vm_map_copy_entry_create(copy);
20616 copy_entry->protection = prot;
20617 copy_entry->max_protection = prot;
20618 copy_entry->use_pmap = TRUE;
20619 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20620 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20621 VME_OBJECT_SET(copy_entry, object, false, 0);
20622 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
20623 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20624
20625 named_entry->backing.copy = copy;
20626 named_entry->is_object = TRUE;
20627 if (object->internal) {
20628 named_entry->internal = TRUE;
20629 }
20630
20631 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20632 named_entry, copy, object, offset, size, prot);
20633 }
20634
20635 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)20636 vm_named_entry_to_vm_object(
20637 vm_named_entry_t named_entry)
20638 {
20639 vm_map_copy_t copy;
20640 vm_map_entry_t copy_entry;
20641 vm_object_t object;
20642
20643 assert(!named_entry->is_sub_map);
20644 assert(!named_entry->is_copy);
20645 assert(named_entry->is_object);
20646 copy = named_entry->backing.copy;
20647 assert(copy != VM_MAP_COPY_NULL);
20648 /*
20649 * Assert that the vm_map_copy is coming from the right
20650 * zone and hasn't been forged
20651 */
20652 vm_map_copy_require(copy);
20653 assert(copy->cpy_hdr.nentries == 1);
20654 copy_entry = vm_map_copy_first_entry(copy);
20655 object = VME_OBJECT(copy_entry);
20656
20657 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20658
20659 return object;
20660 }
20661
20662 /*
20663 * Routine: convert_port_entry_to_map
20664 * Purpose:
20665 * Convert from a port specifying an entry or a task
20666 * to a map. Doesn't consume the port ref; produces a map ref,
20667 * which may be null. Unlike convert_port_to_map, the
20668 * port may be task or a named entry backed.
20669 * Conditions:
20670 * Nothing locked.
20671 */
20672
20673 vm_map_t
convert_port_entry_to_map(ipc_port_t port)20674 convert_port_entry_to_map(
20675 ipc_port_t port)
20676 {
20677 vm_map_t map = VM_MAP_NULL;
20678 vm_named_entry_t named_entry;
20679
20680 if (!IP_VALID(port)) {
20681 return VM_MAP_NULL;
20682 }
20683
20684 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20685 return convert_port_to_map(port);
20686 }
20687
20688 named_entry = mach_memory_entry_from_port(port);
20689
20690 if ((named_entry->is_sub_map) &&
20691 (named_entry->protection & VM_PROT_WRITE)) {
20692 map = named_entry->backing.map;
20693 if (map->pmap != PMAP_NULL) {
20694 if (map->pmap == kernel_pmap) {
20695 panic("userspace has access "
20696 "to a kernel map %p", map);
20697 }
20698 pmap_require(map->pmap);
20699 }
20700 vm_map_reference(map);
20701 }
20702
20703 return map;
20704 }
20705
20706 /*
20707 * Export routines to other components for the things we access locally through
20708 * macros.
20709 */
20710 #undef current_map
20711 vm_map_t
current_map(void)20712 current_map(void)
20713 {
20714 return current_map_fast();
20715 }
20716
20717 /*
20718 * vm_map_reference:
20719 *
20720 * Takes a reference on the specified map.
20721 */
20722 void
vm_map_reference(vm_map_t map)20723 vm_map_reference(
20724 vm_map_t map)
20725 {
20726 if (__probable(map != VM_MAP_NULL)) {
20727 vm_map_require(map);
20728 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20729 }
20730 }
20731
20732 /*
20733 * vm_map_deallocate:
20734 *
20735 * Removes a reference from the specified map,
20736 * destroying it if no references remain.
20737 * The map should not be locked.
20738 */
20739 void
vm_map_deallocate(vm_map_t map)20740 vm_map_deallocate(
20741 vm_map_t map)
20742 {
20743 if (__probable(map != VM_MAP_NULL)) {
20744 vm_map_require(map);
20745 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20746 vm_map_destroy(map);
20747 }
20748 }
20749 }
20750
20751 void
vm_map_inspect_deallocate(vm_map_inspect_t map)20752 vm_map_inspect_deallocate(
20753 vm_map_inspect_t map)
20754 {
20755 vm_map_deallocate((vm_map_t)map);
20756 }
20757
20758 void
vm_map_read_deallocate(vm_map_read_t map)20759 vm_map_read_deallocate(
20760 vm_map_read_t map)
20761 {
20762 vm_map_deallocate((vm_map_t)map);
20763 }
20764
20765
20766 void
vm_map_disable_NX(vm_map_t map)20767 vm_map_disable_NX(vm_map_t map)
20768 {
20769 if (map == NULL) {
20770 return;
20771 }
20772 if (map->pmap == NULL) {
20773 return;
20774 }
20775
20776 pmap_disable_NX(map->pmap);
20777 }
20778
20779 void
vm_map_disallow_data_exec(vm_map_t map)20780 vm_map_disallow_data_exec(vm_map_t map)
20781 {
20782 if (map == NULL) {
20783 return;
20784 }
20785
20786 map->map_disallow_data_exec = TRUE;
20787 }
20788
20789 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
20790 * more descriptive.
20791 */
20792 void
vm_map_set_32bit(vm_map_t map)20793 vm_map_set_32bit(vm_map_t map)
20794 {
20795 #if defined(__arm64__)
20796 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
20797 #else
20798 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
20799 #endif
20800 }
20801
20802
20803 void
vm_map_set_64bit(vm_map_t map)20804 vm_map_set_64bit(vm_map_t map)
20805 {
20806 #if defined(__arm64__)
20807 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
20808 #else
20809 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
20810 #endif
20811 }
20812
20813 /*
20814 * Expand the maximum size of an existing map to 64GB.
20815 */
20816 void
vm_map_set_jumbo(vm_map_t map)20817 vm_map_set_jumbo(vm_map_t map)
20818 {
20819 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20820 vm_map_set_max_addr(map, ~0, false);
20821 #else /* arm64 */
20822 (void) map;
20823 #endif
20824 }
20825
20826 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
20827 /*
20828 * Expand the maximum size of an existing map to the maximum supported.
20829 */
20830 void
vm_map_set_extra_jumbo(vm_map_t map)20831 vm_map_set_extra_jumbo(vm_map_t map)
20832 {
20833 #if defined (__arm64__) && !XNU_TARGET_OS_OSX
20834 vm_map_set_max_addr(map, ~0, true);
20835 #else /* arm64 */
20836 (void) map;
20837 #endif
20838 }
20839 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
20840
20841 /*
20842 * This map has a JIT entitlement
20843 */
20844 void
vm_map_set_jit_entitled(vm_map_t map)20845 vm_map_set_jit_entitled(vm_map_t map)
20846 {
20847 #if defined (__arm64__)
20848 pmap_set_jit_entitled(map->pmap);
20849 #else /* arm64 */
20850 (void) map;
20851 #endif
20852 }
20853
20854 /*
20855 * Get status of this maps TPRO flag
20856 */
20857 boolean_t
vm_map_tpro(vm_map_t map)20858 vm_map_tpro(vm_map_t map)
20859 {
20860 #if defined (__arm64e__)
20861 return pmap_get_tpro(map->pmap);
20862 #else /* arm64e */
20863 (void) map;
20864 return FALSE;
20865 #endif
20866 }
20867
20868 /*
20869 * This map has TPRO enabled
20870 */
20871 void
vm_map_set_tpro(vm_map_t map)20872 vm_map_set_tpro(vm_map_t map)
20873 {
20874 #if defined (__arm64e__)
20875 pmap_set_tpro(map->pmap);
20876 #else /* arm64e */
20877 (void) map;
20878 #endif
20879 }
20880
20881 /*
20882 * Does this map have TPRO enforcement enabled
20883 */
20884 boolean_t
vm_map_tpro_enforcement(vm_map_t map)20885 vm_map_tpro_enforcement(vm_map_t map)
20886 {
20887 return map->tpro_enforcement;
20888 }
20889
20890 /*
20891 * Set TPRO enforcement for this map
20892 */
20893 void
vm_map_set_tpro_enforcement(vm_map_t map)20894 vm_map_set_tpro_enforcement(vm_map_t map)
20895 {
20896 if (vm_map_tpro(map)) {
20897 vm_map_lock(map);
20898 map->tpro_enforcement = TRUE;
20899 vm_map_unlock(map);
20900 }
20901 }
20902
20903 /*
20904 * Enable TPRO on the requested region
20905 *
20906 * Note:
20907 * This routine is primarily intended to be called during/soon after map
20908 * creation before the associated task has been released to run. It is only
20909 * currently safe when we have no resident pages.
20910 */
20911 boolean_t
vm_map_set_tpro_range(__unused vm_map_t map,__unused vm_map_address_t start,__unused vm_map_address_t end)20912 vm_map_set_tpro_range(
20913 __unused vm_map_t map,
20914 __unused vm_map_address_t start,
20915 __unused vm_map_address_t end)
20916 {
20917 return TRUE;
20918 }
20919
20920 /*
20921 * Expand the maximum size of an existing map.
20922 */
20923 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset,__unused bool extra_jumbo)20924 vm_map_set_max_addr(
20925 vm_map_t map,
20926 vm_map_offset_t new_max_offset,
20927 __unused bool extra_jumbo)
20928 {
20929 #if defined(__arm64__)
20930 vm_map_offset_t max_supported_offset;
20931 vm_map_offset_t old_max_offset;
20932 unsigned int option = ARM_PMAP_MAX_OFFSET_JUMBO;
20933
20934 vm_map_lock(map);
20935
20936 old_max_offset = map->max_offset;
20937 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
20938 if (extra_jumbo) {
20939 option = ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO;
20940 }
20941 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
20942 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), option);
20943
20944 new_max_offset = trunc_page(new_max_offset);
20945
20946 /* The address space cannot be shrunk using this routine. */
20947 if (old_max_offset >= new_max_offset) {
20948 vm_map_unlock(map);
20949 return;
20950 }
20951
20952 if (max_supported_offset < new_max_offset) {
20953 new_max_offset = max_supported_offset;
20954 }
20955
20956 map->max_offset = new_max_offset;
20957
20958 /*
20959 * Disable the following chunk of code that extends the "holes" list
20960 * to accomodate a larger VM map.
20961 * In `vm_map_create_options()`, we now set the end of the "holes" list to
20962 * max(map->max_offset, MACH_VM_MAX_ADDRESS) for all platforms.
20963 * MACH_VM_MAX_ADDRESS is the largest virtual address a userspace process
20964 * can map, so any `new_max_offset` value will be <= MACH_VM_MAX_ADDRESS.
20965 * The "holes" list does not need to be adjusted.
20966 */
20967 #if 0
20968 if (map->holelistenabled) {
20969 if (map->holes_list->prev->vme_end == old_max_offset) {
20970 /*
20971 * There is already a hole at the end of the map; simply make it bigger.
20972 */
20973 map->holes_list->prev->vme_end = map->max_offset;
20974 } else {
20975 /*
20976 * There is no hole at the end, so we need to create a new hole
20977 * for the new empty space we're creating.
20978 */
20979 struct vm_map_links *new_hole;
20980
20981 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
20982 new_hole->start = old_max_offset;
20983 new_hole->end = map->max_offset;
20984 new_hole->prev = map->holes_list->prev;
20985 new_hole->next = (struct vm_map_entry *)map->holes_list;
20986 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
20987 map->holes_list->prev = (struct vm_map_entry *)new_hole;
20988 }
20989 }
20990 #endif
20991
20992 vm_map_unlock(map);
20993 #else
20994 (void)map;
20995 (void)new_max_offset;
20996 #endif
20997 }
20998
20999 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)21000 vm_compute_max_offset(boolean_t is64)
21001 {
21002 #if defined(__arm64__)
21003 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21004 #else
21005 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21006 #endif
21007 }
21008
21009 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)21010 vm_map_get_max_aslr_slide_section(
21011 vm_map_t map __unused,
21012 int64_t *max_sections,
21013 int64_t *section_size)
21014 {
21015 #if defined(__arm64__)
21016 *max_sections = 3;
21017 *section_size = ARM_TT_TWIG_SIZE;
21018 #else
21019 *max_sections = 1;
21020 *section_size = 0;
21021 #endif
21022 }
21023
21024 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)21025 vm_map_get_max_aslr_slide_pages(vm_map_t map)
21026 {
21027 #if defined(__arm64__)
21028 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21029 * limited embedded address space; this is also meant to minimize pmap
21030 * memory usage on 16KB page systems.
21031 */
21032 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21033 #else
21034 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21035 #endif
21036 }
21037
21038 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)21039 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21040 {
21041 #if defined(__arm64__)
21042 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21043 * of independent entropy on 16KB page systems.
21044 */
21045 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21046 #else
21047 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21048 #endif
21049 }
21050
21051 boolean_t
vm_map_is_64bit(vm_map_t map)21052 vm_map_is_64bit(
21053 vm_map_t map)
21054 {
21055 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21056 }
21057
21058 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)21059 vm_map_has_hard_pagezero(
21060 vm_map_t map,
21061 vm_map_offset_t pagezero_size)
21062 {
21063 /*
21064 * XXX FBDP
21065 * We should lock the VM map (for read) here but we can get away
21066 * with it for now because there can't really be any race condition:
21067 * the VM map's min_offset is changed only when the VM map is created
21068 * and when the zero page is established (when the binary gets loaded),
21069 * and this routine gets called only when the task terminates and the
21070 * VM map is being torn down, and when a new map is created via
21071 * load_machfile()/execve().
21072 */
21073 return map->min_offset >= pagezero_size;
21074 }
21075
21076 /*
21077 * Raise a VM map's maximun offset.
21078 */
21079 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)21080 vm_map_raise_max_offset(
21081 vm_map_t map,
21082 vm_map_offset_t new_max_offset)
21083 {
21084 kern_return_t ret;
21085
21086 vm_map_lock(map);
21087 ret = KERN_INVALID_ADDRESS;
21088
21089 if (new_max_offset >= map->max_offset) {
21090 if (!vm_map_is_64bit(map)) {
21091 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21092 map->max_offset = new_max_offset;
21093 ret = KERN_SUCCESS;
21094 }
21095 } else {
21096 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21097 map->max_offset = new_max_offset;
21098 ret = KERN_SUCCESS;
21099 }
21100 }
21101 }
21102
21103 vm_map_unlock(map);
21104 return ret;
21105 }
21106
21107
21108 /*
21109 * Raise a VM map's minimum offset.
21110 * To strictly enforce "page zero" reservation.
21111 */
21112 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)21113 vm_map_raise_min_offset(
21114 vm_map_t map,
21115 vm_map_offset_t new_min_offset)
21116 {
21117 vm_map_entry_t first_entry;
21118
21119 new_min_offset = vm_map_round_page(new_min_offset,
21120 VM_MAP_PAGE_MASK(map));
21121
21122 vm_map_lock(map);
21123
21124 if (new_min_offset < map->min_offset) {
21125 /*
21126 * Can't move min_offset backwards, as that would expose
21127 * a part of the address space that was previously, and for
21128 * possibly good reasons, inaccessible.
21129 */
21130 vm_map_unlock(map);
21131 return KERN_INVALID_ADDRESS;
21132 }
21133 if (new_min_offset >= map->max_offset) {
21134 /* can't go beyond the end of the address space */
21135 vm_map_unlock(map);
21136 return KERN_INVALID_ADDRESS;
21137 }
21138
21139 first_entry = vm_map_first_entry(map);
21140 if (first_entry != vm_map_to_entry(map) &&
21141 first_entry->vme_start < new_min_offset) {
21142 /*
21143 * Some memory was already allocated below the new
21144 * minimun offset. It's too late to change it now...
21145 */
21146 vm_map_unlock(map);
21147 return KERN_NO_SPACE;
21148 }
21149
21150 map->min_offset = new_min_offset;
21151
21152 if (map->holelistenabled) {
21153 assert(map->holes_list);
21154 map->holes_list->start = new_min_offset;
21155 assert(new_min_offset < map->holes_list->end);
21156 }
21157
21158 vm_map_unlock(map);
21159
21160 return KERN_SUCCESS;
21161 }
21162
21163 /*
21164 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21165 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21166 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21167 * have to reach over to the BSD data structures.
21168 */
21169
21170 uint64_t vm_map_set_size_limit_count = 0;
21171 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)21172 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21173 {
21174 kern_return_t kr;
21175
21176 vm_map_lock(map);
21177 if (new_size_limit < map->size) {
21178 /* new limit should not be lower than its current size */
21179 DTRACE_VM2(vm_map_set_size_limit_fail,
21180 vm_map_size_t, map->size,
21181 uint64_t, new_size_limit);
21182 kr = KERN_FAILURE;
21183 } else if (new_size_limit == map->size_limit) {
21184 /* no change */
21185 kr = KERN_SUCCESS;
21186 } else {
21187 /* set new limit */
21188 DTRACE_VM2(vm_map_set_size_limit,
21189 vm_map_size_t, map->size,
21190 uint64_t, new_size_limit);
21191 if (new_size_limit != RLIM_INFINITY) {
21192 vm_map_set_size_limit_count++;
21193 }
21194 map->size_limit = new_size_limit;
21195 kr = KERN_SUCCESS;
21196 }
21197 vm_map_unlock(map);
21198 return kr;
21199 }
21200
21201 uint64_t vm_map_set_data_limit_count = 0;
21202 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)21203 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21204 {
21205 kern_return_t kr;
21206
21207 vm_map_lock(map);
21208 if (new_data_limit < map->size) {
21209 /* new limit should not be lower than its current size */
21210 DTRACE_VM2(vm_map_set_data_limit_fail,
21211 vm_map_size_t, map->size,
21212 uint64_t, new_data_limit);
21213 kr = KERN_FAILURE;
21214 } else if (new_data_limit == map->data_limit) {
21215 /* no change */
21216 kr = KERN_SUCCESS;
21217 } else {
21218 /* set new limit */
21219 DTRACE_VM2(vm_map_set_data_limit,
21220 vm_map_size_t, map->size,
21221 uint64_t, new_data_limit);
21222 if (new_data_limit != RLIM_INFINITY) {
21223 vm_map_set_data_limit_count++;
21224 }
21225 map->data_limit = new_data_limit;
21226 kr = KERN_SUCCESS;
21227 }
21228 vm_map_unlock(map);
21229 return kr;
21230 }
21231
21232 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)21233 vm_map_set_user_wire_limit(vm_map_t map,
21234 vm_size_t limit)
21235 {
21236 vm_map_lock(map);
21237 map->user_wire_limit = limit;
21238 vm_map_unlock(map);
21239 }
21240
21241
21242 void
vm_map_switch_protect(vm_map_t map,boolean_t val)21243 vm_map_switch_protect(vm_map_t map,
21244 boolean_t val)
21245 {
21246 vm_map_lock(map);
21247 map->switch_protect = val;
21248 vm_map_unlock(map);
21249 }
21250
21251 extern int cs_process_enforcement_enable;
21252 boolean_t
vm_map_cs_enforcement(vm_map_t map)21253 vm_map_cs_enforcement(
21254 vm_map_t map)
21255 {
21256 if (cs_process_enforcement_enable) {
21257 return TRUE;
21258 }
21259 return map->cs_enforcement;
21260 }
21261
21262 kern_return_t
vm_map_cs_wx_enable(__unused vm_map_t map)21263 vm_map_cs_wx_enable(
21264 __unused vm_map_t map)
21265 {
21266 #if CODE_SIGNING_MONITOR
21267 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21268 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21269 return KERN_SUCCESS;
21270 }
21271 return ret;
21272 #else
21273 /* The VM manages WX memory entirely on its own */
21274 return KERN_SUCCESS;
21275 #endif
21276 }
21277
21278 kern_return_t
vm_map_csm_allow_jit(__unused vm_map_t map)21279 vm_map_csm_allow_jit(
21280 __unused vm_map_t map)
21281 {
21282 #if CODE_SIGNING_MONITOR
21283 return csm_allow_jit_region(vm_map_pmap(map));
21284 #else
21285 /* No code signing monitor to enforce JIT policy */
21286 return KERN_SUCCESS;
21287 #endif
21288 }
21289
21290 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)21291 vm_map_cs_debugged_set(
21292 vm_map_t map,
21293 boolean_t val)
21294 {
21295 vm_map_lock(map);
21296 map->cs_debugged = val;
21297 vm_map_unlock(map);
21298 }
21299
21300 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)21301 vm_map_cs_enforcement_set(
21302 vm_map_t map,
21303 boolean_t val)
21304 {
21305 vm_map_lock(map);
21306 map->cs_enforcement = val;
21307 pmap_set_vm_map_cs_enforced(map->pmap, val);
21308 vm_map_unlock(map);
21309 }
21310
21311 /*
21312 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21313 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21314 * bump both counters.
21315 */
21316 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)21317 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21318 {
21319 pmap_t pmap = vm_map_pmap(map);
21320
21321 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21322 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21323 }
21324
21325 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)21326 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21327 {
21328 pmap_t pmap = vm_map_pmap(map);
21329
21330 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
21331 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
21332 }
21333
21334 /* Add (generate) code signature for memory range */
21335 #if CONFIG_DYNAMIC_CODE_SIGNING
21336 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)21337 vm_map_sign(vm_map_t map,
21338 vm_map_offset_t start,
21339 vm_map_offset_t end)
21340 {
21341 vm_map_entry_t entry;
21342 vm_page_t m;
21343 vm_object_t object;
21344
21345 /*
21346 * Vet all the input parameters and current type and state of the
21347 * underlaying object. Return with an error if anything is amiss.
21348 */
21349 if (map == VM_MAP_NULL) {
21350 return KERN_INVALID_ARGUMENT;
21351 }
21352
21353 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21354 return KERN_INVALID_ADDRESS;
21355 }
21356
21357 vm_map_lock_read(map);
21358
21359 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21360 /*
21361 * Must pass a valid non-submap address.
21362 */
21363 vm_map_unlock_read(map);
21364 return KERN_INVALID_ADDRESS;
21365 }
21366
21367 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21368 /*
21369 * Map entry doesn't cover the requested range. Not handling
21370 * this situation currently.
21371 */
21372 vm_map_unlock_read(map);
21373 return KERN_INVALID_ARGUMENT;
21374 }
21375
21376 object = VME_OBJECT(entry);
21377 if (object == VM_OBJECT_NULL) {
21378 /*
21379 * Object must already be present or we can't sign.
21380 */
21381 vm_map_unlock_read(map);
21382 return KERN_INVALID_ARGUMENT;
21383 }
21384
21385 vm_object_lock(object);
21386 vm_map_unlock_read(map);
21387
21388 while (start < end) {
21389 uint32_t refmod;
21390
21391 m = vm_page_lookup(object,
21392 start - entry->vme_start + VME_OFFSET(entry));
21393 if (m == VM_PAGE_NULL) {
21394 /* shoud we try to fault a page here? we can probably
21395 * demand it exists and is locked for this request */
21396 vm_object_unlock(object);
21397 return KERN_FAILURE;
21398 }
21399 /* deal with special page status */
21400 if (m->vmp_busy ||
21401 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21402 vm_object_unlock(object);
21403 return KERN_FAILURE;
21404 }
21405
21406 /* Page is OK... now "validate" it */
21407 /* This is the place where we'll call out to create a code
21408 * directory, later */
21409 /* XXX TODO4K: deal with 4k subpages individually? */
21410 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21411
21412 /* The page is now "clean" for codesigning purposes. That means
21413 * we don't consider it as modified (wpmapped) anymore. But
21414 * we'll disconnect the page so we note any future modification
21415 * attempts. */
21416 m->vmp_wpmapped = FALSE;
21417 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21418
21419 /* Pull the dirty status from the pmap, since we cleared the
21420 * wpmapped bit */
21421 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21422 SET_PAGE_DIRTY(m, FALSE);
21423 }
21424
21425 /* On to the next page */
21426 start += PAGE_SIZE;
21427 }
21428 vm_object_unlock(object);
21429
21430 return KERN_SUCCESS;
21431 }
21432 #endif
21433
21434 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)21435 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21436 {
21437 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21438 vm_map_entry_t next_entry;
21439 kern_return_t kr = KERN_SUCCESS;
21440 VM_MAP_ZAP_DECLARE(zap_list);
21441
21442 vm_map_lock(map);
21443
21444 for (entry = vm_map_first_entry(map);
21445 entry != vm_map_to_entry(map);
21446 entry = next_entry) {
21447 next_entry = entry->vme_next;
21448
21449 if (!entry->is_sub_map &&
21450 VME_OBJECT(entry) &&
21451 (VME_OBJECT(entry)->internal == TRUE) &&
21452 (VME_OBJECT(entry)->ref_count == 1)) {
21453 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21454 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21455
21456 (void)vm_map_delete(map, entry->vme_start,
21457 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
21458 KMEM_GUARD_NONE, &zap_list);
21459 }
21460 }
21461
21462 vm_map_unlock(map);
21463
21464 vm_map_zap_dispose(&zap_list);
21465
21466 return kr;
21467 }
21468
21469
21470 #if DEVELOPMENT || DEBUG
21471
21472 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)21473 vm_map_disconnect_page_mappings(
21474 vm_map_t map,
21475 boolean_t do_unnest)
21476 {
21477 vm_map_entry_t entry;
21478 ledger_amount_t byte_count = 0;
21479
21480 if (do_unnest == TRUE) {
21481 #ifndef NO_NESTED_PMAP
21482 vm_map_lock(map);
21483
21484 for (entry = vm_map_first_entry(map);
21485 entry != vm_map_to_entry(map);
21486 entry = entry->vme_next) {
21487 if (entry->is_sub_map && entry->use_pmap) {
21488 /*
21489 * Make sure the range between the start of this entry and
21490 * the end of this entry is no longer nested, so that
21491 * we will only remove mappings from the pmap in use by this
21492 * this task
21493 */
21494 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21495 }
21496 }
21497 vm_map_unlock(map);
21498 #endif
21499 }
21500 vm_map_lock_read(map);
21501
21502 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21503
21504 for (entry = vm_map_first_entry(map);
21505 entry != vm_map_to_entry(map);
21506 entry = entry->vme_next) {
21507 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21508 (VME_OBJECT(entry)->phys_contiguous))) {
21509 continue;
21510 }
21511 if (entry->is_sub_map) {
21512 assert(!entry->use_pmap);
21513 }
21514
21515 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21516 }
21517 vm_map_unlock_read(map);
21518
21519 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21520 }
21521
21522 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)21523 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21524 {
21525 vm_object_t object = NULL;
21526 vm_object_offset_t offset;
21527 vm_prot_t prot;
21528 boolean_t wired;
21529 vm_map_version_t version;
21530 vm_map_t real_map;
21531 int result = KERN_FAILURE;
21532
21533 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21534 vm_map_lock(map);
21535
21536 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21537 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21538 NULL, &real_map, NULL);
21539 if (object == NULL) {
21540 result = KERN_MEMORY_ERROR;
21541 } else if (object->pager) {
21542 result = vm_compressor_pager_inject_error(object->pager,
21543 offset);
21544 } else {
21545 result = KERN_MEMORY_PRESENT;
21546 }
21547
21548 if (object != NULL) {
21549 vm_object_unlock(object);
21550 }
21551
21552 if (real_map != map) {
21553 vm_map_unlock(real_map);
21554 }
21555 vm_map_unlock(map);
21556
21557 return result;
21558 }
21559
21560 /* iterate over map entries. Call the first argument block for the number of entries and the second for every entry
21561 * returns: KERN_SUCCESS if iteration completed ok,
21562 * error code if callback returned an error
21563 * KERN_FAILURE if there was a race of adding/removing entries during the iteration and the number of entries
21564 * iterated is different from the number in the first call
21565 */
21566 static kern_return_t
21567 vm_map_entries_foreach_locked(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21568 kern_return_t (^entry_handler)(void* entry))
21569 {
21570 vm_map_lock_assert_held(map);
21571 int nentries = map->hdr.nentries;
21572 kern_return_t error = count_handler(nentries);
21573 if (error) {
21574 return error;
21575 }
21576
21577 /* iterate until we loop back to the map, see get_vmmap_entries() */
21578 vm_map_entry_t entry = vm_map_first_entry(map);
21579 int count = 0;
21580 while (entry != vm_map_to_entry(map)) {
21581 error = entry_handler(entry);
21582 if (error != KERN_SUCCESS) {
21583 return error;
21584 }
21585 entry = entry->vme_next;
21586 ++count;
21587 if (count > nentries) {
21588 /* nentries and entries iteration don't agree on how many entries there are, shouldn't really happen */
21589 return KERN_FAILURE;
21590 }
21591 }
21592 if (count < nentries) {
21593 return KERN_FAILURE;
21594 }
21595 return KERN_SUCCESS;
21596 }
21597
21598 kern_return_t
21599 vm_map_entries_foreach(vm_map_t map, kern_return_t (^count_handler)(int nentries),
21600 kern_return_t (^entry_handler)(void* entry))
21601 {
21602 vm_map_lock_read(map);
21603 kern_return_t error = vm_map_entries_foreach_locked(map, count_handler, entry_handler);
21604 vm_map_unlock_read(map);
21605 return error;
21606 }
21607
21608 /*
21609 * Dump info about the entry into the given buffer.
21610 * return true on success, false if there was not enough space in the give buffer
21611 * argument size in: bytes free in the given buffer, out: bytes written
21612 */
21613 kern_return_t
vm_map_dump_entry_and_compressor_pager(void * pentry,char * buf,size_t * size)21614 vm_map_dump_entry_and_compressor_pager(void* pentry, char *buf, size_t *size)
21615 {
21616 size_t insize = *size;
21617 kern_return_t kr;
21618 size_t offset = 0;
21619
21620 *size = 0;
21621 if (sizeof(struct vm_map_entry_info) > insize) {
21622 return KERN_NO_SPACE;
21623 }
21624
21625 vm_map_entry_t entry = (vm_map_entry_t)pentry;
21626 struct vm_map_entry_info *out_entry = (struct vm_map_entry_info*)buf;
21627 out_entry->vmei_start = entry->vme_start;
21628 out_entry->vmei_end = entry->vme_end;
21629 out_entry->vmei_alias = VME_ALIAS(entry);
21630 out_entry->vmei_offset = VME_OFFSET(entry);
21631 out_entry->vmei_is_sub_map = entry->is_sub_map;
21632 out_entry->vmei_protection = entry->protection;
21633 offset += sizeof(struct vm_map_entry_info);
21634
21635 out_entry->vmei_slot_mapping_count = 0;
21636 out_entry->vmei_is_compressor_pager = false;
21637 *size = offset;
21638 if (out_entry->vmei_is_sub_map) {
21639 return KERN_SUCCESS; // TODO: sub_map interrogation not supported yet
21640 }
21641 /* have a vm_object? */
21642 vm_object_t object = VME_OBJECT(entry);
21643 if (object == VM_OBJECT_NULL || !object->internal) {
21644 return KERN_SUCCESS;
21645 }
21646 /* objects has a pager? */
21647 memory_object_t pager = object->pager;
21648 if (pager != MEMORY_OBJECT_NULL) {
21649 return KERN_SUCCESS;
21650 }
21651 bool is_compressor = false;
21652 unsigned int slot_mapping_count = 0;
21653 size_t pager_info_size = insize - offset;
21654 kr = vm_compressor_pager_dump(pager, buf + offset, &pager_info_size, &is_compressor, &slot_mapping_count);
21655 if (kr != KERN_SUCCESS) {
21656 /* didn't have enough space for everything we want to write, caller needs to retry */
21657 return kr;
21658 }
21659 offset += pager_info_size;
21660 /* if we got here, is_compressor should be true due to the object->internal check above, so this assignment
21661 * is just for sanity sake */
21662 out_entry->vmei_is_compressor_pager = is_compressor;
21663 out_entry->vmei_slot_mapping_count = slot_mapping_count;
21664 *size = offset;
21665 return KERN_SUCCESS;
21666 }
21667
21668
21669 #endif
21670
21671
21672 #if CONFIG_FREEZE
21673
21674
21675 extern struct freezer_context freezer_context_global;
21676 AbsoluteTime c_freezer_last_yield_ts = 0;
21677
21678 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21679 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21680
21681 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)21682 vm_map_freeze(
21683 task_t task,
21684 unsigned int *purgeable_count,
21685 unsigned int *wired_count,
21686 unsigned int *clean_count,
21687 unsigned int *dirty_count,
21688 unsigned int dirty_budget,
21689 unsigned int *shared_count,
21690 int *freezer_error_code,
21691 boolean_t eval_only)
21692 {
21693 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21694 kern_return_t kr = KERN_SUCCESS;
21695 boolean_t evaluation_phase = TRUE;
21696 vm_object_t cur_shared_object = NULL;
21697 int cur_shared_obj_ref_cnt = 0;
21698 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21699
21700 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21701
21702 /*
21703 * We need the exclusive lock here so that we can
21704 * block any page faults or lookups while we are
21705 * in the middle of freezing this vm map.
21706 */
21707 vm_map_t map = task->map;
21708
21709 vm_map_lock(map);
21710
21711 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21712
21713 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21714 if (vm_compressor_low_on_space()) {
21715 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21716 }
21717
21718 if (vm_swap_low_on_space()) {
21719 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21720 }
21721
21722 kr = KERN_NO_SPACE;
21723 goto done;
21724 }
21725
21726 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21727 /*
21728 * In-memory compressor backing the freezer. No disk.
21729 * So no need to do the evaluation phase.
21730 */
21731 evaluation_phase = FALSE;
21732
21733 if (eval_only == TRUE) {
21734 /*
21735 * We don't support 'eval_only' mode
21736 * in this non-swap config.
21737 */
21738 *freezer_error_code = FREEZER_ERROR_GENERIC;
21739 kr = KERN_INVALID_ARGUMENT;
21740 goto done;
21741 }
21742
21743 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21744 clock_get_uptime(&c_freezer_last_yield_ts);
21745 }
21746 again:
21747
21748 for (entry2 = vm_map_first_entry(map);
21749 entry2 != vm_map_to_entry(map);
21750 entry2 = entry2->vme_next) {
21751 vm_object_t src_object;
21752
21753 if (entry2->is_sub_map) {
21754 continue;
21755 }
21756
21757 src_object = VME_OBJECT(entry2);
21758 if (!src_object ||
21759 src_object->phys_contiguous ||
21760 !src_object->internal) {
21761 continue;
21762 }
21763
21764 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21765
21766 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21767 /*
21768 * We skip purgeable objects during evaluation phase only.
21769 * If we decide to freeze this process, we'll explicitly
21770 * purge these objects before we go around again with
21771 * 'evaluation_phase' set to FALSE.
21772 */
21773
21774 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21775 /*
21776 * We want to purge objects that may not belong to this task but are mapped
21777 * in this task alone. Since we already purged this task's purgeable memory
21778 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21779 * on this task's purgeable objects. Hence the check for only volatile objects.
21780 */
21781 if (evaluation_phase ||
21782 src_object->purgable != VM_PURGABLE_VOLATILE ||
21783 src_object->ref_count != 1) {
21784 continue;
21785 }
21786 vm_object_lock(src_object);
21787 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21788 src_object->ref_count == 1) {
21789 purgeable_q_t old_queue;
21790
21791 /* object should be on a purgeable queue */
21792 assert(src_object->objq.next != NULL &&
21793 src_object->objq.prev != NULL);
21794 /* move object from its volatile queue to the nonvolatile queue */
21795 old_queue = vm_purgeable_object_remove(src_object);
21796 assert(old_queue);
21797 if (src_object->purgeable_when_ripe) {
21798 /* remove a token from that volatile queue */
21799 vm_page_lock_queues();
21800 vm_purgeable_token_delete_first(old_queue);
21801 vm_page_unlock_queues();
21802 }
21803 /* purge the object */
21804 vm_object_purge(src_object, 0);
21805 }
21806 vm_object_unlock(src_object);
21807 continue;
21808 }
21809
21810 /*
21811 * Pages belonging to this object could be swapped to disk.
21812 * Make sure it's not a shared object because we could end
21813 * up just bringing it back in again.
21814 *
21815 * We try to optimize somewhat by checking for objects that are mapped
21816 * more than once within our own map. But we don't do full searches,
21817 * we just look at the entries following our current entry.
21818 */
21819
21820 if (src_object->ref_count > 1) {
21821 if (src_object != cur_shared_object) {
21822 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21823 dirty_shared_count += obj_pages_snapshot;
21824
21825 cur_shared_object = src_object;
21826 cur_shared_obj_ref_cnt = 1;
21827 continue;
21828 } else {
21829 cur_shared_obj_ref_cnt++;
21830 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21831 /*
21832 * Fall through to below and treat this object as private.
21833 * So deduct its pages from our shared total and add it to the
21834 * private total.
21835 */
21836
21837 dirty_shared_count -= obj_pages_snapshot;
21838 dirty_private_count += obj_pages_snapshot;
21839 } else {
21840 continue;
21841 }
21842 }
21843 }
21844
21845
21846 if (src_object->ref_count == 1) {
21847 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21848 }
21849
21850 if (evaluation_phase == TRUE) {
21851 continue;
21852 }
21853 }
21854
21855 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21856 *wired_count += src_object->wired_page_count;
21857
21858 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21859 if (vm_compressor_low_on_space()) {
21860 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21861 }
21862
21863 if (vm_swap_low_on_space()) {
21864 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21865 }
21866
21867 kr = KERN_NO_SPACE;
21868 break;
21869 }
21870 if (paged_out_count >= dirty_budget) {
21871 break;
21872 }
21873 dirty_budget -= paged_out_count;
21874 }
21875
21876 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21877 if (evaluation_phase) {
21878 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21879
21880 if (dirty_shared_count > shared_pages_threshold) {
21881 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21882 kr = KERN_FAILURE;
21883 goto done;
21884 }
21885
21886 if (dirty_shared_count &&
21887 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21888 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21889 kr = KERN_FAILURE;
21890 goto done;
21891 }
21892
21893 evaluation_phase = FALSE;
21894 dirty_shared_count = dirty_private_count = 0;
21895
21896 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21897 clock_get_uptime(&c_freezer_last_yield_ts);
21898
21899 if (eval_only) {
21900 kr = KERN_SUCCESS;
21901 goto done;
21902 }
21903
21904 vm_purgeable_purge_task_owned(task);
21905
21906 goto again;
21907 } else {
21908 kr = KERN_SUCCESS;
21909 }
21910
21911 done:
21912 vm_map_unlock(map);
21913
21914 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21915 vm_object_compressed_freezer_done();
21916 }
21917 return kr;
21918 }
21919
21920 #endif
21921
21922 /*
21923 * vm_map_entry_should_cow_for_true_share:
21924 *
21925 * Determines if the map entry should be clipped and setup for copy-on-write
21926 * to avoid applying "true_share" to a large VM object when only a subset is
21927 * targeted.
21928 *
21929 * For now, we target only the map entries created for the Objective C
21930 * Garbage Collector, which initially have the following properties:
21931 * - alias == VM_MEMORY_MALLOC
21932 * - wired_count == 0
21933 * - !needs_copy
21934 * and a VM object with:
21935 * - internal
21936 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
21937 * - !true_share
21938 * - vo_size == ANON_CHUNK_SIZE
21939 *
21940 * Only non-kernel map entries.
21941 */
21942 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)21943 vm_map_entry_should_cow_for_true_share(
21944 vm_map_entry_t entry)
21945 {
21946 vm_object_t object;
21947
21948 if (entry->is_sub_map) {
21949 /* entry does not point at a VM object */
21950 return FALSE;
21951 }
21952
21953 if (entry->needs_copy) {
21954 /* already set for copy_on_write: done! */
21955 return FALSE;
21956 }
21957
21958 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
21959 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
21960 /* not a malloc heap or Obj-C Garbage Collector heap */
21961 return FALSE;
21962 }
21963
21964 if (entry->wired_count) {
21965 /* wired: can't change the map entry... */
21966 vm_counters.should_cow_but_wired++;
21967 return FALSE;
21968 }
21969
21970 object = VME_OBJECT(entry);
21971
21972 if (object == VM_OBJECT_NULL) {
21973 /* no object yet... */
21974 return FALSE;
21975 }
21976
21977 if (!object->internal) {
21978 /* not an internal object */
21979 return FALSE;
21980 }
21981
21982 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
21983 /* not the default copy strategy */
21984 return FALSE;
21985 }
21986
21987 if (object->true_share) {
21988 /* already true_share: too late to avoid it */
21989 return FALSE;
21990 }
21991
21992 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
21993 object->vo_size != ANON_CHUNK_SIZE) {
21994 /* ... not an object created for the ObjC Garbage Collector */
21995 return FALSE;
21996 }
21997
21998 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
21999 object->vo_size != 2048 * 4096) {
22000 /* ... not a "MALLOC_SMALL" heap */
22001 return FALSE;
22002 }
22003
22004 /*
22005 * All the criteria match: we have a large object being targeted for "true_share".
22006 * To limit the adverse side-effects linked with "true_share", tell the caller to
22007 * try and avoid setting up the entire object for "true_share" by clipping the
22008 * targeted range and setting it up for copy-on-write.
22009 */
22010 return TRUE;
22011 }
22012
22013 uint64_t vm_map_range_overflows_count = 0;
22014 TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22015 bool
vm_map_range_overflows(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size)22016 vm_map_range_overflows(
22017 vm_map_t map,
22018 vm_map_offset_t addr,
22019 vm_map_size_t size)
22020 {
22021 vm_map_offset_t start, end, sum;
22022 vm_map_offset_t pgmask;
22023
22024 if (size == 0) {
22025 /* empty range -> no overflow */
22026 return false;
22027 }
22028 pgmask = vm_map_page_mask(map);
22029 start = vm_map_trunc_page_mask(addr, pgmask);
22030 end = vm_map_round_page_mask(addr + size, pgmask);
22031 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22032 vm_map_range_overflows_count++;
22033 if (vm_map_range_overflows_log) {
22034 printf("%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22035 proc_selfpid(),
22036 proc_best_name(current_proc()),
22037 (uint64_t)addr,
22038 (uint64_t)size,
22039 (uint64_t)pgmask);
22040 }
22041 DTRACE_VM4(vm_map_range_overflows,
22042 vm_map_t, map,
22043 uint32_t, pgmask,
22044 uint64_t, (uint64_t)addr,
22045 uint64_t, (uint64_t)size);
22046 return true;
22047 }
22048 return false;
22049 }
22050
22051 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22052 vm_map_round_page_mask(
22053 vm_map_offset_t offset,
22054 vm_map_offset_t mask)
22055 {
22056 return VM_MAP_ROUND_PAGE(offset, mask);
22057 }
22058
22059 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)22060 vm_map_trunc_page_mask(
22061 vm_map_offset_t offset,
22062 vm_map_offset_t mask)
22063 {
22064 return VM_MAP_TRUNC_PAGE(offset, mask);
22065 }
22066
22067 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)22068 vm_map_page_aligned(
22069 vm_map_offset_t offset,
22070 vm_map_offset_t mask)
22071 {
22072 return ((offset) & mask) == 0;
22073 }
22074
22075 int
vm_map_page_shift(vm_map_t map)22076 vm_map_page_shift(
22077 vm_map_t map)
22078 {
22079 return VM_MAP_PAGE_SHIFT(map);
22080 }
22081
22082 int
vm_map_page_size(vm_map_t map)22083 vm_map_page_size(
22084 vm_map_t map)
22085 {
22086 return VM_MAP_PAGE_SIZE(map);
22087 }
22088
22089 vm_map_offset_t
vm_map_page_mask(vm_map_t map)22090 vm_map_page_mask(
22091 vm_map_t map)
22092 {
22093 return VM_MAP_PAGE_MASK(map);
22094 }
22095
22096 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)22097 vm_map_set_page_shift(
22098 vm_map_t map,
22099 int pageshift)
22100 {
22101 if (map->hdr.nentries != 0) {
22102 /* too late to change page size */
22103 return KERN_FAILURE;
22104 }
22105
22106 map->hdr.page_shift = (uint16_t)pageshift;
22107
22108 return KERN_SUCCESS;
22109 }
22110
22111 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)22112 vm_map_query_volatile(
22113 vm_map_t map,
22114 mach_vm_size_t *volatile_virtual_size_p,
22115 mach_vm_size_t *volatile_resident_size_p,
22116 mach_vm_size_t *volatile_compressed_size_p,
22117 mach_vm_size_t *volatile_pmap_size_p,
22118 mach_vm_size_t *volatile_compressed_pmap_size_p)
22119 {
22120 mach_vm_size_t volatile_virtual_size;
22121 mach_vm_size_t volatile_resident_count;
22122 mach_vm_size_t volatile_compressed_count;
22123 mach_vm_size_t volatile_pmap_count;
22124 mach_vm_size_t volatile_compressed_pmap_count;
22125 mach_vm_size_t resident_count;
22126 vm_map_entry_t entry;
22127 vm_object_t object;
22128
22129 /* map should be locked by caller */
22130
22131 volatile_virtual_size = 0;
22132 volatile_resident_count = 0;
22133 volatile_compressed_count = 0;
22134 volatile_pmap_count = 0;
22135 volatile_compressed_pmap_count = 0;
22136
22137 for (entry = vm_map_first_entry(map);
22138 entry != vm_map_to_entry(map);
22139 entry = entry->vme_next) {
22140 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22141
22142 if (entry->is_sub_map) {
22143 continue;
22144 }
22145 if (!(entry->protection & VM_PROT_WRITE)) {
22146 continue;
22147 }
22148 object = VME_OBJECT(entry);
22149 if (object == VM_OBJECT_NULL) {
22150 continue;
22151 }
22152 if (object->purgable != VM_PURGABLE_VOLATILE &&
22153 object->purgable != VM_PURGABLE_EMPTY) {
22154 continue;
22155 }
22156 if (VME_OFFSET(entry)) {
22157 /*
22158 * If the map entry has been split and the object now
22159 * appears several times in the VM map, we don't want
22160 * to count the object's resident_page_count more than
22161 * once. We count it only for the first one, starting
22162 * at offset 0 and ignore the other VM map entries.
22163 */
22164 continue;
22165 }
22166 resident_count = object->resident_page_count;
22167 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22168 resident_count = 0;
22169 } else {
22170 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22171 }
22172
22173 volatile_virtual_size += entry->vme_end - entry->vme_start;
22174 volatile_resident_count += resident_count;
22175 if (object->pager) {
22176 volatile_compressed_count +=
22177 vm_compressor_pager_get_count(object->pager);
22178 }
22179 pmap_compressed_bytes = 0;
22180 pmap_resident_bytes =
22181 pmap_query_resident(map->pmap,
22182 entry->vme_start,
22183 entry->vme_end,
22184 &pmap_compressed_bytes);
22185 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22186 volatile_compressed_pmap_count += (pmap_compressed_bytes
22187 / PAGE_SIZE);
22188 }
22189
22190 /* map is still locked on return */
22191
22192 *volatile_virtual_size_p = volatile_virtual_size;
22193 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22194 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22195 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22196 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22197
22198 return KERN_SUCCESS;
22199 }
22200
22201 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)22202 vm_map_sizes(vm_map_t map,
22203 vm_map_size_t * psize,
22204 vm_map_size_t * pfree,
22205 vm_map_size_t * plargest_free)
22206 {
22207 vm_map_entry_t entry;
22208 vm_map_offset_t prev;
22209 vm_map_size_t free, total_free, largest_free;
22210 boolean_t end;
22211
22212 if (!map) {
22213 *psize = *pfree = *plargest_free = 0;
22214 return;
22215 }
22216 total_free = largest_free = 0;
22217
22218 vm_map_lock_read(map);
22219 if (psize) {
22220 *psize = map->max_offset - map->min_offset;
22221 }
22222
22223 prev = map->min_offset;
22224 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22225 end = (entry == vm_map_to_entry(map));
22226
22227 if (end) {
22228 free = entry->vme_end - prev;
22229 } else {
22230 free = entry->vme_start - prev;
22231 }
22232
22233 total_free += free;
22234 if (free > largest_free) {
22235 largest_free = free;
22236 }
22237
22238 if (end) {
22239 break;
22240 }
22241 prev = entry->vme_end;
22242 }
22243 vm_map_unlock_read(map);
22244 if (pfree) {
22245 *pfree = total_free;
22246 }
22247 if (plargest_free) {
22248 *plargest_free = largest_free;
22249 }
22250 }
22251
22252 #if VM_SCAN_FOR_SHADOW_CHAIN
22253 int
vm_map_shadow_max(vm_map_t map)22254 vm_map_shadow_max(
22255 vm_map_t map)
22256 {
22257 int shadows, shadows_max;
22258 vm_map_entry_t entry;
22259 vm_object_t object, next_object;
22260
22261 if (map == NULL) {
22262 return 0;
22263 }
22264
22265 shadows_max = 0;
22266
22267 vm_map_lock_read(map);
22268
22269 for (entry = vm_map_first_entry(map);
22270 entry != vm_map_to_entry(map);
22271 entry = entry->vme_next) {
22272 if (entry->is_sub_map) {
22273 continue;
22274 }
22275 object = VME_OBJECT(entry);
22276 if (object == NULL) {
22277 continue;
22278 }
22279 vm_object_lock_shared(object);
22280 for (shadows = 0;
22281 object->shadow != NULL;
22282 shadows++, object = next_object) {
22283 next_object = object->shadow;
22284 vm_object_lock_shared(next_object);
22285 vm_object_unlock(object);
22286 }
22287 vm_object_unlock(object);
22288 if (shadows > shadows_max) {
22289 shadows_max = shadows;
22290 }
22291 }
22292
22293 vm_map_unlock_read(map);
22294
22295 return shadows_max;
22296 }
22297 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22298
22299 void
vm_commit_pagezero_status(vm_map_t lmap)22300 vm_commit_pagezero_status(vm_map_t lmap)
22301 {
22302 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22303 }
22304
22305 #if __x86_64__
22306 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)22307 vm_map_set_high_start(
22308 vm_map_t map,
22309 vm_map_offset_t high_start)
22310 {
22311 map->vmmap_high_start = high_start;
22312 }
22313 #endif /* __x86_64__ */
22314
22315 #if CODE_SIGNING_MONITOR
22316
22317 kern_return_t
vm_map_entry_cs_associate(vm_map_t map,vm_map_entry_t entry,vm_map_kernel_flags_t vmk_flags)22318 vm_map_entry_cs_associate(
22319 vm_map_t map,
22320 vm_map_entry_t entry,
22321 vm_map_kernel_flags_t vmk_flags)
22322 {
22323 vm_object_t cs_object, cs_shadow, backing_object;
22324 vm_object_offset_t cs_offset, backing_offset;
22325 void *cs_blobs;
22326 struct vnode *cs_vnode;
22327 kern_return_t cs_ret;
22328
22329 if (map->pmap == NULL ||
22330 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22331 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22332 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22333 return KERN_SUCCESS;
22334 }
22335
22336 if (!(entry->protection & VM_PROT_EXECUTE)) {
22337 /*
22338 * This memory region is not executable, so the code-signing
22339 * monitor would usually not care about it...
22340 */
22341 if (vmk_flags.vmkf_remap_prot_copy &&
22342 (entry->max_protection & VM_PROT_EXECUTE)) {
22343 /*
22344 * ... except if the memory region is being remapped
22345 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22346 * which is what a debugger or dtrace would be doing
22347 * to prepare to modify an executable page to insert
22348 * a breakpoint or activate a probe.
22349 * In that case, fall through so that we can mark
22350 * this region as being "debugged" and no longer
22351 * strictly code-signed.
22352 */
22353 } else {
22354 /*
22355 * Really not executable, so no need to tell the
22356 * code-signing monitor.
22357 */
22358 return KERN_SUCCESS;
22359 }
22360 }
22361
22362 vm_map_lock_assert_exclusive(map);
22363
22364 /*
22365 * Check for a debug association mapping before we check for used_for_jit. This
22366 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22367 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22368 * since they are mapped with RW or RX permissions, which the page table monitor
22369 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22370 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22371 * violation when those USER_EXEC pages are mapped as RW.
22372 *
22373 * Since these pages switch between RW and RX through mprotect, they mimic what
22374 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22375 * on macOS systems, this works in our favor here and allows us to continue to
22376 * support these legacy-programmed applications without sacrificing security on
22377 * the page table or the code signing monitor. We don't need to explicitly check
22378 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22379 * created with RX, then the application must map it as RW in order to first write
22380 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22381 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22382 * Similarly, if the mapping was created as RW, and then switched to RX,
22383 * vm_map_protect will again mark the entry as a copy, and both these cases
22384 * lead to this if-statement being entered.
22385 *
22386 * For more information: rdar://115313336.
22387 */
22388 if (vmk_flags.vmkf_remap_prot_copy) {
22389 cs_ret = csm_associate_debug_region(
22390 map->pmap,
22391 entry->vme_start,
22392 entry->vme_end - entry->vme_start);
22393
22394 /*
22395 * csm_associate_debug_region returns not supported when the code signing
22396 * monitor is disabled. This is intentional, since cs_ret is checked towards
22397 * the end of the function, and if it is not supported, then we still want the
22398 * VM to perform code-signing enforcement on this entry. That said, if we don't
22399 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22400 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22401 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22402 * cases, which will cause a violation when attempted to be mapped as writable).
22403 */
22404 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22405 entry->vme_xnu_user_debug = TRUE;
22406 }
22407 #if DEVELOPMENT || DEBUG
22408 if (vm_log_xnu_user_debug) {
22409 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22410 proc_selfpid(),
22411 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22412 __FUNCTION__, __LINE__,
22413 map, entry,
22414 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22415 entry->vme_xnu_user_debug,
22416 cs_ret);
22417 }
22418 #endif /* DEVELOPMENT || DEBUG */
22419 goto done;
22420 }
22421
22422 if (entry->used_for_jit) {
22423 cs_ret = csm_associate_jit_region(
22424 map->pmap,
22425 entry->vme_start,
22426 entry->vme_end - entry->vme_start);
22427 goto done;
22428 }
22429
22430 cs_object = VME_OBJECT(entry);
22431 vm_object_lock_shared(cs_object);
22432 cs_offset = VME_OFFSET(entry);
22433
22434 /* find the VM object backed by the code-signed vnode */
22435 for (;;) {
22436 /* go to the bottom of cs_object's shadow chain */
22437 for (;
22438 cs_object->shadow != VM_OBJECT_NULL;
22439 cs_object = cs_shadow) {
22440 cs_shadow = cs_object->shadow;
22441 cs_offset += cs_object->vo_shadow_offset;
22442 vm_object_lock_shared(cs_shadow);
22443 vm_object_unlock(cs_object);
22444 }
22445 if (cs_object->internal ||
22446 cs_object->pager == MEMORY_OBJECT_NULL) {
22447 vm_object_unlock(cs_object);
22448 return KERN_SUCCESS;
22449 }
22450
22451 cs_offset += cs_object->paging_offset;
22452
22453 /*
22454 * cs_object could be backed by a:
22455 * vnode_pager
22456 * apple_protect_pager
22457 * shared_region_pager
22458 * fourk_pager (multiple backing objects -> fail?)
22459 * ask the pager if it has a backing VM object
22460 */
22461 if (!memory_object_backing_object(cs_object->pager,
22462 cs_offset,
22463 &backing_object,
22464 &backing_offset)) {
22465 /* no backing object: cs_object is it */
22466 break;
22467 }
22468
22469 /* look down the backing object's shadow chain */
22470 vm_object_lock_shared(backing_object);
22471 vm_object_unlock(cs_object);
22472 cs_object = backing_object;
22473 cs_offset = backing_offset;
22474 }
22475
22476 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22477 if (cs_vnode == NULL) {
22478 /* no vnode, no code signatures to associate */
22479 cs_ret = KERN_SUCCESS;
22480 } else {
22481 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22482 &cs_blobs);
22483 assert(cs_ret == KERN_SUCCESS);
22484 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22485 entry->vme_start,
22486 (entry->vme_end - entry->vme_start),
22487 cs_offset,
22488 cs_blobs);
22489 }
22490 vm_object_unlock(cs_object);
22491 cs_object = VM_OBJECT_NULL;
22492
22493 done:
22494 if (cs_ret == KERN_SUCCESS) {
22495 DTRACE_VM2(vm_map_entry_cs_associate_success,
22496 vm_map_offset_t, entry->vme_start,
22497 vm_map_offset_t, entry->vme_end);
22498 if (vm_map_executable_immutable) {
22499 /*
22500 * Prevent this executable
22501 * mapping from being unmapped
22502 * or modified.
22503 */
22504 entry->vme_permanent = TRUE;
22505 }
22506 /*
22507 * pmap says it will validate the
22508 * code-signing validity of pages
22509 * faulted in via this mapping, so
22510 * this map entry should be marked so
22511 * that vm_fault() bypasses code-signing
22512 * validation for faults coming through
22513 * this mapping.
22514 */
22515 entry->csm_associated = TRUE;
22516 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22517 /*
22518 * pmap won't check the code-signing
22519 * validity of pages faulted in via
22520 * this mapping, so VM should keep
22521 * doing it.
22522 */
22523 DTRACE_VM3(vm_map_entry_cs_associate_off,
22524 vm_map_offset_t, entry->vme_start,
22525 vm_map_offset_t, entry->vme_end,
22526 int, cs_ret);
22527 } else {
22528 /*
22529 * A real error: do not allow
22530 * execution in this mapping.
22531 */
22532 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22533 vm_map_offset_t, entry->vme_start,
22534 vm_map_offset_t, entry->vme_end,
22535 int, cs_ret);
22536 if (vmk_flags.vmkf_overwrite_immutable) {
22537 /*
22538 * We can get here when we remap an apple_protect pager
22539 * on top of an already cs_associated executable mapping
22540 * with the same code signatures, so we don't want to
22541 * lose VM_PROT_EXECUTE in that case...
22542 */
22543 } else {
22544 entry->protection &= ~VM_PROT_ALLEXEC;
22545 entry->max_protection &= ~VM_PROT_ALLEXEC;
22546 }
22547 }
22548
22549 return cs_ret;
22550 }
22551
22552 #endif /* CODE_SIGNING_MONITOR */
22553
22554 inline bool
vm_map_is_corpse_source(vm_map_t map)22555 vm_map_is_corpse_source(vm_map_t map)
22556 {
22557 bool status = false;
22558 if (map) {
22559 vm_map_lock_read(map);
22560 status = map->corpse_source;
22561 vm_map_unlock_read(map);
22562 }
22563 return status;
22564 }
22565
22566 inline void
vm_map_set_corpse_source(vm_map_t map)22567 vm_map_set_corpse_source(vm_map_t map)
22568 {
22569 if (map) {
22570 vm_map_lock(map);
22571 map->corpse_source = true;
22572 vm_map_unlock(map);
22573 }
22574 }
22575
22576 inline void
vm_map_unset_corpse_source(vm_map_t map)22577 vm_map_unset_corpse_source(vm_map_t map)
22578 {
22579 if (map) {
22580 vm_map_lock(map);
22581 map->corpse_source = false;
22582 vm_map_unlock(map);
22583 }
22584 }
22585 /*
22586 * FORKED CORPSE FOOTPRINT
22587 *
22588 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22589 * empty since it never ran and never got to fault in any pages.
22590 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22591 * a forked corpse would therefore return very little information.
22592 *
22593 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22594 * to vm_map_fork() to collect footprint information from the original VM map
22595 * and its pmap, and store it in the forked corpse's VM map. That information
22596 * is stored in place of the VM map's "hole list" since we'll never need to
22597 * lookup for holes in the corpse's map.
22598 *
22599 * The corpse's footprint info looks like this:
22600 *
22601 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22602 * as follows:
22603 * +---------------------------------------+
22604 * header-> | cf_size |
22605 * +-------------------+-------------------+
22606 * | cf_last_region | cf_last_zeroes |
22607 * +-------------------+-------------------+
22608 * region1-> | cfr_vaddr |
22609 * +-------------------+-------------------+
22610 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22611 * +---------------------------------------+
22612 * | d4 | d5 | ... |
22613 * +---------------------------------------+
22614 * | ... |
22615 * +-------------------+-------------------+
22616 * | dy | dz | na | na | cfr_vaddr... | <-region2
22617 * +-------------------+-------------------+
22618 * | cfr_vaddr (ctd) | cfr_num_pages |
22619 * +---------------------------------------+
22620 * | d0 | d1 ... |
22621 * +---------------------------------------+
22622 * ...
22623 * +---------------------------------------+
22624 * last region-> | cfr_vaddr |
22625 * +---------------------------------------+
22626 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22627 * +---------------------------------------+
22628 * ...
22629 * +---------------------------------------+
22630 * | dx | dy | dz | na | na | na | na | na |
22631 * +---------------------------------------+
22632 *
22633 * where:
22634 * cf_size: total size of the buffer (rounded to page size)
22635 * cf_last_region: offset in the buffer of the last "region" sub-header
22636 * cf_last_zeroes: number of trailing "zero" dispositions at the end
22637 * of last region
22638 * cfr_vaddr: virtual address of the start of the covered "region"
22639 * cfr_num_pages: number of pages in the covered "region"
22640 * d*: disposition of the page at that virtual address
22641 * Regions in the buffer are word-aligned.
22642 *
22643 * We estimate the size of the buffer based on the number of memory regions
22644 * and the virtual size of the address space. While copying each memory region
22645 * during vm_map_fork(), we also collect the footprint info for that region
22646 * and store it in the buffer, packing it as much as possible (coalescing
22647 * contiguous memory regions to avoid having too many region headers and
22648 * avoiding long streaks of "zero" page dispositions by splitting footprint
22649 * "regions", so the number of regions in the footprint buffer might not match
22650 * the number of memory regions in the address space.
22651 *
22652 * We also have to copy the original task's "nonvolatile" ledgers since that's
22653 * part of the footprint and will need to be reported to any tool asking for
22654 * the footprint information of the forked corpse.
22655 */
22656
22657 uint64_t vm_map_corpse_footprint_count = 0;
22658 uint64_t vm_map_corpse_footprint_size_avg = 0;
22659 uint64_t vm_map_corpse_footprint_size_max = 0;
22660 uint64_t vm_map_corpse_footprint_full = 0;
22661 uint64_t vm_map_corpse_footprint_no_buf = 0;
22662
22663 struct vm_map_corpse_footprint_header {
22664 vm_size_t cf_size; /* allocated buffer size */
22665 uint32_t cf_last_region; /* offset of last region in buffer */
22666 union {
22667 uint32_t cfu_last_zeroes; /* during creation:
22668 * number of "zero" dispositions at
22669 * end of last region */
22670 uint32_t cfu_hint_region; /* during lookup:
22671 * offset of last looked up region */
22672 #define cf_last_zeroes cfu.cfu_last_zeroes
22673 #define cf_hint_region cfu.cfu_hint_region
22674 } cfu;
22675 };
22676 typedef uint8_t cf_disp_t;
22677 struct vm_map_corpse_footprint_region {
22678 vm_map_offset_t cfr_vaddr; /* region start virtual address */
22679 uint32_t cfr_num_pages; /* number of pages in this "region" */
22680 cf_disp_t cfr_disposition[0]; /* disposition of each page */
22681 } __attribute__((packed));
22682
22683 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)22684 vm_page_disposition_to_cf_disp(
22685 int disposition)
22686 {
22687 assert(sizeof(cf_disp_t) == 1);
22688 /* relocate bits that don't fit in a "uint8_t" */
22689 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22690 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22691 }
22692 /* cast gets rid of extra bits */
22693 return (cf_disp_t) disposition;
22694 }
22695
22696 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)22697 vm_page_cf_disp_to_disposition(
22698 cf_disp_t cf_disp)
22699 {
22700 int disposition;
22701
22702 assert(sizeof(cf_disp_t) == 1);
22703 disposition = (int) cf_disp;
22704 /* move relocated bits back in place */
22705 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22706 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22707 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22708 }
22709 return disposition;
22710 }
22711
22712 /*
22713 * vm_map_corpse_footprint_new_region:
22714 * closes the current footprint "region" and creates a new one
22715 *
22716 * Returns NULL if there's not enough space in the buffer for a new region.
22717 */
22718 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)22719 vm_map_corpse_footprint_new_region(
22720 struct vm_map_corpse_footprint_header *footprint_header)
22721 {
22722 uintptr_t footprint_edge;
22723 uint32_t new_region_offset;
22724 struct vm_map_corpse_footprint_region *footprint_region;
22725 struct vm_map_corpse_footprint_region *new_footprint_region;
22726
22727 footprint_edge = ((uintptr_t)footprint_header +
22728 footprint_header->cf_size);
22729 footprint_region = ((struct vm_map_corpse_footprint_region *)
22730 ((char *)footprint_header +
22731 footprint_header->cf_last_region));
22732 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22733 footprint_edge);
22734
22735 /* get rid of trailing zeroes in the last region */
22736 assert(footprint_region->cfr_num_pages >=
22737 footprint_header->cf_last_zeroes);
22738 footprint_region->cfr_num_pages -=
22739 footprint_header->cf_last_zeroes;
22740 footprint_header->cf_last_zeroes = 0;
22741
22742 /* reuse this region if it's now empty */
22743 if (footprint_region->cfr_num_pages == 0) {
22744 return footprint_region;
22745 }
22746
22747 /* compute offset of new region */
22748 new_region_offset = footprint_header->cf_last_region;
22749 new_region_offset += sizeof(*footprint_region);
22750 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22751 new_region_offset = roundup(new_region_offset, sizeof(int));
22752
22753 /* check if we're going over the edge */
22754 if (((uintptr_t)footprint_header +
22755 new_region_offset +
22756 sizeof(*footprint_region)) >=
22757 footprint_edge) {
22758 /* over the edge: no new region */
22759 return NULL;
22760 }
22761
22762 /* adjust offset of last region in header */
22763 footprint_header->cf_last_region = new_region_offset;
22764
22765 new_footprint_region = (struct vm_map_corpse_footprint_region *)
22766 ((char *)footprint_header +
22767 footprint_header->cf_last_region);
22768 new_footprint_region->cfr_vaddr = 0;
22769 new_footprint_region->cfr_num_pages = 0;
22770 /* caller needs to initialize new region */
22771
22772 return new_footprint_region;
22773 }
22774
22775 /*
22776 * vm_map_corpse_footprint_collect:
22777 * collect footprint information for "old_entry" in "old_map" and
22778 * stores it in "new_map"'s vmmap_footprint_info.
22779 */
22780 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)22781 vm_map_corpse_footprint_collect(
22782 vm_map_t old_map,
22783 vm_map_entry_t old_entry,
22784 vm_map_t new_map)
22785 {
22786 vm_map_offset_t va;
22787 kern_return_t kr;
22788 struct vm_map_corpse_footprint_header *footprint_header;
22789 struct vm_map_corpse_footprint_region *footprint_region;
22790 struct vm_map_corpse_footprint_region *new_footprint_region;
22791 cf_disp_t *next_disp_p;
22792 uintptr_t footprint_edge;
22793 uint32_t num_pages_tmp;
22794 int effective_page_size;
22795
22796 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22797
22798 va = old_entry->vme_start;
22799
22800 vm_map_lock_assert_exclusive(old_map);
22801 vm_map_lock_assert_exclusive(new_map);
22802
22803 assert(new_map->has_corpse_footprint);
22804 assert(!old_map->has_corpse_footprint);
22805 if (!new_map->has_corpse_footprint ||
22806 old_map->has_corpse_footprint) {
22807 /*
22808 * This can only transfer footprint info from a
22809 * map with a live pmap to a map with a corpse footprint.
22810 */
22811 return KERN_NOT_SUPPORTED;
22812 }
22813
22814 if (new_map->vmmap_corpse_footprint == NULL) {
22815 vm_offset_t buf;
22816 vm_size_t buf_size;
22817
22818 buf = 0;
22819 buf_size = (sizeof(*footprint_header) +
22820 (old_map->hdr.nentries
22821 *
22822 (sizeof(*footprint_region) +
22823 +3)) /* potential alignment for each region */
22824 +
22825 ((old_map->size / effective_page_size)
22826 *
22827 sizeof(cf_disp_t))); /* disposition for each page */
22828 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22829 buf_size = round_page(buf_size);
22830
22831 /* limit buffer to 1 page to validate overflow detection */
22832 // buf_size = PAGE_SIZE;
22833
22834 /* limit size to a somewhat sane amount */
22835 #if XNU_TARGET_OS_OSX
22836 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22837 #else /* XNU_TARGET_OS_OSX */
22838 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22839 #endif /* XNU_TARGET_OS_OSX */
22840 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22841 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22842 }
22843
22844 /*
22845 * Allocate the pageable buffer (with a trailing guard page).
22846 * It will be zero-filled on demand.
22847 */
22848 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
22849 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22850 VM_KERN_MEMORY_DIAG);
22851 if (kr != KERN_SUCCESS) {
22852 vm_map_corpse_footprint_no_buf++;
22853 return kr;
22854 }
22855
22856 /* initialize header and 1st region */
22857 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22858 new_map->vmmap_corpse_footprint = footprint_header;
22859
22860 footprint_header->cf_size = buf_size;
22861 footprint_header->cf_last_region =
22862 sizeof(*footprint_header);
22863 footprint_header->cf_last_zeroes = 0;
22864
22865 footprint_region = (struct vm_map_corpse_footprint_region *)
22866 ((char *)footprint_header +
22867 footprint_header->cf_last_region);
22868 footprint_region->cfr_vaddr = 0;
22869 footprint_region->cfr_num_pages = 0;
22870 } else {
22871 /* retrieve header and last region */
22872 footprint_header = (struct vm_map_corpse_footprint_header *)
22873 new_map->vmmap_corpse_footprint;
22874 footprint_region = (struct vm_map_corpse_footprint_region *)
22875 ((char *)footprint_header +
22876 footprint_header->cf_last_region);
22877 }
22878 footprint_edge = ((uintptr_t)footprint_header +
22879 footprint_header->cf_size);
22880
22881 if ((footprint_region->cfr_vaddr +
22882 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22883 effective_page_size))
22884 != old_entry->vme_start) {
22885 uint64_t num_pages_delta, num_pages_delta_size;
22886 uint32_t region_offset_delta_size;
22887
22888 /*
22889 * Not the next contiguous virtual address:
22890 * start a new region or store "zero" dispositions for
22891 * the missing pages?
22892 */
22893 /* size of gap in actual page dispositions */
22894 num_pages_delta = ((old_entry->vme_start -
22895 footprint_region->cfr_vaddr) / effective_page_size)
22896 - footprint_region->cfr_num_pages;
22897 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22898 /* size of gap as a new footprint region header */
22899 region_offset_delta_size =
22900 (sizeof(*footprint_region) +
22901 roundup(((footprint_region->cfr_num_pages -
22902 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22903 sizeof(int)) -
22904 ((footprint_region->cfr_num_pages -
22905 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22906 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22907 if (region_offset_delta_size < num_pages_delta_size ||
22908 os_add3_overflow(footprint_region->cfr_num_pages,
22909 (uint32_t) num_pages_delta,
22910 1,
22911 &num_pages_tmp)) {
22912 /*
22913 * Storing data for this gap would take more space
22914 * than inserting a new footprint region header:
22915 * let's start a new region and save space. If it's a
22916 * tie, let's avoid using a new region, since that
22917 * would require more region hops to find the right
22918 * range during lookups.
22919 *
22920 * If the current region's cfr_num_pages would overflow
22921 * if we added "zero" page dispositions for the gap,
22922 * no choice but to start a new region.
22923 */
22924 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
22925 new_footprint_region =
22926 vm_map_corpse_footprint_new_region(footprint_header);
22927 /* check that we're not going over the edge */
22928 if (new_footprint_region == NULL) {
22929 goto over_the_edge;
22930 }
22931 footprint_region = new_footprint_region;
22932 /* initialize new region as empty */
22933 footprint_region->cfr_vaddr = old_entry->vme_start;
22934 footprint_region->cfr_num_pages = 0;
22935 } else {
22936 /*
22937 * Store "zero" page dispositions for the missing
22938 * pages.
22939 */
22940 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
22941 for (; num_pages_delta > 0; num_pages_delta--) {
22942 next_disp_p = (cf_disp_t *)
22943 ((uintptr_t) footprint_region +
22944 sizeof(*footprint_region));
22945 next_disp_p += footprint_region->cfr_num_pages;
22946 /* check that we're not going over the edge */
22947 if ((uintptr_t)next_disp_p >= footprint_edge) {
22948 goto over_the_edge;
22949 }
22950 /* store "zero" disposition for this gap page */
22951 footprint_region->cfr_num_pages++;
22952 *next_disp_p = (cf_disp_t) 0;
22953 footprint_header->cf_last_zeroes++;
22954 }
22955 }
22956 }
22957
22958 for (va = old_entry->vme_start;
22959 va < old_entry->vme_end;
22960 va += effective_page_size) {
22961 int disposition;
22962 cf_disp_t cf_disp;
22963
22964 vm_map_footprint_query_page_info(old_map,
22965 old_entry,
22966 va,
22967 &disposition);
22968 cf_disp = vm_page_disposition_to_cf_disp(disposition);
22969
22970 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
22971
22972 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
22973 /*
22974 * Ignore "zero" dispositions at start of
22975 * region: just move start of region.
22976 */
22977 footprint_region->cfr_vaddr += effective_page_size;
22978 continue;
22979 }
22980
22981 /* would region's cfr_num_pages overflow? */
22982 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
22983 &num_pages_tmp)) {
22984 /* overflow: create a new region */
22985 new_footprint_region =
22986 vm_map_corpse_footprint_new_region(
22987 footprint_header);
22988 if (new_footprint_region == NULL) {
22989 goto over_the_edge;
22990 }
22991 footprint_region = new_footprint_region;
22992 footprint_region->cfr_vaddr = va;
22993 footprint_region->cfr_num_pages = 0;
22994 }
22995
22996 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
22997 sizeof(*footprint_region));
22998 next_disp_p += footprint_region->cfr_num_pages;
22999 /* check that we're not going over the edge */
23000 if ((uintptr_t)next_disp_p >= footprint_edge) {
23001 goto over_the_edge;
23002 }
23003 /* store this dispostion */
23004 *next_disp_p = cf_disp;
23005 footprint_region->cfr_num_pages++;
23006
23007 if (cf_disp != 0) {
23008 /* non-zero disp: break the current zero streak */
23009 footprint_header->cf_last_zeroes = 0;
23010 /* done */
23011 continue;
23012 }
23013
23014 /* zero disp: add to the current streak of zeroes */
23015 footprint_header->cf_last_zeroes++;
23016 if ((footprint_header->cf_last_zeroes +
23017 roundup(((footprint_region->cfr_num_pages -
23018 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23019 (sizeof(int) - 1),
23020 sizeof(int))) <
23021 (sizeof(*footprint_header))) {
23022 /*
23023 * There are not enough trailing "zero" dispositions
23024 * (+ the extra padding we would need for the previous
23025 * region); creating a new region would not save space
23026 * at this point, so let's keep this "zero" disposition
23027 * in this region and reconsider later.
23028 */
23029 continue;
23030 }
23031 /*
23032 * Create a new region to avoid having too many consecutive
23033 * "zero" dispositions.
23034 */
23035 new_footprint_region =
23036 vm_map_corpse_footprint_new_region(footprint_header);
23037 if (new_footprint_region == NULL) {
23038 goto over_the_edge;
23039 }
23040 footprint_region = new_footprint_region;
23041 /* initialize the new region as empty ... */
23042 footprint_region->cfr_num_pages = 0;
23043 /* ... and skip this "zero" disp */
23044 footprint_region->cfr_vaddr = va + effective_page_size;
23045 }
23046
23047 return KERN_SUCCESS;
23048
23049 over_the_edge:
23050 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23051 vm_map_corpse_footprint_full++;
23052 return KERN_RESOURCE_SHORTAGE;
23053 }
23054
23055 /*
23056 * vm_map_corpse_footprint_collect_done:
23057 * completes the footprint collection by getting rid of any remaining
23058 * trailing "zero" dispositions and trimming the unused part of the
23059 * kernel buffer
23060 */
23061 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)23062 vm_map_corpse_footprint_collect_done(
23063 vm_map_t new_map)
23064 {
23065 struct vm_map_corpse_footprint_header *footprint_header;
23066 struct vm_map_corpse_footprint_region *footprint_region;
23067 vm_size_t buf_size, actual_size;
23068 kern_return_t kr;
23069
23070 assert(new_map->has_corpse_footprint);
23071 if (!new_map->has_corpse_footprint ||
23072 new_map->vmmap_corpse_footprint == NULL) {
23073 return;
23074 }
23075
23076 footprint_header = (struct vm_map_corpse_footprint_header *)
23077 new_map->vmmap_corpse_footprint;
23078 buf_size = footprint_header->cf_size;
23079
23080 footprint_region = (struct vm_map_corpse_footprint_region *)
23081 ((char *)footprint_header +
23082 footprint_header->cf_last_region);
23083
23084 /* get rid of trailing zeroes in last region */
23085 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23086 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23087 footprint_header->cf_last_zeroes = 0;
23088
23089 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23090 sizeof(*footprint_region) +
23091 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23092
23093 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23094 vm_map_corpse_footprint_size_avg =
23095 (((vm_map_corpse_footprint_size_avg *
23096 vm_map_corpse_footprint_count) +
23097 actual_size) /
23098 (vm_map_corpse_footprint_count + 1));
23099 vm_map_corpse_footprint_count++;
23100 if (actual_size > vm_map_corpse_footprint_size_max) {
23101 vm_map_corpse_footprint_size_max = actual_size;
23102 }
23103
23104 actual_size = round_page(actual_size);
23105 if (buf_size > actual_size) {
23106 kr = vm_deallocate(kernel_map,
23107 vm_sanitize_wrap_addr((vm_address_t)footprint_header +
23108 actual_size + PAGE_SIZE), /* trailing guard page */
23109 vm_sanitize_wrap_size(buf_size - actual_size));
23110 assertf(kr == KERN_SUCCESS,
23111 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23112 footprint_header,
23113 (uint64_t) buf_size,
23114 (uint64_t) actual_size,
23115 kr);
23116 kr = vm_protect(kernel_map,
23117 (vm_address_t)footprint_header + actual_size,
23118 PAGE_SIZE,
23119 FALSE, /* set_maximum */
23120 VM_PROT_NONE);
23121 assertf(kr == KERN_SUCCESS,
23122 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23123 footprint_header,
23124 (uint64_t) buf_size,
23125 (uint64_t) actual_size,
23126 kr);
23127 }
23128
23129 footprint_header->cf_size = actual_size;
23130 }
23131
23132 /*
23133 * vm_map_corpse_footprint_query_page_info:
23134 * retrieves the disposition of the page at virtual address "vaddr"
23135 * in the forked corpse's VM map
23136 *
23137 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23138 */
23139 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)23140 vm_map_corpse_footprint_query_page_info(
23141 vm_map_t map,
23142 vm_map_offset_t va,
23143 int *disposition_p)
23144 {
23145 struct vm_map_corpse_footprint_header *footprint_header;
23146 struct vm_map_corpse_footprint_region *footprint_region;
23147 uint32_t footprint_region_offset;
23148 vm_map_offset_t region_start, region_end;
23149 int disp_idx;
23150 kern_return_t kr;
23151 int effective_page_size;
23152 cf_disp_t cf_disp;
23153
23154 if (!map->has_corpse_footprint) {
23155 *disposition_p = 0;
23156 kr = KERN_INVALID_ARGUMENT;
23157 goto done;
23158 }
23159
23160 footprint_header = map->vmmap_corpse_footprint;
23161 if (footprint_header == NULL) {
23162 *disposition_p = 0;
23163 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23164 kr = KERN_INVALID_ARGUMENT;
23165 goto done;
23166 }
23167
23168 /* start looking at the hint ("cf_hint_region") */
23169 footprint_region_offset = footprint_header->cf_hint_region;
23170
23171 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23172
23173 lookup_again:
23174 if (footprint_region_offset < sizeof(*footprint_header)) {
23175 /* hint too low: start from 1st region */
23176 footprint_region_offset = sizeof(*footprint_header);
23177 }
23178 if (footprint_region_offset > footprint_header->cf_last_region) {
23179 /* hint too high: re-start from 1st region */
23180 footprint_region_offset = sizeof(*footprint_header);
23181 }
23182 footprint_region = (struct vm_map_corpse_footprint_region *)
23183 ((char *)footprint_header + footprint_region_offset);
23184 region_start = footprint_region->cfr_vaddr;
23185 region_end = (region_start +
23186 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23187 effective_page_size));
23188 if (va < region_start &&
23189 footprint_region_offset != sizeof(*footprint_header)) {
23190 /* our range starts before the hint region */
23191
23192 /* reset the hint (in a racy way...) */
23193 footprint_header->cf_hint_region = sizeof(*footprint_header);
23194 /* lookup "va" again from 1st region */
23195 footprint_region_offset = sizeof(*footprint_header);
23196 goto lookup_again;
23197 }
23198
23199 while (va >= region_end) {
23200 if (footprint_region_offset >= footprint_header->cf_last_region) {
23201 break;
23202 }
23203 /* skip the region's header */
23204 footprint_region_offset += sizeof(*footprint_region);
23205 /* skip the region's page dispositions */
23206 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23207 /* align to next word boundary */
23208 footprint_region_offset =
23209 roundup(footprint_region_offset,
23210 sizeof(int));
23211 footprint_region = (struct vm_map_corpse_footprint_region *)
23212 ((char *)footprint_header + footprint_region_offset);
23213 region_start = footprint_region->cfr_vaddr;
23214 region_end = (region_start +
23215 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23216 effective_page_size));
23217 }
23218 if (va < region_start || va >= region_end) {
23219 /* page not found */
23220 *disposition_p = 0;
23221 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23222 kr = KERN_SUCCESS;
23223 goto done;
23224 }
23225
23226 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23227 footprint_header->cf_hint_region = footprint_region_offset;
23228
23229 /* get page disposition for "va" in this region */
23230 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23231 cf_disp = footprint_region->cfr_disposition[disp_idx];
23232 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23233 kr = KERN_SUCCESS;
23234 done:
23235 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23236 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23237 DTRACE_VM4(footprint_query_page_info,
23238 vm_map_t, map,
23239 vm_map_offset_t, va,
23240 int, *disposition_p,
23241 kern_return_t, kr);
23242
23243 return kr;
23244 }
23245
23246 void
vm_map_corpse_footprint_destroy(vm_map_t map)23247 vm_map_corpse_footprint_destroy(
23248 vm_map_t map)
23249 {
23250 if (map->has_corpse_footprint &&
23251 map->vmmap_corpse_footprint != 0) {
23252 struct vm_map_corpse_footprint_header *footprint_header;
23253 vm_size_t buf_size;
23254 kern_return_t kr;
23255
23256 footprint_header = map->vmmap_corpse_footprint;
23257 buf_size = footprint_header->cf_size;
23258 kr = vm_deallocate(kernel_map,
23259 vm_sanitize_wrap_addr((vm_offset_t) map->vmmap_corpse_footprint),
23260 vm_sanitize_wrap_size(buf_size + PAGE_SIZE)); /* trailing guard page */
23261 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23262 map->vmmap_corpse_footprint = 0;
23263 map->has_corpse_footprint = FALSE;
23264 }
23265 }
23266
23267 /*
23268 * vm_map_copy_footprint_ledgers:
23269 * copies any ledger that's relevant to the memory footprint of "old_task"
23270 * into the forked corpse's task ("new_task")
23271 */
23272 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)23273 vm_map_copy_footprint_ledgers(
23274 task_t old_task,
23275 task_t new_task)
23276 {
23277 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
23278 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
23279 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
23280 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
23281 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
23282 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
23283 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
23284 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
23285 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
23286 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
23287 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
23288 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
23289 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
23290 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
23291 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
23292 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
23293 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
23294 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
23295 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
23296 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
23297 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_nofootprint_total);
23298 }
23299
23300 /*
23301 * vm_map_copy_ledger:
23302 * copy a single ledger from "old_task" to "new_task"
23303 */
23304 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)23305 vm_map_copy_ledger(
23306 task_t old_task,
23307 task_t new_task,
23308 int ledger_entry)
23309 {
23310 ledger_amount_t old_balance, new_balance, delta;
23311
23312 assert(new_task->map->has_corpse_footprint);
23313 if (!new_task->map->has_corpse_footprint) {
23314 return;
23315 }
23316
23317 /* turn off sanity checks for the ledger we're about to mess with */
23318 ledger_disable_panic_on_negative(new_task->ledger,
23319 ledger_entry);
23320
23321 /* adjust "new_task" to match "old_task" */
23322 ledger_get_balance(old_task->ledger,
23323 ledger_entry,
23324 &old_balance);
23325 ledger_get_balance(new_task->ledger,
23326 ledger_entry,
23327 &new_balance);
23328 if (new_balance == old_balance) {
23329 /* new == old: done */
23330 } else if (new_balance > old_balance) {
23331 /* new > old ==> new -= new - old */
23332 delta = new_balance - old_balance;
23333 ledger_debit(new_task->ledger,
23334 ledger_entry,
23335 delta);
23336 } else {
23337 /* new < old ==> new += old - new */
23338 delta = old_balance - new_balance;
23339 ledger_credit(new_task->ledger,
23340 ledger_entry,
23341 delta);
23342 }
23343 }
23344
23345 /*
23346 * vm_map_get_pmap:
23347 * returns the pmap associated with the vm_map
23348 */
23349 pmap_t
vm_map_get_pmap(vm_map_t map)23350 vm_map_get_pmap(vm_map_t map)
23351 {
23352 return vm_map_pmap(map);
23353 }
23354
23355 ppnum_t
vm_map_get_phys_page(vm_map_t map,vm_offset_t addr)23356 vm_map_get_phys_page(
23357 vm_map_t map,
23358 vm_offset_t addr)
23359 {
23360 vm_object_offset_t offset;
23361 vm_object_t object;
23362 vm_map_offset_t map_offset;
23363 vm_map_entry_t entry;
23364 ppnum_t phys_page = 0;
23365
23366 map_offset = vm_map_trunc_page(addr, PAGE_MASK);
23367
23368 vm_map_lock(map);
23369 while (vm_map_lookup_entry(map, map_offset, &entry)) {
23370 if (entry->is_sub_map) {
23371 vm_map_t old_map;
23372 vm_map_lock(VME_SUBMAP(entry));
23373 old_map = map;
23374 map = VME_SUBMAP(entry);
23375 map_offset = (VME_OFFSET(entry) +
23376 (map_offset - entry->vme_start));
23377 vm_map_unlock(old_map);
23378 continue;
23379 }
23380 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
23381 vm_map_unlock(map);
23382 return (ppnum_t) 0;
23383 }
23384 if (VME_OBJECT(entry)->phys_contiguous) {
23385 /* These are not standard pageable memory mappings */
23386 /* If they are not present in the object they will */
23387 /* have to be picked up from the pager through the */
23388 /* fault mechanism. */
23389 if (VME_OBJECT(entry)->vo_shadow_offset == 0) {
23390 /* need to call vm_fault */
23391 vm_map_unlock(map);
23392 vm_fault(map, map_offset, VM_PROT_NONE,
23393 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
23394 THREAD_UNINT, NULL, 0);
23395 vm_map_lock(map);
23396 continue;
23397 }
23398 offset = (VME_OFFSET(entry) +
23399 (map_offset - entry->vme_start));
23400 phys_page = (ppnum_t)
23401 ((VME_OBJECT(entry)->vo_shadow_offset
23402 + offset) >> PAGE_SHIFT);
23403 break;
23404 }
23405 offset = (VME_OFFSET(entry) + (map_offset - entry->vme_start));
23406 object = VME_OBJECT(entry);
23407 vm_object_lock(object);
23408 while (TRUE) {
23409 vm_page_t dst_page = vm_page_lookup(object, offset);
23410 if (dst_page == VM_PAGE_NULL) {
23411 if (object->shadow) {
23412 vm_object_t old_object;
23413 vm_object_lock(object->shadow);
23414 old_object = object;
23415 offset = offset + object->vo_shadow_offset;
23416 object = object->shadow;
23417 vm_object_unlock(old_object);
23418 } else {
23419 vm_object_unlock(object);
23420 break;
23421 }
23422 } else {
23423 phys_page = (ppnum_t)(VM_PAGE_GET_PHYS_PAGE(dst_page));
23424 vm_object_unlock(object);
23425 break;
23426 }
23427 }
23428 break;
23429 }
23430
23431 vm_map_unlock(map);
23432 return phys_page;
23433 }
23434
23435 #if CONFIG_MAP_RANGES
23436 static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23437 static bitmap_t vm_map_user_range_large_file_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23438
23439 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23440 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23441
23442 /*
23443 * vm_map_range_map_init:
23444 * initializes the VM range ID map to enable index lookup
23445 * of user VM ranges based on VM tag from userspace.
23446 */
23447 static void
vm_map_range_map_init(void)23448 vm_map_range_map_init(void)
23449 {
23450 /*
23451 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23452 * - the former is malloc metadata which should be kept separate
23453 * - the latter has its own ranges
23454 */
23455 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23456 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23457 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23458 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23459 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23460 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23461 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23462 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23463 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23464 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23465 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23466 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23467 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23468 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23469 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23470 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23471 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOACCELERATOR);
23472 bitmap_set(vm_map_user_range_large_file_map, VM_MEMORY_IOSURFACE);
23473 }
23474
23475 static struct mach_vm_range
vm_map_range_random_uniform(vm_map_size_t req_size,vm_map_offset_t min_addr,vm_map_offset_t max_addr,vm_map_offset_t offmask)23476 vm_map_range_random_uniform(
23477 vm_map_size_t req_size,
23478 vm_map_offset_t min_addr,
23479 vm_map_offset_t max_addr,
23480 vm_map_offset_t offmask)
23481 {
23482 vm_map_offset_t random_addr;
23483 struct mach_vm_range alloc;
23484
23485 req_size = (req_size + offmask) & ~offmask;
23486 min_addr = (min_addr + offmask) & ~offmask;
23487 max_addr = max_addr & ~offmask;
23488
23489 read_random(&random_addr, sizeof(random_addr));
23490 random_addr %= (max_addr - req_size - min_addr);
23491 random_addr &= ~offmask;
23492
23493 alloc.min_address = min_addr + random_addr;
23494 alloc.max_address = min_addr + random_addr + req_size;
23495 return alloc;
23496 }
23497
23498 static vm_map_offset_t
vm_map_range_offmask(void)23499 vm_map_range_offmask(void)
23500 {
23501 uint32_t pte_depth;
23502
23503 /*
23504 * PTE optimizations
23505 *
23506 *
23507 * 16k pages systems
23508 * ~~~~~~~~~~~~~~~~~
23509 *
23510 * A single L1 (sub-)page covers the address space.
23511 * - L2 pages cover 64G,
23512 * - L3 pages cover 32M.
23513 *
23514 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23515 * As a result, we really only need to align the ranges to 32M to avoid
23516 * partial L3 pages.
23517 *
23518 * On macOS, the usage of L2 pages will increase, so as a result we will
23519 * want to align ranges to 64G in order to utilize them fully.
23520 *
23521 *
23522 * 4k pages systems
23523 * ~~~~~~~~~~~~~~~~
23524 *
23525 * A single L0 (sub-)page covers the address space.
23526 * - L1 pages cover 512G,
23527 * - L2 pages cover 1G,
23528 * - L3 pages cover 2M.
23529 *
23530 * The long tail of processes on a system will tend to have a VA usage
23531 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23532 * This is achievable with a single L1 and a few L2s without
23533 * randomization.
23534 *
23535 * However once randomization is introduced, the system will immediately
23536 * need several L1s and many more L2s. As a result:
23537 *
23538 * - on embedded devices, the cost of these extra pages isn't
23539 * sustainable, and we just disable the feature entirely,
23540 *
23541 * - on macOS we align ranges to a 512G boundary so that the extra L1
23542 * pages can be used to their full potential.
23543 */
23544
23545 /*
23546 * note, this function assumes _non exotic mappings_
23547 * which is why it uses the native kernel's PAGE_SHIFT.
23548 */
23549 #if XNU_PLATFORM_MacOSX
23550 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23551 #else /* !XNU_PLATFORM_MacOSX */
23552 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23553 #endif /* !XNU_PLATFORM_MacOSX */
23554
23555 if (pte_depth == 0) {
23556 return 0;
23557 }
23558
23559 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23560 }
23561
23562 /*
23563 * vm_map_range_configure:
23564 * configures the user vm_map ranges by increasing the maximum VA range of
23565 * the map and carving out a range at the end of VA space (searching backwards
23566 * in the newly expanded map).
23567 */
23568 kern_return_t
vm_map_range_configure(vm_map_t map,__unused bool needs_extra_jumbo_va)23569 vm_map_range_configure(vm_map_t map, __unused bool needs_extra_jumbo_va)
23570 {
23571 const vm_map_offset_t offmask = vm_map_range_offmask();
23572 struct mach_vm_range data_range;
23573 vm_map_offset_t default_end;
23574 kern_return_t kr;
23575
23576 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23577 /*
23578 * No point doing vm ranges in a 32bit address space.
23579 */
23580 return KERN_NOT_SUPPORTED;
23581 }
23582
23583 /* Should not be applying ranges to kernel map or kernel map submaps */
23584 assert(vm_map_pmap(map) != kernel_pmap);
23585
23586 #if XNU_PLATFORM_MacOSX
23587
23588 /*
23589 * on macOS, the address space is a massive 47 bits (128T),
23590 * with several carve outs that processes can't use:
23591 * - the shared region
23592 * - the commpage region
23593 * - the GPU carve out (if applicable)
23594 *
23595 * and when nano-malloc is in use it desires memory at the 96T mark.
23596 *
23597 * However, their location is architecture dependent:
23598 * - On intel, the shared region and commpage are
23599 * at the very end of the usable address space (above +127T),
23600 * and there is no GPU carve out, and pthread wants to place
23601 * threads at the 112T mark (0x70T).
23602 *
23603 * - On arm64, these are in the same spot as on embedded devices:
23604 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23605 * o commpage region: [63G, 64G)
23606 * o GPU carve out: [64G, 448G)
23607 *
23608 * This is conveninent because the mappings at the end of the address
23609 * space (when they exist) are made by the kernel.
23610 *
23611 * The policy is to allocate a random 1T for the data heap
23612 * in the end of the address-space in the:
23613 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23614 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23615 */
23616
23617 /* see NANOZONE_SIGNATURE in libmalloc */
23618 #if __x86_64__
23619 default_end = 0x71ull << 40;
23620 #else
23621 default_end = 0x61ull << 40;
23622 #endif
23623 data_range = vm_map_range_random_uniform(1ull << 40,
23624 default_end, 0x7full << 40, offmask);
23625
23626 #else /* !XNU_PLATFORM_MacOSX */
23627
23628 /*
23629 * Embedded devices:
23630 *
23631 * The default VA Size scales with the device physical memory.
23632 *
23633 * Out of that:
23634 * - the "zero" page typically uses 4G + some slide
23635 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
23636 *
23637 * Without the use of jumbo or any adjustment to the address space,
23638 * a default VM map typically looks like this:
23639 *
23640 * 0G -->╒════════════╕
23641 * │ pagezero │
23642 * │ + slide │
23643 * ~4G -->╞════════════╡<-- vm_map_min(map)
23644 * │ │
23645 * 6G -->├────────────┤
23646 * │ shared │
23647 * │ region │
23648 * 10G -->├────────────┤
23649 * │ │
23650 * max_va -->├────────────┤<-- vm_map_max(map)
23651 * │ │
23652 * ╎ jumbo ╎
23653 * ╎ ╎
23654 * │ │
23655 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23656 * │ commpage │
23657 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23658 * │ │
23659 * ╎ GPU ╎
23660 * ╎ carveout ╎
23661 * │ │
23662 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23663 * │ │
23664 * ╎ ╎
23665 * ╎ ╎
23666 * │ │
23667 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23668 *
23669 * When this drawing was made, "max_va" was smaller than
23670 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23671 * 12G of address space for the zero-page, slide, files,
23672 * binaries, heap ...
23673 *
23674 * We will want to make a "heap/data" carve out inside
23675 * the jumbo range of half of that usable space, assuming
23676 * that this is less than a forth of the jumbo range.
23677 *
23678 * The assert below intends to catch when max_va grows
23679 * too large for this heuristic.
23680 */
23681
23682 vm_map_lock_read(map);
23683 default_end = vm_map_max(map);
23684 vm_map_unlock_read(map);
23685
23686 /*
23687 * Check that we're not already jumbo'd,
23688 * or our address space was somehow modified.
23689 *
23690 * If so we cannot guarantee that we can set up the ranges
23691 * safely without interfering with the existing map.
23692 */
23693 if (default_end > vm_compute_max_offset(true)) {
23694 return KERN_NO_SPACE;
23695 }
23696
23697 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23698 /*
23699 * an override boot-arg was set, disable user-ranges
23700 *
23701 * XXX: this is problematic because it means these boot-args
23702 * no longer test the behavior changing the value
23703 * of ARM64_MAX_OFFSET_DEVICE_* would have.
23704 */
23705 return KERN_NOT_SUPPORTED;
23706 }
23707
23708 /* expand the default VM space to 64GB */
23709 vm_map_set_jumbo(map);
23710
23711 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23712 data_range = vm_map_range_random_uniform(GiB(10),
23713 default_end + PAGE_SIZE, vm_map_max(map), offmask);
23714
23715 #endif /* !XNU_PLATFORM_MacOSX */
23716
23717 /*
23718 * Poke holes so that ASAN or people listing regions
23719 * do not think this space is free.
23720 */
23721
23722 if (default_end != data_range.min_address) {
23723 kr = vm_map_enter(map, &default_end,
23724 data_range.min_address - default_end,
23725 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23726 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23727 assert(kr == KERN_SUCCESS);
23728 }
23729
23730 if (data_range.max_address != vm_map_max(map)) {
23731 vm_map_entry_t entry;
23732 vm_size_t size;
23733
23734 /*
23735 * Extend the end of the hole to the next VM entry or the end of the map,
23736 * whichever comes first.
23737 */
23738 vm_map_lock_read(map);
23739 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23740 if (entry == vm_map_to_entry(map) || entry->vme_start > vm_map_max(map)) {
23741 size = vm_map_max(map) - data_range.max_address;
23742 } else {
23743 size = entry->vme_start - data_range.max_address;
23744 }
23745 vm_map_unlock_read(map);
23746
23747 kr = vm_map_enter(map, &data_range.max_address, size,
23748 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23749 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23750 assert(kr == KERN_SUCCESS);
23751 }
23752
23753 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
23754 if (needs_extra_jumbo_va) {
23755 /* This will grow the address space to MACH_VM_MAX_ADDRESS */
23756 vm_map_set_extra_jumbo(map);
23757 }
23758 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
23759
23760 vm_map_lock(map);
23761 map->default_range.min_address = vm_map_min(map);
23762 map->default_range.max_address = default_end;
23763 map->data_range = data_range;
23764 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
23765 /* If process has "extra jumbo" entitlement, enable large file range */
23766 if (needs_extra_jumbo_va) {
23767 map->large_file_range = vm_map_range_random_uniform(TiB(1),
23768 MACH_VM_JUMBO_ADDRESS, MACH_VM_MAX_ADDRESS, offmask);
23769 }
23770 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
23771 map->uses_user_ranges = true;
23772 vm_map_unlock(map);
23773
23774 return KERN_SUCCESS;
23775 }
23776
23777 /*
23778 * vm_map_range_fork:
23779 * clones the array of ranges from old_map to new_map in support
23780 * of a VM map fork.
23781 */
23782 void
vm_map_range_fork(vm_map_t new_map,vm_map_t old_map)23783 vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23784 {
23785 if (!old_map->uses_user_ranges) {
23786 /* nothing to do */
23787 return;
23788 }
23789
23790 new_map->default_range = old_map->default_range;
23791 new_map->data_range = old_map->data_range;
23792
23793 if (old_map->extra_ranges_count) {
23794 vm_map_user_range_t otable, ntable;
23795 uint16_t count;
23796
23797 otable = old_map->extra_ranges;
23798 count = old_map->extra_ranges_count;
23799 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23800 Z_WAITOK | Z_ZERO | Z_NOFAIL);
23801 memcpy(ntable, otable,
23802 count * sizeof(struct vm_map_user_range));
23803
23804 new_map->extra_ranges_count = count;
23805 new_map->extra_ranges = ntable;
23806 }
23807
23808 new_map->uses_user_ranges = true;
23809 }
23810
23811 /*
23812 * vm_map_get_user_range:
23813 * copy the VM user range for the given VM map and range ID.
23814 */
23815 kern_return_t
vm_map_get_user_range(vm_map_t map,vm_map_range_id_t range_id,mach_vm_range_t range)23816 vm_map_get_user_range(
23817 vm_map_t map,
23818 vm_map_range_id_t range_id,
23819 mach_vm_range_t range)
23820 {
23821 if (map == NULL || !map->uses_user_ranges || range == NULL) {
23822 return KERN_INVALID_ARGUMENT;
23823 }
23824
23825 switch (range_id) {
23826 case UMEM_RANGE_ID_DEFAULT:
23827 *range = map->default_range;
23828 return KERN_SUCCESS;
23829
23830 case UMEM_RANGE_ID_HEAP:
23831 *range = map->data_range;
23832 return KERN_SUCCESS;
23833
23834 case UMEM_RANGE_ID_LARGE_FILE:
23835 /*
23836 * Because this function tells a user-space process about the user
23837 * ranges in its VM map, this case communicates whether the large file
23838 * range is in use. Note that this is different from how the large file
23839 * range ID is handled in `vm_map_get_range()`: there, we "resolve" the
23840 * VA policy and return either the large file range or data range,
23841 * depending on whether the large file range is enabled.
23842 */
23843 if (map->large_file_range.min_address != map->large_file_range.max_address) {
23844 /* large file range is configured and should be used */
23845 *range = map->large_file_range;
23846 } else {
23847 return KERN_INVALID_ARGUMENT;
23848 }
23849 return KERN_SUCCESS;
23850
23851 default:
23852 return KERN_INVALID_ARGUMENT;
23853 }
23854 }
23855
23856 static vm_map_range_id_t
vm_map_user_range_resolve(vm_map_t map,mach_vm_address_t addr,mach_vm_size_t size,mach_vm_range_t range)23857 vm_map_user_range_resolve(
23858 vm_map_t map,
23859 mach_vm_address_t addr,
23860 mach_vm_size_t size,
23861 mach_vm_range_t range)
23862 {
23863 struct mach_vm_range tmp;
23864
23865 vm_map_lock_assert_held(map);
23866
23867 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23868 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23869
23870 if (mach_vm_range_contains(&map->default_range, addr, size)) {
23871 if (range) {
23872 *range = map->default_range;
23873 }
23874 return UMEM_RANGE_ID_DEFAULT;
23875 }
23876
23877 if (mach_vm_range_contains(&map->data_range, addr, size)) {
23878 if (range) {
23879 *range = map->data_range;
23880 }
23881 return UMEM_RANGE_ID_HEAP;
23882 }
23883
23884 if (mach_vm_range_contains(&map->large_file_range, addr, size)) {
23885 if (range) {
23886 *range = map->large_file_range;
23887 }
23888 return UMEM_RANGE_ID_LARGE_FILE;
23889 }
23890
23891 for (size_t i = 0; i < map->extra_ranges_count; i++) {
23892 vm_map_user_range_t r = &map->extra_ranges[i];
23893
23894 tmp.min_address = r->vmur_min_address;
23895 tmp.max_address = r->vmur_max_address;
23896
23897 if (mach_vm_range_contains(&tmp, addr, size)) {
23898 if (range) {
23899 *range = tmp;
23900 }
23901 return r->vmur_range_id;
23902 }
23903 }
23904
23905 if (range) {
23906 range->min_address = range->max_address = 0;
23907 }
23908 return UMEM_RANGE_ID_DEFAULT;
23909 }
23910 #endif /* CONFIG_MAP_RANGES */
23911
23912 void
vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t * vmkf,vm_map_t map,__unused vm_map_size_t size)23913 vm_map_kernel_flags_update_range_id(
23914 vm_map_kernel_flags_t *vmkf,
23915 vm_map_t map,
23916 __unused vm_map_size_t size)
23917 {
23918 if (map == kernel_map) {
23919 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
23920 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
23921 }
23922 #if CONFIG_MAP_RANGES
23923 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
23924 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT) {
23925 if (bitmap_test(vm_map_user_range_large_file_map, vmkf->vm_tag)
23926 || size >= VM_LARGE_FILE_THRESHOLD) {
23927 /*
23928 * if the map doesn't have the large file range configured,
23929 * the range will get resolved to the heap range in `vm_map_get_range`
23930 */
23931 vmkf->vmkf_range_id = UMEM_RANGE_ID_LARGE_FILE;
23932 } else if (bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
23933 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
23934 }
23935 #endif /* CONFIG_MAP_RANGES */
23936 }
23937 }
23938
23939 /*
23940 * vm_map_entry_has_device_pager:
23941 * Check if the vm map entry specified by the virtual address has a device pager.
23942 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
23943 */
23944 boolean_t
vm_map_entry_has_device_pager(vm_map_t map,vm_map_offset_t vaddr)23945 vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
23946 {
23947 vm_map_entry_t entry;
23948 vm_object_t object;
23949 boolean_t result;
23950
23951 if (map == NULL) {
23952 return FALSE;
23953 }
23954
23955 vm_map_lock(map);
23956 while (TRUE) {
23957 if (!vm_map_lookup_entry(map, vaddr, &entry)) {
23958 result = FALSE;
23959 break;
23960 }
23961 if (entry->is_sub_map) {
23962 // Check the submap
23963 vm_map_t submap = VME_SUBMAP(entry);
23964 assert(submap != NULL);
23965 vm_map_lock(submap);
23966 vm_map_unlock(map);
23967 map = submap;
23968 continue;
23969 }
23970 object = VME_OBJECT(entry);
23971 if (object != NULL && object->pager != NULL && is_device_pager_ops(object->pager->mo_pager_ops)) {
23972 result = TRUE;
23973 break;
23974 }
23975 result = FALSE;
23976 break;
23977 }
23978
23979 vm_map_unlock(map);
23980 return result;
23981 }
23982
23983
23984 #if MACH_ASSERT
23985
23986 extern int pmap_ledgers_panic;
23987 extern int pmap_ledgers_panic_leeway;
23988
23989 #define LEDGER_DRIFT(__LEDGER) \
23990 int __LEDGER##_over; \
23991 ledger_amount_t __LEDGER##_over_total; \
23992 ledger_amount_t __LEDGER##_over_max; \
23993 int __LEDGER##_under; \
23994 ledger_amount_t __LEDGER##_under_total; \
23995 ledger_amount_t __LEDGER##_under_max
23996
23997 struct {
23998 uint64_t num_pmaps_checked;
23999
24000 LEDGER_DRIFT(phys_footprint);
24001 LEDGER_DRIFT(internal);
24002 LEDGER_DRIFT(internal_compressed);
24003 LEDGER_DRIFT(external);
24004 LEDGER_DRIFT(reusable);
24005 LEDGER_DRIFT(iokit_mapped);
24006 LEDGER_DRIFT(alternate_accounting);
24007 LEDGER_DRIFT(alternate_accounting_compressed);
24008 LEDGER_DRIFT(page_table);
24009 LEDGER_DRIFT(purgeable_volatile);
24010 LEDGER_DRIFT(purgeable_nonvolatile);
24011 LEDGER_DRIFT(purgeable_volatile_compressed);
24012 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24013 LEDGER_DRIFT(tagged_nofootprint);
24014 LEDGER_DRIFT(tagged_footprint);
24015 LEDGER_DRIFT(tagged_nofootprint_compressed);
24016 LEDGER_DRIFT(tagged_footprint_compressed);
24017 LEDGER_DRIFT(network_volatile);
24018 LEDGER_DRIFT(network_nonvolatile);
24019 LEDGER_DRIFT(network_volatile_compressed);
24020 LEDGER_DRIFT(network_nonvolatile_compressed);
24021 LEDGER_DRIFT(media_nofootprint);
24022 LEDGER_DRIFT(media_footprint);
24023 LEDGER_DRIFT(media_nofootprint_compressed);
24024 LEDGER_DRIFT(media_footprint_compressed);
24025 LEDGER_DRIFT(graphics_nofootprint);
24026 LEDGER_DRIFT(graphics_footprint);
24027 LEDGER_DRIFT(graphics_nofootprint_compressed);
24028 LEDGER_DRIFT(graphics_footprint_compressed);
24029 LEDGER_DRIFT(neural_nofootprint);
24030 LEDGER_DRIFT(neural_footprint);
24031 LEDGER_DRIFT(neural_nofootprint_compressed);
24032 LEDGER_DRIFT(neural_footprint_compressed);
24033 LEDGER_DRIFT(neural_nofootprint_total);
24034 } pmap_ledgers_drift;
24035
24036 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)24037 vm_map_pmap_check_ledgers(
24038 pmap_t pmap,
24039 ledger_t ledger,
24040 int pid,
24041 char *procname)
24042 {
24043 ledger_amount_t bal;
24044 boolean_t do_panic;
24045
24046 do_panic = FALSE;
24047
24048 pmap_ledgers_drift.num_pmaps_checked++;
24049
24050 #define LEDGER_CHECK_BALANCE(__LEDGER) \
24051 MACRO_BEGIN \
24052 int panic_on_negative = TRUE; \
24053 ledger_get_balance(ledger, \
24054 task_ledgers.__LEDGER, \
24055 &bal); \
24056 ledger_get_panic_on_negative(ledger, \
24057 task_ledgers.__LEDGER, \
24058 &panic_on_negative); \
24059 if (bal != 0) { \
24060 if (panic_on_negative || \
24061 (pmap_ledgers_panic && \
24062 pmap_ledgers_panic_leeway > 0 && \
24063 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24064 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24065 do_panic = TRUE; \
24066 } \
24067 printf("LEDGER BALANCE proc %d (%s) " \
24068 "\"%s\" = %lld\n", \
24069 pid, procname, #__LEDGER, bal); \
24070 if (bal > 0) { \
24071 pmap_ledgers_drift.__LEDGER##_over++; \
24072 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24073 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24074 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24075 } \
24076 } else if (bal < 0) { \
24077 pmap_ledgers_drift.__LEDGER##_under++; \
24078 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24079 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24080 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24081 } \
24082 } \
24083 } \
24084 MACRO_END
24085
24086 LEDGER_CHECK_BALANCE(phys_footprint);
24087 LEDGER_CHECK_BALANCE(internal);
24088 LEDGER_CHECK_BALANCE(internal_compressed);
24089 LEDGER_CHECK_BALANCE(external);
24090 LEDGER_CHECK_BALANCE(reusable);
24091 LEDGER_CHECK_BALANCE(iokit_mapped);
24092 LEDGER_CHECK_BALANCE(alternate_accounting);
24093 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24094 LEDGER_CHECK_BALANCE(page_table);
24095 LEDGER_CHECK_BALANCE(purgeable_volatile);
24096 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24097 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24098 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24099 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24100 LEDGER_CHECK_BALANCE(tagged_footprint);
24101 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24102 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24103 LEDGER_CHECK_BALANCE(network_volatile);
24104 LEDGER_CHECK_BALANCE(network_nonvolatile);
24105 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24106 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24107 LEDGER_CHECK_BALANCE(media_nofootprint);
24108 LEDGER_CHECK_BALANCE(media_footprint);
24109 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24110 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24111 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24112 LEDGER_CHECK_BALANCE(graphics_footprint);
24113 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24114 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24115 LEDGER_CHECK_BALANCE(neural_nofootprint);
24116 LEDGER_CHECK_BALANCE(neural_footprint);
24117 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24118 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24119 LEDGER_CHECK_BALANCE(neural_nofootprint_total);
24120
24121 if (do_panic) {
24122 if (pmap_ledgers_panic) {
24123 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24124 pmap, pid, procname);
24125 } else {
24126 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24127 pmap, pid, procname);
24128 }
24129 }
24130 }
24131
24132 void
vm_map_pmap_set_process(vm_map_t map,int pid,char * procname)24133 vm_map_pmap_set_process(
24134 vm_map_t map,
24135 int pid,
24136 char *procname)
24137 {
24138 pmap_set_process(vm_map_pmap(map), pid, procname);
24139 }
24140
24141 #endif /* MACH_ASSERT */
24142