1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach_assert.h>
67
68 #include <vm/vm_options.h>
69
70 #include <libkern/OSAtomic.h>
71
72 #include <mach/kern_return.h>
73 #include <mach/port.h>
74 #include <mach/vm_attributes.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_behavior.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/memory_object.h>
79 #include <mach/mach_vm.h>
80 #include <machine/cpu_capabilities.h>
81 #include <mach/sdt.h>
82
83 #include <kern/assert.h>
84 #include <kern/backtrace.h>
85 #include <kern/counter.h>
86 #include <kern/exc_guard.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89
90 #include <vm/cpm.h>
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_init.h>
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map_internal.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_pageout.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_kern.h>
101 #include <ipc/ipc_port.h>
102 #include <kern/sched_prim.h>
103 #include <kern/misc_protos.h>
104
105 #include <mach/vm_map_server.h>
106 #include <mach/mach_host_server.h>
107 #include <vm/vm_protos.h>
108 #include <vm/vm_purgeable_internal.h>
109
110 #include <vm/vm_protos.h>
111 #include <vm/vm_shared_region.h>
112 #include <vm/vm_map_store.h>
113
114 #include <san/kasan.h>
115
116 #include <sys/resource.h>
117 #include <sys/codesign.h>
118 #include <sys/mman.h>
119 #include <sys/reboot.h>
120 #include <sys/kdebug_triage.h>
121
122 #if __LP64__
123 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 0
124 #else
125 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 1
126 #endif
127
128 #include <libkern/section_keywords.h>
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 "error", /* 0 */
142 "life", /* 1 */
143 "load", /* 2 */
144 "fault", /* 3 */
145 "copy", /* 4 */
146 "share", /* 5 */
147 "adjust", /* 6 */
148 "pmap", /* 7 */
149 "mementry", /* 8 */
150 "iokit", /* 9 */
151 "upl", /* 10 */
152 "exc", /* 11 */
153 "vfs" /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157
158
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175 "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181
182 extern u_int32_t random(void); /* from <libkern/libkern.h> */
183 /* Internal prototypes
184 */
185
186 typedef struct vm_map_zap {
187 vm_map_entry_t vmz_head;
188 vm_map_entry_t *vmz_tail;
189 } *vm_map_zap_t;
190
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193
194 static vm_map_entry_t vm_map_entry_insert(
195 vm_map_t map,
196 vm_map_entry_t insp_entry,
197 vm_map_offset_t start,
198 vm_map_offset_t end,
199 vm_object_t object,
200 vm_object_offset_t offset,
201 vm_map_kernel_flags_t vmk_flags,
202 boolean_t needs_copy,
203 vm_prot_t cur_protection,
204 vm_prot_t max_protection,
205 vm_inherit_t inheritance,
206 boolean_t no_cache,
207 boolean_t permanent,
208 unsigned int superpage_size,
209 boolean_t clear_map_aligned,
210 int alias);
211
212 static void vm_map_simplify_range(
213 vm_map_t map,
214 vm_map_offset_t start,
215 vm_map_offset_t end); /* forward */
216
217 static boolean_t vm_map_range_check(
218 vm_map_t map,
219 vm_map_offset_t start,
220 vm_map_offset_t end,
221 vm_map_entry_t *entry);
222
223 static void vm_map_submap_pmap_clean(
224 vm_map_t map,
225 vm_map_offset_t start,
226 vm_map_offset_t end,
227 vm_map_t sub_map,
228 vm_map_offset_t offset);
229
230 static void vm_map_pmap_enter(
231 vm_map_t map,
232 vm_map_offset_t addr,
233 vm_map_offset_t end_addr,
234 vm_object_t object,
235 vm_object_offset_t offset,
236 vm_prot_t protection);
237
238 static void _vm_map_clip_end(
239 struct vm_map_header *map_header,
240 vm_map_entry_t entry,
241 vm_map_offset_t end);
242
243 static void _vm_map_clip_start(
244 struct vm_map_header *map_header,
245 vm_map_entry_t entry,
246 vm_map_offset_t start);
247
248 static kmem_return_t vm_map_delete(
249 vm_map_t map,
250 vm_map_offset_t start,
251 vm_map_offset_t end,
252 vmr_flags_t flags,
253 kmem_guard_t guard,
254 vm_map_zap_t zap);
255
256 static void vm_map_copy_insert(
257 vm_map_t map,
258 vm_map_entry_t after_where,
259 vm_map_copy_t copy);
260
261 static kern_return_t vm_map_copy_overwrite_unaligned(
262 vm_map_t dst_map,
263 vm_map_entry_t entry,
264 vm_map_copy_t copy,
265 vm_map_address_t start,
266 boolean_t discard_on_success);
267
268 static kern_return_t vm_map_copy_overwrite_aligned(
269 vm_map_t dst_map,
270 vm_map_entry_t tmp_entry,
271 vm_map_copy_t copy,
272 vm_map_offset_t start,
273 pmap_t pmap);
274
275 static kern_return_t vm_map_copyin_kernel_buffer(
276 vm_map_t src_map,
277 vm_map_address_t src_addr,
278 vm_map_size_t len,
279 boolean_t src_destroy,
280 vm_map_copy_t *copy_result); /* OUT */
281
282 static kern_return_t vm_map_copyout_kernel_buffer(
283 vm_map_t map,
284 vm_map_address_t *addr, /* IN/OUT */
285 vm_map_copy_t copy,
286 vm_map_size_t copy_size,
287 boolean_t overwrite,
288 boolean_t consume_on_success);
289
290 static void vm_map_fork_share(
291 vm_map_t old_map,
292 vm_map_entry_t old_entry,
293 vm_map_t new_map);
294
295 static boolean_t vm_map_fork_copy(
296 vm_map_t old_map,
297 vm_map_entry_t *old_entry_p,
298 vm_map_t new_map,
299 int vm_map_copyin_flags);
300
301 static kern_return_t vm_map_wire_nested(
302 vm_map_t map,
303 vm_map_offset_t start,
304 vm_map_offset_t end,
305 vm_prot_t caller_prot,
306 vm_tag_t tag,
307 boolean_t user_wire,
308 pmap_t map_pmap,
309 vm_map_offset_t pmap_addr,
310 ppnum_t *physpage_p);
311
312 static kern_return_t vm_map_unwire_nested(
313 vm_map_t map,
314 vm_map_offset_t start,
315 vm_map_offset_t end,
316 boolean_t user_wire,
317 pmap_t map_pmap,
318 vm_map_offset_t pmap_addr);
319
320 static kern_return_t vm_map_overwrite_submap_recurse(
321 vm_map_t dst_map,
322 vm_map_offset_t dst_addr,
323 vm_map_size_t dst_size);
324
325 static kern_return_t vm_map_copy_overwrite_nested(
326 vm_map_t dst_map,
327 vm_map_offset_t dst_addr,
328 vm_map_copy_t copy,
329 boolean_t interruptible,
330 pmap_t pmap,
331 boolean_t discard_on_success);
332
333 static kern_return_t vm_map_remap_extract(
334 vm_map_t map,
335 vm_map_offset_t addr,
336 vm_map_size_t size,
337 boolean_t copy,
338 struct vm_map_header *map_header,
339 vm_prot_t *cur_protection,
340 vm_prot_t *max_protection,
341 vm_inherit_t inheritance,
342 vm_map_kernel_flags_t vmk_flags);
343
344 static kern_return_t vm_map_remap_range_allocate(
345 vm_map_t map,
346 vm_map_address_t *address,
347 vm_map_size_t size,
348 vm_map_offset_t mask,
349 int flags,
350 vm_map_kernel_flags_t vmk_flags,
351 vm_tag_t tag,
352 vm_map_entry_t *map_entry,
353 vm_map_zap_t zap_list);
354
355 static void vm_map_region_look_for_page(
356 vm_map_t map,
357 vm_map_offset_t va,
358 vm_object_t object,
359 vm_object_offset_t offset,
360 int max_refcnt,
361 unsigned short depth,
362 vm_region_extended_info_t extended,
363 mach_msg_type_number_t count);
364
365 static int vm_map_region_count_obj_refs(
366 vm_map_entry_t entry,
367 vm_object_t object);
368
369
370 static kern_return_t vm_map_willneed(
371 vm_map_t map,
372 vm_map_offset_t start,
373 vm_map_offset_t end);
374
375 static kern_return_t vm_map_reuse_pages(
376 vm_map_t map,
377 vm_map_offset_t start,
378 vm_map_offset_t end);
379
380 static kern_return_t vm_map_reusable_pages(
381 vm_map_t map,
382 vm_map_offset_t start,
383 vm_map_offset_t end);
384
385 static kern_return_t vm_map_can_reuse(
386 vm_map_t map,
387 vm_map_offset_t start,
388 vm_map_offset_t end);
389
390 #if MACH_ASSERT
391 static kern_return_t vm_map_pageout(
392 vm_map_t map,
393 vm_map_offset_t start,
394 vm_map_offset_t end);
395 #endif /* MACH_ASSERT */
396
397 kern_return_t vm_map_corpse_footprint_collect(
398 vm_map_t old_map,
399 vm_map_entry_t old_entry,
400 vm_map_t new_map);
401 void vm_map_corpse_footprint_collect_done(
402 vm_map_t new_map);
403 void vm_map_corpse_footprint_destroy(
404 vm_map_t map);
405 kern_return_t vm_map_corpse_footprint_query_page_info(
406 vm_map_t map,
407 vm_map_offset_t va,
408 int *disposition_p);
409 void vm_map_footprint_query_page_info(
410 vm_map_t map,
411 vm_map_entry_t map_entry,
412 vm_map_offset_t curr_s_offset,
413 int *disposition_p);
414
415 pid_t find_largest_process_vm_map_entries(void);
416
417 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
418 mach_exception_data_type_t subcode);
419
420 /*
421 * Macros to copy a vm_map_entry. We must be careful to correctly
422 * manage the wired page count. vm_map_entry_copy() creates a new
423 * map entry to the same memory - the wired count in the new entry
424 * must be set to zero. vm_map_entry_copy_full() creates a new
425 * entry that is identical to the old entry. This preserves the
426 * wire count; it's used for map splitting and zone changing in
427 * vm_map_copyout.
428 */
429
430 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)431 vm_map_entry_copy_pmap_cs_assoc(
432 vm_map_t map __unused,
433 vm_map_entry_t new __unused,
434 vm_map_entry_t old __unused)
435 {
436 /* when pmap_cs is not enabled, assert as a sanity check */
437 assert(new->pmap_cs_associated == FALSE);
438 }
439
440 /*
441 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
442 * But for security reasons on some platforms, we don't want the
443 * new mapping to be "used for jit", so we reset the flag here.
444 */
445 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)446 vm_map_entry_copy_code_signing(
447 vm_map_t map,
448 vm_map_entry_t new,
449 vm_map_entry_t old __unused)
450 {
451 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
452 assert(new->used_for_jit == old->used_for_jit);
453 } else {
454 new->used_for_jit = FALSE;
455 }
456 }
457
458 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)459 vm_map_entry_copy_full(
460 vm_map_entry_t new,
461 vm_map_entry_t old)
462 {
463 #if MAP_ENTRY_CREATION_DEBUG
464 btref_put(new->vme_creation_bt);
465 btref_retain(old->vme_creation_bt);
466 #endif
467 #if MAP_ENTRY_INSERTION_DEBUG
468 btref_put(new->vme_insertion_bt);
469 btref_retain(old->vme_insertion_bt);
470 #endif
471 *new = *old;
472 }
473
474 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)475 vm_map_entry_copy(
476 vm_map_t map,
477 vm_map_entry_t new,
478 vm_map_entry_t old)
479 {
480 vm_map_entry_copy_full(new, old);
481
482 new->is_shared = FALSE;
483 new->needs_wakeup = FALSE;
484 new->in_transition = FALSE;
485 new->wired_count = 0;
486 new->user_wired_count = 0;
487 new->permanent = FALSE;
488 vm_map_entry_copy_code_signing(map, new, old);
489 vm_map_entry_copy_pmap_cs_assoc(map, new, old);
490 if (new->iokit_acct) {
491 assertf(!new->use_pmap, "old %p new %p\n", old, new);
492 new->iokit_acct = FALSE;
493 new->use_pmap = TRUE;
494 }
495 new->vme_resilient_codesign = FALSE;
496 new->vme_resilient_media = FALSE;
497 new->vme_atomic = FALSE;
498 new->vme_no_copy_on_read = FALSE;
499 }
500
501 /*
502 * Normal lock_read_to_write() returns FALSE/0 on failure.
503 * These functions evaluate to zero on success and non-zero value on failure.
504 */
505 __attribute__((always_inline))
506 int
vm_map_lock_read_to_write(vm_map_t map)507 vm_map_lock_read_to_write(vm_map_t map)
508 {
509 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
510 DTRACE_VM(vm_map_lock_upgrade);
511 return 0;
512 }
513 return 1;
514 }
515
516 __attribute__((always_inline))
517 boolean_t
vm_map_try_lock(vm_map_t map)518 vm_map_try_lock(vm_map_t map)
519 {
520 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
521 DTRACE_VM(vm_map_lock_w);
522 return TRUE;
523 }
524 return FALSE;
525 }
526
527 __attribute__((always_inline))
528 boolean_t
vm_map_try_lock_read(vm_map_t map)529 vm_map_try_lock_read(vm_map_t map)
530 {
531 if (lck_rw_try_lock_shared(&(map)->lock)) {
532 DTRACE_VM(vm_map_lock_r);
533 return TRUE;
534 }
535 return FALSE;
536 }
537
538 /*
539 * Routines to get the page size the caller should
540 * use while inspecting the target address space.
541 * Use the "_safely" variant if the caller is dealing with a user-provided
542 * array whose size depends on the page size, to avoid any overflow or
543 * underflow of a user-allocated buffer.
544 */
545 int
vm_self_region_page_shift_safely(vm_map_t target_map)546 vm_self_region_page_shift_safely(
547 vm_map_t target_map)
548 {
549 int effective_page_shift = 0;
550
551 if (PAGE_SIZE == (4096)) {
552 /* x86_64 and 4k watches: always use 4k */
553 return PAGE_SHIFT;
554 }
555 /* did caller provide an explicit page size for this thread to use? */
556 effective_page_shift = thread_self_region_page_shift();
557 if (effective_page_shift) {
558 /* use the explicitly-provided page size */
559 return effective_page_shift;
560 }
561 /* no explicit page size: use the caller's page size... */
562 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
563 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
564 /* page size match: safe to use */
565 return effective_page_shift;
566 }
567 /* page size mismatch */
568 return -1;
569 }
570 int
vm_self_region_page_shift(vm_map_t target_map)571 vm_self_region_page_shift(
572 vm_map_t target_map)
573 {
574 int effective_page_shift;
575
576 effective_page_shift = vm_self_region_page_shift_safely(target_map);
577 if (effective_page_shift == -1) {
578 /* no safe value but OK to guess for caller */
579 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
580 VM_MAP_PAGE_SHIFT(target_map));
581 }
582 return effective_page_shift;
583 }
584
585
586 /*
587 * Decide if we want to allow processes to execute from their data or stack areas.
588 * override_nx() returns true if we do. Data/stack execution can be enabled independently
589 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
590 * or allow_stack_exec to enable data execution for that type of data area for that particular
591 * ABI (or both by or'ing the flags together). These are initialized in the architecture
592 * specific pmap files since the default behavior varies according to architecture. The
593 * main reason it varies is because of the need to provide binary compatibility with old
594 * applications that were written before these restrictions came into being. In the old
595 * days, an app could execute anything it could read, but this has slowly been tightened
596 * up over time. The default behavior is:
597 *
598 * 32-bit PPC apps may execute from both stack and data areas
599 * 32-bit Intel apps may exeucte from data areas but not stack
600 * 64-bit PPC/Intel apps may not execute from either data or stack
601 *
602 * An application on any architecture may override these defaults by explicitly
603 * adding PROT_EXEC permission to the page in question with the mprotect(2)
604 * system call. This code here just determines what happens when an app tries to
605 * execute from a page that lacks execute permission.
606 *
607 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
608 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
609 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
610 * execution from data areas for a particular binary even if the arch normally permits it. As
611 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
612 * to support some complicated use cases, notably browsers with out-of-process plugins that
613 * are not all NX-safe.
614 */
615
616 extern int allow_data_exec, allow_stack_exec;
617
618 int
override_nx(vm_map_t map,uint32_t user_tag)619 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
620 {
621 int current_abi;
622
623 if (map->pmap == kernel_pmap) {
624 return FALSE;
625 }
626
627 /*
628 * Determine if the app is running in 32 or 64 bit mode.
629 */
630
631 if (vm_map_is_64bit(map)) {
632 current_abi = VM_ABI_64;
633 } else {
634 current_abi = VM_ABI_32;
635 }
636
637 /*
638 * Determine if we should allow the execution based on whether it's a
639 * stack or data area and the current architecture.
640 */
641
642 if (user_tag == VM_MEMORY_STACK) {
643 return allow_stack_exec & current_abi;
644 }
645
646 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
647 }
648
649
650 /*
651 * Virtual memory maps provide for the mapping, protection,
652 * and sharing of virtual memory objects. In addition,
653 * this module provides for an efficient virtual copy of
654 * memory from one map to another.
655 *
656 * Synchronization is required prior to most operations.
657 *
658 * Maps consist of an ordered doubly-linked list of simple
659 * entries; a single hint is used to speed up lookups.
660 *
661 * Sharing maps have been deleted from this version of Mach.
662 * All shared objects are now mapped directly into the respective
663 * maps. This requires a change in the copy on write strategy;
664 * the asymmetric (delayed) strategy is used for shared temporary
665 * objects instead of the symmetric (shadow) strategy. All maps
666 * are now "top level" maps (either task map, kernel map or submap
667 * of the kernel map).
668 *
669 * Since portions of maps are specified by start/end addreses,
670 * which may not align with existing map entries, all
671 * routines merely "clip" entries to these start/end values.
672 * [That is, an entry is split into two, bordering at a
673 * start or end value.] Note that these clippings may not
674 * always be necessary (as the two resulting entries are then
675 * not changed); however, the clipping is done for convenience.
676 * No attempt is currently made to "glue back together" two
677 * abutting entries.
678 *
679 * The symmetric (shadow) copy strategy implements virtual copy
680 * by copying VM object references from one map to
681 * another, and then marking both regions as copy-on-write.
682 * It is important to note that only one writeable reference
683 * to a VM object region exists in any map when this strategy
684 * is used -- this means that shadow object creation can be
685 * delayed until a write operation occurs. The symmetric (delayed)
686 * strategy allows multiple maps to have writeable references to
687 * the same region of a vm object, and hence cannot delay creating
688 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
689 * Copying of permanent objects is completely different; see
690 * vm_object_copy_strategically() in vm_object.c.
691 */
692
693 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_zone; /* zone for vm_map structures */
694 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_copy_zone; /* zone for vm_map_copy structures */
695
696 SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_zone; /* zone for vm_map_entry structures */
697 SECURITY_READ_ONLY_LATE(zone_t) vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */
698 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
699 SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_reserved_zone;
700 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
701
702 #define VM_MAP_ZONE_NAME "maps"
703 #define VM_MAP_ZFLAGS ( \
704 ZC_NOENCRYPT | \
705 ZC_NOGZALLOC | \
706 ZC_VM_LP64)
707
708 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
709 #define VM_MAP_ENTRY_ZFLAGS ( \
710 ZC_NOENCRYPT | \
711 ZC_CACHING | \
712 ZC_NOGZALLOC | \
713 ZC_KASAN_NOQUARANTINE | \
714 ZC_VM_LP64)
715
716 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
717 #define VM_MAP_ENTRY_RESERVED_ZONE_NAME "Reserved VM map entries"
718 #define VM_MAP_ENTRY_RESERVED_ZFLAGS ( \
719 ZC_NOENCRYPT | \
720 ZC_NOCACHING | \
721 ZC_NOGZALLOC | \
722 ZC_KASAN_NOQUARANTINE | \
723 ZC_VM)
724 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
725
726 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
727 #define VM_MAP_HOLES_ZFLAGS ( \
728 ZC_NOENCRYPT | \
729 ZC_CACHING | \
730 ZC_NOGZALLOC | \
731 ZC_KASAN_NOQUARANTINE | \
732 ZC_VM_LP64)
733
734 /*
735 * Asserts that a vm_map_copy object is coming from the
736 * vm_map_copy_zone to ensure that it isn't a fake constructed
737 * anywhere else.
738 */
739 void
vm_map_copy_require(struct vm_map_copy * copy)740 vm_map_copy_require(struct vm_map_copy *copy)
741 {
742 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
743 }
744
745 /*
746 * vm_map_require:
747 *
748 * Ensures that the argument is memory allocated from the genuine
749 * vm map zone. (See zone_id_require_allow_foreign).
750 */
751 void
vm_map_require(vm_map_t map)752 vm_map_require(vm_map_t map)
753 {
754 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
755 }
756
757 #define VM_MAP_EARLY_COUNT_MAX 16
758 static __startup_data vm_offset_t map_data;
759 static __startup_data vm_size_t map_data_size;
760 static __startup_data vm_offset_t kentry_data;
761 static __startup_data vm_size_t kentry_data_size;
762 static __startup_data vm_offset_t map_holes_data;
763 static __startup_data vm_size_t map_holes_data_size;
764 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
765 static __startup_data uint32_t early_map_count;
766
767 #if XNU_TARGET_OS_OSX
768 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
769 #else /* XNU_TARGET_OS_OSX */
770 #define NO_COALESCE_LIMIT 0
771 #endif /* XNU_TARGET_OS_OSX */
772
773 /* Skip acquiring locks if we're in the midst of a kernel core dump */
774 unsigned int not_in_kdp = 1;
775
776 unsigned int vm_map_set_cache_attr_count = 0;
777
778 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)779 vm_map_set_cache_attr(
780 vm_map_t map,
781 vm_map_offset_t va)
782 {
783 vm_map_entry_t map_entry;
784 vm_object_t object;
785 kern_return_t kr = KERN_SUCCESS;
786
787 vm_map_lock_read(map);
788
789 if (!vm_map_lookup_entry(map, va, &map_entry) ||
790 map_entry->is_sub_map) {
791 /*
792 * that memory is not properly mapped
793 */
794 kr = KERN_INVALID_ARGUMENT;
795 goto done;
796 }
797 object = VME_OBJECT(map_entry);
798
799 if (object == VM_OBJECT_NULL) {
800 /*
801 * there should be a VM object here at this point
802 */
803 kr = KERN_INVALID_ARGUMENT;
804 goto done;
805 }
806 vm_object_lock(object);
807 object->set_cache_attr = TRUE;
808 vm_object_unlock(object);
809
810 vm_map_set_cache_attr_count++;
811 done:
812 vm_map_unlock_read(map);
813
814 return kr;
815 }
816
817
818 #if CONFIG_CODE_DECRYPTION
819 /*
820 * vm_map_apple_protected:
821 * This remaps the requested part of the object with an object backed by
822 * the decrypting pager.
823 * crypt_info contains entry points and session data for the crypt module.
824 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
825 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
826 */
827 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)828 vm_map_apple_protected(
829 vm_map_t map,
830 vm_map_offset_t start,
831 vm_map_offset_t end,
832 vm_object_offset_t crypto_backing_offset,
833 struct pager_crypt_info *crypt_info,
834 uint32_t cryptid)
835 {
836 boolean_t map_locked;
837 kern_return_t kr;
838 vm_map_entry_t map_entry;
839 struct vm_map_entry tmp_entry;
840 memory_object_t unprotected_mem_obj;
841 vm_object_t protected_object;
842 vm_map_offset_t map_addr;
843 vm_map_offset_t start_aligned, end_aligned;
844 vm_object_offset_t crypto_start, crypto_end;
845 int vm_flags;
846 vm_map_kernel_flags_t vmk_flags;
847 boolean_t cache_pager;
848
849 vm_flags = 0;
850 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
851
852 map_locked = FALSE;
853 unprotected_mem_obj = MEMORY_OBJECT_NULL;
854
855 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
856 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
857 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
858 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
859
860 #if __arm64__
861 /*
862 * "start" and "end" might be 4K-aligned but not 16K-aligned,
863 * so we might have to loop and establish up to 3 mappings:
864 *
865 * + the first 16K-page, which might overlap with the previous
866 * 4K-aligned mapping,
867 * + the center,
868 * + the last 16K-page, which might overlap with the next
869 * 4K-aligned mapping.
870 * Each of these mapping might be backed by a vnode pager (if
871 * properly page-aligned) or a "fourk_pager", itself backed by a
872 * vnode pager (if 4K-aligned but not page-aligned).
873 */
874 #endif /* __arm64__ */
875
876 map_addr = start_aligned;
877 for (map_addr = start_aligned;
878 map_addr < end;
879 map_addr = tmp_entry.vme_end) {
880 vm_map_lock(map);
881 map_locked = TRUE;
882
883 /* lookup the protected VM object */
884 if (!vm_map_lookup_entry(map,
885 map_addr,
886 &map_entry) ||
887 map_entry->is_sub_map ||
888 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
889 /* that memory is not properly mapped */
890 kr = KERN_INVALID_ARGUMENT;
891 goto done;
892 }
893
894 /* ensure mapped memory is mapped as executable except
895 * except for model decryption flow */
896 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
897 !(map_entry->protection & VM_PROT_EXECUTE)) {
898 kr = KERN_INVALID_ARGUMENT;
899 goto done;
900 }
901
902 /* get the protected object to be decrypted */
903 protected_object = VME_OBJECT(map_entry);
904 if (protected_object == VM_OBJECT_NULL) {
905 /* there should be a VM object here at this point */
906 kr = KERN_INVALID_ARGUMENT;
907 goto done;
908 }
909 /* ensure protected object stays alive while map is unlocked */
910 vm_object_reference(protected_object);
911
912 /* limit the map entry to the area we want to cover */
913 vm_map_clip_start(map, map_entry, start_aligned);
914 vm_map_clip_end(map, map_entry, end_aligned);
915
916 tmp_entry = *map_entry;
917 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
918 vm_map_unlock(map);
919 map_locked = FALSE;
920
921 /*
922 * This map entry might be only partially encrypted
923 * (if not fully "page-aligned").
924 */
925 crypto_start = 0;
926 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
927 if (tmp_entry.vme_start < start) {
928 if (tmp_entry.vme_start != start_aligned) {
929 kr = KERN_INVALID_ADDRESS;
930 }
931 crypto_start += (start - tmp_entry.vme_start);
932 }
933 if (tmp_entry.vme_end > end) {
934 if (tmp_entry.vme_end != end_aligned) {
935 kr = KERN_INVALID_ADDRESS;
936 }
937 crypto_end -= (tmp_entry.vme_end - end);
938 }
939
940 /*
941 * This "extra backing offset" is needed to get the decryption
942 * routine to use the right key. It adjusts for the possibly
943 * relative offset of an interposed "4K" pager...
944 */
945 if (crypto_backing_offset == (vm_object_offset_t) -1) {
946 crypto_backing_offset = VME_OFFSET(&tmp_entry);
947 }
948
949 cache_pager = TRUE;
950 #if XNU_TARGET_OS_OSX
951 if (vm_map_is_alien(map)) {
952 cache_pager = FALSE;
953 }
954 #endif /* XNU_TARGET_OS_OSX */
955
956 /*
957 * Lookup (and create if necessary) the protected memory object
958 * matching that VM object.
959 * If successful, this also grabs a reference on the memory object,
960 * to guarantee that it doesn't go away before we get a chance to map
961 * it.
962 */
963 unprotected_mem_obj = apple_protect_pager_setup(
964 protected_object,
965 VME_OFFSET(&tmp_entry),
966 crypto_backing_offset,
967 crypt_info,
968 crypto_start,
969 crypto_end,
970 cache_pager);
971
972 /* release extra ref on protected object */
973 vm_object_deallocate(protected_object);
974
975 if (unprotected_mem_obj == NULL) {
976 kr = KERN_FAILURE;
977 goto done;
978 }
979
980 vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
981 /* can overwrite an immutable mapping */
982 vmk_flags.vmkf_overwrite_immutable = TRUE;
983 #if __arm64__
984 if (tmp_entry.used_for_jit &&
985 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
986 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
987 fourk_binary_compatibility_unsafe &&
988 fourk_binary_compatibility_allow_wx) {
989 printf("** FOURK_COMPAT [%d]: "
990 "allowing write+execute at 0x%llx\n",
991 proc_selfpid(), tmp_entry.vme_start);
992 vmk_flags.vmkf_map_jit = TRUE;
993 }
994 #endif /* __arm64__ */
995
996 /* map this memory object in place of the current one */
997 map_addr = tmp_entry.vme_start;
998 kr = vm_map_enter_mem_object(map,
999 &map_addr,
1000 (tmp_entry.vme_end -
1001 tmp_entry.vme_start),
1002 (mach_vm_offset_t) 0,
1003 vm_flags,
1004 vmk_flags,
1005 VM_KERN_MEMORY_NONE,
1006 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1007 0,
1008 TRUE,
1009 tmp_entry.protection,
1010 tmp_entry.max_protection,
1011 tmp_entry.inheritance);
1012 assertf(kr == KERN_SUCCESS,
1013 "kr = 0x%x\n", kr);
1014 assertf(map_addr == tmp_entry.vme_start,
1015 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1016 (uint64_t)map_addr,
1017 (uint64_t) tmp_entry.vme_start,
1018 &tmp_entry);
1019
1020 #if VM_MAP_DEBUG_APPLE_PROTECT
1021 if (vm_map_debug_apple_protect) {
1022 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1023 " backing:[object:%p,offset:0x%llx,"
1024 "crypto_backing_offset:0x%llx,"
1025 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1026 map,
1027 (uint64_t) map_addr,
1028 (uint64_t) (map_addr + (tmp_entry.vme_end -
1029 tmp_entry.vme_start)),
1030 unprotected_mem_obj,
1031 protected_object,
1032 VME_OFFSET(&tmp_entry),
1033 crypto_backing_offset,
1034 crypto_start,
1035 crypto_end);
1036 }
1037 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1038
1039 /*
1040 * Release the reference obtained by
1041 * apple_protect_pager_setup().
1042 * The mapping (if it succeeded) is now holding a reference on
1043 * the memory object.
1044 */
1045 memory_object_deallocate(unprotected_mem_obj);
1046 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1047
1048 /* continue with next map entry */
1049 crypto_backing_offset += (tmp_entry.vme_end -
1050 tmp_entry.vme_start);
1051 crypto_backing_offset -= crypto_start;
1052 }
1053 kr = KERN_SUCCESS;
1054
1055 done:
1056 if (map_locked) {
1057 vm_map_unlock(map);
1058 }
1059 return kr;
1060 }
1061 #endif /* CONFIG_CODE_DECRYPTION */
1062
1063
1064 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1065 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1066 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1067
1068 #if XNU_TARGET_OS_OSX
1069 int malloc_no_cow = 0;
1070 #else /* XNU_TARGET_OS_OSX */
1071 int malloc_no_cow = 1;
1072 #endif /* XNU_TARGET_OS_OSX */
1073 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1074 #if DEBUG
1075 int vm_check_map_sanity = 0;
1076 #endif
1077
1078 /*
1079 * vm_map_init:
1080 *
1081 * Initialize the vm_map module. Must be called before
1082 * any other vm_map routines.
1083 *
1084 * Map and entry structures are allocated from zones -- we must
1085 * initialize those zones.
1086 *
1087 * There are three zones of interest:
1088 *
1089 * vm_map_zone: used to allocate maps.
1090 * vm_map_entry_zone: used to allocate map entries.
1091 *
1092 * LP32:
1093 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1094 *
1095 * The kernel allocates map entries from a special zone that is initially
1096 * "crammed" with memory. It would be difficult (perhaps impossible) for
1097 * the kernel to allocate more memory to a entry zone when it became
1098 * empty since the very act of allocating memory implies the creation
1099 * of a new entry.
1100 */
1101 __startup_func
1102 void
vm_map_init(void)1103 vm_map_init(void)
1104 {
1105
1106 #if MACH_ASSERT
1107 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1108 sizeof(debug4k_filter));
1109 #endif /* MACH_ASSERT */
1110
1111 vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1112 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1113
1114 /*
1115 * Don't quarantine because we always need elements available
1116 * Disallow GC on this zone... to aid the GC.
1117 */
1118 vm_map_entry_zone = zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1119 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1120 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1121 z->z_elems_rsv = (uint16_t)(32 *
1122 (ml_early_cpu_max_number() + 1));
1123 });
1124 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1125 vm_map_entry_reserved_zone = zone_create(VM_MAP_ENTRY_RESERVED_ZONE_NAME,
1126 sizeof(struct vm_map_entry), VM_MAP_ENTRY_RESERVED_ZFLAGS);
1127 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1128
1129 vm_map_holes_zone = zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1130 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1131 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1132 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1133 });
1134
1135 vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1136 ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1137
1138 /*
1139 * Add the stolen memory to zones, adjust zone size and stolen counts.
1140 */
1141 zone_cram_early(vm_map_zone, map_data, map_data_size);
1142 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1143 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1144 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1145 vm_map_zone->z_elems_free,
1146 vm_map_entry_zone->z_elems_free,
1147 vm_map_holes_zone->z_elems_free);
1148
1149 /*
1150 * Since these are covered by zones, remove them from stolen page accounting.
1151 */
1152 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1153
1154 #if VM_MAP_DEBUG_APPLE_PROTECT
1155 PE_parse_boot_argn("vm_map_debug_apple_protect",
1156 &vm_map_debug_apple_protect,
1157 sizeof(vm_map_debug_apple_protect));
1158 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1159 #if VM_MAP_DEBUG_APPLE_FOURK
1160 PE_parse_boot_argn("vm_map_debug_fourk",
1161 &vm_map_debug_fourk,
1162 sizeof(vm_map_debug_fourk));
1163 #endif /* VM_MAP_DEBUG_FOURK */
1164
1165 PE_parse_boot_argn("malloc_no_cow",
1166 &malloc_no_cow,
1167 sizeof(malloc_no_cow));
1168 if (malloc_no_cow) {
1169 vm_memory_malloc_no_cow_mask = 0ULL;
1170 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1171 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1172 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1173 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1174 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1175 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1176 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1177 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1178 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1179 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1180 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1181 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1182 &vm_memory_malloc_no_cow_mask,
1183 sizeof(vm_memory_malloc_no_cow_mask));
1184 }
1185
1186 #if DEBUG
1187 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1188 if (vm_check_map_sanity) {
1189 kprintf("VM sanity checking enabled\n");
1190 } else {
1191 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1192 }
1193 #endif /* DEBUG */
1194
1195 #if DEVELOPMENT || DEBUG
1196 PE_parse_boot_argn("panic_on_unsigned_execute",
1197 &panic_on_unsigned_execute,
1198 sizeof(panic_on_unsigned_execute));
1199 PE_parse_boot_argn("panic_on_mlock_failure",
1200 &panic_on_mlock_failure,
1201 sizeof(panic_on_mlock_failure));
1202 #endif /* DEVELOPMENT || DEBUG */
1203 }
1204
1205 __startup_func
1206 static void
vm_map_steal_memory(void)1207 vm_map_steal_memory(void)
1208 {
1209 /*
1210 * We need to reserve enough memory to support boostraping VM maps
1211 * and the zone subsystem.
1212 *
1213 * The VM Maps that need to function before zones can support them
1214 * are the ones registered with vm_map_will_allocate_early_map(),
1215 * which are:
1216 * - the kernel map
1217 * - the various submaps used by zones (pgz, meta, ...)
1218 *
1219 * We also need enough entries and holes to support them
1220 * until zone_metadata_init() is called, which is when
1221 * the zone allocator becomes capable of expanding dynamically.
1222 *
1223 * We need:
1224 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1225 * - To allow for 3-4 entries per map, but the kernel map
1226 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1227 * to describe the submaps, so double it (and make it 8x too)
1228 * - To allow for holes between entries,
1229 * hence needs the same budget as entries
1230 */
1231 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1232 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1233 VM_MAP_EARLY_COUNT_MAX);
1234
1235 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1236 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1237 8 * VM_MAP_EARLY_COUNT_MAX);
1238
1239 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1240 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1241 8 * VM_MAP_EARLY_COUNT_MAX);
1242
1243 /*
1244 * Steal a contiguous range of memory so that a simple range check
1245 * can validate early addresses being freed/crammed to these
1246 * zones
1247 */
1248 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1249 map_holes_data_size);
1250 kentry_data = map_data + map_data_size;
1251 map_holes_data = kentry_data + kentry_data_size;
1252 }
1253 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1254
1255 __startup_func
1256 static void
vm_kernel_boostraped(void)1257 vm_kernel_boostraped(void)
1258 {
1259 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1260 vm_map_zone->z_elems_free,
1261 vm_map_entry_zone->z_elems_free,
1262 vm_map_holes_zone->z_elems_free);
1263 }
1264 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1265
1266 void
vm_map_disable_hole_optimization(vm_map_t map)1267 vm_map_disable_hole_optimization(vm_map_t map)
1268 {
1269 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1270
1271 if (map->holelistenabled) {
1272 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1273
1274 while (hole_entry != NULL) {
1275 next_hole_entry = hole_entry->vme_next;
1276
1277 hole_entry->vme_next = NULL;
1278 hole_entry->vme_prev = NULL;
1279 zfree(vm_map_holes_zone, hole_entry);
1280
1281 if (next_hole_entry == head_entry) {
1282 hole_entry = NULL;
1283 } else {
1284 hole_entry = next_hole_entry;
1285 }
1286 }
1287
1288 map->holes_list = NULL;
1289 map->holelistenabled = FALSE;
1290
1291 map->first_free = vm_map_first_entry(map);
1292 SAVE_HINT_HOLE_WRITE(map, NULL);
1293 }
1294 }
1295
1296 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1297 vm_kernel_map_is_kernel(vm_map_t map)
1298 {
1299 return map->pmap == kernel_pmap;
1300 }
1301
1302 /*
1303 * vm_map_create:
1304 *
1305 * Creates and returns a new empty VM map with
1306 * the given physical map structure, and having
1307 * the given lower and upper address bounds.
1308 */
1309
1310 extern vm_map_t vm_map_create_external(
1311 pmap_t pmap,
1312 vm_map_offset_t min_off,
1313 vm_map_offset_t max_off,
1314 boolean_t pageable);
1315
1316 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1317 vm_map_create_external(
1318 pmap_t pmap,
1319 vm_map_offset_t min,
1320 vm_map_offset_t max,
1321 boolean_t pageable)
1322 {
1323 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1324
1325 if (pageable) {
1326 options |= VM_MAP_CREATE_PAGEABLE;
1327 }
1328 return vm_map_create_options(pmap, min, max, options);
1329 }
1330
1331 __startup_func
1332 void
vm_map_will_allocate_early_map(vm_map_t * owner)1333 vm_map_will_allocate_early_map(vm_map_t *owner)
1334 {
1335 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1336 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1337 }
1338
1339 early_map_owners[early_map_count++] = owner;
1340 }
1341
1342 __startup_func
1343 void
vm_map_relocate_early_maps(vm_offset_t delta)1344 vm_map_relocate_early_maps(vm_offset_t delta)
1345 {
1346 for (uint32_t i = 0; i < early_map_count; i++) {
1347 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1348
1349 *early_map_owners[i] = (vm_map_t)(addr + delta);
1350 }
1351
1352 early_map_count = ~0u;
1353 }
1354
1355 /*
1356 * Routine: vm_map_relocate_early_elem
1357 *
1358 * Purpose:
1359 * Early zone elements are allocated in a temporary part
1360 * of the address space.
1361 *
1362 * Once the zones live in their final place, the early
1363 * VM maps, map entries and map holes need to be relocated.
1364 *
1365 * It involves rewriting any vm_map_t, vm_map_entry_t or
1366 * pointers to vm_map_links. Other pointers to other types
1367 * are fine.
1368 *
1369 * Fortunately, pointers to those types are self-contained
1370 * in those zones, _except_ for pointers to VM maps,
1371 * which are tracked during early boot and fixed with
1372 * vm_map_relocate_early_maps().
1373 */
1374 __startup_func
1375 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1376 vm_map_relocate_early_elem(
1377 uint32_t zone_id,
1378 vm_offset_t new_addr,
1379 vm_offset_t delta)
1380 {
1381 #define relocate(type_t, field) ({ \
1382 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1383 if (*__field) { \
1384 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1385 } \
1386 })
1387
1388 switch (zone_id) {
1389 case ZONE_ID_VM_MAP:
1390 case ZONE_ID_VM_MAP_ENTRY:
1391 case ZONE_ID_VM_MAP_HOLES:
1392 break;
1393
1394 default:
1395 panic("Unexpected zone ID %d", zone_id);
1396 }
1397
1398 if (zone_id == ZONE_ID_VM_MAP) {
1399 relocate(vm_map_t, hdr.links.prev);
1400 relocate(vm_map_t, hdr.links.next);
1401 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1402 #ifdef VM_MAP_STORE_USE_RB
1403 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1404 #endif /* VM_MAP_STORE_USE_RB */
1405 relocate(vm_map_t, hint);
1406 relocate(vm_map_t, hole_hint);
1407 relocate(vm_map_t, first_free);
1408 return;
1409 }
1410
1411 relocate(struct vm_map_links *, prev);
1412 relocate(struct vm_map_links *, next);
1413
1414 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1415 #ifdef VM_MAP_STORE_USE_RB
1416 relocate(vm_map_entry_t, store.entry.rbe_left);
1417 relocate(vm_map_entry_t, store.entry.rbe_right);
1418 relocate(vm_map_entry_t, store.entry.rbe_parent);
1419 #endif /* VM_MAP_STORE_USE_RB */
1420 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1421 /* no object to relocate because we haven't made any */
1422 ((vm_map_entry_t)new_addr)->vme_submap +=
1423 delta >> VME_SUBMAP_SHIFT;
1424 }
1425 #if MAP_ENTRY_CREATION_DEBUG
1426 relocate(vm_map_entry_t, vme_creation_maphdr);
1427 #endif /* MAP_ENTRY_CREATION_DEBUG */
1428 }
1429
1430 #undef relocate
1431 }
1432
1433 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1434 vm_map_create_options(
1435 pmap_t pmap,
1436 vm_map_offset_t min,
1437 vm_map_offset_t max,
1438 vm_map_create_options_t options)
1439 {
1440 vm_map_t result;
1441
1442 #if DEBUG || DEVELOPMENT
1443 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1444 if (early_map_count != ~0u && early_map_count !=
1445 zone_count_allocated(vm_map_zone) + 1) {
1446 panic("allocating %dth early map, owner not known",
1447 zone_count_allocated(vm_map_zone) + 1);
1448 }
1449 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1450 panic("allocating %dth early map for non kernel pmap",
1451 early_map_count);
1452 }
1453 }
1454 #endif /* DEBUG || DEVELOPMENT */
1455
1456 result = zalloc_flags(vm_map_zone, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1457
1458 vm_map_first_entry(result) = vm_map_to_entry(result);
1459 vm_map_last_entry(result) = vm_map_to_entry(result);
1460
1461 vm_map_store_init(&result->hdr);
1462 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1463 vm_map_set_page_shift(result, PAGE_SHIFT);
1464
1465 result->size_limit = RLIM_INFINITY; /* default unlimited */
1466 result->data_limit = RLIM_INFINITY; /* default unlimited */
1467 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1468 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1469 result->pmap = pmap;
1470 result->min_offset = min;
1471 result->max_offset = max;
1472 result->first_free = vm_map_to_entry(result);
1473 result->hint = vm_map_to_entry(result);
1474
1475 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1476 assert(pmap == kernel_pmap);
1477 result->never_faults = true;
1478 }
1479
1480 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1481 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1482 result->has_corpse_footprint = true;
1483 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1484 struct vm_map_links *hole_entry = zalloc(vm_map_holes_zone);
1485
1486 hole_entry->start = min;
1487 #if defined(__arm__) || defined(__arm64__)
1488 hole_entry->end = result->max_offset;
1489 #else
1490 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1491 #endif
1492 result->holes_list = result->hole_hint = hole_entry;
1493 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1494 result->holelistenabled = true;
1495 }
1496
1497 vm_map_lock_init(result);
1498
1499 return result;
1500 }
1501
1502 /*
1503 * Adjusts a submap that was made by kmem_suballoc()
1504 * before it knew where it would be mapped,
1505 * so that it has the right min/max offsets.
1506 *
1507 * We do not need to hold any locks:
1508 * only the caller knows about this map,
1509 * and it is not published on any entry yet.
1510 */
1511 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1512 vm_map_adjust_offsets(
1513 vm_map_t map,
1514 vm_map_offset_t min_off,
1515 vm_map_offset_t max_off)
1516 {
1517 assert(map->min_offset == 0);
1518 assert(map->max_offset == max_off - min_off);
1519 assert(map->hdr.nentries == 0);
1520 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1521
1522 map->min_offset = min_off;
1523 map->max_offset = max_off;
1524
1525 if (map->holelistenabled) {
1526 struct vm_map_links *hole = map->holes_list;
1527
1528 hole->start = min_off;
1529 #if defined(__arm__) || defined(__arm64__)
1530 hole->end = max_off;
1531 #else
1532 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1533 #endif
1534 }
1535 }
1536
1537
1538 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1539 vm_map_adjusted_size(vm_map_t map)
1540 {
1541 struct vm_reserved_region *regions = NULL;
1542 size_t num_regions = 0;
1543 mach_vm_size_t reserved_size = 0, map_size = 0;
1544
1545 if (map == NULL || (map->size == 0)) {
1546 return 0;
1547 }
1548
1549 map_size = map->size;
1550
1551 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1552 /*
1553 * No special reserved regions or not an exotic map or the task
1554 * is terminating and these special regions might have already
1555 * been deallocated.
1556 */
1557 return map_size;
1558 }
1559
1560 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1561 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1562
1563 while (num_regions) {
1564 reserved_size += regions[--num_regions].vmrr_size;
1565 }
1566
1567 /*
1568 * There are a few places where the map is being switched out due to
1569 * 'termination' without that bit being set (e.g. exec and corpse purging).
1570 * In those cases, we could have the map's regions being deallocated on
1571 * a core while some accounting process is trying to get the map's size.
1572 * So this assert can't be enabled till all those places are uniform in
1573 * their use of the 'map->terminated' bit.
1574 *
1575 * assert(map_size >= reserved_size);
1576 */
1577
1578 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1579 }
1580
1581 /*
1582 * vm_map_entry_create: [ internal use only ]
1583 *
1584 * Allocates a VM map entry for insertion in the
1585 * given map (or map copy). No fields are filled.
1586 *
1587 * The VM entry will be zero initialized, except for:
1588 * - behavior set to VM_BEHAVIOR_DEFAULT
1589 * - inheritance set to VM_INHERIT_DEFAULT
1590 */
1591 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1592
1593 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1594
1595 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1596 _vm_map_entry_create(
1597 struct vm_map_header *map_header __unused)
1598 {
1599 vm_map_entry_t entry = NULL;
1600 zone_t zone = vm_map_entry_zone;
1601
1602 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1603 zone_security_flags_t zsflags = zone_security_array[ZONE_ID_VM_MAP_ENTRY];
1604 if (map_header == &zone_submap(zsflags)->hdr) {
1605 /*
1606 * If we are trying to allocate an entry for the submap
1607 * of the vm_map_entry_zone, then this can cause recursive
1608 * locking of this map.
1609 *
1610 * Try to allocate _without blocking_ from this zone,
1611 * but if it is depleted, we need to go to the
1612 * vm_map_entry_reserved_zone which is in the zalloc
1613 * "VM" submap, which can grow without taking any map lock.
1614 *
1615 * Note: the vm_map_entry_zone has a rather high "reserve"
1616 * setup in order to minimize usage of the reserved one.
1617 */
1618 entry = zalloc_flags(vm_map_entry_zone, Z_NOWAIT | Z_ZERO);
1619 zone = vm_map_entry_reserved_zone;
1620 }
1621 #endif
1622 if (entry == NULL) {
1623 entry = zalloc_flags(zone, Z_WAITOK | Z_ZERO);
1624 }
1625
1626 /*
1627 * Help the compiler with what we know to be true,
1628 * so that the further bitfields inits have good codegen.
1629 *
1630 * See rdar://87041299
1631 */
1632 __builtin_assume(entry->vme_object_value == 0);
1633 #if __LP64__
1634 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1635 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1636 #else
1637 __builtin_assume(*(uint32_t *)(&entry->vme_object_value + 1) == 0);
1638 __builtin_assume(*(uint32_t *)(&entry->vme_object_value + 2) == 0);
1639 __builtin_assume(*(uint32_t *)(&entry->vme_object_value + 3) == 0);
1640 __builtin_assume(*(uint32_t *)(&entry->vme_object_value + 4) == 0);
1641 #endif
1642
1643 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1644 "VME_ALIAS_MASK covers tags");
1645
1646 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1647 "can skip zeroing of the behavior field");
1648 entry->inheritance = VM_INHERIT_DEFAULT;
1649
1650 vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1651
1652 #if MAP_ENTRY_CREATION_DEBUG
1653 entry->vme_creation_maphdr = map_header;
1654 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1655 BTREF_GET_NOWAIT);
1656 #endif
1657 return entry;
1658 }
1659
1660 /*
1661 * vm_map_entry_dispose: [ internal use only ]
1662 *
1663 * Inverse of vm_map_entry_create.
1664 *
1665 * write map lock held so no need to
1666 * do anything special to insure correctness
1667 * of the stores
1668 */
1669 static void
vm_map_entry_dispose(vm_map_entry_t entry)1670 vm_map_entry_dispose(
1671 vm_map_entry_t entry)
1672 {
1673 #if MAP_ENTRY_CREATION_DEBUG
1674 btref_put(entry->vme_creation_bt);
1675 #endif
1676 #if MAP_ENTRY_INSERTION_DEBUG
1677 btref_put(entry->vme_insertion_bt);
1678 #endif
1679 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1680 if (zone_id_for_element(entry, sizeof(*entry)) != ZONE_ID_VM_MAP_ENTRY) {
1681 zfree(vm_map_entry_reserved_zone, entry);
1682 return;
1683 }
1684 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1685 zfree(vm_map_entry_zone, entry);
1686 }
1687
1688 #define vm_map_copy_entry_dispose(copy_entry) \
1689 vm_map_entry_dispose(copy_entry)
1690
1691 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1692 vm_map_zap_first_entry(
1693 vm_map_zap_t list)
1694 {
1695 return list->vmz_head;
1696 }
1697
1698 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1699 vm_map_zap_last_entry(
1700 vm_map_zap_t list)
1701 {
1702 assert(vm_map_zap_first_entry(list));
1703 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1704 }
1705
1706 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1707 vm_map_zap_append(
1708 vm_map_zap_t list,
1709 vm_map_entry_t entry)
1710 {
1711 entry->vme_next = VM_MAP_ENTRY_NULL;
1712 *list->vmz_tail = entry;
1713 list->vmz_tail = &entry->vme_next;
1714 }
1715
1716 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1717 vm_map_zap_pop(
1718 vm_map_zap_t list)
1719 {
1720 vm_map_entry_t head = list->vmz_head;
1721
1722 if (head != VM_MAP_ENTRY_NULL &&
1723 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1724 list->vmz_tail = &list->vmz_head;
1725 }
1726
1727 return head;
1728 }
1729
1730 static void
vm_map_zap_dispose(vm_map_zap_t list)1731 vm_map_zap_dispose(
1732 vm_map_zap_t list)
1733 {
1734 vm_map_entry_t entry;
1735
1736 while ((entry = vm_map_zap_pop(list))) {
1737 if (entry->is_sub_map) {
1738 vm_map_deallocate(VME_SUBMAP(entry));
1739 } else {
1740 vm_object_deallocate(VME_OBJECT(entry));
1741 }
1742
1743 vm_map_entry_dispose(entry);
1744 }
1745 }
1746
1747 #if MACH_ASSERT
1748 static boolean_t first_free_check = FALSE;
1749 boolean_t
first_free_is_valid(vm_map_t map)1750 first_free_is_valid(
1751 vm_map_t map)
1752 {
1753 if (!first_free_check) {
1754 return TRUE;
1755 }
1756
1757 return first_free_is_valid_store( map );
1758 }
1759 #endif /* MACH_ASSERT */
1760
1761
1762 #define vm_map_copy_entry_link(copy, after_where, entry) \
1763 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1764
1765 #define vm_map_copy_entry_unlink(copy, entry) \
1766 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry))
1767
1768 /*
1769 * vm_map_destroy:
1770 *
1771 * Actually destroy a map.
1772 */
1773 void
vm_map_destroy(vm_map_t map)1774 vm_map_destroy(
1775 vm_map_t map)
1776 {
1777 /* final cleanup: this is not allowed to fail */
1778 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1779
1780 VM_MAP_ZAP_DECLARE(zap);
1781
1782 vm_map_lock(map);
1783
1784 map->terminated = true;
1785 /* clean up regular map entries */
1786 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags,
1787 KMEM_GUARD_NONE, &zap);
1788 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1789 #if !defined(__arm__)
1790 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags,
1791 KMEM_GUARD_NONE, &zap);
1792 #endif /* !__arm__ */
1793
1794 vm_map_disable_hole_optimization(map);
1795 vm_map_corpse_footprint_destroy(map);
1796
1797 vm_map_unlock(map);
1798
1799 vm_map_zap_dispose(&zap);
1800
1801 assert(map->hdr.nentries == 0);
1802
1803 if (map->pmap) {
1804 pmap_destroy(map->pmap);
1805 }
1806
1807 #if LOCKS_INDIRECT_ALLOW
1808 if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) {
1809 /*
1810 * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT.
1811 * And this is regardless of whether the lck_mtx_ext_t is embedded in the
1812 * structure or kalloc'ed via lck_mtx_init.
1813 * An example is s_lock_ext within struct _vm_map.
1814 *
1815 * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We
1816 * can add another tag to detect embedded vs alloc'ed indirect external
1817 * mutexes but that'll be additional checks in the lock path and require
1818 * updating dependencies for the old vs new tag.
1819 *
1820 * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied
1821 * just when lock debugging is ON, we choose to forego explicitly destroying
1822 * the vm_map mutex and rw lock. Because the vm_map_lck_grp is
1823 * permanent, this has no serious side-effect.
1824 */
1825 } else
1826 #endif /* LOCKS_INDIRECT_ALLOW */
1827 {
1828 lck_rw_destroy(&(map)->lock, &vm_map_lck_grp);
1829 }
1830
1831 zfree(vm_map_zone, map);
1832 }
1833
1834 /*
1835 * Returns pid of the task with the largest number of VM map entries.
1836 * Used in the zone-map-exhaustion jetsam path.
1837 */
1838 pid_t
find_largest_process_vm_map_entries(void)1839 find_largest_process_vm_map_entries(void)
1840 {
1841 pid_t victim_pid = -1;
1842 int max_vm_map_entries = 0;
1843 task_t task = TASK_NULL;
1844 queue_head_t *task_list = &tasks;
1845
1846 lck_mtx_lock(&tasks_threads_lock);
1847 queue_iterate(task_list, task, task_t, tasks) {
1848 if (task == kernel_task || !task->active) {
1849 continue;
1850 }
1851
1852 vm_map_t task_map = task->map;
1853 if (task_map != VM_MAP_NULL) {
1854 int task_vm_map_entries = task_map->hdr.nentries;
1855 if (task_vm_map_entries > max_vm_map_entries) {
1856 max_vm_map_entries = task_vm_map_entries;
1857 victim_pid = pid_from_task(task);
1858 }
1859 }
1860 }
1861 lck_mtx_unlock(&tasks_threads_lock);
1862
1863 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1864 return victim_pid;
1865 }
1866
1867
1868 /*
1869 * vm_map_lookup_entry: [ internal use only ]
1870 *
1871 * Calls into the vm map store layer to find the map
1872 * entry containing (or immediately preceding) the
1873 * specified address in the given map; the entry is returned
1874 * in the "entry" parameter. The boolean
1875 * result indicates whether the address is
1876 * actually contained in the map.
1877 */
1878 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1879 vm_map_lookup_entry(
1880 vm_map_t map,
1881 vm_map_offset_t address,
1882 vm_map_entry_t *entry) /* OUT */
1883 {
1884 #if CONFIG_KERNEL_TBI
1885 if (VM_KERNEL_ADDRESS(address)) {
1886 address = VM_KERNEL_STRIP_UPTR(address);
1887 }
1888 #endif /* CONFIG_KERNEL_TBI */
1889 #if CONFIG_PROB_GZALLOC
1890 if (map->pmap == kernel_pmap) {
1891 assertf(!pgz_owned(address),
1892 "it is the responsibility of callers to unguard PGZ addresses");
1893 }
1894 #endif /* CONFIG_PROB_GZALLOC */
1895 return vm_map_store_lookup_entry( map, address, entry );
1896 }
1897
1898 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1899 vm_map_lookup_entry_or_next(
1900 vm_map_t map,
1901 vm_map_offset_t address,
1902 vm_map_entry_t *entry) /* OUT */
1903 {
1904 if (vm_map_lookup_entry(map, address, entry)) {
1905 return true;
1906 }
1907
1908 *entry = (*entry)->vme_next;
1909 return false;
1910 }
1911
1912 #if CONFIG_PROB_GZALLOC
1913 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1914 vm_map_lookup_entry_allow_pgz(
1915 vm_map_t map,
1916 vm_map_offset_t address,
1917 vm_map_entry_t *entry) /* OUT */
1918 {
1919 #if CONFIG_KERNEL_TBI
1920 if (VM_KERNEL_ADDRESS(address)) {
1921 address = VM_KERNEL_STRIP_UPTR(address);
1922 }
1923 #endif /* CONFIG_KERNEL_TBI */
1924 return vm_map_store_lookup_entry( map, address, entry );
1925 }
1926 #endif /* CONFIG_PROB_GZALLOC */
1927
1928 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1929 /*
1930 * Routine: vm_map_adjust_direction
1931 * Purpose:
1932 * Overrides direction to reduce fragmentation. Allocate small
1933 * allocations from the end and large allocations from the right.
1934 */
1935 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1936 vm_map_adjust_direction(
1937 vm_map_kernel_flags_t *vmk_flags,
1938 vm_map_size_t size)
1939 {
1940 if (size < KMEM_SMALLMAP_THRESHOLD) {
1941 vmk_flags->vmkf_last_free = true;
1942 } else {
1943 vmk_flags->vmkf_last_free = false;
1944 }
1945 }
1946 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1947
1948 /*
1949 * Routine: vm_map_get_range
1950 * Purpose:
1951 * Adjust bounds based on security policy.
1952 */
1953 static struct kmem_range
vm_map_get_range(vm_map_t map,vm_map_offset_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1954 vm_map_get_range(
1955 vm_map_t map,
1956 vm_map_offset_t *address,
1957 vm_map_kernel_flags_t *vmk_flags,
1958 vm_map_size_t size)
1959 {
1960 struct kmem_range effective_range = {};
1961 if (map == kernel_map) {
1962 kmem_range_id_t range_id = vmk_flags->vmkf_range_id;
1963 effective_range = kmem_ranges[range_id];
1964
1965 if (startup_phase >= STARTUP_SUB_KMEM) {
1966 /*
1967 * Hint provided by caller is zeroed as the range is restricted to a
1968 * subset of the entire kernel_map VA, which could put the hint outside
1969 * the range, causing vm_map_store_find_space to fail.
1970 */
1971 *address = 0ull;
1972 assert(range_id != 0);
1973 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1974 /*
1975 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1976 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1977 * use the entire range. Two small allocations from different fronts
1978 * (left and right) can only meet when memory in the that range is
1979 * entirely exhausted.
1980 */
1981 if (size >= KMEM_SMALLMAP_THRESHOLD) {
1982 effective_range = kmem_large_ranges[range_id];
1983 }
1984 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1985 vm_map_adjust_direction(vmk_flags, size);
1986 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1987 }
1988 } else {
1989 /*
1990 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
1991 * allocations of PAGEZERO to explicit requests since its
1992 * normal use is to catch dereferences of NULL and many
1993 * applications also treat pointers with a value of 0 as
1994 * special and suddenly having address 0 contain useable
1995 * memory would tend to confuse those applications.
1996 */
1997 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1998 effective_range.max_address = map->max_offset;
1999 }
2000
2001 return effective_range;
2002 }
2003
2004 /*
2005 * Routine: vm_map_locate_space
2006 * Purpose:
2007 * Finds a range in the specified virtual address map,
2008 * returning the start of that range,
2009 * as well as the entry right before it.
2010 */
2011 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2012 vm_map_locate_space(
2013 vm_map_t map,
2014 vm_map_size_t size,
2015 vm_map_offset_t mask,
2016 vm_map_kernel_flags_t vmk_flags,
2017 vm_map_offset_t *start_inout,
2018 vm_map_entry_t *entry_out)
2019 {
2020 struct kmem_range effective_range = {};
2021 vm_map_size_t guard_offset;
2022 vm_map_offset_t hint, limit;
2023 vm_map_entry_t entry;
2024
2025 /*
2026 * Only supported by vm_map_enter() with a fixed address.
2027 */
2028 assert(!vmk_flags.vmkf_beyond_max);
2029
2030 if (__improbable(map->wait_for_space)) {
2031 /*
2032 * support for "wait_for_space" is minimal,
2033 * its only consumer is the ipc_kernel_copy_map.
2034 */
2035 assert(!map->holelistenabled &&
2036 !vmk_flags.vmkf_last_free &&
2037 !vmk_flags.vmkf_keep_map_locked &&
2038 !vmk_flags.vmkf_map_jit &&
2039 !vmk_flags.vmkf_random_address &&
2040 *start_inout <= map->min_offset);
2041 } else if (vmk_flags.vmkf_last_free) {
2042 assert(!vmk_flags.vmkf_map_jit &&
2043 !vmk_flags.vmkf_random_address);
2044 }
2045
2046 if (vmk_flags.vmkf_guard_before) {
2047 guard_offset = VM_MAP_PAGE_SIZE(map);
2048 assert(size > guard_offset);
2049 size -= guard_offset;
2050 } else {
2051 assert(size != 0);
2052 guard_offset = 0;
2053 }
2054
2055 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2056 #if XNU_TARGET_OS_OSX
2057 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2058 assert(map != kernel_map);
2059 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2060 }
2061 #endif /* XNU_TARGET_OS_OSX */
2062
2063 again:
2064 if (vmk_flags.vmkf_last_free) {
2065 hint = *start_inout;
2066
2067 if (hint == 0 || hint > effective_range.max_address) {
2068 hint = effective_range.max_address;
2069 }
2070 if (hint <= effective_range.min_address) {
2071 return KERN_NO_SPACE;
2072 }
2073 limit = effective_range.min_address;
2074 } else {
2075 hint = *start_inout;
2076
2077 if (vmk_flags.vmkf_map_jit) {
2078 if (map->jit_entry_exists &&
2079 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2080 return KERN_INVALID_ARGUMENT;
2081 }
2082 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2083 vmk_flags.vmkf_random_address = true;
2084 }
2085 }
2086
2087 if (vmk_flags.vmkf_random_address) {
2088 kern_return_t kr;
2089
2090 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2091 if (kr != KERN_SUCCESS) {
2092 return kr;
2093 }
2094 }
2095 #if XNU_TARGET_OS_OSX
2096 else if ((hint == 0 || hint == vm_map_min(map)) &&
2097 !map->disable_vmentry_reuse &&
2098 map->vmmap_high_start != 0) {
2099 hint = map->vmmap_high_start;
2100 }
2101 #endif /* XNU_TARGET_OS_OSX */
2102
2103 if (hint < effective_range.min_address) {
2104 hint = effective_range.min_address;
2105 }
2106 if (effective_range.max_address <= hint) {
2107 return KERN_NO_SPACE;
2108 }
2109
2110 limit = effective_range.max_address;
2111 }
2112 entry = vm_map_store_find_space(map,
2113 hint, limit, vmk_flags.vmkf_last_free,
2114 guard_offset, size, mask,
2115 start_inout);
2116
2117 if (__improbable(entry == NULL)) {
2118 if (map->wait_for_space &&
2119 guard_offset + size <=
2120 effective_range.max_address - effective_range.min_address) {
2121 assert_wait((event_t)map, THREAD_ABORTSAFE);
2122 vm_map_unlock(map);
2123 thread_block(THREAD_CONTINUE_NULL);
2124 vm_map_lock(map);
2125 goto again;
2126 }
2127 return KERN_NO_SPACE;
2128 }
2129
2130 if (entry_out) {
2131 *entry_out = entry;
2132 }
2133 return KERN_SUCCESS;
2134 }
2135
2136
2137 /*
2138 * Routine: vm_map_find_space
2139 * Purpose:
2140 * Allocate a range in the specified virtual address map,
2141 * returning the entry allocated for that range.
2142 * Used by kmem_alloc, etc.
2143 *
2144 * The map must be NOT be locked. It will be returned locked
2145 * on KERN_SUCCESS, unlocked on failure.
2146 *
2147 * If an entry is allocated, the object/offset fields
2148 * are initialized to zero.
2149 */
2150 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2151 vm_map_find_space(
2152 vm_map_t map,
2153 vm_map_offset_t hint_address,
2154 vm_map_size_t size,
2155 vm_map_offset_t mask,
2156 vm_map_kernel_flags_t vmk_flags,
2157 vm_map_entry_t *o_entry) /* OUT */
2158 {
2159 vm_map_entry_t new_entry, entry;
2160 kern_return_t kr;
2161
2162 if (size == 0) {
2163 return KERN_INVALID_ARGUMENT;
2164 }
2165
2166 new_entry = vm_map_entry_create(map);
2167 new_entry->use_pmap = true;
2168 new_entry->protection = VM_PROT_DEFAULT;
2169 new_entry->max_protection = VM_PROT_ALL;
2170
2171 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2172 new_entry->map_aligned = true;
2173 }
2174 if (vmk_flags.vmkf_permanent) {
2175 new_entry->permanent = true;
2176 }
2177
2178 vm_map_lock(map);
2179
2180 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2181 &hint_address, &entry);
2182 if (kr != KERN_SUCCESS) {
2183 vm_map_unlock(map);
2184 vm_map_entry_dispose(new_entry);
2185 return kr;
2186 }
2187 new_entry->vme_start = hint_address;
2188 new_entry->vme_end = hint_address + size;
2189
2190 /*
2191 * At this point,
2192 *
2193 * - new_entry's "vme_start" and "vme_end" should define
2194 * the endpoints of the available new range,
2195 *
2196 * - and "entry" should refer to the region before
2197 * the new range,
2198 *
2199 * - and the map should still be locked.
2200 */
2201
2202 assert(page_aligned(new_entry->vme_start));
2203 assert(page_aligned(new_entry->vme_end));
2204 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2205 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2206
2207 /*
2208 * Insert the new entry into the list
2209 */
2210
2211 vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2212 map->size += size;
2213
2214 /*
2215 * Update the lookup hint
2216 */
2217 SAVE_HINT_MAP_WRITE(map, new_entry);
2218
2219 *o_entry = new_entry;
2220 return KERN_SUCCESS;
2221 }
2222
2223 int vm_map_pmap_enter_print = FALSE;
2224 int vm_map_pmap_enter_enable = FALSE;
2225
2226 /*
2227 * Routine: vm_map_pmap_enter [internal only]
2228 *
2229 * Description:
2230 * Force pages from the specified object to be entered into
2231 * the pmap at the specified address if they are present.
2232 * As soon as a page not found in the object the scan ends.
2233 *
2234 * Returns:
2235 * Nothing.
2236 *
2237 * In/out conditions:
2238 * The source map should not be locked on entry.
2239 */
2240 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2241 vm_map_pmap_enter(
2242 vm_map_t map,
2243 vm_map_offset_t addr,
2244 vm_map_offset_t end_addr,
2245 vm_object_t object,
2246 vm_object_offset_t offset,
2247 vm_prot_t protection)
2248 {
2249 int type_of_fault;
2250 kern_return_t kr;
2251 struct vm_object_fault_info fault_info = {};
2252
2253 if (map->pmap == 0) {
2254 return;
2255 }
2256
2257 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2258
2259 while (addr < end_addr) {
2260 vm_page_t m;
2261
2262
2263 /*
2264 * TODO:
2265 * From vm_map_enter(), we come into this function without the map
2266 * lock held or the object lock held.
2267 * We haven't taken a reference on the object either.
2268 * We should do a proper lookup on the map to make sure
2269 * that things are sane before we go locking objects that
2270 * could have been deallocated from under us.
2271 */
2272
2273 vm_object_lock(object);
2274
2275 m = vm_page_lookup(object, offset);
2276
2277 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2278 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
2279 vm_object_unlock(object);
2280 return;
2281 }
2282
2283 if (vm_map_pmap_enter_print) {
2284 printf("vm_map_pmap_enter:");
2285 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2286 map, (unsigned long long)addr, object, (unsigned long long)offset);
2287 }
2288 type_of_fault = DBG_CACHE_HIT_FAULT;
2289 kr = vm_fault_enter(m, map->pmap,
2290 addr,
2291 PAGE_SIZE, 0,
2292 protection, protection,
2293 VM_PAGE_WIRED(m),
2294 FALSE, /* change_wiring */
2295 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2296 &fault_info,
2297 NULL, /* need_retry */
2298 &type_of_fault);
2299
2300 vm_object_unlock(object);
2301
2302 offset += PAGE_SIZE_64;
2303 addr += PAGE_SIZE;
2304 }
2305 }
2306
2307 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2308 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2309 vm_map_random_address_for_size(
2310 vm_map_t map,
2311 vm_map_offset_t *address,
2312 vm_map_size_t size,
2313 vm_map_kernel_flags_t vmk_flags)
2314 {
2315 kern_return_t kr = KERN_SUCCESS;
2316 int tries = 0;
2317 vm_map_offset_t random_addr = 0;
2318 vm_map_offset_t hole_end;
2319
2320 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2321 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2322 vm_map_size_t vm_hole_size = 0;
2323 vm_map_size_t addr_space_size;
2324 struct kmem_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2325
2326 addr_space_size = effective_range.max_address - effective_range.min_address;
2327 if (size >= addr_space_size) {
2328 return KERN_NO_SPACE;
2329 }
2330 addr_space_size -= size;
2331
2332 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2333
2334 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2335 if (startup_phase < STARTUP_SUB_ZALLOC) {
2336 random_addr = (vm_map_offset_t)early_random();
2337 } else {
2338 random_addr = (vm_map_offset_t)random();
2339 }
2340 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2341 random_addr = vm_map_trunc_page(
2342 effective_range.min_address + (random_addr % addr_space_size),
2343 VM_MAP_PAGE_MASK(map));
2344
2345 #if CONFIG_PROB_GZALLOC
2346 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2347 continue;
2348 }
2349 #endif /* CONFIG_PROB_GZALLOC */
2350
2351 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2352 if (prev_entry == vm_map_to_entry(map)) {
2353 next_entry = vm_map_first_entry(map);
2354 } else {
2355 next_entry = prev_entry->vme_next;
2356 }
2357 if (next_entry == vm_map_to_entry(map)) {
2358 hole_end = vm_map_max(map);
2359 } else {
2360 hole_end = next_entry->vme_start;
2361 }
2362 vm_hole_size = hole_end - random_addr;
2363 if (vm_hole_size >= size) {
2364 *address = random_addr;
2365 break;
2366 }
2367 }
2368 tries++;
2369 }
2370
2371 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2372 kr = KERN_NO_SPACE;
2373 }
2374 return kr;
2375 }
2376
2377 static boolean_t
vm_memory_malloc_no_cow(int alias)2378 vm_memory_malloc_no_cow(
2379 int alias)
2380 {
2381 uint64_t alias_mask;
2382
2383 if (alias > 63) {
2384 return FALSE;
2385 }
2386
2387 alias_mask = 1ULL << alias;
2388 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2389 return TRUE;
2390 }
2391 return FALSE;
2392 }
2393
2394 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2395 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2396 /*
2397 * Routine: vm_map_enter
2398 *
2399 * Description:
2400 * Allocate a range in the specified virtual address map.
2401 * The resulting range will refer to memory defined by
2402 * the given memory object and offset into that object.
2403 *
2404 * Arguments are as defined in the vm_map call.
2405 */
2406 static unsigned int vm_map_enter_restore_successes = 0;
2407 static unsigned int vm_map_enter_restore_failures = 0;
2408 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2409 vm_map_enter(
2410 vm_map_t map,
2411 vm_map_offset_t *address, /* IN/OUT */
2412 vm_map_size_t size,
2413 vm_map_offset_t mask,
2414 int flags,
2415 vm_map_kernel_flags_t vmk_flags,
2416 vm_tag_t alias,
2417 vm_object_t object,
2418 vm_object_offset_t offset,
2419 boolean_t needs_copy,
2420 vm_prot_t cur_protection,
2421 vm_prot_t max_protection,
2422 vm_inherit_t inheritance)
2423 {
2424 vm_map_entry_t entry, new_entry;
2425 vm_map_offset_t start, tmp_start, tmp_offset;
2426 vm_map_offset_t end, tmp_end;
2427 vm_map_offset_t tmp2_start, tmp2_end;
2428 vm_map_offset_t step;
2429 kern_return_t result = KERN_SUCCESS;
2430 boolean_t map_locked = FALSE;
2431 boolean_t pmap_empty = TRUE;
2432 boolean_t new_mapping_established = FALSE;
2433 boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2434 boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2435 boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2436 boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2437 boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2438 const boolean_t is_submap = vmk_flags.vmkf_submap;
2439 boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2440 const boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2441 const boolean_t entry_for_jit = vmk_flags.vmkf_map_jit;
2442 boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct;
2443 boolean_t resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2444 boolean_t resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2445 unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2446 vm_tag_t user_alias;
2447 kern_return_t kr;
2448 boolean_t clear_map_aligned = FALSE;
2449 vm_map_size_t chunk_size = 0;
2450 vm_object_t caller_object;
2451 VM_MAP_ZAP_DECLARE(zap_old_list);
2452 VM_MAP_ZAP_DECLARE(zap_new_list);
2453
2454 caller_object = object;
2455
2456 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2457
2458 if (flags & VM_FLAGS_4GB_CHUNK) {
2459 #if defined(__LP64__)
2460 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2461 #else /* __LP64__ */
2462 chunk_size = ANON_CHUNK_SIZE;
2463 #endif /* __LP64__ */
2464 } else {
2465 chunk_size = ANON_CHUNK_SIZE;
2466 }
2467
2468 if (superpage_size) {
2469 switch (superpage_size) {
2470 /*
2471 * Note that the current implementation only supports
2472 * a single size for superpages, SUPERPAGE_SIZE, per
2473 * architecture. As soon as more sizes are supposed
2474 * to be supported, SUPERPAGE_SIZE has to be replaced
2475 * with a lookup of the size depending on superpage_size.
2476 */
2477 #ifdef __x86_64__
2478 case SUPERPAGE_SIZE_ANY:
2479 /* handle it like 2 MB and round up to page size */
2480 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2481 OS_FALLTHROUGH;
2482 case SUPERPAGE_SIZE_2MB:
2483 break;
2484 #endif
2485 default:
2486 return KERN_INVALID_ARGUMENT;
2487 }
2488 mask = SUPERPAGE_SIZE - 1;
2489 if (size & (SUPERPAGE_SIZE - 1)) {
2490 return KERN_INVALID_ARGUMENT;
2491 }
2492 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2493 }
2494
2495
2496 if ((cur_protection & VM_PROT_WRITE) &&
2497 (cur_protection & VM_PROT_EXECUTE) &&
2498 #if XNU_TARGET_OS_OSX
2499 map->pmap != kernel_pmap &&
2500 (cs_process_global_enforcement() ||
2501 (vmk_flags.vmkf_cs_enforcement_override
2502 ? vmk_flags.vmkf_cs_enforcement
2503 : (vm_map_cs_enforcement(map)
2504 #if __arm64__
2505 || !VM_MAP_IS_EXOTIC(map)
2506 #endif /* __arm64__ */
2507 ))) &&
2508 #endif /* XNU_TARGET_OS_OSX */
2509 (VM_MAP_POLICY_WX_FAIL(map) ||
2510 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2511 !entry_for_jit) {
2512 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2513
2514 DTRACE_VM3(cs_wx,
2515 uint64_t, 0,
2516 uint64_t, 0,
2517 vm_prot_t, cur_protection);
2518 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2519 proc_selfpid(),
2520 (current_task()->bsd_info
2521 ? proc_name_address(current_task()->bsd_info)
2522 : "?"),
2523 __FUNCTION__,
2524 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2525 cur_protection &= ~VM_PROT_EXECUTE;
2526 if (vm_protect_wx_fail) {
2527 return KERN_PROTECTION_FAILURE;
2528 }
2529 }
2530
2531 /*
2532 * If the task has requested executable lockdown,
2533 * deny any new executable mapping.
2534 */
2535 if (map->map_disallow_new_exec == TRUE) {
2536 if (cur_protection & VM_PROT_EXECUTE) {
2537 return KERN_PROTECTION_FAILURE;
2538 }
2539 }
2540
2541 if (resilient_codesign) {
2542 assert(!is_submap);
2543 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2544 if ((cur_protection | max_protection) & reject_prot) {
2545 return KERN_PROTECTION_FAILURE;
2546 }
2547 }
2548
2549 if (resilient_media) {
2550 assert(!is_submap);
2551 // assert(!needs_copy);
2552 if (object != VM_OBJECT_NULL &&
2553 !object->internal) {
2554 /*
2555 * This mapping is directly backed by an external
2556 * memory manager (e.g. a vnode pager for a file):
2557 * we would not have any safe place to inject
2558 * a zero-filled page if an actual page is not
2559 * available, without possibly impacting the actual
2560 * contents of the mapped object (e.g. the file),
2561 * so we can't provide any media resiliency here.
2562 */
2563 return KERN_INVALID_ARGUMENT;
2564 }
2565 }
2566
2567 if (is_submap) {
2568 if (purgable) {
2569 /* submaps can not be purgeable */
2570 return KERN_INVALID_ARGUMENT;
2571 }
2572 if (object == VM_OBJECT_NULL) {
2573 /* submaps can not be created lazily */
2574 return KERN_INVALID_ARGUMENT;
2575 }
2576 }
2577 if (vmk_flags.vmkf_already) {
2578 /*
2579 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2580 * is already present. For it to be meaningul, the requested
2581 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2582 * we shouldn't try and remove what was mapped there first
2583 * (!VM_FLAGS_OVERWRITE).
2584 */
2585 if ((flags & VM_FLAGS_ANYWHERE) ||
2586 (flags & VM_FLAGS_OVERWRITE)) {
2587 return KERN_INVALID_ARGUMENT;
2588 }
2589 }
2590
2591 if (size == 0 ||
2592 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2593 *address = 0;
2594 return KERN_INVALID_ARGUMENT;
2595 }
2596
2597 if (map->pmap == kernel_pmap) {
2598 user_alias = VM_KERN_MEMORY_NONE;
2599 } else {
2600 user_alias = alias;
2601 }
2602
2603 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2604 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2605 }
2606
2607 #define RETURN(value) { result = value; goto BailOut; }
2608
2609 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2610 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2611 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2612 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2613 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2614 }
2615
2616 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2617 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2618 /*
2619 * In most cases, the caller rounds the size up to the
2620 * map's page size.
2621 * If we get a size that is explicitly not map-aligned here,
2622 * we'll have to respect the caller's wish and mark the
2623 * mapping as "not map-aligned" to avoid tripping the
2624 * map alignment checks later.
2625 */
2626 clear_map_aligned = TRUE;
2627 }
2628 if (!anywhere &&
2629 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2630 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2631 /*
2632 * We've been asked to map at a fixed address and that
2633 * address is not aligned to the map's specific alignment.
2634 * The caller should know what it's doing (i.e. most likely
2635 * mapping some fragmented copy map, transferring memory from
2636 * a VM map with a different alignment), so clear map_aligned
2637 * for this new VM map entry and proceed.
2638 */
2639 clear_map_aligned = TRUE;
2640 }
2641
2642 /*
2643 * Only zero-fill objects are allowed to be purgable.
2644 * LP64todo - limit purgable objects to 32-bits for now
2645 */
2646 if (purgable &&
2647 (offset != 0 ||
2648 (object != VM_OBJECT_NULL &&
2649 (object->vo_size != size ||
2650 object->purgable == VM_PURGABLE_DENY))
2651 || size > ANON_MAX_SIZE)) { /* LP64todo: remove when dp capable */
2652 return KERN_INVALID_ARGUMENT;
2653 }
2654
2655 start = *address;
2656
2657 if (anywhere) {
2658 vm_map_lock(map);
2659 map_locked = TRUE;
2660
2661 if (flags & VM_FLAGS_RANDOM_ADDR) {
2662 vmk_flags.vmkf_random_address = true;
2663 }
2664
2665 /*
2666 * Default to data range when an explicit range id isn't specified
2667 */
2668 if ((vmk_flags.vmkf_range_id == KMEM_RANGE_ID_NONE) &&
2669 (map == kernel_map)) {
2670 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
2671 }
2672
2673 result = vm_map_locate_space(map, size, mask, vmk_flags,
2674 &start, &entry);
2675 if (result != KERN_SUCCESS) {
2676 goto BailOut;
2677 }
2678
2679 *address = start;
2680 end = start + size;
2681 assert(VM_MAP_PAGE_ALIGNED(*address,
2682 VM_MAP_PAGE_MASK(map)));
2683 } else {
2684 vm_map_offset_t effective_min_offset, effective_max_offset;
2685
2686 effective_min_offset = map->min_offset;
2687 effective_max_offset = map->max_offset;
2688
2689 if (vmk_flags.vmkf_beyond_max) {
2690 /*
2691 * Allow an insertion beyond the map's max offset.
2692 */
2693 effective_max_offset = 0x00000000FFFFF000ULL;
2694 #if !defined(__arm__)
2695 if (vm_map_is_64bit(map)) {
2696 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2697 }
2698 #endif /* __arm__ */
2699 #if XNU_TARGET_OS_OSX
2700 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2701 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2702 #endif /* XNU_TARGET_OS_OSX */
2703 }
2704
2705 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2706 !overwrite &&
2707 user_alias == VM_MEMORY_REALLOC) {
2708 /*
2709 * Force realloc() to switch to a new allocation,
2710 * to prevent 4k-fragmented virtual ranges.
2711 */
2712 // DEBUG4K_ERROR("no realloc in place");
2713 return KERN_NO_SPACE;
2714 }
2715
2716 /*
2717 * Verify that:
2718 * the address doesn't itself violate
2719 * the mask requirement.
2720 */
2721
2722 vm_map_lock(map);
2723 map_locked = TRUE;
2724 if ((start & mask) != 0) {
2725 RETURN(KERN_NO_SPACE);
2726 }
2727
2728 /*
2729 * ... the address is within bounds
2730 */
2731
2732 end = start + size;
2733
2734 if ((start < effective_min_offset) ||
2735 (end > effective_max_offset) ||
2736 (start >= end)) {
2737 RETURN(KERN_INVALID_ADDRESS);
2738 }
2739
2740 if (overwrite) {
2741 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2742
2743 /*
2744 * Fixed mapping and "overwrite" flag: attempt to
2745 * remove all existing mappings in the specified
2746 * address range, saving them in our "zap_old_list".
2747 *
2748 * This avoids releasing the VM map lock in
2749 * vm_map_entry_delete() and allows atomicity
2750 * when we want to replace some mappings with a new one.
2751 * It also allows us to restore the old VM mappings if the
2752 * new mapping fails.
2753 */
2754 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2755
2756 if (vmk_flags.vmkf_overwrite_immutable) {
2757 /* we can overwrite immutable mappings */
2758 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2759 }
2760 (void)vm_map_delete(map, start, end, remove_flags,
2761 KMEM_GUARD_NONE, &zap_old_list);
2762 }
2763
2764 /*
2765 * ... the starting address isn't allocated
2766 */
2767
2768 if (vm_map_lookup_entry(map, start, &entry)) {
2769 if (!(vmk_flags.vmkf_already)) {
2770 RETURN(KERN_NO_SPACE);
2771 }
2772 /*
2773 * Check if what's already there is what we want.
2774 */
2775 tmp_start = start;
2776 tmp_offset = offset;
2777 if (entry->vme_start < start) {
2778 tmp_start -= start - entry->vme_start;
2779 tmp_offset -= start - entry->vme_start;
2780 }
2781 for (; entry->vme_start < end;
2782 entry = entry->vme_next) {
2783 /*
2784 * Check if the mapping's attributes
2785 * match the existing map entry.
2786 */
2787 if (entry == vm_map_to_entry(map) ||
2788 entry->vme_start != tmp_start ||
2789 entry->is_sub_map != is_submap ||
2790 VME_OFFSET(entry) != tmp_offset ||
2791 entry->needs_copy != needs_copy ||
2792 entry->protection != cur_protection ||
2793 entry->max_protection != max_protection ||
2794 entry->inheritance != inheritance ||
2795 entry->iokit_acct != iokit_acct ||
2796 VME_ALIAS(entry) != alias) {
2797 /* not the same mapping ! */
2798 RETURN(KERN_NO_SPACE);
2799 }
2800 /*
2801 * Check if the same object is being mapped.
2802 */
2803 if (is_submap) {
2804 if (VME_SUBMAP(entry) !=
2805 (vm_map_t) object) {
2806 /* not the same submap */
2807 RETURN(KERN_NO_SPACE);
2808 }
2809 } else {
2810 if (VME_OBJECT(entry) != object) {
2811 /* not the same VM object... */
2812 vm_object_t obj2;
2813
2814 obj2 = VME_OBJECT(entry);
2815 if ((obj2 == VM_OBJECT_NULL ||
2816 obj2->internal) &&
2817 (object == VM_OBJECT_NULL ||
2818 object->internal)) {
2819 /*
2820 * ... but both are
2821 * anonymous memory,
2822 * so equivalent.
2823 */
2824 } else {
2825 RETURN(KERN_NO_SPACE);
2826 }
2827 }
2828 }
2829
2830 tmp_offset += entry->vme_end - entry->vme_start;
2831 tmp_start += entry->vme_end - entry->vme_start;
2832 if (entry->vme_end >= end) {
2833 /* reached the end of our mapping */
2834 break;
2835 }
2836 }
2837 /* it all matches: let's use what's already there ! */
2838 RETURN(KERN_MEMORY_PRESENT);
2839 }
2840
2841 /*
2842 * ... the next region doesn't overlap the
2843 * end point.
2844 */
2845
2846 if ((entry->vme_next != vm_map_to_entry(map)) &&
2847 (entry->vme_next->vme_start < end)) {
2848 RETURN(KERN_NO_SPACE);
2849 }
2850 }
2851
2852 /*
2853 * At this point,
2854 * "start" and "end" should define the endpoints of the
2855 * available new range, and
2856 * "entry" should refer to the region before the new
2857 * range, and
2858 *
2859 * the map should be locked.
2860 */
2861
2862 /*
2863 * See whether we can avoid creating a new entry (and object) by
2864 * extending one of our neighbors. [So far, we only attempt to
2865 * extend from below.] Note that we can never extend/join
2866 * purgable objects because they need to remain distinct
2867 * entities in order to implement their "volatile object"
2868 * semantics.
2869 */
2870
2871 if (purgable ||
2872 entry_for_jit ||
2873 vm_memory_malloc_no_cow(user_alias)) {
2874 if (object == VM_OBJECT_NULL) {
2875 object = vm_object_allocate(size);
2876 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2877 object->true_share = FALSE;
2878 if (purgable) {
2879 task_t owner;
2880 object->purgable = VM_PURGABLE_NONVOLATILE;
2881 if (map->pmap == kernel_pmap) {
2882 /*
2883 * Purgeable mappings made in a kernel
2884 * map are "owned" by the kernel itself
2885 * rather than the current user task
2886 * because they're likely to be used by
2887 * more than this user task (see
2888 * execargs_purgeable_allocate(), for
2889 * example).
2890 */
2891 owner = kernel_task;
2892 } else {
2893 owner = current_task();
2894 }
2895 assert(object->vo_owner == NULL);
2896 assert(object->resident_page_count == 0);
2897 assert(object->wired_page_count == 0);
2898 vm_object_lock(object);
2899 vm_purgeable_nonvolatile_enqueue(object, owner);
2900 vm_object_unlock(object);
2901 }
2902 offset = (vm_object_offset_t)0;
2903 }
2904 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2905 /* no coalescing if address space uses sub-pages */
2906 } else if ((is_submap == FALSE) &&
2907 (object == VM_OBJECT_NULL) &&
2908 (entry != vm_map_to_entry(map)) &&
2909 (entry->vme_end == start) &&
2910 (!entry->is_shared) &&
2911 (!entry->is_sub_map) &&
2912 (!entry->in_transition) &&
2913 (!entry->needs_wakeup) &&
2914 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2915 (entry->protection == cur_protection) &&
2916 (entry->max_protection == max_protection) &&
2917 (entry->inheritance == inheritance) &&
2918 ((user_alias == VM_MEMORY_REALLOC) ||
2919 (VME_ALIAS(entry) == alias)) &&
2920 (entry->no_cache == no_cache) &&
2921 (entry->permanent == permanent) &&
2922 /* no coalescing for immutable executable mappings */
2923 !((entry->protection & VM_PROT_EXECUTE) &&
2924 entry->permanent) &&
2925 (!entry->superpage_size && !superpage_size) &&
2926 /*
2927 * No coalescing if not map-aligned, to avoid propagating
2928 * that condition any further than needed:
2929 */
2930 (!entry->map_aligned || !clear_map_aligned) &&
2931 (!entry->zero_wired_pages) &&
2932 (!entry->used_for_jit && !entry_for_jit) &&
2933 (!entry->pmap_cs_associated) &&
2934 (entry->iokit_acct == iokit_acct) &&
2935 (!entry->vme_resilient_codesign) &&
2936 (!entry->vme_resilient_media) &&
2937 (!entry->vme_atomic) &&
2938 (entry->vme_no_copy_on_read == no_copy_on_read) &&
2939
2940 ((entry->vme_end - entry->vme_start) + size <=
2941 (user_alias == VM_MEMORY_REALLOC ?
2942 ANON_CHUNK_SIZE :
2943 NO_COALESCE_LIMIT)) &&
2944
2945 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
2946 if (vm_object_coalesce(VME_OBJECT(entry),
2947 VM_OBJECT_NULL,
2948 VME_OFFSET(entry),
2949 (vm_object_offset_t) 0,
2950 (vm_map_size_t)(entry->vme_end - entry->vme_start),
2951 (vm_map_size_t)(end - entry->vme_end))) {
2952 /*
2953 * Coalesced the two objects - can extend
2954 * the previous map entry to include the
2955 * new range.
2956 */
2957 map->size += (end - entry->vme_end);
2958 assert(entry->vme_start < end);
2959 assert(VM_MAP_PAGE_ALIGNED(end,
2960 VM_MAP_PAGE_MASK(map)));
2961 if (__improbable(vm_debug_events)) {
2962 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2963 }
2964 entry->vme_end = end;
2965 if (map->holelistenabled) {
2966 vm_map_store_update_first_free(map, entry, TRUE);
2967 } else {
2968 vm_map_store_update_first_free(map, map->first_free, TRUE);
2969 }
2970 new_mapping_established = TRUE;
2971 RETURN(KERN_SUCCESS);
2972 }
2973 }
2974
2975 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2976 new_entry = NULL;
2977
2978 if (vmk_flags.vmkf_submap_adjust) {
2979 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2980 offset = start;
2981 }
2982
2983 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2984 tmp2_end = tmp2_start + step;
2985 /*
2986 * Create a new entry
2987 *
2988 * XXX FBDP
2989 * The reserved "page zero" in each process's address space can
2990 * be arbitrarily large. Splitting it into separate objects and
2991 * therefore different VM map entries serves no purpose and just
2992 * slows down operations on the VM map, so let's not split the
2993 * allocation into chunks if the max protection is NONE. That
2994 * memory should never be accessible, so it will never get to the
2995 * default pager.
2996 */
2997 tmp_start = tmp2_start;
2998 if (object == VM_OBJECT_NULL &&
2999 size > chunk_size &&
3000 max_protection != VM_PROT_NONE &&
3001 superpage_size == 0) {
3002 tmp_end = tmp_start + chunk_size;
3003 } else {
3004 tmp_end = tmp2_end;
3005 }
3006 do {
3007 if (!is_submap &&
3008 object != VM_OBJECT_NULL &&
3009 object->internal &&
3010 offset + (tmp_end - tmp_start) > object->vo_size) {
3011 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3012 DTRACE_VM5(vm_map_enter_overmap,
3013 vm_map_t, map,
3014 vm_map_address_t, tmp_start,
3015 vm_map_address_t, tmp_end,
3016 vm_object_offset_t, offset,
3017 vm_object_size_t, object->vo_size);
3018 }
3019 new_entry = vm_map_entry_insert(map,
3020 entry, tmp_start, tmp_end,
3021 object, offset, vmk_flags,
3022 needs_copy,
3023 cur_protection, max_protection,
3024 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3025 VM_INHERIT_NONE : inheritance),
3026 no_cache,
3027 permanent,
3028 superpage_size,
3029 clear_map_aligned,
3030 alias);
3031
3032 assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3033
3034 if (resilient_codesign) {
3035 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3036 if (!((cur_protection | max_protection) & reject_prot)) {
3037 new_entry->vme_resilient_codesign = TRUE;
3038 }
3039 }
3040
3041 if (resilient_media &&
3042 (object == VM_OBJECT_NULL ||
3043 object->internal)) {
3044 new_entry->vme_resilient_media = TRUE;
3045 }
3046
3047 assert(!new_entry->iokit_acct);
3048 if (!is_submap &&
3049 object != VM_OBJECT_NULL &&
3050 (object->purgable != VM_PURGABLE_DENY ||
3051 object->vo_ledger_tag)) {
3052 assert(new_entry->use_pmap);
3053 assert(!new_entry->iokit_acct);
3054 /*
3055 * Turn off pmap accounting since
3056 * purgeable (or tagged) objects have their
3057 * own ledgers.
3058 */
3059 new_entry->use_pmap = FALSE;
3060 } else if (!is_submap &&
3061 iokit_acct &&
3062 object != VM_OBJECT_NULL &&
3063 object->internal) {
3064 /* alternate accounting */
3065 assert(!new_entry->iokit_acct);
3066 assert(new_entry->use_pmap);
3067 new_entry->iokit_acct = TRUE;
3068 new_entry->use_pmap = FALSE;
3069 DTRACE_VM4(
3070 vm_map_iokit_mapped_region,
3071 vm_map_t, map,
3072 vm_map_offset_t, new_entry->vme_start,
3073 vm_map_offset_t, new_entry->vme_end,
3074 int, VME_ALIAS(new_entry));
3075 vm_map_iokit_mapped_region(
3076 map,
3077 (new_entry->vme_end -
3078 new_entry->vme_start));
3079 } else if (!is_submap) {
3080 assert(!new_entry->iokit_acct);
3081 assert(new_entry->use_pmap);
3082 }
3083
3084 if (is_submap) {
3085 vm_map_t submap;
3086 boolean_t submap_is_64bit;
3087 boolean_t use_pmap;
3088
3089 assert(new_entry->is_sub_map);
3090 assert(!new_entry->use_pmap);
3091 assert(!new_entry->iokit_acct);
3092 submap = (vm_map_t) object;
3093 submap_is_64bit = vm_map_is_64bit(submap);
3094 use_pmap = vmk_flags.vmkf_nested_pmap;
3095 #ifndef NO_NESTED_PMAP
3096 if (use_pmap && submap->pmap == NULL) {
3097 ledger_t ledger = map->pmap->ledger;
3098 /* we need a sub pmap to nest... */
3099 submap->pmap = pmap_create_options(ledger, 0,
3100 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3101 if (submap->pmap == NULL) {
3102 /* let's proceed without nesting... */
3103 }
3104 #if defined(__arm__) || defined(__arm64__)
3105 else {
3106 pmap_set_nested(submap->pmap);
3107 }
3108 #endif
3109 }
3110 if (use_pmap && submap->pmap != NULL) {
3111 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3112 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3113 kr = KERN_FAILURE;
3114 } else {
3115 kr = pmap_nest(map->pmap,
3116 submap->pmap,
3117 tmp_start,
3118 tmp_end - tmp_start);
3119 }
3120 if (kr != KERN_SUCCESS) {
3121 printf("vm_map_enter: "
3122 "pmap_nest(0x%llx,0x%llx) "
3123 "error 0x%x\n",
3124 (long long)tmp_start,
3125 (long long)tmp_end,
3126 kr);
3127 } else {
3128 /* we're now nested ! */
3129 new_entry->use_pmap = TRUE;
3130 pmap_empty = FALSE;
3131 }
3132 }
3133 #endif /* NO_NESTED_PMAP */
3134 }
3135 entry = new_entry;
3136
3137 if (superpage_size) {
3138 vm_page_t pages, m;
3139 vm_object_t sp_object;
3140 vm_object_offset_t sp_offset;
3141
3142 VME_OFFSET_SET(entry, 0);
3143
3144 /* allocate one superpage */
3145 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3146 if (kr != KERN_SUCCESS) {
3147 /* deallocate whole range... */
3148 new_mapping_established = TRUE;
3149 /* ... but only up to "tmp_end" */
3150 size -= end - tmp_end;
3151 RETURN(kr);
3152 }
3153
3154 /* create one vm_object per superpage */
3155 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3156 sp_object->phys_contiguous = TRUE;
3157 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3158 VME_OBJECT_SET(entry, sp_object, false, 0);
3159 assert(entry->use_pmap);
3160
3161 /* enter the base pages into the object */
3162 vm_object_lock(sp_object);
3163 for (sp_offset = 0;
3164 sp_offset < SUPERPAGE_SIZE;
3165 sp_offset += PAGE_SIZE) {
3166 m = pages;
3167 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3168 pages = NEXT_PAGE(m);
3169 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3170 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3171 }
3172 vm_object_unlock(sp_object);
3173 }
3174 } while (tmp_end != tmp2_end &&
3175 (tmp_start = tmp_end) &&
3176 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3177 tmp_end + chunk_size : tmp2_end));
3178 }
3179
3180 new_mapping_established = TRUE;
3181
3182 BailOut:
3183 assert(map_locked == TRUE);
3184
3185 /*
3186 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3187 * If we have identified and possibly established the new mapping(s),
3188 * make sure we did not go beyond the address space limit.
3189 */
3190 if (result == KERN_SUCCESS) {
3191 if (map->size_limit != RLIM_INFINITY &&
3192 map->size > map->size_limit) {
3193 /*
3194 * Establishing the requested mappings would exceed
3195 * the process's RLIMIT_AS limit: fail with
3196 * KERN_NO_SPACE.
3197 */
3198 result = KERN_NO_SPACE;
3199 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3200 proc_selfpid(),
3201 (current_task()->bsd_info
3202 ? proc_name_address(current_task()->bsd_info)
3203 : "?"),
3204 __FUNCTION__,
3205 (uint64_t) map->size,
3206 (uint64_t) map->size_limit);
3207 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3208 vm_map_size_t, map->size,
3209 uint64_t, map->size_limit);
3210 vm_map_enter_RLIMIT_AS_count++;
3211 } else if (map->data_limit != RLIM_INFINITY &&
3212 map->size > map->data_limit) {
3213 /*
3214 * Establishing the requested mappings would exceed
3215 * the process's RLIMIT_DATA limit: fail with
3216 * KERN_NO_SPACE.
3217 */
3218 result = KERN_NO_SPACE;
3219 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3220 proc_selfpid(),
3221 (current_task()->bsd_info
3222 ? proc_name_address(current_task()->bsd_info)
3223 : "?"),
3224 __FUNCTION__,
3225 (uint64_t) map->size,
3226 (uint64_t) map->data_limit);
3227 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3228 vm_map_size_t, map->size,
3229 uint64_t, map->data_limit);
3230 vm_map_enter_RLIMIT_DATA_count++;
3231 }
3232 }
3233
3234 if (result == KERN_SUCCESS) {
3235 vm_prot_t pager_prot;
3236 memory_object_t pager;
3237
3238 #if DEBUG
3239 if (pmap_empty &&
3240 !(vmk_flags.vmkf_no_pmap_check)) {
3241 assert(pmap_is_empty(map->pmap,
3242 *address,
3243 *address + size));
3244 }
3245 #endif /* DEBUG */
3246
3247 /*
3248 * For "named" VM objects, let the pager know that the
3249 * memory object is being mapped. Some pagers need to keep
3250 * track of this, to know when they can reclaim the memory
3251 * object, for example.
3252 * VM calls memory_object_map() for each mapping (specifying
3253 * the protection of each mapping) and calls
3254 * memory_object_last_unmap() when all the mappings are gone.
3255 */
3256 pager_prot = max_protection;
3257 if (needs_copy) {
3258 /*
3259 * Copy-On-Write mapping: won't modify
3260 * the memory object.
3261 */
3262 pager_prot &= ~VM_PROT_WRITE;
3263 }
3264 if (!is_submap &&
3265 object != VM_OBJECT_NULL &&
3266 object->named &&
3267 object->pager != MEMORY_OBJECT_NULL) {
3268 vm_object_lock(object);
3269 pager = object->pager;
3270 if (object->named &&
3271 pager != MEMORY_OBJECT_NULL) {
3272 assert(object->pager_ready);
3273 vm_object_mapping_wait(object, THREAD_UNINT);
3274 vm_object_mapping_begin(object);
3275 vm_object_unlock(object);
3276
3277 kr = memory_object_map(pager, pager_prot);
3278 assert(kr == KERN_SUCCESS);
3279
3280 vm_object_lock(object);
3281 vm_object_mapping_end(object);
3282 }
3283 vm_object_unlock(object);
3284 }
3285 }
3286
3287 assert(map_locked == TRUE);
3288
3289 if (!keep_map_locked) {
3290 vm_map_unlock(map);
3291 map_locked = FALSE;
3292 }
3293
3294 /*
3295 * We can't hold the map lock if we enter this block.
3296 */
3297
3298 if (result == KERN_SUCCESS) {
3299 /* Wire down the new entry if the user
3300 * requested all new map entries be wired.
3301 */
3302 if ((map->wiring_required) || (superpage_size)) {
3303 assert(!keep_map_locked);
3304 pmap_empty = FALSE; /* pmap won't be empty */
3305 kr = vm_map_wire_kernel(map, start, end,
3306 new_entry->protection, VM_KERN_MEMORY_MLOCK,
3307 TRUE);
3308 result = kr;
3309 }
3310
3311 }
3312
3313 if (result != KERN_SUCCESS) {
3314 if (new_mapping_established) {
3315 /*
3316 * The caller had an extra reference on the VM object
3317 * it gave us.
3318 * We've transferred that reference to the mapping we
3319 * just established but we're about to undo that mapping
3320 * and release that reference.
3321 * The caller expects its reference to be consumed on
3322 * success only, so we have to get the extra reference
3323 * back for the caller.
3324 */
3325 vm_object_reference(caller_object);
3326
3327 /*
3328 * We have to get rid of the new mappings since we
3329 * won't make them available to the user.
3330 * Try and do that atomically, to minimize the risk
3331 * that someone else create new mappings that range.
3332 */
3333
3334 if (!map_locked) {
3335 vm_map_lock(map);
3336 map_locked = TRUE;
3337 }
3338 (void)vm_map_delete(map, *address, *address + size,
3339 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3340 KMEM_GUARD_NONE, &zap_new_list);
3341 }
3342
3343 if (vm_map_zap_first_entry(&zap_old_list)) {
3344 vm_map_entry_t entry1, entry2;
3345
3346 /*
3347 * The new mapping failed. Attempt to restore
3348 * the old mappings, saved in the "zap_old_map".
3349 */
3350 if (!map_locked) {
3351 vm_map_lock(map);
3352 map_locked = TRUE;
3353 }
3354
3355 /* first check if the coast is still clear */
3356 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3357 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3358
3359 if (vm_map_lookup_entry(map, start, &entry1) ||
3360 vm_map_lookup_entry(map, end, &entry2) ||
3361 entry1 != entry2) {
3362 /*
3363 * Part of that range has already been
3364 * re-mapped: we can't restore the old
3365 * mappings...
3366 */
3367 vm_map_enter_restore_failures++;
3368 } else {
3369 /*
3370 * Transfer the saved map entries from
3371 * "zap_old_map" to the original "map",
3372 * inserting them all after "entry1".
3373 */
3374 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3375 vm_map_size_t entry_size;
3376
3377 entry_size = (entry2->vme_end -
3378 entry2->vme_start);
3379 vm_map_store_entry_link(map, entry1, entry2,
3380 VM_MAP_KERNEL_FLAGS_NONE);
3381 map->size += entry_size;
3382 entry1 = entry2;
3383 }
3384 if (map->wiring_required) {
3385 /*
3386 * XXX TODO: we should rewire the
3387 * old pages here...
3388 */
3389 }
3390 vm_map_enter_restore_successes++;
3391 }
3392 }
3393 }
3394
3395 /*
3396 * The caller is responsible for releasing the lock if it requested to
3397 * keep the map locked.
3398 */
3399 if (map_locked && !keep_map_locked) {
3400 vm_map_unlock(map);
3401 }
3402
3403 vm_map_zap_dispose(&zap_old_list);
3404 vm_map_zap_dispose(&zap_new_list);
3405
3406 return result;
3407
3408 #undef RETURN
3409 }
3410
3411 #if __arm64__
3412 extern const struct memory_object_pager_ops fourk_pager_ops;
3413 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3414 vm_map_enter_fourk(
3415 vm_map_t map,
3416 vm_map_offset_t *address, /* IN/OUT */
3417 vm_map_size_t size,
3418 vm_map_offset_t mask,
3419 int flags,
3420 vm_map_kernel_flags_t vmk_flags,
3421 vm_tag_t alias,
3422 vm_object_t object,
3423 vm_object_offset_t offset,
3424 boolean_t needs_copy,
3425 vm_prot_t cur_protection,
3426 vm_prot_t max_protection,
3427 vm_inherit_t inheritance)
3428 {
3429 vm_map_entry_t entry, new_entry;
3430 vm_map_offset_t start, fourk_start;
3431 vm_map_offset_t end, fourk_end;
3432 vm_map_size_t fourk_size;
3433 kern_return_t result = KERN_SUCCESS;
3434 boolean_t map_locked = FALSE;
3435 boolean_t pmap_empty = TRUE;
3436 boolean_t new_mapping_established = FALSE;
3437 boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3438 boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3439 boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3440 boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3441 boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3442 const boolean_t is_submap = vmk_flags.vmkf_submap;
3443 boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3444 const boolean_t entry_for_jit = vmk_flags.vmkf_map_jit;
3445 // boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct;
3446 unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3447 vm_map_offset_t effective_min_offset, effective_max_offset;
3448 kern_return_t kr;
3449 boolean_t clear_map_aligned = FALSE;
3450 memory_object_t fourk_mem_obj;
3451 vm_object_t fourk_object;
3452 vm_map_offset_t fourk_pager_offset;
3453 int fourk_pager_index_start, fourk_pager_index_num;
3454 int cur_idx;
3455 boolean_t fourk_copy;
3456 vm_object_t copy_object;
3457 vm_object_offset_t copy_offset;
3458 VM_MAP_ZAP_DECLARE(zap_list);
3459
3460 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3461 panic("%s:%d", __FUNCTION__, __LINE__);
3462 }
3463 fourk_mem_obj = MEMORY_OBJECT_NULL;
3464 fourk_object = VM_OBJECT_NULL;
3465
3466 if (superpage_size) {
3467 return KERN_NOT_SUPPORTED;
3468 }
3469
3470 if ((cur_protection & VM_PROT_WRITE) &&
3471 (cur_protection & VM_PROT_EXECUTE) &&
3472 #if XNU_TARGET_OS_OSX
3473 map->pmap != kernel_pmap &&
3474 (vm_map_cs_enforcement(map)
3475 #if __arm64__
3476 || !VM_MAP_IS_EXOTIC(map)
3477 #endif /* __arm64__ */
3478 ) &&
3479 #endif /* XNU_TARGET_OS_OSX */
3480 !entry_for_jit) {
3481 DTRACE_VM3(cs_wx,
3482 uint64_t, 0,
3483 uint64_t, 0,
3484 vm_prot_t, cur_protection);
3485 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3486 "turning off execute\n",
3487 proc_selfpid(),
3488 (current_task()->bsd_info
3489 ? proc_name_address(current_task()->bsd_info)
3490 : "?"),
3491 __FUNCTION__);
3492 cur_protection &= ~VM_PROT_EXECUTE;
3493 }
3494
3495 /*
3496 * If the task has requested executable lockdown,
3497 * deny any new executable mapping.
3498 */
3499 if (map->map_disallow_new_exec == TRUE) {
3500 if (cur_protection & VM_PROT_EXECUTE) {
3501 return KERN_PROTECTION_FAILURE;
3502 }
3503 }
3504
3505 if (is_submap) {
3506 return KERN_NOT_SUPPORTED;
3507 }
3508 if (vmk_flags.vmkf_already) {
3509 return KERN_NOT_SUPPORTED;
3510 }
3511 if (purgable || entry_for_jit) {
3512 return KERN_NOT_SUPPORTED;
3513 }
3514
3515 effective_min_offset = map->min_offset;
3516
3517 if (vmk_flags.vmkf_beyond_max) {
3518 return KERN_NOT_SUPPORTED;
3519 } else {
3520 effective_max_offset = map->max_offset;
3521 }
3522
3523 if (size == 0 ||
3524 (offset & FOURK_PAGE_MASK) != 0) {
3525 *address = 0;
3526 return KERN_INVALID_ARGUMENT;
3527 }
3528
3529 #define RETURN(value) { result = value; goto BailOut; }
3530
3531 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3532 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3533
3534 if (!anywhere && overwrite) {
3535 return KERN_NOT_SUPPORTED;
3536 }
3537
3538 fourk_start = *address;
3539 fourk_size = size;
3540 fourk_end = fourk_start + fourk_size;
3541
3542 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3543 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3544 size = end - start;
3545
3546 if (anywhere) {
3547 return KERN_NOT_SUPPORTED;
3548 } else {
3549 /*
3550 * Verify that:
3551 * the address doesn't itself violate
3552 * the mask requirement.
3553 */
3554
3555 vm_map_lock(map);
3556 map_locked = TRUE;
3557 if ((start & mask) != 0) {
3558 RETURN(KERN_NO_SPACE);
3559 }
3560
3561 /*
3562 * ... the address is within bounds
3563 */
3564
3565 end = start + size;
3566
3567 if ((start < effective_min_offset) ||
3568 (end > effective_max_offset) ||
3569 (start >= end)) {
3570 RETURN(KERN_INVALID_ADDRESS);
3571 }
3572
3573 /*
3574 * ... the starting address isn't allocated
3575 */
3576 if (vm_map_lookup_entry(map, start, &entry)) {
3577 vm_object_t cur_object, shadow_object;
3578
3579 /*
3580 * We might already some 4K mappings
3581 * in a 16K page here.
3582 */
3583
3584 if (entry->vme_end - entry->vme_start
3585 != SIXTEENK_PAGE_SIZE) {
3586 RETURN(KERN_NO_SPACE);
3587 }
3588 if (entry->is_sub_map) {
3589 RETURN(KERN_NO_SPACE);
3590 }
3591 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3592 RETURN(KERN_NO_SPACE);
3593 }
3594
3595 /* go all the way down the shadow chain */
3596 cur_object = VME_OBJECT(entry);
3597 vm_object_lock(cur_object);
3598 while (cur_object->shadow != VM_OBJECT_NULL) {
3599 shadow_object = cur_object->shadow;
3600 vm_object_lock(shadow_object);
3601 vm_object_unlock(cur_object);
3602 cur_object = shadow_object;
3603 shadow_object = VM_OBJECT_NULL;
3604 }
3605 if (cur_object->internal ||
3606 cur_object->pager == NULL) {
3607 vm_object_unlock(cur_object);
3608 RETURN(KERN_NO_SPACE);
3609 }
3610 if (cur_object->pager->mo_pager_ops
3611 != &fourk_pager_ops) {
3612 vm_object_unlock(cur_object);
3613 RETURN(KERN_NO_SPACE);
3614 }
3615 fourk_object = cur_object;
3616 fourk_mem_obj = fourk_object->pager;
3617
3618 /* keep the "4K" object alive */
3619 vm_object_reference_locked(fourk_object);
3620 memory_object_reference(fourk_mem_obj);
3621 vm_object_unlock(fourk_object);
3622
3623 /* merge permissions */
3624 entry->protection |= cur_protection;
3625 entry->max_protection |= max_protection;
3626
3627 if ((entry->protection & VM_PROT_WRITE) &&
3628 (entry->protection & VM_PROT_ALLEXEC) &&
3629 fourk_binary_compatibility_unsafe &&
3630 fourk_binary_compatibility_allow_wx) {
3631 /* write+execute: need to be "jit" */
3632 entry->used_for_jit = TRUE;
3633 }
3634 goto map_in_fourk_pager;
3635 }
3636
3637 /*
3638 * ... the next region doesn't overlap the
3639 * end point.
3640 */
3641
3642 if ((entry->vme_next != vm_map_to_entry(map)) &&
3643 (entry->vme_next->vme_start < end)) {
3644 RETURN(KERN_NO_SPACE);
3645 }
3646 }
3647
3648 /*
3649 * At this point,
3650 * "start" and "end" should define the endpoints of the
3651 * available new range, and
3652 * "entry" should refer to the region before the new
3653 * range, and
3654 *
3655 * the map should be locked.
3656 */
3657
3658 /* create a new "4K" pager */
3659 fourk_mem_obj = fourk_pager_create();
3660 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3661 assert(fourk_object);
3662
3663 /* keep the "4" object alive */
3664 vm_object_reference(fourk_object);
3665
3666 /* create a "copy" object, to map the "4K" object copy-on-write */
3667 fourk_copy = TRUE;
3668 result = vm_object_copy_strategically(fourk_object,
3669 0,
3670 end - start,
3671 ©_object,
3672 ©_offset,
3673 &fourk_copy);
3674 assert(result == KERN_SUCCESS);
3675 assert(copy_object != VM_OBJECT_NULL);
3676 assert(copy_offset == 0);
3677
3678 /* map the "4K" pager's copy object */
3679 new_entry = vm_map_entry_insert(map,
3680 entry,
3681 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3682 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3683 copy_object,
3684 0, /* offset */
3685 vmk_flags,
3686 FALSE, /* needs_copy */
3687 cur_protection, max_protection,
3688 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3689 VM_INHERIT_NONE : inheritance),
3690 no_cache,
3691 permanent,
3692 superpage_size,
3693 clear_map_aligned,
3694 alias);
3695 entry = new_entry;
3696
3697 #if VM_MAP_DEBUG_FOURK
3698 if (vm_map_debug_fourk) {
3699 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3700 map,
3701 (uint64_t) entry->vme_start,
3702 (uint64_t) entry->vme_end,
3703 fourk_mem_obj);
3704 }
3705 #endif /* VM_MAP_DEBUG_FOURK */
3706
3707 new_mapping_established = TRUE;
3708
3709 map_in_fourk_pager:
3710 /* "map" the original "object" where it belongs in the "4K" pager */
3711 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3712 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3713 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3714 fourk_pager_index_num = 4;
3715 } else {
3716 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3717 }
3718 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3719 fourk_pager_index_num = 4 - fourk_pager_index_start;
3720 }
3721 for (cur_idx = 0;
3722 cur_idx < fourk_pager_index_num;
3723 cur_idx++) {
3724 vm_object_t old_object;
3725 vm_object_offset_t old_offset;
3726
3727 kr = fourk_pager_populate(fourk_mem_obj,
3728 TRUE, /* overwrite */
3729 fourk_pager_index_start + cur_idx,
3730 object,
3731 (object
3732 ? (offset +
3733 (cur_idx * FOURK_PAGE_SIZE))
3734 : 0),
3735 &old_object,
3736 &old_offset);
3737 #if VM_MAP_DEBUG_FOURK
3738 if (vm_map_debug_fourk) {
3739 if (old_object == (vm_object_t) -1 &&
3740 old_offset == (vm_object_offset_t) -1) {
3741 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3742 "pager [%p:0x%llx] "
3743 "populate[%d] "
3744 "[object:%p,offset:0x%llx]\n",
3745 map,
3746 (uint64_t) entry->vme_start,
3747 (uint64_t) entry->vme_end,
3748 fourk_mem_obj,
3749 VME_OFFSET(entry),
3750 fourk_pager_index_start + cur_idx,
3751 object,
3752 (object
3753 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3754 : 0));
3755 } else {
3756 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3757 "pager [%p:0x%llx] "
3758 "populate[%d] [object:%p,offset:0x%llx] "
3759 "old [%p:0x%llx]\n",
3760 map,
3761 (uint64_t) entry->vme_start,
3762 (uint64_t) entry->vme_end,
3763 fourk_mem_obj,
3764 VME_OFFSET(entry),
3765 fourk_pager_index_start + cur_idx,
3766 object,
3767 (object
3768 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3769 : 0),
3770 old_object,
3771 old_offset);
3772 }
3773 }
3774 #endif /* VM_MAP_DEBUG_FOURK */
3775
3776 assert(kr == KERN_SUCCESS);
3777 if (object != old_object &&
3778 object != VM_OBJECT_NULL &&
3779 object != (vm_object_t) -1) {
3780 vm_object_reference(object);
3781 }
3782 if (object != old_object &&
3783 old_object != VM_OBJECT_NULL &&
3784 old_object != (vm_object_t) -1) {
3785 vm_object_deallocate(old_object);
3786 }
3787 }
3788
3789 BailOut:
3790 assert(map_locked == TRUE);
3791
3792 if (result == KERN_SUCCESS) {
3793 vm_prot_t pager_prot;
3794 memory_object_t pager;
3795
3796 #if DEBUG
3797 if (pmap_empty &&
3798 !(vmk_flags.vmkf_no_pmap_check)) {
3799 assert(pmap_is_empty(map->pmap,
3800 *address,
3801 *address + size));
3802 }
3803 #endif /* DEBUG */
3804
3805 /*
3806 * For "named" VM objects, let the pager know that the
3807 * memory object is being mapped. Some pagers need to keep
3808 * track of this, to know when they can reclaim the memory
3809 * object, for example.
3810 * VM calls memory_object_map() for each mapping (specifying
3811 * the protection of each mapping) and calls
3812 * memory_object_last_unmap() when all the mappings are gone.
3813 */
3814 pager_prot = max_protection;
3815 if (needs_copy) {
3816 /*
3817 * Copy-On-Write mapping: won't modify
3818 * the memory object.
3819 */
3820 pager_prot &= ~VM_PROT_WRITE;
3821 }
3822 if (!is_submap &&
3823 object != VM_OBJECT_NULL &&
3824 object->named &&
3825 object->pager != MEMORY_OBJECT_NULL) {
3826 vm_object_lock(object);
3827 pager = object->pager;
3828 if (object->named &&
3829 pager != MEMORY_OBJECT_NULL) {
3830 assert(object->pager_ready);
3831 vm_object_mapping_wait(object, THREAD_UNINT);
3832 vm_object_mapping_begin(object);
3833 vm_object_unlock(object);
3834
3835 kr = memory_object_map(pager, pager_prot);
3836 assert(kr == KERN_SUCCESS);
3837
3838 vm_object_lock(object);
3839 vm_object_mapping_end(object);
3840 }
3841 vm_object_unlock(object);
3842 }
3843 if (!is_submap &&
3844 fourk_object != VM_OBJECT_NULL &&
3845 fourk_object->named &&
3846 fourk_object->pager != MEMORY_OBJECT_NULL) {
3847 vm_object_lock(fourk_object);
3848 pager = fourk_object->pager;
3849 if (fourk_object->named &&
3850 pager != MEMORY_OBJECT_NULL) {
3851 assert(fourk_object->pager_ready);
3852 vm_object_mapping_wait(fourk_object,
3853 THREAD_UNINT);
3854 vm_object_mapping_begin(fourk_object);
3855 vm_object_unlock(fourk_object);
3856
3857 kr = memory_object_map(pager, VM_PROT_READ);
3858 assert(kr == KERN_SUCCESS);
3859
3860 vm_object_lock(fourk_object);
3861 vm_object_mapping_end(fourk_object);
3862 }
3863 vm_object_unlock(fourk_object);
3864 }
3865 }
3866
3867 if (fourk_object != VM_OBJECT_NULL) {
3868 vm_object_deallocate(fourk_object);
3869 fourk_object = VM_OBJECT_NULL;
3870 memory_object_deallocate(fourk_mem_obj);
3871 fourk_mem_obj = MEMORY_OBJECT_NULL;
3872 }
3873
3874 assert(map_locked == TRUE);
3875
3876 if (!keep_map_locked) {
3877 vm_map_unlock(map);
3878 map_locked = FALSE;
3879 }
3880
3881 /*
3882 * We can't hold the map lock if we enter this block.
3883 */
3884
3885 if (result == KERN_SUCCESS) {
3886 /* Wire down the new entry if the user
3887 * requested all new map entries be wired.
3888 */
3889 if ((map->wiring_required) || (superpage_size)) {
3890 assert(!keep_map_locked);
3891 pmap_empty = FALSE; /* pmap won't be empty */
3892 kr = vm_map_wire_kernel(map, start, end,
3893 new_entry->protection, VM_KERN_MEMORY_MLOCK,
3894 TRUE);
3895 result = kr;
3896 }
3897
3898 }
3899
3900 if (result != KERN_SUCCESS) {
3901 if (new_mapping_established) {
3902 /*
3903 * We have to get rid of the new mappings since we
3904 * won't make them available to the user.
3905 * Try and do that atomically, to minimize the risk
3906 * that someone else create new mappings that range.
3907 */
3908
3909 if (!map_locked) {
3910 vm_map_lock(map);
3911 map_locked = TRUE;
3912 }
3913 (void)vm_map_delete(map, *address, *address + size,
3914 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3915 KMEM_GUARD_NONE, &zap_list);
3916 }
3917 }
3918
3919 /*
3920 * The caller is responsible for releasing the lock if it requested to
3921 * keep the map locked.
3922 */
3923 if (map_locked && !keep_map_locked) {
3924 vm_map_unlock(map);
3925 }
3926
3927 vm_map_zap_dispose(&zap_list);
3928
3929 return result;
3930
3931 #undef RETURN
3932 }
3933 #endif /* __arm64__ */
3934
3935 /*
3936 * Counters for the prefault optimization.
3937 */
3938 int64_t vm_prefault_nb_pages = 0;
3939 int64_t vm_prefault_nb_bailout = 0;
3940
3941 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3942 vm_map_enter_mem_object_helper(
3943 vm_map_t target_map,
3944 vm_map_offset_t *address,
3945 vm_map_size_t initial_size,
3946 vm_map_offset_t mask,
3947 int flags,
3948 vm_map_kernel_flags_t vmk_flags,
3949 vm_tag_t tag,
3950 ipc_port_t port,
3951 vm_object_offset_t offset,
3952 boolean_t copy,
3953 vm_prot_t cur_protection,
3954 vm_prot_t max_protection,
3955 vm_inherit_t inheritance,
3956 upl_page_list_ptr_t page_list,
3957 unsigned int page_list_count)
3958 {
3959 vm_map_address_t map_addr;
3960 vm_map_size_t map_size;
3961 vm_object_t object;
3962 vm_object_size_t size;
3963 kern_return_t result;
3964 boolean_t mask_cur_protection, mask_max_protection;
3965 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
3966 vm_map_offset_t offset_in_mapping = 0;
3967 #if __arm64__
3968 boolean_t fourk = vmk_flags.vmkf_fourk;
3969 #endif /* __arm64__ */
3970
3971 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3972 /* XXX TODO4K prefaulting depends on page size... */
3973 try_prefault = FALSE;
3974 }
3975
3976 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3977
3978 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
3979 mask_max_protection = max_protection & VM_PROT_IS_MASK;
3980 cur_protection &= ~VM_PROT_IS_MASK;
3981 max_protection &= ~VM_PROT_IS_MASK;
3982
3983 /*
3984 * Check arguments for validity
3985 */
3986 if ((target_map == VM_MAP_NULL) ||
3987 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3988 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3989 (inheritance > VM_INHERIT_LAST_VALID) ||
3990 (try_prefault && (copy || !page_list)) ||
3991 initial_size == 0) {
3992 return KERN_INVALID_ARGUMENT;
3993 }
3994
3995 /*
3996 * Redirect to kmem_ranges[data]
3997 */
3998 if (target_map == kernel_map) {
3999 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
4000 }
4001
4002 #if __arm64__
4003 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4004 /* no "fourk" if map is using a sub-page page size */
4005 fourk = FALSE;
4006 }
4007 if (fourk) {
4008 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4009 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4010 } else
4011 #endif /* __arm64__ */
4012 {
4013 map_addr = vm_map_trunc_page(*address,
4014 VM_MAP_PAGE_MASK(target_map));
4015 map_size = vm_map_round_page(initial_size,
4016 VM_MAP_PAGE_MASK(target_map));
4017 }
4018 size = vm_object_round_page(initial_size);
4019
4020 /*
4021 * Find the vm object (if any) corresponding to this port.
4022 */
4023 if (!IP_VALID(port)) {
4024 object = VM_OBJECT_NULL;
4025 offset = 0;
4026 copy = FALSE;
4027 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4028 vm_named_entry_t named_entry;
4029 vm_object_offset_t data_offset;
4030
4031 named_entry = mach_memory_entry_from_port(port);
4032
4033 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4034 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4035 data_offset = named_entry->data_offset;
4036 offset += named_entry->data_offset;
4037 } else {
4038 data_offset = 0;
4039 }
4040
4041 /* a few checks to make sure user is obeying rules */
4042 if (size == 0) {
4043 if (offset >= named_entry->size) {
4044 return KERN_INVALID_RIGHT;
4045 }
4046 size = named_entry->size - offset;
4047 }
4048 if (mask_max_protection) {
4049 max_protection &= named_entry->protection;
4050 }
4051 if (mask_cur_protection) {
4052 cur_protection &= named_entry->protection;
4053 }
4054 if ((named_entry->protection & max_protection) !=
4055 max_protection) {
4056 return KERN_INVALID_RIGHT;
4057 }
4058 if ((named_entry->protection & cur_protection) !=
4059 cur_protection) {
4060 return KERN_INVALID_RIGHT;
4061 }
4062 if (offset + size < offset) {
4063 /* overflow */
4064 return KERN_INVALID_ARGUMENT;
4065 }
4066 if (named_entry->size < (offset + initial_size)) {
4067 return KERN_INVALID_ARGUMENT;
4068 }
4069
4070 if (named_entry->is_copy) {
4071 /* for a vm_map_copy, we can only map it whole */
4072 if ((size != named_entry->size) &&
4073 (vm_map_round_page(size,
4074 VM_MAP_PAGE_MASK(target_map)) ==
4075 named_entry->size)) {
4076 /* XXX FBDP use the rounded size... */
4077 size = vm_map_round_page(
4078 size,
4079 VM_MAP_PAGE_MASK(target_map));
4080 }
4081 }
4082
4083 /* the callers parameter offset is defined to be the */
4084 /* offset from beginning of named entry offset in object */
4085 offset = offset + named_entry->offset;
4086
4087 if (!VM_MAP_PAGE_ALIGNED(size,
4088 VM_MAP_PAGE_MASK(target_map))) {
4089 /*
4090 * Let's not map more than requested;
4091 * vm_map_enter() will handle this "not map-aligned"
4092 * case.
4093 */
4094 map_size = size;
4095 }
4096
4097 named_entry_lock(named_entry);
4098 if (named_entry->is_sub_map) {
4099 vm_map_t submap;
4100
4101 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4102 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4103 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4104 }
4105
4106 submap = named_entry->backing.map;
4107 vm_map_reference(submap);
4108 named_entry_unlock(named_entry);
4109
4110 vmk_flags.vmkf_submap = TRUE;
4111
4112 result = vm_map_enter(target_map,
4113 &map_addr,
4114 map_size,
4115 mask,
4116 flags,
4117 vmk_flags,
4118 tag,
4119 (vm_object_t)(uintptr_t) submap,
4120 offset,
4121 copy,
4122 cur_protection,
4123 max_protection,
4124 inheritance);
4125 if (result != KERN_SUCCESS) {
4126 vm_map_deallocate(submap);
4127 } else {
4128 /*
4129 * No need to lock "submap" just to check its
4130 * "mapped" flag: that flag is never reset
4131 * once it's been set and if we race, we'll
4132 * just end up setting it twice, which is OK.
4133 */
4134 if (submap->mapped_in_other_pmaps == FALSE &&
4135 vm_map_pmap(submap) != PMAP_NULL &&
4136 vm_map_pmap(submap) !=
4137 vm_map_pmap(target_map)) {
4138 /*
4139 * This submap is being mapped in a map
4140 * that uses a different pmap.
4141 * Set its "mapped_in_other_pmaps" flag
4142 * to indicate that we now need to
4143 * remove mappings from all pmaps rather
4144 * than just the submap's pmap.
4145 */
4146 vm_map_lock(submap);
4147 submap->mapped_in_other_pmaps = TRUE;
4148 vm_map_unlock(submap);
4149 }
4150 *address = map_addr;
4151 }
4152 return result;
4153 } else if (named_entry->is_copy) {
4154 kern_return_t kr;
4155 vm_map_copy_t copy_map;
4156 vm_map_entry_t copy_entry;
4157 vm_map_offset_t copy_addr;
4158 vm_map_copy_t target_copy_map;
4159 vm_map_offset_t overmap_start, overmap_end;
4160 vm_map_offset_t trimmed_start;
4161 vm_map_size_t target_size;
4162
4163 if (flags & ~(VM_FLAGS_FIXED |
4164 VM_FLAGS_ANYWHERE |
4165 VM_FLAGS_OVERWRITE |
4166 VM_FLAGS_RETURN_4K_DATA_ADDR |
4167 VM_FLAGS_RETURN_DATA_ADDR |
4168 VM_FLAGS_ALIAS_MASK)) {
4169 named_entry_unlock(named_entry);
4170 return KERN_INVALID_ARGUMENT;
4171 }
4172
4173 copy_map = named_entry->backing.copy;
4174 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4175 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4176 /* unsupported type; should not happen */
4177 printf("vm_map_enter_mem_object: "
4178 "memory_entry->backing.copy "
4179 "unsupported type 0x%x\n",
4180 copy_map->type);
4181 named_entry_unlock(named_entry);
4182 return KERN_INVALID_ARGUMENT;
4183 }
4184
4185 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4186 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4187 }
4188
4189 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4190 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4191 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4192 if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4193 offset_in_mapping &= ~((signed)(0xFFF));
4194 }
4195 }
4196
4197 target_copy_map = VM_MAP_COPY_NULL;
4198 target_size = copy_map->size;
4199 overmap_start = 0;
4200 overmap_end = 0;
4201 trimmed_start = 0;
4202 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4203 DEBUG4K_ADJUST("adjusting...\n");
4204 kr = vm_map_copy_adjust_to_target(
4205 copy_map,
4206 offset /* includes data_offset */,
4207 initial_size,
4208 target_map,
4209 copy,
4210 &target_copy_map,
4211 &overmap_start,
4212 &overmap_end,
4213 &trimmed_start);
4214 if (kr != KERN_SUCCESS) {
4215 named_entry_unlock(named_entry);
4216 return kr;
4217 }
4218 target_size = target_copy_map->size;
4219 if (trimmed_start >= data_offset) {
4220 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4221 } else {
4222 data_offset -= trimmed_start;
4223 }
4224 } else {
4225 /*
4226 * Assert that the vm_map_copy is coming from the right
4227 * zone and hasn't been forged
4228 */
4229 vm_map_copy_require(copy_map);
4230 target_copy_map = copy_map;
4231 }
4232
4233 /* reserve a contiguous range */
4234 kr = vm_map_enter(target_map,
4235 &map_addr,
4236 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4237 mask,
4238 flags & (VM_FLAGS_ANYWHERE |
4239 VM_FLAGS_OVERWRITE |
4240 VM_FLAGS_RETURN_4K_DATA_ADDR |
4241 VM_FLAGS_RETURN_DATA_ADDR),
4242 vmk_flags,
4243 tag,
4244 VM_OBJECT_NULL,
4245 0,
4246 FALSE, /* copy */
4247 cur_protection,
4248 max_protection,
4249 inheritance);
4250 if (kr != KERN_SUCCESS) {
4251 DEBUG4K_ERROR("kr 0x%x\n", kr);
4252 if (target_copy_map != copy_map) {
4253 vm_map_copy_discard(target_copy_map);
4254 target_copy_map = VM_MAP_COPY_NULL;
4255 }
4256 named_entry_unlock(named_entry);
4257 return kr;
4258 }
4259
4260 copy_addr = map_addr;
4261
4262 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4263 copy_entry != vm_map_copy_to_entry(target_copy_map);
4264 copy_entry = copy_entry->vme_next) {
4265 int remap_flags;
4266 vm_map_kernel_flags_t vmk_remap_flags;
4267 vm_map_t copy_submap = VM_MAP_NULL;
4268 vm_object_t copy_object = VM_OBJECT_NULL;
4269 vm_map_size_t copy_size;
4270 vm_object_offset_t copy_offset;
4271 int copy_vm_alias;
4272 boolean_t do_copy;
4273
4274 do_copy = FALSE;
4275 remap_flags = 0;
4276 vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4277
4278 if (copy_entry->is_sub_map) {
4279 copy_submap = VME_SUBMAP(copy_entry);
4280 copy_object = (vm_object_t)copy_submap;
4281 } else {
4282 copy_object = VME_OBJECT(copy_entry);
4283 }
4284 copy_offset = VME_OFFSET(copy_entry);
4285 copy_size = (copy_entry->vme_end -
4286 copy_entry->vme_start);
4287 VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4288 if (copy_vm_alias == 0) {
4289 /*
4290 * Caller does not want a specific
4291 * alias for this new mapping: use
4292 * the alias of the original mapping.
4293 */
4294 copy_vm_alias = VME_ALIAS(copy_entry);
4295 }
4296
4297 /* sanity check */
4298 if ((copy_addr + copy_size) >
4299 (map_addr +
4300 overmap_start + overmap_end +
4301 named_entry->size /* XXX full size */)) {
4302 /* over-mapping too much !? */
4303 kr = KERN_INVALID_ARGUMENT;
4304 DEBUG4K_ERROR("kr 0x%x\n", kr);
4305 /* abort */
4306 break;
4307 }
4308
4309 /* take a reference on the object */
4310 if (copy_entry->is_sub_map) {
4311 vmk_remap_flags.vmkf_submap = TRUE;
4312 vm_map_reference(copy_submap);
4313 copy_object = (vm_object_t)(uintptr_t) copy_submap;
4314 } else {
4315 if (!copy &&
4316 copy_object != VM_OBJECT_NULL &&
4317 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4318 /*
4319 * We need to resolve our side of this
4320 * "symmetric" copy-on-write now; we
4321 * need a new object to map and share,
4322 * instead of the current one which
4323 * might still be shared with the
4324 * original mapping.
4325 *
4326 * Note: A "vm_map_copy_t" does not
4327 * have a lock but we're protected by
4328 * the named entry's lock here.
4329 */
4330 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4331 VME_OBJECT_SHADOW(copy_entry, copy_size);
4332 assert(copy_object != VME_OBJECT(copy_entry));
4333 if (!copy_entry->needs_copy &&
4334 copy_entry->protection & VM_PROT_WRITE) {
4335 vm_prot_t prot;
4336
4337 prot = copy_entry->protection & ~VM_PROT_WRITE;
4338 vm_object_pmap_protect(copy_object,
4339 copy_offset,
4340 copy_size,
4341 PMAP_NULL,
4342 PAGE_SIZE,
4343 0,
4344 prot);
4345 }
4346
4347 copy_entry->needs_copy = FALSE;
4348 copy_entry->is_shared = TRUE;
4349 copy_object = VME_OBJECT(copy_entry);
4350 copy_offset = VME_OFFSET(copy_entry);
4351 vm_object_lock(copy_object);
4352 /* we're about to make a shared mapping of this object */
4353 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4354 copy_object->true_share = TRUE;
4355 vm_object_unlock(copy_object);
4356 }
4357
4358 if (copy_object != VM_OBJECT_NULL &&
4359 copy_object->named &&
4360 copy_object->pager != MEMORY_OBJECT_NULL &&
4361 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4362 memory_object_t pager;
4363 vm_prot_t pager_prot;
4364
4365 /*
4366 * For "named" VM objects, let the pager know that the
4367 * memory object is being mapped. Some pagers need to keep
4368 * track of this, to know when they can reclaim the memory
4369 * object, for example.
4370 * VM calls memory_object_map() for each mapping (specifying
4371 * the protection of each mapping) and calls
4372 * memory_object_last_unmap() when all the mappings are gone.
4373 */
4374 pager_prot = max_protection;
4375 if (copy) {
4376 /*
4377 * Copy-On-Write mapping: won't modify the
4378 * memory object.
4379 */
4380 pager_prot &= ~VM_PROT_WRITE;
4381 }
4382 vm_object_lock(copy_object);
4383 pager = copy_object->pager;
4384 if (copy_object->named &&
4385 pager != MEMORY_OBJECT_NULL &&
4386 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4387 assert(copy_object->pager_ready);
4388 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4389 vm_object_mapping_begin(copy_object);
4390 vm_object_unlock(copy_object);
4391
4392 kr = memory_object_map(pager, pager_prot);
4393 assert(kr == KERN_SUCCESS);
4394
4395 vm_object_lock(copy_object);
4396 vm_object_mapping_end(copy_object);
4397 }
4398 vm_object_unlock(copy_object);
4399 }
4400
4401 /*
4402 * Perform the copy if requested
4403 */
4404
4405 if (copy && copy_object != VM_OBJECT_NULL) {
4406 vm_object_t new_object;
4407 vm_object_offset_t new_offset;
4408
4409 result = vm_object_copy_strategically(copy_object, copy_offset,
4410 copy_size,
4411 &new_object, &new_offset,
4412 &do_copy);
4413
4414
4415 if (result == KERN_MEMORY_RESTART_COPY) {
4416 boolean_t success;
4417 boolean_t src_needs_copy;
4418
4419 /*
4420 * XXX
4421 * We currently ignore src_needs_copy.
4422 * This really is the issue of how to make
4423 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4424 * non-kernel users to use. Solution forthcoming.
4425 * In the meantime, since we don't allow non-kernel
4426 * memory managers to specify symmetric copy,
4427 * we won't run into problems here.
4428 */
4429 new_object = copy_object;
4430 new_offset = copy_offset;
4431 success = vm_object_copy_quickly(new_object,
4432 new_offset,
4433 copy_size,
4434 &src_needs_copy,
4435 &do_copy);
4436 assert(success);
4437 result = KERN_SUCCESS;
4438 }
4439 if (result != KERN_SUCCESS) {
4440 kr = result;
4441 break;
4442 }
4443
4444 copy_object = new_object;
4445 copy_offset = new_offset;
4446 /*
4447 * No extra object reference for the mapping:
4448 * the mapping should be the only thing keeping
4449 * this new object alive.
4450 */
4451 } else {
4452 /*
4453 * We already have the right object
4454 * to map.
4455 */
4456 copy_object = VME_OBJECT(copy_entry);
4457 /* take an extra ref for the mapping below */
4458 vm_object_reference(copy_object);
4459 }
4460 }
4461
4462 /* over-map the object into destination */
4463 remap_flags |= flags;
4464 remap_flags |= VM_FLAGS_FIXED;
4465 remap_flags |= VM_FLAGS_OVERWRITE;
4466 remap_flags &= ~VM_FLAGS_ANYWHERE;
4467 if (!copy && !copy_entry->is_sub_map) {
4468 /*
4469 * copy-on-write should have been
4470 * resolved at this point, or we would
4471 * end up sharing instead of copying.
4472 */
4473 assert(!copy_entry->needs_copy);
4474 }
4475 #if XNU_TARGET_OS_OSX
4476 if (copy_entry->used_for_jit) {
4477 vmk_remap_flags.vmkf_map_jit = TRUE;
4478 }
4479 #endif /* XNU_TARGET_OS_OSX */
4480
4481 assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4482 "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4483 kr = vm_map_enter(target_map,
4484 ©_addr,
4485 copy_size,
4486 (vm_map_offset_t) 0,
4487 remap_flags,
4488 vmk_remap_flags,
4489 (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4490 copy_object,
4491 copy_offset,
4492 ((copy_object == NULL)
4493 ? FALSE
4494 : (copy || copy_entry->needs_copy)),
4495 cur_protection,
4496 max_protection,
4497 inheritance);
4498 if (kr != KERN_SUCCESS) {
4499 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4500 if (copy_entry->is_sub_map) {
4501 vm_map_deallocate(copy_submap);
4502 } else {
4503 vm_object_deallocate(copy_object);
4504 }
4505 /* abort */
4506 break;
4507 }
4508
4509 /* next mapping */
4510 copy_addr += copy_size;
4511 }
4512
4513 if (kr == KERN_SUCCESS) {
4514 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4515 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4516 *address = map_addr + offset_in_mapping;
4517 } else {
4518 *address = map_addr;
4519 }
4520 if (overmap_start) {
4521 *address += overmap_start;
4522 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4523 }
4524 }
4525 named_entry_unlock(named_entry);
4526 if (target_copy_map != copy_map) {
4527 vm_map_copy_discard(target_copy_map);
4528 target_copy_map = VM_MAP_COPY_NULL;
4529 }
4530
4531 if (kr != KERN_SUCCESS) {
4532 if (!(flags & VM_FLAGS_OVERWRITE)) {
4533 /* deallocate the contiguous range */
4534 (void) vm_deallocate(target_map,
4535 map_addr,
4536 map_size);
4537 }
4538 }
4539
4540 return kr;
4541 }
4542
4543 if (named_entry->is_object) {
4544 unsigned int access;
4545 vm_prot_t protections;
4546 unsigned int wimg_mode;
4547
4548 /* we are mapping a VM object */
4549
4550 protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4551 access = GET_MAP_MEM(named_entry->protection);
4552
4553 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4554 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4555 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4556 if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4557 offset_in_mapping &= ~((signed)(0xFFF));
4558 }
4559 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4560 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4561 }
4562
4563 object = vm_named_entry_to_vm_object(named_entry);
4564 assert(object != VM_OBJECT_NULL);
4565 vm_object_lock(object);
4566 named_entry_unlock(named_entry);
4567
4568 vm_object_reference_locked(object);
4569
4570 wimg_mode = object->wimg_bits;
4571 vm_prot_to_wimg(access, &wimg_mode);
4572 if (object->wimg_bits != wimg_mode) {
4573 vm_object_change_wimg_mode(object, wimg_mode);
4574 }
4575
4576 vm_object_unlock(object);
4577 } else {
4578 panic("invalid VM named entry %p", named_entry);
4579 }
4580 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4581 /*
4582 * JMM - This is temporary until we unify named entries
4583 * and raw memory objects.
4584 *
4585 * Detected fake ip_kotype for a memory object. In
4586 * this case, the port isn't really a port at all, but
4587 * instead is just a raw memory object.
4588 */
4589 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4590 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4591 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4592 }
4593
4594 object = memory_object_to_vm_object((memory_object_t)port);
4595 if (object == VM_OBJECT_NULL) {
4596 return KERN_INVALID_OBJECT;
4597 }
4598 vm_object_reference(object);
4599
4600 /* wait for object (if any) to be ready */
4601 if (object != VM_OBJECT_NULL) {
4602 if (object == kernel_object) {
4603 printf("Warning: Attempt to map kernel object"
4604 " by a non-private kernel entity\n");
4605 return KERN_INVALID_OBJECT;
4606 }
4607 if (!object->pager_ready) {
4608 vm_object_lock(object);
4609
4610 while (!object->pager_ready) {
4611 vm_object_wait(object,
4612 VM_OBJECT_EVENT_PAGER_READY,
4613 THREAD_UNINT);
4614 vm_object_lock(object);
4615 }
4616 vm_object_unlock(object);
4617 }
4618 }
4619 } else {
4620 return KERN_INVALID_OBJECT;
4621 }
4622
4623 if (object != VM_OBJECT_NULL &&
4624 object->named &&
4625 object->pager != MEMORY_OBJECT_NULL &&
4626 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4627 memory_object_t pager;
4628 vm_prot_t pager_prot;
4629 kern_return_t kr;
4630
4631 /*
4632 * For "named" VM objects, let the pager know that the
4633 * memory object is being mapped. Some pagers need to keep
4634 * track of this, to know when they can reclaim the memory
4635 * object, for example.
4636 * VM calls memory_object_map() for each mapping (specifying
4637 * the protection of each mapping) and calls
4638 * memory_object_last_unmap() when all the mappings are gone.
4639 */
4640 pager_prot = max_protection;
4641 if (copy) {
4642 /*
4643 * Copy-On-Write mapping: won't modify the
4644 * memory object.
4645 */
4646 pager_prot &= ~VM_PROT_WRITE;
4647 }
4648 vm_object_lock(object);
4649 pager = object->pager;
4650 if (object->named &&
4651 pager != MEMORY_OBJECT_NULL &&
4652 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4653 assert(object->pager_ready);
4654 vm_object_mapping_wait(object, THREAD_UNINT);
4655 vm_object_mapping_begin(object);
4656 vm_object_unlock(object);
4657
4658 kr = memory_object_map(pager, pager_prot);
4659 assert(kr == KERN_SUCCESS);
4660
4661 vm_object_lock(object);
4662 vm_object_mapping_end(object);
4663 }
4664 vm_object_unlock(object);
4665 }
4666
4667 /*
4668 * Perform the copy if requested
4669 */
4670
4671 if (copy) {
4672 vm_object_t new_object;
4673 vm_object_offset_t new_offset;
4674
4675 result = vm_object_copy_strategically(object, offset,
4676 map_size,
4677 &new_object, &new_offset,
4678 ©);
4679
4680
4681 if (result == KERN_MEMORY_RESTART_COPY) {
4682 boolean_t success;
4683 boolean_t src_needs_copy;
4684
4685 /*
4686 * XXX
4687 * We currently ignore src_needs_copy.
4688 * This really is the issue of how to make
4689 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4690 * non-kernel users to use. Solution forthcoming.
4691 * In the meantime, since we don't allow non-kernel
4692 * memory managers to specify symmetric copy,
4693 * we won't run into problems here.
4694 */
4695 new_object = object;
4696 new_offset = offset;
4697 success = vm_object_copy_quickly(new_object,
4698 new_offset,
4699 map_size,
4700 &src_needs_copy,
4701 ©);
4702 assert(success);
4703 result = KERN_SUCCESS;
4704 }
4705 /*
4706 * Throw away the reference to the
4707 * original object, as it won't be mapped.
4708 */
4709
4710 vm_object_deallocate(object);
4711
4712 if (result != KERN_SUCCESS) {
4713 return result;
4714 }
4715
4716 object = new_object;
4717 offset = new_offset;
4718 }
4719
4720 /*
4721 * If non-kernel users want to try to prefault pages, the mapping and prefault
4722 * needs to be atomic.
4723 */
4724 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4725 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4726
4727 #if __arm64__
4728 if (fourk) {
4729 /* map this object in a "4K" pager */
4730 result = vm_map_enter_fourk(target_map,
4731 &map_addr,
4732 map_size,
4733 (vm_map_offset_t) mask,
4734 flags,
4735 vmk_flags,
4736 tag,
4737 object,
4738 offset,
4739 copy,
4740 cur_protection,
4741 max_protection,
4742 inheritance);
4743 } else
4744 #endif /* __arm64__ */
4745 {
4746 result = vm_map_enter(target_map,
4747 &map_addr, map_size,
4748 (vm_map_offset_t)mask,
4749 flags,
4750 vmk_flags,
4751 tag,
4752 object, offset,
4753 copy,
4754 cur_protection, max_protection,
4755 inheritance);
4756 }
4757 if (result != KERN_SUCCESS) {
4758 vm_object_deallocate(object);
4759 }
4760
4761 /*
4762 * Try to prefault, and do not forget to release the vm map lock.
4763 */
4764 if (result == KERN_SUCCESS && try_prefault) {
4765 mach_vm_address_t va = map_addr;
4766 kern_return_t kr = KERN_SUCCESS;
4767 unsigned int i = 0;
4768 int pmap_options;
4769
4770 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4771 if (object->internal) {
4772 pmap_options |= PMAP_OPTIONS_INTERNAL;
4773 }
4774
4775 for (i = 0; i < page_list_count; ++i) {
4776 if (!UPL_VALID_PAGE(page_list, i)) {
4777 if (kernel_prefault) {
4778 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4779 result = KERN_MEMORY_ERROR;
4780 break;
4781 }
4782 } else {
4783 /*
4784 * If this function call failed, we should stop
4785 * trying to optimize, other calls are likely
4786 * going to fail too.
4787 *
4788 * We are not gonna report an error for such
4789 * failure though. That's an optimization, not
4790 * something critical.
4791 */
4792 kr = pmap_enter_options(target_map->pmap,
4793 va, UPL_PHYS_PAGE(page_list, i),
4794 cur_protection, VM_PROT_NONE,
4795 0, TRUE, pmap_options, NULL);
4796 if (kr != KERN_SUCCESS) {
4797 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4798 if (kernel_prefault) {
4799 result = kr;
4800 }
4801 break;
4802 }
4803 OSIncrementAtomic64(&vm_prefault_nb_pages);
4804 }
4805
4806 /* Next virtual address */
4807 va += PAGE_SIZE;
4808 }
4809 if (vmk_flags.vmkf_keep_map_locked) {
4810 vm_map_unlock(target_map);
4811 }
4812 }
4813
4814 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4815 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4816 *address = map_addr + offset_in_mapping;
4817 } else {
4818 *address = map_addr;
4819 }
4820 return result;
4821 }
4822
4823 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4824 vm_map_enter_mem_object(
4825 vm_map_t target_map,
4826 vm_map_offset_t *address,
4827 vm_map_size_t initial_size,
4828 vm_map_offset_t mask,
4829 int flags,
4830 vm_map_kernel_flags_t vmk_flags,
4831 vm_tag_t tag,
4832 ipc_port_t port,
4833 vm_object_offset_t offset,
4834 boolean_t copy,
4835 vm_prot_t cur_protection,
4836 vm_prot_t max_protection,
4837 vm_inherit_t inheritance)
4838 {
4839 kern_return_t ret;
4840
4841 ret = vm_map_enter_mem_object_helper(target_map,
4842 address,
4843 initial_size,
4844 mask,
4845 flags,
4846 vmk_flags,
4847 tag,
4848 port,
4849 offset,
4850 copy,
4851 cur_protection,
4852 max_protection,
4853 inheritance,
4854 NULL,
4855 0);
4856
4857 #if KASAN
4858 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4859 kasan_notify_address(*address, initial_size);
4860 }
4861 #endif
4862
4863 return ret;
4864 }
4865
4866 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4867 vm_map_enter_mem_object_prefault(
4868 vm_map_t target_map,
4869 vm_map_offset_t *address,
4870 vm_map_size_t initial_size,
4871 vm_map_offset_t mask,
4872 int flags,
4873 vm_map_kernel_flags_t vmk_flags,
4874 vm_tag_t tag,
4875 ipc_port_t port,
4876 vm_object_offset_t offset,
4877 vm_prot_t cur_protection,
4878 vm_prot_t max_protection,
4879 upl_page_list_ptr_t page_list,
4880 unsigned int page_list_count)
4881 {
4882 kern_return_t ret;
4883
4884 ret = vm_map_enter_mem_object_helper(target_map,
4885 address,
4886 initial_size,
4887 mask,
4888 flags,
4889 vmk_flags,
4890 tag,
4891 port,
4892 offset,
4893 FALSE,
4894 cur_protection,
4895 max_protection,
4896 VM_INHERIT_DEFAULT,
4897 page_list,
4898 page_list_count);
4899
4900 #if KASAN
4901 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4902 kasan_notify_address(*address, initial_size);
4903 }
4904 #endif
4905
4906 return ret;
4907 }
4908
4909
4910 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4911 vm_map_enter_mem_object_control(
4912 vm_map_t target_map,
4913 vm_map_offset_t *address,
4914 vm_map_size_t initial_size,
4915 vm_map_offset_t mask,
4916 int flags,
4917 vm_map_kernel_flags_t vmk_flags,
4918 vm_tag_t tag,
4919 memory_object_control_t control,
4920 vm_object_offset_t offset,
4921 boolean_t copy,
4922 vm_prot_t cur_protection,
4923 vm_prot_t max_protection,
4924 vm_inherit_t inheritance)
4925 {
4926 vm_map_address_t map_addr;
4927 vm_map_size_t map_size;
4928 vm_object_t object;
4929 vm_object_size_t size;
4930 kern_return_t result;
4931 memory_object_t pager;
4932 vm_prot_t pager_prot;
4933 kern_return_t kr;
4934 #if __arm64__
4935 boolean_t fourk = vmk_flags.vmkf_fourk;
4936 #endif /* __arm64__ */
4937
4938 /*
4939 * Check arguments for validity
4940 */
4941 if ((target_map == VM_MAP_NULL) ||
4942 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4943 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4944 (inheritance > VM_INHERIT_LAST_VALID) ||
4945 initial_size == 0) {
4946 return KERN_INVALID_ARGUMENT;
4947 }
4948
4949 #if __arm64__
4950 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4951 fourk = FALSE;
4952 }
4953
4954 if (fourk) {
4955 map_addr = vm_map_trunc_page(*address,
4956 FOURK_PAGE_MASK);
4957 map_size = vm_map_round_page(initial_size,
4958 FOURK_PAGE_MASK);
4959 } else
4960 #endif /* __arm64__ */
4961 {
4962 map_addr = vm_map_trunc_page(*address,
4963 VM_MAP_PAGE_MASK(target_map));
4964 map_size = vm_map_round_page(initial_size,
4965 VM_MAP_PAGE_MASK(target_map));
4966 }
4967 size = vm_object_round_page(initial_size);
4968
4969 object = memory_object_control_to_vm_object(control);
4970
4971 if (object == VM_OBJECT_NULL) {
4972 return KERN_INVALID_OBJECT;
4973 }
4974
4975 if (object == kernel_object) {
4976 printf("Warning: Attempt to map kernel object"
4977 " by a non-private kernel entity\n");
4978 return KERN_INVALID_OBJECT;
4979 }
4980
4981 vm_object_lock(object);
4982 object->ref_count++;
4983
4984 /*
4985 * For "named" VM objects, let the pager know that the
4986 * memory object is being mapped. Some pagers need to keep
4987 * track of this, to know when they can reclaim the memory
4988 * object, for example.
4989 * VM calls memory_object_map() for each mapping (specifying
4990 * the protection of each mapping) and calls
4991 * memory_object_last_unmap() when all the mappings are gone.
4992 */
4993 pager_prot = max_protection;
4994 if (copy) {
4995 pager_prot &= ~VM_PROT_WRITE;
4996 }
4997 pager = object->pager;
4998 if (object->named &&
4999 pager != MEMORY_OBJECT_NULL &&
5000 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5001 assert(object->pager_ready);
5002 vm_object_mapping_wait(object, THREAD_UNINT);
5003 vm_object_mapping_begin(object);
5004 vm_object_unlock(object);
5005
5006 kr = memory_object_map(pager, pager_prot);
5007 assert(kr == KERN_SUCCESS);
5008
5009 vm_object_lock(object);
5010 vm_object_mapping_end(object);
5011 }
5012 vm_object_unlock(object);
5013
5014 /*
5015 * Perform the copy if requested
5016 */
5017
5018 if (copy) {
5019 vm_object_t new_object;
5020 vm_object_offset_t new_offset;
5021
5022 result = vm_object_copy_strategically(object, offset, size,
5023 &new_object, &new_offset,
5024 ©);
5025
5026
5027 if (result == KERN_MEMORY_RESTART_COPY) {
5028 boolean_t success;
5029 boolean_t src_needs_copy;
5030
5031 /*
5032 * XXX
5033 * We currently ignore src_needs_copy.
5034 * This really is the issue of how to make
5035 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5036 * non-kernel users to use. Solution forthcoming.
5037 * In the meantime, since we don't allow non-kernel
5038 * memory managers to specify symmetric copy,
5039 * we won't run into problems here.
5040 */
5041 new_object = object;
5042 new_offset = offset;
5043 success = vm_object_copy_quickly(new_object,
5044 new_offset, size,
5045 &src_needs_copy,
5046 ©);
5047 assert(success);
5048 result = KERN_SUCCESS;
5049 }
5050 /*
5051 * Throw away the reference to the
5052 * original object, as it won't be mapped.
5053 */
5054
5055 vm_object_deallocate(object);
5056
5057 if (result != KERN_SUCCESS) {
5058 return result;
5059 }
5060
5061 object = new_object;
5062 offset = new_offset;
5063 }
5064
5065 #if __arm64__
5066 if (fourk) {
5067 result = vm_map_enter_fourk(target_map,
5068 &map_addr,
5069 map_size,
5070 (vm_map_offset_t)mask,
5071 flags,
5072 vmk_flags,
5073 tag,
5074 object, offset,
5075 copy,
5076 cur_protection, max_protection,
5077 inheritance);
5078 } else
5079 #endif /* __arm64__ */
5080 {
5081 result = vm_map_enter(target_map,
5082 &map_addr, map_size,
5083 (vm_map_offset_t)mask,
5084 flags,
5085 vmk_flags,
5086 tag,
5087 object, offset,
5088 copy,
5089 cur_protection, max_protection,
5090 inheritance);
5091 }
5092 if (result != KERN_SUCCESS) {
5093 vm_object_deallocate(object);
5094 }
5095 *address = map_addr;
5096
5097 return result;
5098 }
5099
5100
5101 #if VM_CPM
5102
5103 #ifdef MACH_ASSERT
5104 extern pmap_paddr_t avail_start, avail_end;
5105 #endif
5106
5107 /*
5108 * Allocate memory in the specified map, with the caveat that
5109 * the memory is physically contiguous. This call may fail
5110 * if the system can't find sufficient contiguous memory.
5111 * This call may cause or lead to heart-stopping amounts of
5112 * paging activity.
5113 *
5114 * Memory obtained from this call should be freed in the
5115 * normal way, viz., via vm_deallocate.
5116 */
5117 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags)5118 vm_map_enter_cpm(
5119 vm_map_t map,
5120 vm_map_offset_t *addr,
5121 vm_map_size_t size,
5122 int flags)
5123 {
5124 vm_object_t cpm_obj;
5125 pmap_t pmap;
5126 vm_page_t m, pages;
5127 kern_return_t kr;
5128 vm_map_offset_t va, start, end, offset;
5129 #if MACH_ASSERT
5130 vm_map_offset_t prev_addr = 0;
5131 #endif /* MACH_ASSERT */
5132
5133 boolean_t anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5134 vm_tag_t tag;
5135
5136 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5137 /* XXX TODO4K do we need to support this? */
5138 *addr = 0;
5139 return KERN_NOT_SUPPORTED;
5140 }
5141
5142 VM_GET_FLAGS_ALIAS(flags, tag);
5143
5144 if (size == 0) {
5145 *addr = 0;
5146 return KERN_SUCCESS;
5147 }
5148 if (anywhere) {
5149 *addr = vm_map_min(map);
5150 } else {
5151 *addr = vm_map_trunc_page(*addr,
5152 VM_MAP_PAGE_MASK(map));
5153 }
5154 size = vm_map_round_page(size,
5155 VM_MAP_PAGE_MASK(map));
5156
5157 /*
5158 * LP64todo - cpm_allocate should probably allow
5159 * allocations of >4GB, but not with the current
5160 * algorithm, so just cast down the size for now.
5161 */
5162 if (size > VM_MAX_ADDRESS) {
5163 return KERN_RESOURCE_SHORTAGE;
5164 }
5165 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5166 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5167 return kr;
5168 }
5169
5170 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5171 assert(cpm_obj != VM_OBJECT_NULL);
5172 assert(cpm_obj->internal);
5173 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5174 assert(cpm_obj->can_persist == FALSE);
5175 assert(cpm_obj->pager_created == FALSE);
5176 assert(cpm_obj->pageout == FALSE);
5177 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5178
5179 /*
5180 * Insert pages into object.
5181 */
5182
5183 vm_object_lock(cpm_obj);
5184 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5185 m = pages;
5186 pages = NEXT_PAGE(m);
5187 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5188
5189 assert(!m->vmp_gobbled);
5190 assert(!m->vmp_wanted);
5191 assert(!m->vmp_pageout);
5192 assert(!m->vmp_tabled);
5193 assert(VM_PAGE_WIRED(m));
5194 assert(m->vmp_busy);
5195 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5196
5197 m->vmp_busy = FALSE;
5198 vm_page_insert(m, cpm_obj, offset);
5199 }
5200 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5201 vm_object_unlock(cpm_obj);
5202
5203 /*
5204 * Hang onto a reference on the object in case a
5205 * multi-threaded application for some reason decides
5206 * to deallocate the portion of the address space into
5207 * which we will insert this object.
5208 *
5209 * Unfortunately, we must insert the object now before
5210 * we can talk to the pmap module about which addresses
5211 * must be wired down. Hence, the race with a multi-
5212 * threaded app.
5213 */
5214 vm_object_reference(cpm_obj);
5215
5216 /*
5217 * Insert object into map.
5218 */
5219
5220 kr = vm_map_enter(
5221 map,
5222 addr,
5223 size,
5224 (vm_map_offset_t)0,
5225 flags,
5226 VM_MAP_KERNEL_FLAGS_NONE,
5227 cpm_obj,
5228 (vm_object_offset_t)0,
5229 FALSE,
5230 VM_PROT_ALL,
5231 VM_PROT_ALL,
5232 VM_INHERIT_DEFAULT);
5233
5234 if (kr != KERN_SUCCESS) {
5235 /*
5236 * A CPM object doesn't have can_persist set,
5237 * so all we have to do is deallocate it to
5238 * free up these pages.
5239 */
5240 assert(cpm_obj->pager_created == FALSE);
5241 assert(cpm_obj->can_persist == FALSE);
5242 assert(cpm_obj->pageout == FALSE);
5243 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5244 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5245 vm_object_deallocate(cpm_obj); /* kill creation ref */
5246 }
5247
5248 /*
5249 * Inform the physical mapping system that the
5250 * range of addresses may not fault, so that
5251 * page tables and such can be locked down as well.
5252 */
5253 start = *addr;
5254 end = start + size;
5255 pmap = vm_map_pmap(map);
5256 pmap_pageable(pmap, start, end, FALSE);
5257
5258 /*
5259 * Enter each page into the pmap, to avoid faults.
5260 * Note that this loop could be coded more efficiently,
5261 * if the need arose, rather than looking up each page
5262 * again.
5263 */
5264 for (offset = 0, va = start; offset < size;
5265 va += PAGE_SIZE, offset += PAGE_SIZE) {
5266 int type_of_fault;
5267
5268 vm_object_lock(cpm_obj);
5269 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5270 assert(m != VM_PAGE_NULL);
5271
5272 vm_page_zero_fill(m);
5273
5274 type_of_fault = DBG_ZERO_FILL_FAULT;
5275
5276 vm_fault_enter(m, pmap, va,
5277 PAGE_SIZE, 0,
5278 VM_PROT_ALL, VM_PROT_WRITE,
5279 VM_PAGE_WIRED(m),
5280 FALSE, /* change_wiring */
5281 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5282 FALSE, /* no_cache */
5283 FALSE, /* cs_bypass */
5284 0, /* user_tag */
5285 0, /* pmap_options */
5286 NULL, /* need_retry */
5287 &type_of_fault);
5288
5289 vm_object_unlock(cpm_obj);
5290 }
5291
5292 #if MACH_ASSERT
5293 /*
5294 * Verify ordering in address space.
5295 */
5296 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5297 vm_object_lock(cpm_obj);
5298 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5299 vm_object_unlock(cpm_obj);
5300 if (m == VM_PAGE_NULL) {
5301 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5302 cpm_obj, (uint64_t)offset);
5303 }
5304 assert(m->vmp_tabled);
5305 assert(!m->vmp_busy);
5306 assert(!m->vmp_wanted);
5307 assert(!m->vmp_fictitious);
5308 assert(!m->vmp_private);
5309 assert(!m->vmp_absent);
5310 assert(!m->vmp_error);
5311 assert(!m->vmp_cleaning);
5312 assert(!m->vmp_laundry);
5313 assert(!m->vmp_precious);
5314 assert(!m->vmp_clustered);
5315 if (offset != 0) {
5316 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5317 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5318 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5319 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5320 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5321 panic("vm_allocate_cpm: pages not contig!");
5322 }
5323 }
5324 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5325 }
5326 #endif /* MACH_ASSERT */
5327
5328 vm_object_deallocate(cpm_obj); /* kill extra ref */
5329
5330 return kr;
5331 }
5332
5333
5334 #else /* VM_CPM */
5335
5336 /*
5337 * Interface is defined in all cases, but unless the kernel
5338 * is built explicitly for this option, the interface does
5339 * nothing.
5340 */
5341
5342 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags)5343 vm_map_enter_cpm(
5344 __unused vm_map_t map,
5345 __unused vm_map_offset_t *addr,
5346 __unused vm_map_size_t size,
5347 __unused int flags)
5348 {
5349 return KERN_FAILURE;
5350 }
5351 #endif /* VM_CPM */
5352
5353 /* Not used without nested pmaps */
5354 #ifndef NO_NESTED_PMAP
5355 /*
5356 * Clip and unnest a portion of a nested submap mapping.
5357 */
5358
5359
5360 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5361 vm_map_clip_unnest(
5362 vm_map_t map,
5363 vm_map_entry_t entry,
5364 vm_map_offset_t start_unnest,
5365 vm_map_offset_t end_unnest)
5366 {
5367 vm_map_offset_t old_start_unnest = start_unnest;
5368 vm_map_offset_t old_end_unnest = end_unnest;
5369
5370 assert(entry->is_sub_map);
5371 assert(VME_SUBMAP(entry) != NULL);
5372 assert(entry->use_pmap);
5373
5374 /*
5375 * Query the platform for the optimal unnest range.
5376 * DRK: There's some duplication of effort here, since
5377 * callers may have adjusted the range to some extent. This
5378 * routine was introduced to support 1GiB subtree nesting
5379 * for x86 platforms, which can also nest on 2MiB boundaries
5380 * depending on size/alignment.
5381 */
5382 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5383 assert(VME_SUBMAP(entry)->is_nested_map);
5384 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5385 log_unnest_badness(map,
5386 old_start_unnest,
5387 old_end_unnest,
5388 VME_SUBMAP(entry)->is_nested_map,
5389 (entry->vme_start +
5390 VME_SUBMAP(entry)->lowest_unnestable_start -
5391 VME_OFFSET(entry)));
5392 }
5393
5394 if (entry->vme_start > start_unnest ||
5395 entry->vme_end < end_unnest) {
5396 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5397 "bad nested entry: start=0x%llx end=0x%llx\n",
5398 (long long)start_unnest, (long long)end_unnest,
5399 (long long)entry->vme_start, (long long)entry->vme_end);
5400 }
5401
5402 if (start_unnest > entry->vme_start) {
5403 _vm_map_clip_start(&map->hdr,
5404 entry,
5405 start_unnest);
5406 if (map->holelistenabled) {
5407 vm_map_store_update_first_free(map, NULL, FALSE);
5408 } else {
5409 vm_map_store_update_first_free(map, map->first_free, FALSE);
5410 }
5411 }
5412 if (entry->vme_end > end_unnest) {
5413 _vm_map_clip_end(&map->hdr,
5414 entry,
5415 end_unnest);
5416 if (map->holelistenabled) {
5417 vm_map_store_update_first_free(map, NULL, FALSE);
5418 } else {
5419 vm_map_store_update_first_free(map, map->first_free, FALSE);
5420 }
5421 }
5422
5423 pmap_unnest(map->pmap,
5424 entry->vme_start,
5425 entry->vme_end - entry->vme_start);
5426 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5427 /* clean up parent map/maps */
5428 vm_map_submap_pmap_clean(
5429 map, entry->vme_start,
5430 entry->vme_end,
5431 VME_SUBMAP(entry),
5432 VME_OFFSET(entry));
5433 }
5434 entry->use_pmap = FALSE;
5435 if ((map->pmap != kernel_pmap) &&
5436 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5437 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5438 }
5439 }
5440 #endif /* NO_NESTED_PMAP */
5441
5442 __abortlike
5443 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5444 __vm_map_clip_atomic_entry_panic(
5445 vm_map_t map,
5446 vm_map_entry_t entry,
5447 vm_map_offset_t where)
5448 {
5449 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5450 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5451 (uint64_t)entry->vme_start,
5452 (uint64_t)entry->vme_end,
5453 (uint64_t)where);
5454 }
5455
5456 /*
5457 * vm_map_clip_start: [ internal use only ]
5458 *
5459 * Asserts that the given entry begins at or after
5460 * the specified address; if necessary,
5461 * it splits the entry into two.
5462 */
5463 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5464 vm_map_clip_start(
5465 vm_map_t map,
5466 vm_map_entry_t entry,
5467 vm_map_offset_t startaddr)
5468 {
5469 #ifndef NO_NESTED_PMAP
5470 if (entry->is_sub_map &&
5471 entry->use_pmap &&
5472 startaddr >= entry->vme_start) {
5473 vm_map_offset_t start_unnest, end_unnest;
5474
5475 /*
5476 * Make sure "startaddr" is no longer in a nested range
5477 * before we clip. Unnest only the minimum range the platform
5478 * can handle.
5479 * vm_map_clip_unnest may perform additional adjustments to
5480 * the unnest range.
5481 */
5482 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5483 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5484 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5485 }
5486 #endif /* NO_NESTED_PMAP */
5487 if (startaddr > entry->vme_start) {
5488 if (!entry->is_sub_map &&
5489 VME_OBJECT(entry) &&
5490 VME_OBJECT(entry)->phys_contiguous) {
5491 pmap_remove(map->pmap,
5492 (addr64_t)(entry->vme_start),
5493 (addr64_t)(entry->vme_end));
5494 }
5495 if (entry->vme_atomic) {
5496 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5497 }
5498
5499 DTRACE_VM5(
5500 vm_map_clip_start,
5501 vm_map_t, map,
5502 vm_map_offset_t, entry->vme_start,
5503 vm_map_offset_t, entry->vme_end,
5504 vm_map_offset_t, startaddr,
5505 int, VME_ALIAS(entry));
5506
5507 _vm_map_clip_start(&map->hdr, entry, startaddr);
5508 if (map->holelistenabled) {
5509 vm_map_store_update_first_free(map, NULL, FALSE);
5510 } else {
5511 vm_map_store_update_first_free(map, map->first_free, FALSE);
5512 }
5513 }
5514 }
5515
5516
5517 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5518 MACRO_BEGIN \
5519 if ((startaddr) > (entry)->vme_start) \
5520 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5521 MACRO_END
5522
5523 /*
5524 * This routine is called only when it is known that
5525 * the entry must be split.
5526 */
5527 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5528 _vm_map_clip_start(
5529 struct vm_map_header *map_header,
5530 vm_map_entry_t entry,
5531 vm_map_offset_t start)
5532 {
5533 vm_map_entry_t new_entry;
5534
5535 /*
5536 * Split off the front portion --
5537 * note that we must insert the new
5538 * entry BEFORE this one, so that
5539 * this entry has the specified starting
5540 * address.
5541 */
5542
5543 if (entry->map_aligned) {
5544 assert(VM_MAP_PAGE_ALIGNED(start,
5545 VM_MAP_HDR_PAGE_MASK(map_header)));
5546 }
5547
5548 new_entry = _vm_map_entry_create(map_header);
5549 vm_map_entry_copy_full(new_entry, entry);
5550
5551 new_entry->vme_end = start;
5552 assert(new_entry->vme_start < new_entry->vme_end);
5553 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5554 assert(start < entry->vme_end);
5555 entry->vme_start = start;
5556
5557 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5558
5559 if (entry->is_sub_map) {
5560 vm_map_reference(VME_SUBMAP(new_entry));
5561 } else {
5562 vm_object_reference(VME_OBJECT(new_entry));
5563 }
5564 }
5565
5566
5567 /*
5568 * vm_map_clip_end: [ internal use only ]
5569 *
5570 * Asserts that the given entry ends at or before
5571 * the specified address; if necessary,
5572 * it splits the entry into two.
5573 */
5574 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5575 vm_map_clip_end(
5576 vm_map_t map,
5577 vm_map_entry_t entry,
5578 vm_map_offset_t endaddr)
5579 {
5580 if (endaddr > entry->vme_end) {
5581 /*
5582 * Within the scope of this clipping, limit "endaddr" to
5583 * the end of this map entry...
5584 */
5585 endaddr = entry->vme_end;
5586 }
5587 #ifndef NO_NESTED_PMAP
5588 if (entry->is_sub_map && entry->use_pmap) {
5589 vm_map_offset_t start_unnest, end_unnest;
5590
5591 /*
5592 * Make sure the range between the start of this entry and
5593 * the new "endaddr" is no longer nested before we clip.
5594 * Unnest only the minimum range the platform can handle.
5595 * vm_map_clip_unnest may perform additional adjustments to
5596 * the unnest range.
5597 */
5598 start_unnest = entry->vme_start;
5599 end_unnest =
5600 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5601 ~(pmap_shared_region_size_min(map->pmap) - 1);
5602 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5603 }
5604 #endif /* NO_NESTED_PMAP */
5605 if (endaddr < entry->vme_end) {
5606 if (!entry->is_sub_map &&
5607 VME_OBJECT(entry) &&
5608 VME_OBJECT(entry)->phys_contiguous) {
5609 pmap_remove(map->pmap,
5610 (addr64_t)(entry->vme_start),
5611 (addr64_t)(entry->vme_end));
5612 }
5613 if (entry->vme_atomic) {
5614 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5615 }
5616 DTRACE_VM5(
5617 vm_map_clip_end,
5618 vm_map_t, map,
5619 vm_map_offset_t, entry->vme_start,
5620 vm_map_offset_t, entry->vme_end,
5621 vm_map_offset_t, endaddr,
5622 int, VME_ALIAS(entry));
5623
5624 _vm_map_clip_end(&map->hdr, entry, endaddr);
5625 if (map->holelistenabled) {
5626 vm_map_store_update_first_free(map, NULL, FALSE);
5627 } else {
5628 vm_map_store_update_first_free(map, map->first_free, FALSE);
5629 }
5630 }
5631 }
5632
5633
5634 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5635 MACRO_BEGIN \
5636 if ((endaddr) < (entry)->vme_end) \
5637 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5638 MACRO_END
5639
5640 /*
5641 * This routine is called only when it is known that
5642 * the entry must be split.
5643 */
5644 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5645 _vm_map_clip_end(
5646 struct vm_map_header *map_header,
5647 vm_map_entry_t entry,
5648 vm_map_offset_t end)
5649 {
5650 vm_map_entry_t new_entry;
5651
5652 /*
5653 * Create a new entry and insert it
5654 * AFTER the specified entry
5655 */
5656
5657 if (entry->map_aligned) {
5658 assert(VM_MAP_PAGE_ALIGNED(end,
5659 VM_MAP_HDR_PAGE_MASK(map_header)));
5660 }
5661
5662 new_entry = _vm_map_entry_create(map_header);
5663 vm_map_entry_copy_full(new_entry, entry);
5664
5665 assert(entry->vme_start < end);
5666 new_entry->vme_start = entry->vme_end = end;
5667 VME_OFFSET_SET(new_entry,
5668 VME_OFFSET(new_entry) + (end - entry->vme_start));
5669 assert(new_entry->vme_start < new_entry->vme_end);
5670
5671 _vm_map_store_entry_link(map_header, entry, new_entry);
5672
5673 if (entry->is_sub_map) {
5674 vm_map_reference(VME_SUBMAP(new_entry));
5675 } else {
5676 vm_object_reference(VME_OBJECT(new_entry));
5677 }
5678 }
5679
5680
5681 /*
5682 * VM_MAP_RANGE_CHECK: [ internal use only ]
5683 *
5684 * Asserts that the starting and ending region
5685 * addresses fall within the valid range of the map.
5686 */
5687 #define VM_MAP_RANGE_CHECK(map, start, end) \
5688 MACRO_BEGIN \
5689 if (start < vm_map_min(map)) \
5690 start = vm_map_min(map); \
5691 if (end > vm_map_max(map)) \
5692 end = vm_map_max(map); \
5693 if (start > end) \
5694 start = end; \
5695 MACRO_END
5696
5697 /*
5698 * vm_map_range_check: [ internal use only ]
5699 *
5700 * Check that the region defined by the specified start and
5701 * end addresses are wholly contained within a single map
5702 * entry or set of adjacent map entries of the spacified map,
5703 * i.e. the specified region contains no unmapped space.
5704 * If any or all of the region is unmapped, FALSE is returned.
5705 * Otherwise, TRUE is returned and if the output argument 'entry'
5706 * is not NULL it points to the map entry containing the start
5707 * of the region.
5708 *
5709 * The map is locked for reading on entry and is left locked.
5710 */
5711 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5712 vm_map_range_check(
5713 vm_map_t map,
5714 vm_map_offset_t start,
5715 vm_map_offset_t end,
5716 vm_map_entry_t *entry)
5717 {
5718 vm_map_entry_t cur;
5719 vm_map_offset_t prev;
5720
5721 /*
5722 * Basic sanity checks first
5723 */
5724 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5725 return FALSE;
5726 }
5727
5728 /*
5729 * Check first if the region starts within a valid
5730 * mapping for the map.
5731 */
5732 if (!vm_map_lookup_entry(map, start, &cur)) {
5733 return FALSE;
5734 }
5735
5736 /*
5737 * Optimize for the case that the region is contained
5738 * in a single map entry.
5739 */
5740 if (entry != (vm_map_entry_t *) NULL) {
5741 *entry = cur;
5742 }
5743 if (end <= cur->vme_end) {
5744 return TRUE;
5745 }
5746
5747 /*
5748 * If the region is not wholly contained within a
5749 * single entry, walk the entries looking for holes.
5750 */
5751 prev = cur->vme_end;
5752 cur = cur->vme_next;
5753 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5754 if (end <= cur->vme_end) {
5755 return TRUE;
5756 }
5757 prev = cur->vme_end;
5758 cur = cur->vme_next;
5759 }
5760 return FALSE;
5761 }
5762
5763 /*
5764 * vm_map_protect:
5765 *
5766 * Sets the protection of the specified address
5767 * region in the target map. If "set_max" is
5768 * specified, the maximum protection is to be set;
5769 * otherwise, only the current protection is affected.
5770 */
5771 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5772 vm_map_protect(
5773 vm_map_t map,
5774 vm_map_offset_t start,
5775 vm_map_offset_t end,
5776 vm_prot_t new_prot,
5777 boolean_t set_max)
5778 {
5779 vm_map_entry_t current;
5780 vm_map_offset_t prev;
5781 vm_map_entry_t entry;
5782 vm_prot_t new_max;
5783 int pmap_options = 0;
5784 kern_return_t kr;
5785
5786 if (new_prot & VM_PROT_COPY) {
5787 vm_map_offset_t new_start;
5788 vm_prot_t cur_prot, max_prot;
5789 vm_map_kernel_flags_t kflags;
5790
5791 /* LP64todo - see below */
5792 if (start >= map->max_offset) {
5793 return KERN_INVALID_ADDRESS;
5794 }
5795
5796 if ((new_prot & VM_PROT_ALLEXEC) &&
5797 map->pmap != kernel_pmap &&
5798 (vm_map_cs_enforcement(map)
5799 #if XNU_TARGET_OS_OSX && __arm64__
5800 || !VM_MAP_IS_EXOTIC(map)
5801 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5802 ) &&
5803 VM_MAP_POLICY_WX_FAIL(map)) {
5804 DTRACE_VM3(cs_wx,
5805 uint64_t, (uint64_t) start,
5806 uint64_t, (uint64_t) end,
5807 vm_prot_t, new_prot);
5808 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5809 proc_selfpid(),
5810 (current_task()->bsd_info
5811 ? proc_name_address(current_task()->bsd_info)
5812 : "?"),
5813 __FUNCTION__);
5814 return KERN_PROTECTION_FAILURE;
5815 }
5816
5817 /*
5818 * Let vm_map_remap_extract() know that it will need to:
5819 * + make a copy of the mapping
5820 * + add VM_PROT_WRITE to the max protections
5821 * + remove any protections that are no longer allowed from the
5822 * max protections (to avoid any WRITE/EXECUTE conflict, for
5823 * example).
5824 * Note that "max_prot" is an IN/OUT parameter only for this
5825 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5826 * only.
5827 */
5828 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5829 cur_prot = VM_PROT_NONE;
5830 kflags = VM_MAP_KERNEL_FLAGS_NONE;
5831 kflags.vmkf_remap_prot_copy = TRUE;
5832 kflags.vmkf_overwrite_immutable = TRUE;
5833 new_start = start;
5834 kr = vm_map_remap(map,
5835 &new_start,
5836 end - start,
5837 0, /* mask */
5838 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5839 kflags,
5840 0,
5841 map,
5842 start,
5843 TRUE, /* copy-on-write remapping! */
5844 &cur_prot, /* IN/OUT */
5845 &max_prot, /* IN/OUT */
5846 VM_INHERIT_DEFAULT);
5847 if (kr != KERN_SUCCESS) {
5848 return kr;
5849 }
5850 new_prot &= ~VM_PROT_COPY;
5851 }
5852
5853 vm_map_lock(map);
5854
5855 /* LP64todo - remove this check when vm_map_commpage64()
5856 * no longer has to stuff in a map_entry for the commpage
5857 * above the map's max_offset.
5858 */
5859 if (start >= map->max_offset) {
5860 vm_map_unlock(map);
5861 return KERN_INVALID_ADDRESS;
5862 }
5863
5864 while (1) {
5865 /*
5866 * Lookup the entry. If it doesn't start in a valid
5867 * entry, return an error.
5868 */
5869 if (!vm_map_lookup_entry(map, start, &entry)) {
5870 vm_map_unlock(map);
5871 return KERN_INVALID_ADDRESS;
5872 }
5873
5874 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5875 start = SUPERPAGE_ROUND_DOWN(start);
5876 continue;
5877 }
5878 break;
5879 }
5880 if (entry->superpage_size) {
5881 end = SUPERPAGE_ROUND_UP(end);
5882 }
5883
5884 /*
5885 * Make a first pass to check for protection and address
5886 * violations.
5887 */
5888
5889 current = entry;
5890 prev = current->vme_start;
5891 while ((current != vm_map_to_entry(map)) &&
5892 (current->vme_start < end)) {
5893 /*
5894 * If there is a hole, return an error.
5895 */
5896 if (current->vme_start != prev) {
5897 vm_map_unlock(map);
5898 return KERN_INVALID_ADDRESS;
5899 }
5900
5901 new_max = current->max_protection;
5902
5903 #if defined(__x86_64__)
5904 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5905 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5906 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5907 }
5908 #endif
5909 if ((new_prot & new_max) != new_prot) {
5910 vm_map_unlock(map);
5911 return KERN_PROTECTION_FAILURE;
5912 }
5913
5914 if (current->used_for_jit &&
5915 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5916 vm_map_unlock(map);
5917 return KERN_PROTECTION_FAILURE;
5918 }
5919
5920 if ((new_prot & VM_PROT_WRITE) &&
5921 (new_prot & VM_PROT_ALLEXEC) &&
5922 #if XNU_TARGET_OS_OSX
5923 map->pmap != kernel_pmap &&
5924 (vm_map_cs_enforcement(map)
5925 #if __arm64__
5926 || !VM_MAP_IS_EXOTIC(map)
5927 #endif /* __arm64__ */
5928 ) &&
5929 #endif /* XNU_TARGET_OS_OSX */
5930 !(current->used_for_jit)) {
5931 DTRACE_VM3(cs_wx,
5932 uint64_t, (uint64_t) current->vme_start,
5933 uint64_t, (uint64_t) current->vme_end,
5934 vm_prot_t, new_prot);
5935 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5936 proc_selfpid(),
5937 (current_task()->bsd_info
5938 ? proc_name_address(current_task()->bsd_info)
5939 : "?"),
5940 __FUNCTION__);
5941 new_prot &= ~VM_PROT_ALLEXEC;
5942 if (VM_MAP_POLICY_WX_FAIL(map)) {
5943 vm_map_unlock(map);
5944 return KERN_PROTECTION_FAILURE;
5945 }
5946 }
5947
5948 /*
5949 * If the task has requested executable lockdown,
5950 * deny both:
5951 * - adding executable protections OR
5952 * - adding write protections to an existing executable mapping.
5953 */
5954 if (map->map_disallow_new_exec == TRUE) {
5955 if ((new_prot & VM_PROT_ALLEXEC) ||
5956 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5957 vm_map_unlock(map);
5958 return KERN_PROTECTION_FAILURE;
5959 }
5960 }
5961
5962 prev = current->vme_end;
5963 current = current->vme_next;
5964 }
5965
5966 #if __arm64__
5967 if (end > prev &&
5968 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5969 vm_map_entry_t prev_entry;
5970
5971 prev_entry = current->vme_prev;
5972 if (prev_entry != vm_map_to_entry(map) &&
5973 !prev_entry->map_aligned &&
5974 (vm_map_round_page(prev_entry->vme_end,
5975 VM_MAP_PAGE_MASK(map))
5976 == end)) {
5977 /*
5978 * The last entry in our range is not "map-aligned"
5979 * but it would have reached all the way to "end"
5980 * if it had been map-aligned, so this is not really
5981 * a hole in the range and we can proceed.
5982 */
5983 prev = end;
5984 }
5985 }
5986 #endif /* __arm64__ */
5987
5988 if (end > prev) {
5989 vm_map_unlock(map);
5990 return KERN_INVALID_ADDRESS;
5991 }
5992
5993 /*
5994 * Go back and fix up protections.
5995 * Clip to start here if the range starts within
5996 * the entry.
5997 */
5998
5999 current = entry;
6000 if (current != vm_map_to_entry(map)) {
6001 /* clip and unnest if necessary */
6002 vm_map_clip_start(map, current, start);
6003 }
6004
6005 while ((current != vm_map_to_entry(map)) &&
6006 (current->vme_start < end)) {
6007 vm_prot_t old_prot;
6008
6009 vm_map_clip_end(map, current, end);
6010
6011 if (current->is_sub_map) {
6012 /* clipping did unnest if needed */
6013 assert(!current->use_pmap);
6014 }
6015
6016 old_prot = current->protection;
6017
6018 if (set_max) {
6019 current->max_protection = new_prot;
6020 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6021 current->protection = (new_prot & old_prot);
6022 } else {
6023 current->protection = new_prot;
6024 }
6025
6026 /*
6027 * Update physical map if necessary.
6028 * If the request is to turn off write protection,
6029 * we won't do it for real (in pmap). This is because
6030 * it would cause copy-on-write to fail. We've already
6031 * set, the new protection in the map, so if a
6032 * write-protect fault occurred, it will be fixed up
6033 * properly, COW or not.
6034 */
6035 if (current->protection != old_prot) {
6036 /* Look one level in we support nested pmaps */
6037 /* from mapped submaps which are direct entries */
6038 /* in our map */
6039
6040 vm_prot_t prot;
6041
6042 prot = current->protection;
6043 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6044 prot &= ~VM_PROT_WRITE;
6045 } else {
6046 assert(!VME_OBJECT(current)->code_signed);
6047 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6048 if (prot & VM_PROT_WRITE) {
6049 /*
6050 * For write requests on the
6051 * compressor, we wil ask the
6052 * pmap layer to prevent us from
6053 * taking a write fault when we
6054 * attempt to access the mapping
6055 * next.
6056 */
6057 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6058 }
6059 }
6060
6061 if (override_nx(map, VME_ALIAS(current)) && prot) {
6062 prot |= VM_PROT_EXECUTE;
6063 }
6064
6065 #if DEVELOPMENT || DEBUG
6066 if (!(old_prot & VM_PROT_EXECUTE) &&
6067 (prot & VM_PROT_EXECUTE) &&
6068 panic_on_unsigned_execute &&
6069 (proc_selfcsflags() & CS_KILL)) {
6070 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6071 }
6072 #endif /* DEVELOPMENT || DEBUG */
6073
6074 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6075 if (current->wired_count) {
6076 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6077 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6078 }
6079
6080 /* If the pmap layer cares about this
6081 * protection type, force a fault for
6082 * each page so that vm_fault will
6083 * repopulate the page with the full
6084 * set of protections.
6085 */
6086 /*
6087 * TODO: We don't seem to need this,
6088 * but this is due to an internal
6089 * implementation detail of
6090 * pmap_protect. Do we want to rely
6091 * on this?
6092 */
6093 prot = VM_PROT_NONE;
6094 }
6095
6096 if (current->is_sub_map && current->use_pmap) {
6097 pmap_protect(VME_SUBMAP(current)->pmap,
6098 current->vme_start,
6099 current->vme_end,
6100 prot);
6101 } else {
6102 pmap_protect_options(map->pmap,
6103 current->vme_start,
6104 current->vme_end,
6105 prot,
6106 pmap_options,
6107 NULL);
6108 }
6109 }
6110 current = current->vme_next;
6111 }
6112
6113 current = entry;
6114 while ((current != vm_map_to_entry(map)) &&
6115 (current->vme_start <= end)) {
6116 vm_map_simplify_entry(map, current);
6117 current = current->vme_next;
6118 }
6119
6120 vm_map_unlock(map);
6121 return KERN_SUCCESS;
6122 }
6123
6124 /*
6125 * vm_map_inherit:
6126 *
6127 * Sets the inheritance of the specified address
6128 * range in the target map. Inheritance
6129 * affects how the map will be shared with
6130 * child maps at the time of vm_map_fork.
6131 */
6132 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6133 vm_map_inherit(
6134 vm_map_t map,
6135 vm_map_offset_t start,
6136 vm_map_offset_t end,
6137 vm_inherit_t new_inheritance)
6138 {
6139 vm_map_entry_t entry;
6140 vm_map_entry_t temp_entry;
6141
6142 vm_map_lock(map);
6143
6144 VM_MAP_RANGE_CHECK(map, start, end);
6145
6146 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6147 entry = temp_entry;
6148 } else {
6149 temp_entry = temp_entry->vme_next;
6150 entry = temp_entry;
6151 }
6152
6153 /* first check entire range for submaps which can't support the */
6154 /* given inheritance. */
6155 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6156 if (entry->is_sub_map) {
6157 if (new_inheritance == VM_INHERIT_COPY) {
6158 vm_map_unlock(map);
6159 return KERN_INVALID_ARGUMENT;
6160 }
6161 }
6162
6163 entry = entry->vme_next;
6164 }
6165
6166 entry = temp_entry;
6167 if (entry != vm_map_to_entry(map)) {
6168 /* clip and unnest if necessary */
6169 vm_map_clip_start(map, entry, start);
6170 }
6171
6172 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6173 vm_map_clip_end(map, entry, end);
6174 if (entry->is_sub_map) {
6175 /* clip did unnest if needed */
6176 assert(!entry->use_pmap);
6177 }
6178
6179 entry->inheritance = new_inheritance;
6180
6181 entry = entry->vme_next;
6182 }
6183
6184 vm_map_unlock(map);
6185 return KERN_SUCCESS;
6186 }
6187
6188 /*
6189 * Update the accounting for the amount of wired memory in this map. If the user has
6190 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6191 */
6192
6193 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6194 add_wire_counts(
6195 vm_map_t map,
6196 vm_map_entry_t entry,
6197 boolean_t user_wire)
6198 {
6199 vm_map_size_t size;
6200
6201 if (user_wire) {
6202 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6203
6204 /*
6205 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6206 * this map entry.
6207 */
6208
6209 if (entry->user_wired_count == 0) {
6210 size = entry->vme_end - entry->vme_start;
6211
6212 /*
6213 * Since this is the first time the user is wiring this map entry, check to see if we're
6214 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6215 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6216 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6217 * limit, then we fail.
6218 */
6219
6220 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6221 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6222 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6223 #if DEVELOPMENT || DEBUG
6224 if (panic_on_mlock_failure) {
6225 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6226 }
6227 #endif /* DEVELOPMENT || DEBUG */
6228 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6229 } else {
6230 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6231 #if DEVELOPMENT || DEBUG
6232 if (panic_on_mlock_failure) {
6233 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6234 }
6235 #endif /* DEVELOPMENT || DEBUG */
6236 }
6237 return KERN_RESOURCE_SHORTAGE;
6238 }
6239
6240 /*
6241 * The first time the user wires an entry, we also increment the wired_count and add this to
6242 * the total that has been wired in the map.
6243 */
6244
6245 if (entry->wired_count >= MAX_WIRE_COUNT) {
6246 return KERN_FAILURE;
6247 }
6248
6249 entry->wired_count++;
6250 map->user_wire_size += size;
6251 }
6252
6253 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6254 return KERN_FAILURE;
6255 }
6256
6257 entry->user_wired_count++;
6258 } else {
6259 /*
6260 * The kernel's wiring the memory. Just bump the count and continue.
6261 */
6262
6263 if (entry->wired_count >= MAX_WIRE_COUNT) {
6264 panic("vm_map_wire: too many wirings");
6265 }
6266
6267 entry->wired_count++;
6268 }
6269
6270 return KERN_SUCCESS;
6271 }
6272
6273 /*
6274 * Update the memory wiring accounting now that the given map entry is being unwired.
6275 */
6276
6277 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6278 subtract_wire_counts(
6279 vm_map_t map,
6280 vm_map_entry_t entry,
6281 boolean_t user_wire)
6282 {
6283 if (user_wire) {
6284 /*
6285 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6286 */
6287
6288 if (entry->user_wired_count == 1) {
6289 /*
6290 * We're removing the last user wire reference. Decrement the wired_count and the total
6291 * user wired memory for this map.
6292 */
6293
6294 assert(entry->wired_count >= 1);
6295 entry->wired_count--;
6296 map->user_wire_size -= entry->vme_end - entry->vme_start;
6297 }
6298
6299 assert(entry->user_wired_count >= 1);
6300 entry->user_wired_count--;
6301 } else {
6302 /*
6303 * The kernel is unwiring the memory. Just update the count.
6304 */
6305
6306 assert(entry->wired_count >= 1);
6307 entry->wired_count--;
6308 }
6309 }
6310
6311 int cs_executable_wire = 0;
6312
6313 /*
6314 * vm_map_wire:
6315 *
6316 * Sets the pageability of the specified address range in the
6317 * target map as wired. Regions specified as not pageable require
6318 * locked-down physical memory and physical page maps. The
6319 * access_type variable indicates types of accesses that must not
6320 * generate page faults. This is checked against protection of
6321 * memory being locked-down.
6322 *
6323 * The map must not be locked, but a reference must remain to the
6324 * map throughout the call.
6325 */
6326 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6327 vm_map_wire_nested(
6328 vm_map_t map,
6329 vm_map_offset_t start,
6330 vm_map_offset_t end,
6331 vm_prot_t caller_prot,
6332 vm_tag_t tag,
6333 boolean_t user_wire,
6334 pmap_t map_pmap,
6335 vm_map_offset_t pmap_addr,
6336 ppnum_t *physpage_p)
6337 {
6338 vm_map_entry_t entry;
6339 vm_prot_t access_type;
6340 struct vm_map_entry *first_entry, tmp_entry;
6341 vm_map_t real_map;
6342 vm_map_offset_t s, e;
6343 kern_return_t rc;
6344 boolean_t need_wakeup;
6345 boolean_t main_map = FALSE;
6346 wait_interrupt_t interruptible_state;
6347 thread_t cur_thread;
6348 unsigned int last_timestamp;
6349 vm_map_size_t size;
6350 boolean_t wire_and_extract;
6351 vm_prot_t extra_prots;
6352
6353 extra_prots = VM_PROT_COPY;
6354 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6355 #if XNU_TARGET_OS_OSX
6356 if (map->pmap == kernel_pmap ||
6357 !vm_map_cs_enforcement(map)) {
6358 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6359 }
6360 #endif /* XNU_TARGET_OS_OSX */
6361
6362 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6363
6364 wire_and_extract = FALSE;
6365 if (physpage_p != NULL) {
6366 /*
6367 * The caller wants the physical page number of the
6368 * wired page. We return only one physical page number
6369 * so this works for only one page at a time.
6370 */
6371 if ((end - start) != PAGE_SIZE) {
6372 return KERN_INVALID_ARGUMENT;
6373 }
6374 wire_and_extract = TRUE;
6375 *physpage_p = 0;
6376 }
6377
6378 vm_map_lock(map);
6379 if (map_pmap == NULL) {
6380 main_map = TRUE;
6381 }
6382 last_timestamp = map->timestamp;
6383
6384 VM_MAP_RANGE_CHECK(map, start, end);
6385 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6386 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6387
6388 if (start == end) {
6389 /* We wired what the caller asked for, zero pages */
6390 vm_map_unlock(map);
6391 return KERN_SUCCESS;
6392 }
6393
6394 need_wakeup = FALSE;
6395 cur_thread = current_thread();
6396
6397 s = start;
6398 rc = KERN_SUCCESS;
6399
6400 if (vm_map_lookup_entry(map, s, &first_entry)) {
6401 entry = first_entry;
6402 /*
6403 * vm_map_clip_start will be done later.
6404 * We don't want to unnest any nested submaps here !
6405 */
6406 } else {
6407 /* Start address is not in map */
6408 rc = KERN_INVALID_ADDRESS;
6409 goto done;
6410 }
6411
6412 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6413 /*
6414 * At this point, we have wired from "start" to "s".
6415 * We still need to wire from "s" to "end".
6416 *
6417 * "entry" hasn't been clipped, so it could start before "s"
6418 * and/or end after "end".
6419 */
6420
6421 /* "e" is how far we want to wire in this entry */
6422 e = entry->vme_end;
6423 if (e > end) {
6424 e = end;
6425 }
6426
6427 /*
6428 * If another thread is wiring/unwiring this entry then
6429 * block after informing other thread to wake us up.
6430 */
6431 if (entry->in_transition) {
6432 wait_result_t wait_result;
6433
6434 /*
6435 * We have not clipped the entry. Make sure that
6436 * the start address is in range so that the lookup
6437 * below will succeed.
6438 * "s" is the current starting point: we've already
6439 * wired from "start" to "s" and we still have
6440 * to wire from "s" to "end".
6441 */
6442
6443 entry->needs_wakeup = TRUE;
6444
6445 /*
6446 * wake up anybody waiting on entries that we have
6447 * already wired.
6448 */
6449 if (need_wakeup) {
6450 vm_map_entry_wakeup(map);
6451 need_wakeup = FALSE;
6452 }
6453 /*
6454 * User wiring is interruptible
6455 */
6456 wait_result = vm_map_entry_wait(map,
6457 (user_wire) ? THREAD_ABORTSAFE :
6458 THREAD_UNINT);
6459 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6460 /*
6461 * undo the wirings we have done so far
6462 * We do not clear the needs_wakeup flag,
6463 * because we cannot tell if we were the
6464 * only one waiting.
6465 */
6466 rc = KERN_FAILURE;
6467 goto done;
6468 }
6469
6470 /*
6471 * Cannot avoid a lookup here. reset timestamp.
6472 */
6473 last_timestamp = map->timestamp;
6474
6475 /*
6476 * The entry could have been clipped, look it up again.
6477 * Worse that can happen is, it may not exist anymore.
6478 */
6479 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6480 /*
6481 * User: undo everything upto the previous
6482 * entry. let vm_map_unwire worry about
6483 * checking the validity of the range.
6484 */
6485 rc = KERN_FAILURE;
6486 goto done;
6487 }
6488 entry = first_entry;
6489 continue;
6490 }
6491
6492 if (entry->is_sub_map) {
6493 vm_map_offset_t sub_start;
6494 vm_map_offset_t sub_end;
6495 vm_map_offset_t local_start;
6496 vm_map_offset_t local_end;
6497 pmap_t pmap;
6498
6499 if (wire_and_extract) {
6500 /*
6501 * Wiring would result in copy-on-write
6502 * which would not be compatible with
6503 * the sharing we have with the original
6504 * provider of this memory.
6505 */
6506 rc = KERN_INVALID_ARGUMENT;
6507 goto done;
6508 }
6509
6510 vm_map_clip_start(map, entry, s);
6511 vm_map_clip_end(map, entry, end);
6512
6513 sub_start = VME_OFFSET(entry);
6514 sub_end = entry->vme_end;
6515 sub_end += VME_OFFSET(entry) - entry->vme_start;
6516
6517 local_end = entry->vme_end;
6518 if (map_pmap == NULL) {
6519 vm_object_t object;
6520 vm_object_offset_t offset;
6521 vm_prot_t prot;
6522 boolean_t wired;
6523 vm_map_entry_t local_entry;
6524 vm_map_version_t version;
6525 vm_map_t lookup_map;
6526
6527 if (entry->use_pmap) {
6528 pmap = VME_SUBMAP(entry)->pmap;
6529 /* ppc implementation requires that */
6530 /* submaps pmap address ranges line */
6531 /* up with parent map */
6532 #ifdef notdef
6533 pmap_addr = sub_start;
6534 #endif
6535 pmap_addr = s;
6536 } else {
6537 pmap = map->pmap;
6538 pmap_addr = s;
6539 }
6540
6541 if (entry->wired_count) {
6542 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6543 goto done;
6544 }
6545
6546 /*
6547 * The map was not unlocked:
6548 * no need to goto re-lookup.
6549 * Just go directly to next entry.
6550 */
6551 entry = entry->vme_next;
6552 s = entry->vme_start;
6553 continue;
6554 }
6555
6556 /* call vm_map_lookup_locked to */
6557 /* cause any needs copy to be */
6558 /* evaluated */
6559 local_start = entry->vme_start;
6560 lookup_map = map;
6561 vm_map_lock_write_to_read(map);
6562 rc = vm_map_lookup_locked(
6563 &lookup_map, local_start,
6564 (access_type | extra_prots),
6565 OBJECT_LOCK_EXCLUSIVE,
6566 &version, &object,
6567 &offset, &prot, &wired,
6568 NULL,
6569 &real_map, NULL);
6570 if (rc != KERN_SUCCESS) {
6571 vm_map_unlock_read(lookup_map);
6572 assert(map_pmap == NULL);
6573 vm_map_unwire(map, start,
6574 s, user_wire);
6575 return rc;
6576 }
6577 vm_object_unlock(object);
6578 if (real_map != lookup_map) {
6579 vm_map_unlock(real_map);
6580 }
6581 vm_map_unlock_read(lookup_map);
6582 vm_map_lock(map);
6583
6584 /* we unlocked, so must re-lookup */
6585 if (!vm_map_lookup_entry(map,
6586 local_start,
6587 &local_entry)) {
6588 rc = KERN_FAILURE;
6589 goto done;
6590 }
6591
6592 /*
6593 * entry could have been "simplified",
6594 * so re-clip
6595 */
6596 entry = local_entry;
6597 assert(s == local_start);
6598 vm_map_clip_start(map, entry, s);
6599 vm_map_clip_end(map, entry, end);
6600 /* re-compute "e" */
6601 e = entry->vme_end;
6602 if (e > end) {
6603 e = end;
6604 }
6605
6606 /* did we have a change of type? */
6607 if (!entry->is_sub_map) {
6608 last_timestamp = map->timestamp;
6609 continue;
6610 }
6611 } else {
6612 local_start = entry->vme_start;
6613 pmap = map_pmap;
6614 }
6615
6616 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6617 goto done;
6618 }
6619
6620 entry->in_transition = TRUE;
6621
6622 vm_map_unlock(map);
6623 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6624 sub_start, sub_end,
6625 caller_prot, tag,
6626 user_wire, pmap, pmap_addr,
6627 NULL);
6628 vm_map_lock(map);
6629
6630 /*
6631 * Find the entry again. It could have been clipped
6632 * after we unlocked the map.
6633 */
6634 if (!vm_map_lookup_entry(map, local_start,
6635 &first_entry)) {
6636 panic("vm_map_wire: re-lookup failed");
6637 }
6638 entry = first_entry;
6639
6640 assert(local_start == s);
6641 /* re-compute "e" */
6642 e = entry->vme_end;
6643 if (e > end) {
6644 e = end;
6645 }
6646
6647 last_timestamp = map->timestamp;
6648 while ((entry != vm_map_to_entry(map)) &&
6649 (entry->vme_start < e)) {
6650 assert(entry->in_transition);
6651 entry->in_transition = FALSE;
6652 if (entry->needs_wakeup) {
6653 entry->needs_wakeup = FALSE;
6654 need_wakeup = TRUE;
6655 }
6656 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6657 subtract_wire_counts(map, entry, user_wire);
6658 }
6659 entry = entry->vme_next;
6660 }
6661 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6662 goto done;
6663 }
6664
6665 /* no need to relookup again */
6666 s = entry->vme_start;
6667 continue;
6668 }
6669
6670 /*
6671 * If this entry is already wired then increment
6672 * the appropriate wire reference count.
6673 */
6674 if (entry->wired_count) {
6675 if ((entry->protection & access_type) != access_type) {
6676 /* found a protection problem */
6677
6678 /*
6679 * XXX FBDP
6680 * We should always return an error
6681 * in this case but since we didn't
6682 * enforce it before, let's do
6683 * it only for the new "wire_and_extract"
6684 * code path for now...
6685 */
6686 if (wire_and_extract) {
6687 rc = KERN_PROTECTION_FAILURE;
6688 goto done;
6689 }
6690 }
6691
6692 /*
6693 * entry is already wired down, get our reference
6694 * after clipping to our range.
6695 */
6696 vm_map_clip_start(map, entry, s);
6697 vm_map_clip_end(map, entry, end);
6698
6699 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6700 goto done;
6701 }
6702
6703 if (wire_and_extract) {
6704 vm_object_t object;
6705 vm_object_offset_t offset;
6706 vm_page_t m;
6707
6708 /*
6709 * We don't have to "wire" the page again
6710 * bit we still have to "extract" its
6711 * physical page number, after some sanity
6712 * checks.
6713 */
6714 assert((entry->vme_end - entry->vme_start)
6715 == PAGE_SIZE);
6716 assert(!entry->needs_copy);
6717 assert(!entry->is_sub_map);
6718 assert(VME_OBJECT(entry));
6719 if (((entry->vme_end - entry->vme_start)
6720 != PAGE_SIZE) ||
6721 entry->needs_copy ||
6722 entry->is_sub_map ||
6723 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6724 rc = KERN_INVALID_ARGUMENT;
6725 goto done;
6726 }
6727
6728 object = VME_OBJECT(entry);
6729 offset = VME_OFFSET(entry);
6730 /* need exclusive lock to update m->dirty */
6731 if (entry->protection & VM_PROT_WRITE) {
6732 vm_object_lock(object);
6733 } else {
6734 vm_object_lock_shared(object);
6735 }
6736 m = vm_page_lookup(object, offset);
6737 assert(m != VM_PAGE_NULL);
6738 assert(VM_PAGE_WIRED(m));
6739 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6740 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6741 if (entry->protection & VM_PROT_WRITE) {
6742 vm_object_lock_assert_exclusive(
6743 object);
6744 m->vmp_dirty = TRUE;
6745 }
6746 } else {
6747 /* not already wired !? */
6748 *physpage_p = 0;
6749 }
6750 vm_object_unlock(object);
6751 }
6752
6753 /* map was not unlocked: no need to relookup */
6754 entry = entry->vme_next;
6755 s = entry->vme_start;
6756 continue;
6757 }
6758
6759 /*
6760 * Unwired entry or wire request transmitted via submap
6761 */
6762
6763 /*
6764 * Wiring would copy the pages to the shadow object.
6765 * The shadow object would not be code-signed so
6766 * attempting to execute code from these copied pages
6767 * would trigger a code-signing violation.
6768 */
6769
6770 if ((entry->protection & VM_PROT_EXECUTE)
6771 #if XNU_TARGET_OS_OSX
6772 &&
6773 map->pmap != kernel_pmap &&
6774 (vm_map_cs_enforcement(map)
6775 #if __arm64__
6776 || !VM_MAP_IS_EXOTIC(map)
6777 #endif /* __arm64__ */
6778 )
6779 #endif /* XNU_TARGET_OS_OSX */
6780 ) {
6781 #if MACH_ASSERT
6782 printf("pid %d[%s] wiring executable range from "
6783 "0x%llx to 0x%llx: rejected to preserve "
6784 "code-signing\n",
6785 proc_selfpid(),
6786 (current_task()->bsd_info
6787 ? proc_name_address(current_task()->bsd_info)
6788 : "?"),
6789 (uint64_t) entry->vme_start,
6790 (uint64_t) entry->vme_end);
6791 #endif /* MACH_ASSERT */
6792 DTRACE_VM2(cs_executable_wire,
6793 uint64_t, (uint64_t)entry->vme_start,
6794 uint64_t, (uint64_t)entry->vme_end);
6795 cs_executable_wire++;
6796 rc = KERN_PROTECTION_FAILURE;
6797 goto done;
6798 }
6799
6800 /*
6801 * Perform actions of vm_map_lookup that need the write
6802 * lock on the map: create a shadow object for a
6803 * copy-on-write region, or an object for a zero-fill
6804 * region.
6805 */
6806 size = entry->vme_end - entry->vme_start;
6807 /*
6808 * If wiring a copy-on-write page, we need to copy it now
6809 * even if we're only (currently) requesting read access.
6810 * This is aggressive, but once it's wired we can't move it.
6811 */
6812 if (entry->needs_copy) {
6813 if (wire_and_extract) {
6814 /*
6815 * We're supposed to share with the original
6816 * provider so should not be "needs_copy"
6817 */
6818 rc = KERN_INVALID_ARGUMENT;
6819 goto done;
6820 }
6821
6822 VME_OBJECT_SHADOW(entry, size);
6823 entry->needs_copy = FALSE;
6824 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6825 if (wire_and_extract) {
6826 /*
6827 * We're supposed to share with the original
6828 * provider so should already have an object.
6829 */
6830 rc = KERN_INVALID_ARGUMENT;
6831 goto done;
6832 }
6833 VME_OBJECT_SET(entry, vm_object_allocate(size), false, 0);
6834 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6835 assert(entry->use_pmap);
6836 }
6837
6838 vm_map_clip_start(map, entry, s);
6839 vm_map_clip_end(map, entry, end);
6840
6841 /* re-compute "e" */
6842 e = entry->vme_end;
6843 if (e > end) {
6844 e = end;
6845 }
6846
6847 /*
6848 * Check for holes and protection mismatch.
6849 * Holes: Next entry should be contiguous unless this
6850 * is the end of the region.
6851 * Protection: Access requested must be allowed, unless
6852 * wiring is by protection class
6853 */
6854 if ((entry->vme_end < end) &&
6855 ((entry->vme_next == vm_map_to_entry(map)) ||
6856 (entry->vme_next->vme_start > entry->vme_end))) {
6857 /* found a hole */
6858 rc = KERN_INVALID_ADDRESS;
6859 goto done;
6860 }
6861 if ((entry->protection & access_type) != access_type) {
6862 /* found a protection problem */
6863 rc = KERN_PROTECTION_FAILURE;
6864 goto done;
6865 }
6866
6867 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6868
6869 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6870 goto done;
6871 }
6872
6873 entry->in_transition = TRUE;
6874
6875 /*
6876 * This entry might get split once we unlock the map.
6877 * In vm_fault_wire(), we need the current range as
6878 * defined by this entry. In order for this to work
6879 * along with a simultaneous clip operation, we make a
6880 * temporary copy of this entry and use that for the
6881 * wiring. Note that the underlying objects do not
6882 * change during a clip.
6883 */
6884 tmp_entry = *entry;
6885
6886 /*
6887 * The in_transition state guarentees that the entry
6888 * (or entries for this range, if split occured) will be
6889 * there when the map lock is acquired for the second time.
6890 */
6891 vm_map_unlock(map);
6892
6893 if (!user_wire && cur_thread != THREAD_NULL) {
6894 interruptible_state = thread_interrupt_level(THREAD_UNINT);
6895 } else {
6896 interruptible_state = THREAD_UNINT;
6897 }
6898
6899 if (map_pmap) {
6900 rc = vm_fault_wire(map,
6901 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6902 physpage_p);
6903 } else {
6904 rc = vm_fault_wire(map,
6905 &tmp_entry, caller_prot, tag, map->pmap,
6906 tmp_entry.vme_start,
6907 physpage_p);
6908 }
6909
6910 if (!user_wire && cur_thread != THREAD_NULL) {
6911 thread_interrupt_level(interruptible_state);
6912 }
6913
6914 vm_map_lock(map);
6915
6916 if (last_timestamp + 1 != map->timestamp) {
6917 /*
6918 * Find the entry again. It could have been clipped
6919 * after we unlocked the map.
6920 */
6921 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6922 &first_entry)) {
6923 panic("vm_map_wire: re-lookup failed");
6924 }
6925
6926 entry = first_entry;
6927 }
6928
6929 last_timestamp = map->timestamp;
6930
6931 while ((entry != vm_map_to_entry(map)) &&
6932 (entry->vme_start < tmp_entry.vme_end)) {
6933 assert(entry->in_transition);
6934 entry->in_transition = FALSE;
6935 if (entry->needs_wakeup) {
6936 entry->needs_wakeup = FALSE;
6937 need_wakeup = TRUE;
6938 }
6939 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6940 subtract_wire_counts(map, entry, user_wire);
6941 }
6942 entry = entry->vme_next;
6943 }
6944
6945 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6946 goto done;
6947 }
6948
6949 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6950 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
6951 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6952 /* found a "new" hole */
6953 s = tmp_entry.vme_end;
6954 rc = KERN_INVALID_ADDRESS;
6955 goto done;
6956 }
6957
6958 s = entry->vme_start;
6959 } /* end while loop through map entries */
6960
6961 done:
6962 if (rc == KERN_SUCCESS) {
6963 /* repair any damage we may have made to the VM map */
6964 vm_map_simplify_range(map, start, end);
6965 }
6966
6967 vm_map_unlock(map);
6968
6969 /*
6970 * wake up anybody waiting on entries we wired.
6971 */
6972 if (need_wakeup) {
6973 vm_map_entry_wakeup(map);
6974 }
6975
6976 if (rc != KERN_SUCCESS) {
6977 /* undo what has been wired so far */
6978 vm_map_unwire_nested(map, start, s, user_wire,
6979 map_pmap, pmap_addr);
6980 if (physpage_p) {
6981 *physpage_p = 0;
6982 }
6983 }
6984
6985 return rc;
6986 }
6987
6988 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)6989 vm_map_wire_external(
6990 vm_map_t map,
6991 vm_map_offset_t start,
6992 vm_map_offset_t end,
6993 vm_prot_t caller_prot,
6994 boolean_t user_wire)
6995 {
6996 kern_return_t kret;
6997
6998 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
6999 user_wire, (pmap_t)NULL, 0, NULL);
7000 return kret;
7001 }
7002
7003 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7004 vm_map_wire_kernel(
7005 vm_map_t map,
7006 vm_map_offset_t start,
7007 vm_map_offset_t end,
7008 vm_prot_t caller_prot,
7009 vm_tag_t tag,
7010 boolean_t user_wire)
7011 {
7012 kern_return_t kret;
7013
7014 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7015 user_wire, (pmap_t)NULL, 0, NULL);
7016 return kret;
7017 }
7018
7019 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7020 vm_map_wire_and_extract_external(
7021 vm_map_t map,
7022 vm_map_offset_t start,
7023 vm_prot_t caller_prot,
7024 boolean_t user_wire,
7025 ppnum_t *physpage_p)
7026 {
7027 kern_return_t kret;
7028
7029 kret = vm_map_wire_nested(map,
7030 start,
7031 start + VM_MAP_PAGE_SIZE(map),
7032 caller_prot,
7033 vm_tag_bt(),
7034 user_wire,
7035 (pmap_t)NULL,
7036 0,
7037 physpage_p);
7038 if (kret != KERN_SUCCESS &&
7039 physpage_p != NULL) {
7040 *physpage_p = 0;
7041 }
7042 return kret;
7043 }
7044
7045 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7046 vm_map_wire_and_extract_kernel(
7047 vm_map_t map,
7048 vm_map_offset_t start,
7049 vm_prot_t caller_prot,
7050 vm_tag_t tag,
7051 boolean_t user_wire,
7052 ppnum_t *physpage_p)
7053 {
7054 kern_return_t kret;
7055
7056 kret = vm_map_wire_nested(map,
7057 start,
7058 start + VM_MAP_PAGE_SIZE(map),
7059 caller_prot,
7060 tag,
7061 user_wire,
7062 (pmap_t)NULL,
7063 0,
7064 physpage_p);
7065 if (kret != KERN_SUCCESS &&
7066 physpage_p != NULL) {
7067 *physpage_p = 0;
7068 }
7069 return kret;
7070 }
7071
7072 /*
7073 * vm_map_unwire:
7074 *
7075 * Sets the pageability of the specified address range in the target
7076 * as pageable. Regions specified must have been wired previously.
7077 *
7078 * The map must not be locked, but a reference must remain to the map
7079 * throughout the call.
7080 *
7081 * Kernel will panic on failures. User unwire ignores holes and
7082 * unwired and intransition entries to avoid losing memory by leaving
7083 * it unwired.
7084 */
7085 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7086 vm_map_unwire_nested(
7087 vm_map_t map,
7088 vm_map_offset_t start,
7089 vm_map_offset_t end,
7090 boolean_t user_wire,
7091 pmap_t map_pmap,
7092 vm_map_offset_t pmap_addr)
7093 {
7094 vm_map_entry_t entry;
7095 struct vm_map_entry *first_entry, tmp_entry;
7096 boolean_t need_wakeup;
7097 boolean_t main_map = FALSE;
7098 unsigned int last_timestamp;
7099
7100 vm_map_lock(map);
7101 if (map_pmap == NULL) {
7102 main_map = TRUE;
7103 }
7104 last_timestamp = map->timestamp;
7105
7106 VM_MAP_RANGE_CHECK(map, start, end);
7107 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7108 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7109
7110 if (start == end) {
7111 /* We unwired what the caller asked for: zero pages */
7112 vm_map_unlock(map);
7113 return KERN_SUCCESS;
7114 }
7115
7116 if (vm_map_lookup_entry(map, start, &first_entry)) {
7117 entry = first_entry;
7118 /*
7119 * vm_map_clip_start will be done later.
7120 * We don't want to unnest any nested sub maps here !
7121 */
7122 } else {
7123 if (!user_wire) {
7124 panic("vm_map_unwire: start not found");
7125 }
7126 /* Start address is not in map. */
7127 vm_map_unlock(map);
7128 return KERN_INVALID_ADDRESS;
7129 }
7130
7131 if (entry->superpage_size) {
7132 /* superpages are always wired */
7133 vm_map_unlock(map);
7134 return KERN_INVALID_ADDRESS;
7135 }
7136
7137 need_wakeup = FALSE;
7138 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7139 if (entry->in_transition) {
7140 /*
7141 * 1)
7142 * Another thread is wiring down this entry. Note
7143 * that if it is not for the other thread we would
7144 * be unwiring an unwired entry. This is not
7145 * permitted. If we wait, we will be unwiring memory
7146 * we did not wire.
7147 *
7148 * 2)
7149 * Another thread is unwiring this entry. We did not
7150 * have a reference to it, because if we did, this
7151 * entry will not be getting unwired now.
7152 */
7153 if (!user_wire) {
7154 /*
7155 * XXX FBDP
7156 * This could happen: there could be some
7157 * overlapping vslock/vsunlock operations
7158 * going on.
7159 * We should probably just wait and retry,
7160 * but then we have to be careful that this
7161 * entry could get "simplified" after
7162 * "in_transition" gets unset and before
7163 * we re-lookup the entry, so we would
7164 * have to re-clip the entry to avoid
7165 * re-unwiring what we have already unwired...
7166 * See vm_map_wire_nested().
7167 *
7168 * Or we could just ignore "in_transition"
7169 * here and proceed to decement the wired
7170 * count(s) on this entry. That should be fine
7171 * as long as "wired_count" doesn't drop all
7172 * the way to 0 (and we should panic if THAT
7173 * happens).
7174 */
7175 panic("vm_map_unwire: in_transition entry");
7176 }
7177
7178 entry = entry->vme_next;
7179 continue;
7180 }
7181
7182 if (entry->is_sub_map) {
7183 vm_map_offset_t sub_start;
7184 vm_map_offset_t sub_end;
7185 vm_map_offset_t local_end;
7186 pmap_t pmap;
7187
7188 vm_map_clip_start(map, entry, start);
7189 vm_map_clip_end(map, entry, end);
7190
7191 sub_start = VME_OFFSET(entry);
7192 sub_end = entry->vme_end - entry->vme_start;
7193 sub_end += VME_OFFSET(entry);
7194 local_end = entry->vme_end;
7195 if (map_pmap == NULL) {
7196 if (entry->use_pmap) {
7197 pmap = VME_SUBMAP(entry)->pmap;
7198 pmap_addr = sub_start;
7199 } else {
7200 pmap = map->pmap;
7201 pmap_addr = start;
7202 }
7203 if (entry->wired_count == 0 ||
7204 (user_wire && entry->user_wired_count == 0)) {
7205 if (!user_wire) {
7206 panic("vm_map_unwire: entry is unwired");
7207 }
7208 entry = entry->vme_next;
7209 continue;
7210 }
7211
7212 /*
7213 * Check for holes
7214 * Holes: Next entry should be contiguous unless
7215 * this is the end of the region.
7216 */
7217 if (((entry->vme_end < end) &&
7218 ((entry->vme_next == vm_map_to_entry(map)) ||
7219 (entry->vme_next->vme_start
7220 > entry->vme_end)))) {
7221 if (!user_wire) {
7222 panic("vm_map_unwire: non-contiguous region");
7223 }
7224 /*
7225 * entry = entry->vme_next;
7226 * continue;
7227 */
7228 }
7229
7230 subtract_wire_counts(map, entry, user_wire);
7231
7232 if (entry->wired_count != 0) {
7233 entry = entry->vme_next;
7234 continue;
7235 }
7236
7237 entry->in_transition = TRUE;
7238 tmp_entry = *entry;/* see comment in vm_map_wire() */
7239
7240 /*
7241 * We can unlock the map now. The in_transition state
7242 * guarantees existance of the entry.
7243 */
7244 vm_map_unlock(map);
7245 vm_map_unwire_nested(VME_SUBMAP(entry),
7246 sub_start, sub_end, user_wire, pmap, pmap_addr);
7247 vm_map_lock(map);
7248
7249 if (last_timestamp + 1 != map->timestamp) {
7250 /*
7251 * Find the entry again. It could have been
7252 * clipped or deleted after we unlocked the map.
7253 */
7254 if (!vm_map_lookup_entry(map,
7255 tmp_entry.vme_start,
7256 &first_entry)) {
7257 if (!user_wire) {
7258 panic("vm_map_unwire: re-lookup failed");
7259 }
7260 entry = first_entry->vme_next;
7261 } else {
7262 entry = first_entry;
7263 }
7264 }
7265 last_timestamp = map->timestamp;
7266
7267 /*
7268 * clear transition bit for all constituent entries
7269 * that were in the original entry (saved in
7270 * tmp_entry). Also check for waiters.
7271 */
7272 while ((entry != vm_map_to_entry(map)) &&
7273 (entry->vme_start < tmp_entry.vme_end)) {
7274 assert(entry->in_transition);
7275 entry->in_transition = FALSE;
7276 if (entry->needs_wakeup) {
7277 entry->needs_wakeup = FALSE;
7278 need_wakeup = TRUE;
7279 }
7280 entry = entry->vme_next;
7281 }
7282 continue;
7283 } else {
7284 tmp_entry = *entry;
7285 vm_map_unlock(map);
7286 vm_map_unwire_nested(VME_SUBMAP(entry),
7287 sub_start, sub_end, user_wire, map_pmap,
7288 pmap_addr);
7289 vm_map_lock(map);
7290
7291 if (last_timestamp + 1 != map->timestamp) {
7292 /*
7293 * Find the entry again. It could have been
7294 * clipped or deleted after we unlocked the map.
7295 */
7296 if (!vm_map_lookup_entry(map,
7297 tmp_entry.vme_start,
7298 &first_entry)) {
7299 if (!user_wire) {
7300 panic("vm_map_unwire: re-lookup failed");
7301 }
7302 entry = first_entry->vme_next;
7303 } else {
7304 entry = first_entry;
7305 }
7306 }
7307 last_timestamp = map->timestamp;
7308 }
7309 }
7310
7311
7312 if ((entry->wired_count == 0) ||
7313 (user_wire && entry->user_wired_count == 0)) {
7314 if (!user_wire) {
7315 panic("vm_map_unwire: entry is unwired");
7316 }
7317
7318 entry = entry->vme_next;
7319 continue;
7320 }
7321
7322 assert(entry->wired_count > 0 &&
7323 (!user_wire || entry->user_wired_count > 0));
7324
7325 vm_map_clip_start(map, entry, start);
7326 vm_map_clip_end(map, entry, end);
7327
7328 /*
7329 * Check for holes
7330 * Holes: Next entry should be contiguous unless
7331 * this is the end of the region.
7332 */
7333 if (((entry->vme_end < end) &&
7334 ((entry->vme_next == vm_map_to_entry(map)) ||
7335 (entry->vme_next->vme_start > entry->vme_end)))) {
7336 if (!user_wire) {
7337 panic("vm_map_unwire: non-contiguous region");
7338 }
7339 entry = entry->vme_next;
7340 continue;
7341 }
7342
7343 subtract_wire_counts(map, entry, user_wire);
7344
7345 if (entry->wired_count != 0) {
7346 entry = entry->vme_next;
7347 continue;
7348 }
7349
7350 if (entry->zero_wired_pages) {
7351 entry->zero_wired_pages = FALSE;
7352 }
7353
7354 entry->in_transition = TRUE;
7355 tmp_entry = *entry; /* see comment in vm_map_wire() */
7356
7357 /*
7358 * We can unlock the map now. The in_transition state
7359 * guarantees existance of the entry.
7360 */
7361 vm_map_unlock(map);
7362 if (map_pmap) {
7363 vm_fault_unwire(map,
7364 &tmp_entry, FALSE, map_pmap, pmap_addr);
7365 } else {
7366 vm_fault_unwire(map,
7367 &tmp_entry, FALSE, map->pmap,
7368 tmp_entry.vme_start);
7369 }
7370 vm_map_lock(map);
7371
7372 if (last_timestamp + 1 != map->timestamp) {
7373 /*
7374 * Find the entry again. It could have been clipped
7375 * or deleted after we unlocked the map.
7376 */
7377 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7378 &first_entry)) {
7379 if (!user_wire) {
7380 panic("vm_map_unwire: re-lookup failed");
7381 }
7382 entry = first_entry->vme_next;
7383 } else {
7384 entry = first_entry;
7385 }
7386 }
7387 last_timestamp = map->timestamp;
7388
7389 /*
7390 * clear transition bit for all constituent entries that
7391 * were in the original entry (saved in tmp_entry). Also
7392 * check for waiters.
7393 */
7394 while ((entry != vm_map_to_entry(map)) &&
7395 (entry->vme_start < tmp_entry.vme_end)) {
7396 assert(entry->in_transition);
7397 entry->in_transition = FALSE;
7398 if (entry->needs_wakeup) {
7399 entry->needs_wakeup = FALSE;
7400 need_wakeup = TRUE;
7401 }
7402 entry = entry->vme_next;
7403 }
7404 }
7405
7406 /*
7407 * We might have fragmented the address space when we wired this
7408 * range of addresses. Attempt to re-coalesce these VM map entries
7409 * with their neighbors now that they're no longer wired.
7410 * Under some circumstances, address space fragmentation can
7411 * prevent VM object shadow chain collapsing, which can cause
7412 * swap space leaks.
7413 */
7414 vm_map_simplify_range(map, start, end);
7415
7416 vm_map_unlock(map);
7417 /*
7418 * wake up anybody waiting on entries that we have unwired.
7419 */
7420 if (need_wakeup) {
7421 vm_map_entry_wakeup(map);
7422 }
7423 return KERN_SUCCESS;
7424 }
7425
7426 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7427 vm_map_unwire(
7428 vm_map_t map,
7429 vm_map_offset_t start,
7430 vm_map_offset_t end,
7431 boolean_t user_wire)
7432 {
7433 return vm_map_unwire_nested(map, start, end,
7434 user_wire, (pmap_t)NULL, 0);
7435 }
7436
7437
7438 /*
7439 * vm_map_entry_zap: [ internal use only ]
7440 *
7441 * Remove the entry from the target map
7442 * and put it on a zap list.
7443 */
7444 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7445 vm_map_entry_zap(
7446 vm_map_t map,
7447 vm_map_entry_t entry,
7448 vm_map_zap_t zap)
7449 {
7450 vm_map_offset_t s, e;
7451
7452 s = entry->vme_start;
7453 e = entry->vme_end;
7454 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7455 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7456 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7457 assert(page_aligned(s));
7458 assert(page_aligned(e));
7459 }
7460 if (entry->map_aligned == TRUE) {
7461 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7462 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7463 }
7464 assert(entry->wired_count == 0);
7465 assert(entry->user_wired_count == 0);
7466 assert(!entry->permanent);
7467
7468 vm_map_store_entry_unlink(map, entry);
7469 map->size -= e - s;
7470
7471 vm_map_zap_append(zap, entry);
7472 }
7473
7474 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7475 vm_map_submap_pmap_clean(
7476 vm_map_t map,
7477 vm_map_offset_t start,
7478 vm_map_offset_t end,
7479 vm_map_t sub_map,
7480 vm_map_offset_t offset)
7481 {
7482 vm_map_offset_t submap_start;
7483 vm_map_offset_t submap_end;
7484 vm_map_size_t remove_size;
7485 vm_map_entry_t entry;
7486
7487 submap_end = offset + (end - start);
7488 submap_start = offset;
7489
7490 vm_map_lock_read(sub_map);
7491 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7492 remove_size = (entry->vme_end - entry->vme_start);
7493 if (offset > entry->vme_start) {
7494 remove_size -= offset - entry->vme_start;
7495 }
7496
7497
7498 if (submap_end < entry->vme_end) {
7499 remove_size -=
7500 entry->vme_end - submap_end;
7501 }
7502 if (entry->is_sub_map) {
7503 vm_map_submap_pmap_clean(
7504 sub_map,
7505 start,
7506 start + remove_size,
7507 VME_SUBMAP(entry),
7508 VME_OFFSET(entry));
7509 } else {
7510 if (map->mapped_in_other_pmaps &&
7511 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7512 VME_OBJECT(entry) != NULL) {
7513 vm_object_pmap_protect_options(
7514 VME_OBJECT(entry),
7515 (VME_OFFSET(entry) +
7516 offset -
7517 entry->vme_start),
7518 remove_size,
7519 PMAP_NULL,
7520 PAGE_SIZE,
7521 entry->vme_start,
7522 VM_PROT_NONE,
7523 PMAP_OPTIONS_REMOVE);
7524 } else {
7525 pmap_remove(map->pmap,
7526 (addr64_t)start,
7527 (addr64_t)(start + remove_size));
7528 }
7529 }
7530 }
7531
7532 entry = entry->vme_next;
7533
7534 while ((entry != vm_map_to_entry(sub_map))
7535 && (entry->vme_start < submap_end)) {
7536 remove_size = (entry->vme_end - entry->vme_start);
7537 if (submap_end < entry->vme_end) {
7538 remove_size -= entry->vme_end - submap_end;
7539 }
7540 if (entry->is_sub_map) {
7541 vm_map_submap_pmap_clean(
7542 sub_map,
7543 (start + entry->vme_start) - offset,
7544 ((start + entry->vme_start) - offset) + remove_size,
7545 VME_SUBMAP(entry),
7546 VME_OFFSET(entry));
7547 } else {
7548 if (map->mapped_in_other_pmaps &&
7549 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7550 VME_OBJECT(entry) != NULL) {
7551 vm_object_pmap_protect_options(
7552 VME_OBJECT(entry),
7553 VME_OFFSET(entry),
7554 remove_size,
7555 PMAP_NULL,
7556 PAGE_SIZE,
7557 entry->vme_start,
7558 VM_PROT_NONE,
7559 PMAP_OPTIONS_REMOVE);
7560 } else {
7561 pmap_remove(map->pmap,
7562 (addr64_t)((start + entry->vme_start)
7563 - offset),
7564 (addr64_t)(((start + entry->vme_start)
7565 - offset) + remove_size));
7566 }
7567 }
7568 entry = entry->vme_next;
7569 }
7570 vm_map_unlock_read(sub_map);
7571 return;
7572 }
7573
7574 /*
7575 * virt_memory_guard_ast:
7576 *
7577 * Handle the AST callout for a virtual memory guard.
7578 * raise an EXC_GUARD exception and terminate the task
7579 * if configured to do so.
7580 */
7581 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7582 virt_memory_guard_ast(
7583 thread_t thread,
7584 mach_exception_data_type_t code,
7585 mach_exception_data_type_t subcode)
7586 {
7587 task_t task = get_threadtask(thread);
7588 assert(task != kernel_task);
7589 assert(task == current_task());
7590 kern_return_t sync_exception_result;
7591 uint32_t behavior;
7592
7593 behavior = task->task_exc_guard;
7594
7595 /* Is delivery enabled */
7596 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7597 return;
7598 }
7599
7600 /* If only once, make sure we're that once */
7601 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7602 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7603
7604 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7605 break;
7606 }
7607 behavior = task->task_exc_guard;
7608 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7609 return;
7610 }
7611 }
7612
7613 /* Raise exception synchronously and see if handler claimed it */
7614 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7615
7616 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7617 /*
7618 * If Synchronous EXC_GUARD delivery was successful then
7619 * kill the process and return, else kill the process
7620 * and deliver the exception via EXC_CORPSE_NOTIFY.
7621 */
7622 if (sync_exception_result == KERN_SUCCESS) {
7623 task_bsdtask_kill(current_task());
7624 } else {
7625 exit_with_guard_exception(current_proc(), code, subcode);
7626 }
7627 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7628 /*
7629 * If the synchronous EXC_GUARD delivery was not successful,
7630 * raise a simulated crash.
7631 */
7632 if (sync_exception_result != KERN_SUCCESS) {
7633 task_violated_guard(code, subcode, NULL);
7634 }
7635 }
7636 }
7637
7638 /*
7639 * vm_map_guard_exception:
7640 *
7641 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7642 *
7643 * Right now, we do this when we find nothing mapped, or a
7644 * gap in the mapping when a user address space deallocate
7645 * was requested. We report the address of the first gap found.
7646 */
7647 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7648 vm_map_guard_exception(
7649 vm_map_offset_t gap_start,
7650 unsigned reason)
7651 {
7652 mach_exception_code_t code = 0;
7653 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7654 unsigned int target = 0; /* should we pass in pid associated with map? */
7655 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7656 boolean_t fatal = FALSE;
7657
7658 task_t task = current_task_early();
7659
7660 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7661 if (task == NULL || task == kernel_task) {
7662 return;
7663 }
7664
7665 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7666 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7667 EXC_GUARD_ENCODE_TARGET(code, target);
7668
7669 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7670 fatal = TRUE;
7671 }
7672 thread_guard_violation(current_thread(), code, subcode, fatal);
7673 }
7674
7675 __abortlike
7676 static void
__vm_map_delete_misaligned_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)7677 __vm_map_delete_misaligned_panic(
7678 vm_map_t map,
7679 vm_map_offset_t start,
7680 vm_map_offset_t end)
7681 {
7682 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
7683 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
7684 }
7685
7686 __abortlike
7687 static void
__vm_map_delete_failed_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,kern_return_t kr)7688 __vm_map_delete_failed_panic(
7689 vm_map_t map,
7690 vm_map_offset_t start,
7691 vm_map_offset_t end,
7692 kern_return_t kr)
7693 {
7694 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
7695 map, (uint64_t)start, (uint64_t)end, kr);
7696 }
7697
7698 __abortlike
7699 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7700 __vm_map_delete_gap_panic(
7701 vm_map_t map,
7702 vm_map_offset_t where,
7703 vm_map_offset_t start,
7704 vm_map_offset_t end)
7705 {
7706 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7707 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7708 }
7709
7710 __abortlike
7711 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7712 __vm_map_delete_permanent_panic(
7713 vm_map_t map,
7714 vm_map_offset_t start,
7715 vm_map_offset_t end,
7716 vm_map_entry_t entry)
7717 {
7718 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7719 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7720 map, (uint64_t)start, (uint64_t)end, entry,
7721 (uint64_t)entry->vme_start,
7722 (uint64_t)entry->vme_end);
7723 }
7724
7725 __options_decl(vm_map_delete_state_t, uint32_t, {
7726 VMDS_NONE = 0x0000,
7727
7728 VMDS_FOUND_GAP = 0x0001,
7729 VMDS_GAPS_OK = 0x0002,
7730
7731 VMDS_KERNEL_PMAP = 0x0004,
7732 VMDS_NEEDS_LOOKUP = 0x0008,
7733 VMDS_NEEDS_WAKEUP = 0x0010,
7734 });
7735
7736 /*
7737 * vm_map_delete: [ internal use only ]
7738 *
7739 * Deallocates the given address range from the target map.
7740 * Removes all user wirings. Unwires one kernel wiring if
7741 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
7742 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
7743 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7744 *
7745 *
7746 * When VM_MAP_REMOVE_RETURN_ERRORS is not passed,
7747 * then any error in removing mappings will lead to a panic
7748 * so that clients do not have to repeat the panic code
7749 * at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
7750 * is also passed, then KERN_ABORTED will not lead to a panic.
7751 *
7752 * This routine is called with map locked and leaves map locked.
7753 */
7754 static kmem_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard,vm_map_zap_t zap_list)7755 vm_map_delete(
7756 vm_map_t map,
7757 vm_map_offset_t start,
7758 vm_map_offset_t end,
7759 vmr_flags_t flags,
7760 kmem_guard_t guard,
7761 vm_map_zap_t zap_list)
7762 {
7763 vm_map_entry_t entry, next;
7764 int interruptible;
7765 vm_map_offset_t gap_start = 0;
7766 vm_map_offset_t clear_in_transition_end = 0;
7767 __unused vm_map_offset_t save_start = start;
7768 __unused vm_map_offset_t save_end = end;
7769 vm_map_delete_state_t state = VMDS_NONE;
7770 kmem_return_t ret = { };
7771
7772 if (vm_map_pmap(map) == kernel_pmap) {
7773 state |= VMDS_KERNEL_PMAP;
7774 }
7775
7776 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7777 state |= VMDS_GAPS_OK;
7778 }
7779
7780 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7781 THREAD_ABORTSAFE : THREAD_UNINT;
7782
7783 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
7784 (start & VM_MAP_PAGE_MASK(map))) {
7785 __vm_map_delete_misaligned_panic(map, start, end);
7786 }
7787
7788 if ((state & VMDS_GAPS_OK) == 0) {
7789 /*
7790 * If the map isn't terminated then all deletions must have
7791 * no gaps, and be within the [min, max) of the map.
7792 *
7793 * We got here without VM_MAP_RANGE_CHECK() being called,
7794 * and hence must validate bounds manually.
7795 *
7796 * It is worth noting that because vm_deallocate() will
7797 * round_page() the deallocation size, it's possible for "end"
7798 * to be 0 here due to overflow. We hence must treat it as being
7799 * beyond vm_map_max(map).
7800 *
7801 * Similarly, end < start means some wrap around happend,
7802 * which should cause an error or panic.
7803 */
7804 if (end == 0 || end > vm_map_max(map)) {
7805 state |= VMDS_FOUND_GAP;
7806 gap_start = vm_map_max(map);
7807 if (state & VMDS_KERNEL_PMAP) {
7808 __vm_map_delete_gap_panic(map,
7809 gap_start, start, end);
7810 }
7811 goto out;
7812 }
7813
7814 if (end < start) {
7815 if (state & VMDS_KERNEL_PMAP) {
7816 __vm_map_delete_gap_panic(map,
7817 vm_map_max(map), start, end);
7818 }
7819 ret.kmr_return = KERN_INVALID_ARGUMENT;
7820 goto out;
7821 }
7822
7823 if (start < vm_map_min(map)) {
7824 state |= VMDS_FOUND_GAP;
7825 gap_start = start;
7826 if (state & VMDS_KERNEL_PMAP) {
7827 __vm_map_delete_gap_panic(map,
7828 gap_start, start, end);
7829 }
7830 goto out;
7831 }
7832 } else {
7833 /*
7834 * If the map is terminated, we must accept start/end
7835 * being beyond the boundaries of the map as this is
7836 * how some of the mappings like commpage mappings
7837 * can be destroyed (they're outside of those bounds).
7838 *
7839 * end < start is still something we can't cope with,
7840 * so just bail.
7841 */
7842 if (end < start) {
7843 goto out;
7844 }
7845 }
7846
7847
7848 /*
7849 * Find the start of the region.
7850 *
7851 * If in a superpage, extend the range
7852 * to include the start of the mapping.
7853 */
7854 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7855 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7856 start = SUPERPAGE_ROUND_DOWN(start);
7857 } else {
7858 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7859 break;
7860 }
7861 }
7862
7863 if (entry->superpage_size) {
7864 end = SUPERPAGE_ROUND_UP(end);
7865 }
7866
7867 /*
7868 * Step through all entries in this region
7869 */
7870 for (vm_map_offset_t s = start; s < end;) {
7871 /*
7872 * At this point, we have deleted all the memory entries
7873 * in [start, s) and are proceeding with the [s, end) range.
7874 *
7875 * This loop might drop the map lock, and it is possible that
7876 * some memory was already reallocated within [start, s)
7877 * and we don't want to mess with those entries.
7878 *
7879 * Some of those entries could even have been re-assembled
7880 * with an entry after "s" (in vm_map_simplify_entry()), so
7881 * we may have to vm_map_clip_start() again.
7882 *
7883 * When clear_in_transition_end is set, the we had marked
7884 * [start, clear_in_transition_end) as "in_transition"
7885 * during a previous iteration and we need to clear it.
7886 */
7887
7888 /*
7889 * Step 1: If needed (because we dropped locks),
7890 * lookup the entry again.
7891 *
7892 * If we're coming back from unwiring (Step 5),
7893 * we also need to mark the entries as no longer
7894 * in transition after that.
7895 */
7896
7897 if (state & VMDS_NEEDS_LOOKUP) {
7898 state &= ~VMDS_NEEDS_LOOKUP;
7899
7900 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7901 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7902 }
7903 }
7904
7905 if (clear_in_transition_end) {
7906 for (vm_map_entry_t it = entry;
7907 it != vm_map_to_entry(map) &&
7908 it->vme_start < clear_in_transition_end;
7909 it = it->vme_next) {
7910 assert(it->in_transition);
7911 it->in_transition = FALSE;
7912 if (it->needs_wakeup) {
7913 it->needs_wakeup = FALSE;
7914 state |= VMDS_NEEDS_WAKEUP;
7915 }
7916 }
7917
7918 clear_in_transition_end = 0;
7919 }
7920
7921
7922 /*
7923 * Step 2: Perform various policy checks
7924 * before we do _anything_ to this entry.
7925 */
7926
7927 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
7928 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
7929 /*
7930 * Either we found a gap already,
7931 * or we are tearing down a map,
7932 * keep going.
7933 */
7934 } else if (state & VMDS_KERNEL_PMAP) {
7935 __vm_map_delete_gap_panic(map, s, start, end);
7936 } else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
7937 /*
7938 * The vm_map_round_page() is needed since an entry
7939 * can be less than VM_MAP_PAGE_MASK() sized.
7940 *
7941 * For example, devices which have h/w 4K pages,
7942 * but entry sizes are all now 16K.
7943 */
7944 state |= VMDS_FOUND_GAP;
7945 gap_start = s;
7946 }
7947
7948 if (entry == vm_map_to_entry(map) ||
7949 end <= entry->vme_start) {
7950 break;
7951 }
7952
7953 s = entry->vme_start;
7954 }
7955
7956 if (state & VMDS_KERNEL_PMAP) {
7957 /*
7958 * In the kernel map and its submaps,
7959 * permanent entries never die, even
7960 * if VM_MAP_REMOVE_IMMUTABLE is passed.
7961 */
7962 if (entry->permanent) {
7963 __vm_map_delete_permanent_panic(map, start, end, entry);
7964 }
7965
7966 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
7967 end = entry->vme_end;
7968 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
7969 }
7970
7971 /*
7972 * In the kernel map and its submaps,
7973 * the removal of an atomic/guarded entry is strict.
7974 *
7975 * An atomic entry is processed only if it was
7976 * specifically targeted.
7977 *
7978 * We might have deleted non-atomic entries before
7979 * we reach this this point however...
7980 */
7981 kmem_entry_validate_guard(map, entry,
7982 start, end - start, guard);
7983 }
7984
7985
7986 /*
7987 * Step 3: Perform any clipping needed.
7988 *
7989 * After this, "entry" starts at "s", ends before "end"
7990 */
7991
7992 if (entry->vme_start < s) {
7993 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7994 entry->map_aligned &&
7995 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
7996 /*
7997 * The entry will no longer be map-aligned
7998 * after clipping and the caller said it's OK.
7999 */
8000 entry->map_aligned = FALSE;
8001 }
8002 vm_map_clip_start(map, entry, s);
8003 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8004 }
8005
8006 if (end < entry->vme_end) {
8007 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8008 entry->map_aligned &&
8009 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8010 /*
8011 * The entry will no longer be map-aligned
8012 * after clipping and the caller said it's OK.
8013 */
8014 entry->map_aligned = FALSE;
8015 }
8016 vm_map_clip_end(map, entry, end);
8017 }
8018
8019 assert(s == entry->vme_start);
8020 assert(entry->vme_end <= end);
8021
8022
8023 /*
8024 * Step 4: If the entry is in flux, wait for this to resolve.
8025 */
8026
8027 if (entry->in_transition) {
8028 wait_result_t wait_result;
8029
8030 /*
8031 * Another thread is wiring/unwiring this entry.
8032 * Let the other thread know we are waiting.
8033 */
8034
8035 entry->needs_wakeup = TRUE;
8036
8037 /*
8038 * wake up anybody waiting on entries that we have
8039 * already unwired/deleted.
8040 */
8041 if (state & VMDS_NEEDS_WAKEUP) {
8042 vm_map_entry_wakeup(map);
8043 state &= ~VMDS_NEEDS_WAKEUP;
8044 }
8045
8046 wait_result = vm_map_entry_wait(map, interruptible);
8047
8048 if (interruptible &&
8049 wait_result == THREAD_INTERRUPTED) {
8050 /*
8051 * We do not clear the needs_wakeup flag,
8052 * since we cannot tell if we were the only one.
8053 */
8054 ret.kmr_return = KERN_ABORTED;
8055 return ret;
8056 }
8057
8058 /*
8059 * The entry could have been clipped or it
8060 * may not exist anymore. Look it up again.
8061 */
8062 state |= VMDS_NEEDS_LOOKUP;
8063 continue;
8064 }
8065
8066
8067 /*
8068 * Step 5: Handle wiring
8069 */
8070
8071 if (entry->wired_count) {
8072 struct vm_map_entry tmp_entry;
8073 boolean_t user_wire;
8074 unsigned int last_timestamp;
8075
8076 user_wire = entry->user_wired_count > 0;
8077
8078 /*
8079 * Remove a kernel wiring if requested
8080 */
8081 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8082 entry->wired_count--;
8083 }
8084
8085 /*
8086 * Remove all user wirings for proper accounting
8087 */
8088 while (entry->user_wired_count) {
8089 subtract_wire_counts(map, entry, user_wire);
8090 }
8091
8092 /*
8093 * All our DMA I/O operations in IOKit are currently
8094 * done by wiring through the map entries of the task
8095 * requesting the I/O.
8096 *
8097 * Because of this, we must always wait for kernel wirings
8098 * to go away on the entries before deleting them.
8099 *
8100 * Any caller who wants to actually remove a kernel wiring
8101 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8102 * properly remove one wiring instead of blasting through
8103 * them all.
8104 */
8105 if (entry->wired_count != 0) {
8106 assert(map != kernel_map);
8107 /*
8108 * Cannot continue. Typical case is when
8109 * a user thread has physical io pending on
8110 * on this page. Either wait for the
8111 * kernel wiring to go away or return an
8112 * error.
8113 */
8114 wait_result_t wait_result;
8115
8116 entry->needs_wakeup = TRUE;
8117 wait_result = vm_map_entry_wait(map,
8118 interruptible);
8119
8120 if (interruptible &&
8121 wait_result == THREAD_INTERRUPTED) {
8122 /*
8123 * We do not clear the
8124 * needs_wakeup flag, since we
8125 * cannot tell if we were the
8126 * only one.
8127 */
8128 ret.kmr_return = KERN_ABORTED;
8129 return ret;
8130 }
8131
8132
8133 /*
8134 * The entry could have been clipped or
8135 * it may not exist anymore. Look it
8136 * up again.
8137 */
8138 state |= VMDS_NEEDS_LOOKUP;
8139 continue;
8140 }
8141
8142 /*
8143 * We can unlock the map now.
8144 *
8145 * The entry might be split once we unlock the map,
8146 * but we need the range as defined by this entry
8147 * to be stable. So we must make a local copy.
8148 *
8149 * The underlying objects do not change during clips,
8150 * and the in_transition state guarentees existence
8151 * of the entry.
8152 */
8153 last_timestamp = map->timestamp;
8154 entry->in_transition = TRUE;
8155 tmp_entry = *entry;
8156 vm_map_unlock(map);
8157
8158 if (tmp_entry.is_sub_map) {
8159 vm_map_t sub_map;
8160 vm_map_offset_t sub_start, sub_end;
8161 pmap_t pmap;
8162 vm_map_offset_t pmap_addr;
8163
8164
8165 sub_map = VME_SUBMAP(&tmp_entry);
8166 sub_start = VME_OFFSET(&tmp_entry);
8167 sub_end = sub_start + (tmp_entry.vme_end -
8168 tmp_entry.vme_start);
8169 if (tmp_entry.use_pmap) {
8170 pmap = sub_map->pmap;
8171 pmap_addr = tmp_entry.vme_start;
8172 } else {
8173 pmap = map->pmap;
8174 pmap_addr = tmp_entry.vme_start;
8175 }
8176 (void) vm_map_unwire_nested(sub_map,
8177 sub_start, sub_end,
8178 user_wire,
8179 pmap, pmap_addr);
8180 } else {
8181 if (tmp_entry.vme_kernel_object) {
8182 pmap_protect_options(
8183 map->pmap,
8184 tmp_entry.vme_start,
8185 tmp_entry.vme_end,
8186 VM_PROT_NONE,
8187 PMAP_OPTIONS_REMOVE,
8188 NULL);
8189 }
8190 vm_fault_unwire(map, &tmp_entry,
8191 tmp_entry.vme_kernel_object,
8192 map->pmap, tmp_entry.vme_start);
8193 }
8194
8195 vm_map_lock(map);
8196
8197 /*
8198 * Unwiring happened, we can now go back to deleting
8199 * them (after we clear the in_transition bit for the range).
8200 */
8201 if (last_timestamp + 1 != map->timestamp) {
8202 state |= VMDS_NEEDS_LOOKUP;
8203 }
8204 clear_in_transition_end = tmp_entry.vme_end;
8205 continue;
8206 }
8207
8208 assert(entry->wired_count == 0);
8209 assert(entry->user_wired_count == 0);
8210
8211
8212 /*
8213 * Step 6: Entry is unwired and ready for us to delete !
8214 */
8215
8216 if (!entry->permanent) {
8217 /*
8218 * Typical case: the entry really shouldn't be permanent
8219 */
8220 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8221 #if 0
8222 printf("FBDP %d[%s] removing permanent entry "
8223 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8224 proc_selfpid(),
8225 (current_task()->bsd_info
8226 ? proc_name_address(current_task()->bsd_info)
8227 : "?"), entry,
8228 (uint64_t)entry->vme_start,
8229 (uint64_t)entry->vme_end,
8230 entry->protection,
8231 entry->max_protection);
8232 #endif
8233 entry->permanent = FALSE;
8234 } else {
8235 /*
8236 * dtrace -n 'vm_map_delete_permanent {
8237 * print("start=0x%llx end=0x%llx prot=0x%x/0x%x\n", arg0, arg1, arg2, arg3);
8238 * stack();
8239 * ustack();
8240 * }'
8241 */
8242 DTRACE_VM5(vm_map_delete_permanent,
8243 vm_map_offset_t, entry->vme_start,
8244 vm_map_offset_t, entry->vme_end,
8245 vm_prot_t, entry->protection,
8246 vm_prot_t, entry->max_protection,
8247 int, VME_ALIAS(entry));
8248 }
8249
8250 if (entry->is_sub_map) {
8251 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8252 "map %p (%d) entry %p submap %p (%d)\n",
8253 map, VM_MAP_PAGE_SHIFT(map), entry,
8254 VME_SUBMAP(entry),
8255 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8256 if (entry->use_pmap) {
8257 #ifndef NO_NESTED_PMAP
8258 int pmap_flags;
8259
8260 if (map->terminated) {
8261 /*
8262 * This is the final cleanup of the
8263 * address space being terminated.
8264 * No new mappings are expected and
8265 * we don't really need to unnest the
8266 * shared region (and lose the "global"
8267 * pmap mappings, if applicable).
8268 *
8269 * Tell the pmap layer that we're
8270 * "clean" wrt nesting.
8271 */
8272 pmap_flags = PMAP_UNNEST_CLEAN;
8273 } else {
8274 /*
8275 * We're unmapping part of the nested
8276 * shared region, so we can't keep the
8277 * nested pmap.
8278 */
8279 pmap_flags = 0;
8280 }
8281 pmap_unnest_options(
8282 map->pmap,
8283 (addr64_t)entry->vme_start,
8284 entry->vme_end - entry->vme_start,
8285 pmap_flags);
8286 #endif /* NO_NESTED_PMAP */
8287 if (map->mapped_in_other_pmaps &&
8288 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8289 /* clean up parent map/maps */
8290 vm_map_submap_pmap_clean(
8291 map, entry->vme_start,
8292 entry->vme_end,
8293 VME_SUBMAP(entry),
8294 VME_OFFSET(entry));
8295 }
8296 } else {
8297 vm_map_submap_pmap_clean(
8298 map, entry->vme_start, entry->vme_end,
8299 VME_SUBMAP(entry),
8300 VME_OFFSET(entry));
8301 }
8302 } else if (entry->vme_kernel_object ||
8303 VME_OBJECT(entry) == compressor_object) {
8304 /*
8305 * nothing to do
8306 */
8307 } else if (map->mapped_in_other_pmaps &&
8308 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8309 vm_object_pmap_protect_options(
8310 VME_OBJECT(entry), VME_OFFSET(entry),
8311 entry->vme_end - entry->vme_start,
8312 PMAP_NULL,
8313 PAGE_SIZE,
8314 entry->vme_start,
8315 VM_PROT_NONE,
8316 PMAP_OPTIONS_REMOVE);
8317 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8318 (state & VMDS_KERNEL_PMAP)) {
8319 /* Remove translations associated
8320 * with this range unless the entry
8321 * does not have an object, or
8322 * it's the kernel map or a descendant
8323 * since the platform could potentially
8324 * create "backdoor" mappings invisible
8325 * to the VM. It is expected that
8326 * objectless, non-kernel ranges
8327 * do not have such VM invisible
8328 * translations.
8329 */
8330 pmap_remove_options(map->pmap,
8331 (addr64_t)entry->vme_start,
8332 (addr64_t)entry->vme_end,
8333 PMAP_OPTIONS_REMOVE);
8334 }
8335
8336 #if DEBUG
8337 /*
8338 * All pmap mappings for this map entry must have been
8339 * cleared by now.
8340 */
8341 assert(pmap_is_empty(map->pmap,
8342 entry->vme_start,
8343 entry->vme_end));
8344 #endif /* DEBUG */
8345
8346 if (entry->iokit_acct) {
8347 /* alternate accounting */
8348 DTRACE_VM4(vm_map_iokit_unmapped_region,
8349 vm_map_t, map,
8350 vm_map_offset_t, entry->vme_start,
8351 vm_map_offset_t, entry->vme_end,
8352 int, VME_ALIAS(entry));
8353 vm_map_iokit_unmapped_region(map,
8354 (entry->vme_end -
8355 entry->vme_start));
8356 entry->iokit_acct = FALSE;
8357 entry->use_pmap = FALSE;
8358 }
8359
8360 s = entry->vme_end;
8361 next = entry->vme_next;
8362 ret.kmr_size += entry->vme_end - entry->vme_start;
8363
8364 if (entry->permanent) {
8365 /*
8366 * A permanent entry can not be removed, so leave it
8367 * in place but remove all access permissions.
8368 */
8369 entry->protection = VM_PROT_NONE;
8370 entry->max_protection = VM_PROT_NONE;
8371 } else {
8372 vm_map_entry_zap(map, entry, zap_list);
8373 }
8374
8375 entry = next;
8376
8377 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8378 unsigned int last_timestamp = map->timestamp++;
8379
8380 if (lck_rw_lock_yield_exclusive(&map->lock,
8381 LCK_RW_YIELD_ANY_WAITER)) {
8382 if (last_timestamp != map->timestamp + 1) {
8383 state |= VMDS_NEEDS_LOOKUP;
8384 }
8385 } else {
8386 /* we didn't yield, undo our change */
8387 map->timestamp--;
8388 }
8389 }
8390 }
8391
8392 if (map->wait_for_space) {
8393 thread_wakeup((event_t) map);
8394 }
8395
8396 if (state & VMDS_NEEDS_WAKEUP) {
8397 vm_map_entry_wakeup(map);
8398 }
8399
8400 out:
8401 if ((flags & VM_MAP_REMOVE_RETURN_ERRORS) == 0 && ret.kmr_return) {
8402 __vm_map_delete_failed_panic(map, start, end, ret.kmr_return);
8403 }
8404
8405 if (state & VMDS_FOUND_GAP) {
8406 DTRACE_VM3(kern_vm_deallocate_gap,
8407 vm_map_offset_t, gap_start,
8408 vm_map_offset_t, save_start,
8409 vm_map_offset_t, save_end);
8410 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8411 ret.kmr_return = KERN_INVALID_VALUE;
8412 } else {
8413 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8414 }
8415 }
8416
8417 return ret;
8418 }
8419
8420 kmem_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8421 vm_map_remove_and_unlock(
8422 vm_map_t map,
8423 vm_map_offset_t start,
8424 vm_map_offset_t end,
8425 vmr_flags_t flags,
8426 kmem_guard_t guard)
8427 {
8428 kmem_return_t ret;
8429 VM_MAP_ZAP_DECLARE(zap);
8430
8431 ret = vm_map_delete(map, start, end, flags, guard, &zap);
8432 vm_map_unlock(map);
8433
8434 vm_map_zap_dispose(&zap);
8435
8436 return ret;
8437 }
8438
8439 /*
8440 * vm_map_remove_guard:
8441 *
8442 * Remove the given address range from the target map.
8443 * This is the exported form of vm_map_delete.
8444 */
8445 kmem_return_t
vm_map_remove_guard(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,kmem_guard_t guard)8446 vm_map_remove_guard(
8447 vm_map_t map,
8448 vm_map_offset_t start,
8449 vm_map_offset_t end,
8450 vmr_flags_t flags,
8451 kmem_guard_t guard)
8452 {
8453 vm_map_lock(map);
8454 return vm_map_remove_and_unlock(map, start, end, flags, guard);
8455 }
8456
8457 /*
8458 * vm_map_terminate:
8459 *
8460 * Clean out a task's map.
8461 */
8462 kern_return_t
vm_map_terminate(vm_map_t map)8463 vm_map_terminate(
8464 vm_map_t map)
8465 {
8466 vm_map_lock(map);
8467 map->terminated = TRUE;
8468 vm_map_disable_hole_optimization(map);
8469 (void)vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8470 VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
8471 return KERN_SUCCESS;
8472 }
8473
8474
8475 /*
8476 * Routine: vm_map_copy_allocate
8477 *
8478 * Description:
8479 * Allocates and initializes a map copy object.
8480 */
8481 static vm_map_copy_t
vm_map_copy_allocate(void)8482 vm_map_copy_allocate(void)
8483 {
8484 vm_map_copy_t new_copy;
8485
8486 new_copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO);
8487 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8488 vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8489 vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8490 return new_copy;
8491 }
8492
8493 /*
8494 * Routine: vm_map_copy_discard
8495 *
8496 * Description:
8497 * Dispose of a map copy object (returned by
8498 * vm_map_copyin).
8499 */
8500 void
vm_map_copy_discard(vm_map_copy_t copy)8501 vm_map_copy_discard(
8502 vm_map_copy_t copy)
8503 {
8504 if (copy == VM_MAP_COPY_NULL) {
8505 return;
8506 }
8507
8508 /*
8509 * Assert that the vm_map_copy is coming from the right
8510 * zone and hasn't been forged
8511 */
8512 vm_map_copy_require(copy);
8513
8514 switch (copy->type) {
8515 case VM_MAP_COPY_ENTRY_LIST:
8516 while (vm_map_copy_first_entry(copy) !=
8517 vm_map_copy_to_entry(copy)) {
8518 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8519
8520 vm_map_copy_entry_unlink(copy, entry);
8521 if (entry->is_sub_map) {
8522 vm_map_deallocate(VME_SUBMAP(entry));
8523 } else {
8524 vm_object_deallocate(VME_OBJECT(entry));
8525 }
8526 vm_map_copy_entry_dispose(entry);
8527 }
8528 break;
8529 case VM_MAP_COPY_OBJECT:
8530 vm_object_deallocate(copy->cpy_object);
8531 break;
8532 case VM_MAP_COPY_KERNEL_BUFFER:
8533
8534 /*
8535 * The vm_map_copy_t and possibly the data buffer were
8536 * allocated by a single call to kalloc_data(), i.e. the
8537 * vm_map_copy_t was not allocated out of the zone.
8538 */
8539 if (copy->size > msg_ool_size_small || copy->offset) {
8540 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8541 (long long)copy->size, (long long)copy->offset);
8542 }
8543 kfree_data(copy->cpy_kdata, copy->size);
8544 }
8545 zfree(vm_map_copy_zone, copy);
8546 }
8547
8548 /*
8549 * Routine: vm_map_copy_copy
8550 *
8551 * Description:
8552 * Move the information in a map copy object to
8553 * a new map copy object, leaving the old one
8554 * empty.
8555 *
8556 * This is used by kernel routines that need
8557 * to look at out-of-line data (in copyin form)
8558 * before deciding whether to return SUCCESS.
8559 * If the routine returns FAILURE, the original
8560 * copy object will be deallocated; therefore,
8561 * these routines must make a copy of the copy
8562 * object and leave the original empty so that
8563 * deallocation will not fail.
8564 */
8565 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8566 vm_map_copy_copy(
8567 vm_map_copy_t copy)
8568 {
8569 vm_map_copy_t new_copy;
8570
8571 if (copy == VM_MAP_COPY_NULL) {
8572 return VM_MAP_COPY_NULL;
8573 }
8574
8575 /*
8576 * Assert that the vm_map_copy is coming from the right
8577 * zone and hasn't been forged
8578 */
8579 vm_map_copy_require(copy);
8580
8581 /*
8582 * Allocate a new copy object, and copy the information
8583 * from the old one into it.
8584 */
8585
8586 new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
8587 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8588 #if __has_feature(ptrauth_calls)
8589 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8590 new_copy->cpy_kdata = copy->cpy_kdata;
8591 }
8592 #endif
8593
8594 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8595 /*
8596 * The links in the entry chain must be
8597 * changed to point to the new copy object.
8598 */
8599 vm_map_copy_first_entry(copy)->vme_prev
8600 = vm_map_copy_to_entry(new_copy);
8601 vm_map_copy_last_entry(copy)->vme_next
8602 = vm_map_copy_to_entry(new_copy);
8603 }
8604
8605 /*
8606 * Change the old copy object into one that contains
8607 * nothing to be deallocated.
8608 */
8609 copy->type = VM_MAP_COPY_OBJECT;
8610 copy->cpy_object = VM_OBJECT_NULL;
8611
8612 /*
8613 * Return the new object.
8614 */
8615 return new_copy;
8616 }
8617
8618 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8619 vm_map_entry_is_overwritable(
8620 vm_map_t dst_map __unused,
8621 vm_map_entry_t entry)
8622 {
8623 if (!(entry->protection & VM_PROT_WRITE)) {
8624 /* can't overwrite if not writable */
8625 return FALSE;
8626 }
8627 #if !__x86_64__
8628 if (entry->used_for_jit &&
8629 vm_map_cs_enforcement(dst_map) &&
8630 !dst_map->cs_debugged) {
8631 /*
8632 * Can't overwrite a JIT region while cs_enforced
8633 * and not cs_debugged.
8634 */
8635 return FALSE;
8636 }
8637 #endif /* !__x86_64__ */
8638 return TRUE;
8639 }
8640
8641 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8642 vm_map_overwrite_submap_recurse(
8643 vm_map_t dst_map,
8644 vm_map_offset_t dst_addr,
8645 vm_map_size_t dst_size)
8646 {
8647 vm_map_offset_t dst_end;
8648 vm_map_entry_t tmp_entry;
8649 vm_map_entry_t entry;
8650 kern_return_t result;
8651 boolean_t encountered_sub_map = FALSE;
8652
8653
8654
8655 /*
8656 * Verify that the destination is all writeable
8657 * initially. We have to trunc the destination
8658 * address and round the copy size or we'll end up
8659 * splitting entries in strange ways.
8660 */
8661
8662 dst_end = vm_map_round_page(dst_addr + dst_size,
8663 VM_MAP_PAGE_MASK(dst_map));
8664 vm_map_lock(dst_map);
8665
8666 start_pass_1:
8667 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8668 vm_map_unlock(dst_map);
8669 return KERN_INVALID_ADDRESS;
8670 }
8671
8672 vm_map_clip_start(dst_map,
8673 tmp_entry,
8674 vm_map_trunc_page(dst_addr,
8675 VM_MAP_PAGE_MASK(dst_map)));
8676 if (tmp_entry->is_sub_map) {
8677 /* clipping did unnest if needed */
8678 assert(!tmp_entry->use_pmap);
8679 }
8680
8681 for (entry = tmp_entry;;) {
8682 vm_map_entry_t next;
8683
8684 next = entry->vme_next;
8685 while (entry->is_sub_map) {
8686 vm_map_offset_t sub_start;
8687 vm_map_offset_t sub_end;
8688 vm_map_offset_t local_end;
8689
8690 if (entry->in_transition) {
8691 /*
8692 * Say that we are waiting, and wait for entry.
8693 */
8694 entry->needs_wakeup = TRUE;
8695 vm_map_entry_wait(dst_map, THREAD_UNINT);
8696
8697 goto start_pass_1;
8698 }
8699
8700 encountered_sub_map = TRUE;
8701 sub_start = VME_OFFSET(entry);
8702
8703 if (entry->vme_end < dst_end) {
8704 sub_end = entry->vme_end;
8705 } else {
8706 sub_end = dst_end;
8707 }
8708 sub_end -= entry->vme_start;
8709 sub_end += VME_OFFSET(entry);
8710 local_end = entry->vme_end;
8711 vm_map_unlock(dst_map);
8712
8713 result = vm_map_overwrite_submap_recurse(
8714 VME_SUBMAP(entry),
8715 sub_start,
8716 sub_end - sub_start);
8717
8718 if (result != KERN_SUCCESS) {
8719 return result;
8720 }
8721 if (dst_end <= entry->vme_end) {
8722 return KERN_SUCCESS;
8723 }
8724 vm_map_lock(dst_map);
8725 if (!vm_map_lookup_entry(dst_map, local_end,
8726 &tmp_entry)) {
8727 vm_map_unlock(dst_map);
8728 return KERN_INVALID_ADDRESS;
8729 }
8730 entry = tmp_entry;
8731 next = entry->vme_next;
8732 }
8733
8734 if (!(entry->protection & VM_PROT_WRITE)) {
8735 vm_map_unlock(dst_map);
8736 return KERN_PROTECTION_FAILURE;
8737 }
8738
8739 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8740 vm_map_unlock(dst_map);
8741 return KERN_PROTECTION_FAILURE;
8742 }
8743
8744 /*
8745 * If the entry is in transition, we must wait
8746 * for it to exit that state. Anything could happen
8747 * when we unlock the map, so start over.
8748 */
8749 if (entry->in_transition) {
8750 /*
8751 * Say that we are waiting, and wait for entry.
8752 */
8753 entry->needs_wakeup = TRUE;
8754 vm_map_entry_wait(dst_map, THREAD_UNINT);
8755
8756 goto start_pass_1;
8757 }
8758
8759 /*
8760 * our range is contained completely within this map entry
8761 */
8762 if (dst_end <= entry->vme_end) {
8763 vm_map_unlock(dst_map);
8764 return KERN_SUCCESS;
8765 }
8766 /*
8767 * check that range specified is contiguous region
8768 */
8769 if ((next == vm_map_to_entry(dst_map)) ||
8770 (next->vme_start != entry->vme_end)) {
8771 vm_map_unlock(dst_map);
8772 return KERN_INVALID_ADDRESS;
8773 }
8774
8775 /*
8776 * Check for permanent objects in the destination.
8777 */
8778 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8779 ((!VME_OBJECT(entry)->internal) ||
8780 (VME_OBJECT(entry)->true_share))) {
8781 if (encountered_sub_map) {
8782 vm_map_unlock(dst_map);
8783 return KERN_FAILURE;
8784 }
8785 }
8786
8787
8788 entry = next;
8789 }/* for */
8790 vm_map_unlock(dst_map);
8791 return KERN_SUCCESS;
8792 }
8793
8794 /*
8795 * Routine: vm_map_copy_overwrite
8796 *
8797 * Description:
8798 * Copy the memory described by the map copy
8799 * object (copy; returned by vm_map_copyin) onto
8800 * the specified destination region (dst_map, dst_addr).
8801 * The destination must be writeable.
8802 *
8803 * Unlike vm_map_copyout, this routine actually
8804 * writes over previously-mapped memory. If the
8805 * previous mapping was to a permanent (user-supplied)
8806 * memory object, it is preserved.
8807 *
8808 * The attributes (protection and inheritance) of the
8809 * destination region are preserved.
8810 *
8811 * If successful, consumes the copy object.
8812 * Otherwise, the caller is responsible for it.
8813 *
8814 * Implementation notes:
8815 * To overwrite aligned temporary virtual memory, it is
8816 * sufficient to remove the previous mapping and insert
8817 * the new copy. This replacement is done either on
8818 * the whole region (if no permanent virtual memory
8819 * objects are embedded in the destination region) or
8820 * in individual map entries.
8821 *
8822 * To overwrite permanent virtual memory , it is necessary
8823 * to copy each page, as the external memory management
8824 * interface currently does not provide any optimizations.
8825 *
8826 * Unaligned memory also has to be copied. It is possible
8827 * to use 'vm_trickery' to copy the aligned data. This is
8828 * not done but not hard to implement.
8829 *
8830 * Once a page of permanent memory has been overwritten,
8831 * it is impossible to interrupt this function; otherwise,
8832 * the call would be neither atomic nor location-independent.
8833 * The kernel-state portion of a user thread must be
8834 * interruptible.
8835 *
8836 * It may be expensive to forward all requests that might
8837 * overwrite permanent memory (vm_write, vm_copy) to
8838 * uninterruptible kernel threads. This routine may be
8839 * called by interruptible threads; however, success is
8840 * not guaranteed -- if the request cannot be performed
8841 * atomically and interruptibly, an error indication is
8842 * returned.
8843 *
8844 * Callers of this function must call vm_map_copy_require on
8845 * previously created vm_map_copy_t or pass a newly created
8846 * one to ensure that it hasn't been forged.
8847 */
8848
8849 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)8850 vm_map_copy_overwrite_nested(
8851 vm_map_t dst_map,
8852 vm_map_address_t dst_addr,
8853 vm_map_copy_t copy,
8854 boolean_t interruptible,
8855 pmap_t pmap,
8856 boolean_t discard_on_success)
8857 {
8858 vm_map_offset_t dst_end;
8859 vm_map_entry_t tmp_entry;
8860 vm_map_entry_t entry;
8861 kern_return_t kr;
8862 boolean_t aligned = TRUE;
8863 boolean_t contains_permanent_objects = FALSE;
8864 boolean_t encountered_sub_map = FALSE;
8865 vm_map_offset_t base_addr;
8866 vm_map_size_t copy_size;
8867 vm_map_size_t total_size;
8868 uint16_t copy_page_shift;
8869
8870 /*
8871 * Check for special kernel buffer allocated
8872 * by new_ipc_kmsg_copyin.
8873 */
8874
8875 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8876 return vm_map_copyout_kernel_buffer(
8877 dst_map, &dst_addr,
8878 copy, copy->size, TRUE, discard_on_success);
8879 }
8880
8881 /*
8882 * Only works for entry lists at the moment. Will
8883 * support page lists later.
8884 */
8885
8886 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
8887
8888 if (copy->size == 0) {
8889 if (discard_on_success) {
8890 vm_map_copy_discard(copy);
8891 }
8892 return KERN_SUCCESS;
8893 }
8894
8895 copy_page_shift = copy->cpy_hdr.page_shift;
8896
8897 /*
8898 * Verify that the destination is all writeable
8899 * initially. We have to trunc the destination
8900 * address and round the copy size or we'll end up
8901 * splitting entries in strange ways.
8902 */
8903
8904 if (!VM_MAP_PAGE_ALIGNED(copy->size,
8905 VM_MAP_PAGE_MASK(dst_map)) ||
8906 !VM_MAP_PAGE_ALIGNED(copy->offset,
8907 VM_MAP_PAGE_MASK(dst_map)) ||
8908 !VM_MAP_PAGE_ALIGNED(dst_addr,
8909 VM_MAP_PAGE_MASK(dst_map)) ||
8910 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
8911 aligned = FALSE;
8912 dst_end = vm_map_round_page(dst_addr + copy->size,
8913 VM_MAP_PAGE_MASK(dst_map));
8914 } else {
8915 dst_end = dst_addr + copy->size;
8916 }
8917
8918 vm_map_lock(dst_map);
8919
8920 /* LP64todo - remove this check when vm_map_commpage64()
8921 * no longer has to stuff in a map_entry for the commpage
8922 * above the map's max_offset.
8923 */
8924 if (dst_addr >= dst_map->max_offset) {
8925 vm_map_unlock(dst_map);
8926 return KERN_INVALID_ADDRESS;
8927 }
8928
8929 start_pass_1:
8930 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8931 vm_map_unlock(dst_map);
8932 return KERN_INVALID_ADDRESS;
8933 }
8934 vm_map_clip_start(dst_map,
8935 tmp_entry,
8936 vm_map_trunc_page(dst_addr,
8937 VM_MAP_PAGE_MASK(dst_map)));
8938 for (entry = tmp_entry;;) {
8939 vm_map_entry_t next = entry->vme_next;
8940
8941 while (entry->is_sub_map) {
8942 vm_map_offset_t sub_start;
8943 vm_map_offset_t sub_end;
8944 vm_map_offset_t local_end;
8945
8946 if (entry->in_transition) {
8947 /*
8948 * Say that we are waiting, and wait for entry.
8949 */
8950 entry->needs_wakeup = TRUE;
8951 vm_map_entry_wait(dst_map, THREAD_UNINT);
8952
8953 goto start_pass_1;
8954 }
8955
8956 local_end = entry->vme_end;
8957 if (!(entry->needs_copy)) {
8958 /* if needs_copy we are a COW submap */
8959 /* in such a case we just replace so */
8960 /* there is no need for the follow- */
8961 /* ing check. */
8962 encountered_sub_map = TRUE;
8963 sub_start = VME_OFFSET(entry);
8964
8965 if (entry->vme_end < dst_end) {
8966 sub_end = entry->vme_end;
8967 } else {
8968 sub_end = dst_end;
8969 }
8970 sub_end -= entry->vme_start;
8971 sub_end += VME_OFFSET(entry);
8972 vm_map_unlock(dst_map);
8973
8974 kr = vm_map_overwrite_submap_recurse(
8975 VME_SUBMAP(entry),
8976 sub_start,
8977 sub_end - sub_start);
8978 if (kr != KERN_SUCCESS) {
8979 return kr;
8980 }
8981 vm_map_lock(dst_map);
8982 }
8983
8984 if (dst_end <= entry->vme_end) {
8985 goto start_overwrite;
8986 }
8987 if (!vm_map_lookup_entry(dst_map, local_end,
8988 &entry)) {
8989 vm_map_unlock(dst_map);
8990 return KERN_INVALID_ADDRESS;
8991 }
8992 next = entry->vme_next;
8993 }
8994
8995 if (!(entry->protection & VM_PROT_WRITE)) {
8996 vm_map_unlock(dst_map);
8997 return KERN_PROTECTION_FAILURE;
8998 }
8999
9000 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9001 vm_map_unlock(dst_map);
9002 return KERN_PROTECTION_FAILURE;
9003 }
9004
9005 /*
9006 * If the entry is in transition, we must wait
9007 * for it to exit that state. Anything could happen
9008 * when we unlock the map, so start over.
9009 */
9010 if (entry->in_transition) {
9011 /*
9012 * Say that we are waiting, and wait for entry.
9013 */
9014 entry->needs_wakeup = TRUE;
9015 vm_map_entry_wait(dst_map, THREAD_UNINT);
9016
9017 goto start_pass_1;
9018 }
9019
9020 /*
9021 * our range is contained completely within this map entry
9022 */
9023 if (dst_end <= entry->vme_end) {
9024 break;
9025 }
9026 /*
9027 * check that range specified is contiguous region
9028 */
9029 if ((next == vm_map_to_entry(dst_map)) ||
9030 (next->vme_start != entry->vme_end)) {
9031 vm_map_unlock(dst_map);
9032 return KERN_INVALID_ADDRESS;
9033 }
9034
9035
9036 /*
9037 * Check for permanent objects in the destination.
9038 */
9039 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9040 ((!VME_OBJECT(entry)->internal) ||
9041 (VME_OBJECT(entry)->true_share))) {
9042 contains_permanent_objects = TRUE;
9043 }
9044
9045 entry = next;
9046 }/* for */
9047
9048 start_overwrite:
9049 /*
9050 * If there are permanent objects in the destination, then
9051 * the copy cannot be interrupted.
9052 */
9053
9054 if (interruptible && contains_permanent_objects) {
9055 vm_map_unlock(dst_map);
9056 return KERN_FAILURE; /* XXX */
9057 }
9058
9059 /*
9060 *
9061 * Make a second pass, overwriting the data
9062 * At the beginning of each loop iteration,
9063 * the next entry to be overwritten is "tmp_entry"
9064 * (initially, the value returned from the lookup above),
9065 * and the starting address expected in that entry
9066 * is "start".
9067 */
9068
9069 total_size = copy->size;
9070 if (encountered_sub_map) {
9071 copy_size = 0;
9072 /* re-calculate tmp_entry since we've had the map */
9073 /* unlocked */
9074 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9075 vm_map_unlock(dst_map);
9076 return KERN_INVALID_ADDRESS;
9077 }
9078 } else {
9079 copy_size = copy->size;
9080 }
9081
9082 base_addr = dst_addr;
9083 while (TRUE) {
9084 /* deconstruct the copy object and do in parts */
9085 /* only in sub_map, interruptable case */
9086 vm_map_entry_t copy_entry;
9087 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9088 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9089 int nentries;
9090 int remaining_entries = 0;
9091 vm_map_offset_t new_offset = 0;
9092
9093 for (entry = tmp_entry; copy_size == 0;) {
9094 vm_map_entry_t next;
9095
9096 next = entry->vme_next;
9097
9098 /* tmp_entry and base address are moved along */
9099 /* each time we encounter a sub-map. Otherwise */
9100 /* entry can outpase tmp_entry, and the copy_size */
9101 /* may reflect the distance between them */
9102 /* if the current entry is found to be in transition */
9103 /* we will start over at the beginning or the last */
9104 /* encounter of a submap as dictated by base_addr */
9105 /* we will zero copy_size accordingly. */
9106 if (entry->in_transition) {
9107 /*
9108 * Say that we are waiting, and wait for entry.
9109 */
9110 entry->needs_wakeup = TRUE;
9111 vm_map_entry_wait(dst_map, THREAD_UNINT);
9112
9113 if (!vm_map_lookup_entry(dst_map, base_addr,
9114 &tmp_entry)) {
9115 vm_map_unlock(dst_map);
9116 return KERN_INVALID_ADDRESS;
9117 }
9118 copy_size = 0;
9119 entry = tmp_entry;
9120 continue;
9121 }
9122 if (entry->is_sub_map) {
9123 vm_map_offset_t sub_start;
9124 vm_map_offset_t sub_end;
9125 vm_map_offset_t local_end;
9126
9127 if (entry->needs_copy) {
9128 /* if this is a COW submap */
9129 /* just back the range with a */
9130 /* anonymous entry */
9131 if (entry->vme_end < dst_end) {
9132 sub_end = entry->vme_end;
9133 } else {
9134 sub_end = dst_end;
9135 }
9136 if (entry->vme_start < base_addr) {
9137 sub_start = base_addr;
9138 } else {
9139 sub_start = entry->vme_start;
9140 }
9141 vm_map_clip_end(
9142 dst_map, entry, sub_end);
9143 vm_map_clip_start(
9144 dst_map, entry, sub_start);
9145 assert(!entry->use_pmap);
9146 assert(!entry->iokit_acct);
9147 entry->use_pmap = TRUE;
9148 vm_map_deallocate(VME_SUBMAP(entry));
9149 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, 0);
9150 VME_OFFSET_SET(entry, 0);
9151 entry->is_shared = FALSE;
9152 entry->needs_copy = FALSE;
9153 entry->protection = VM_PROT_DEFAULT;
9154 entry->max_protection = VM_PROT_ALL;
9155 entry->wired_count = 0;
9156 entry->user_wired_count = 0;
9157 if (entry->inheritance
9158 == VM_INHERIT_SHARE) {
9159 entry->inheritance = VM_INHERIT_COPY;
9160 }
9161 continue;
9162 }
9163 /* first take care of any non-sub_map */
9164 /* entries to send */
9165 if (base_addr < entry->vme_start) {
9166 /* stuff to send */
9167 copy_size =
9168 entry->vme_start - base_addr;
9169 break;
9170 }
9171 sub_start = VME_OFFSET(entry);
9172
9173 if (entry->vme_end < dst_end) {
9174 sub_end = entry->vme_end;
9175 } else {
9176 sub_end = dst_end;
9177 }
9178 sub_end -= entry->vme_start;
9179 sub_end += VME_OFFSET(entry);
9180 local_end = entry->vme_end;
9181 vm_map_unlock(dst_map);
9182 copy_size = sub_end - sub_start;
9183
9184 /* adjust the copy object */
9185 if (total_size > copy_size) {
9186 vm_map_size_t local_size = 0;
9187 vm_map_size_t entry_size;
9188
9189 nentries = 1;
9190 new_offset = copy->offset;
9191 copy_entry = vm_map_copy_first_entry(copy);
9192 while (copy_entry !=
9193 vm_map_copy_to_entry(copy)) {
9194 entry_size = copy_entry->vme_end -
9195 copy_entry->vme_start;
9196 if ((local_size < copy_size) &&
9197 ((local_size + entry_size)
9198 >= copy_size)) {
9199 vm_map_copy_clip_end(copy,
9200 copy_entry,
9201 copy_entry->vme_start +
9202 (copy_size - local_size));
9203 entry_size = copy_entry->vme_end -
9204 copy_entry->vme_start;
9205 local_size += entry_size;
9206 new_offset += entry_size;
9207 }
9208 if (local_size >= copy_size) {
9209 next_copy = copy_entry->vme_next;
9210 copy_entry->vme_next =
9211 vm_map_copy_to_entry(copy);
9212 previous_prev =
9213 copy->cpy_hdr.links.prev;
9214 copy->cpy_hdr.links.prev = copy_entry;
9215 copy->size = copy_size;
9216 remaining_entries =
9217 copy->cpy_hdr.nentries;
9218 remaining_entries -= nentries;
9219 copy->cpy_hdr.nentries = nentries;
9220 break;
9221 } else {
9222 local_size += entry_size;
9223 new_offset += entry_size;
9224 nentries++;
9225 }
9226 copy_entry = copy_entry->vme_next;
9227 }
9228 }
9229
9230 if ((entry->use_pmap) && (pmap == NULL)) {
9231 kr = vm_map_copy_overwrite_nested(
9232 VME_SUBMAP(entry),
9233 sub_start,
9234 copy,
9235 interruptible,
9236 VME_SUBMAP(entry)->pmap,
9237 TRUE);
9238 } else if (pmap != NULL) {
9239 kr = vm_map_copy_overwrite_nested(
9240 VME_SUBMAP(entry),
9241 sub_start,
9242 copy,
9243 interruptible, pmap,
9244 TRUE);
9245 } else {
9246 kr = vm_map_copy_overwrite_nested(
9247 VME_SUBMAP(entry),
9248 sub_start,
9249 copy,
9250 interruptible,
9251 dst_map->pmap,
9252 TRUE);
9253 }
9254 if (kr != KERN_SUCCESS) {
9255 if (next_copy != NULL) {
9256 copy->cpy_hdr.nentries +=
9257 remaining_entries;
9258 copy->cpy_hdr.links.prev->vme_next =
9259 next_copy;
9260 copy->cpy_hdr.links.prev
9261 = previous_prev;
9262 copy->size = total_size;
9263 }
9264 return kr;
9265 }
9266 if (dst_end <= local_end) {
9267 return KERN_SUCCESS;
9268 }
9269 /* otherwise copy no longer exists, it was */
9270 /* destroyed after successful copy_overwrite */
9271 copy = vm_map_copy_allocate();
9272 copy->type = VM_MAP_COPY_ENTRY_LIST;
9273 copy->offset = new_offset;
9274 copy->cpy_hdr.page_shift = copy_page_shift;
9275
9276 /*
9277 * XXX FBDP
9278 * this does not seem to deal with
9279 * the VM map store (R&B tree)
9280 */
9281
9282 total_size -= copy_size;
9283 copy_size = 0;
9284 /* put back remainder of copy in container */
9285 if (next_copy != NULL) {
9286 copy->cpy_hdr.nentries = remaining_entries;
9287 copy->cpy_hdr.links.next = next_copy;
9288 copy->cpy_hdr.links.prev = previous_prev;
9289 copy->size = total_size;
9290 next_copy->vme_prev =
9291 vm_map_copy_to_entry(copy);
9292 next_copy = NULL;
9293 }
9294 base_addr = local_end;
9295 vm_map_lock(dst_map);
9296 if (!vm_map_lookup_entry(dst_map,
9297 local_end, &tmp_entry)) {
9298 vm_map_unlock(dst_map);
9299 return KERN_INVALID_ADDRESS;
9300 }
9301 entry = tmp_entry;
9302 continue;
9303 }
9304 if (dst_end <= entry->vme_end) {
9305 copy_size = dst_end - base_addr;
9306 break;
9307 }
9308
9309 if ((next == vm_map_to_entry(dst_map)) ||
9310 (next->vme_start != entry->vme_end)) {
9311 vm_map_unlock(dst_map);
9312 return KERN_INVALID_ADDRESS;
9313 }
9314
9315 entry = next;
9316 }/* for */
9317
9318 next_copy = NULL;
9319 nentries = 1;
9320
9321 /* adjust the copy object */
9322 if (total_size > copy_size) {
9323 vm_map_size_t local_size = 0;
9324 vm_map_size_t entry_size;
9325
9326 new_offset = copy->offset;
9327 copy_entry = vm_map_copy_first_entry(copy);
9328 while (copy_entry != vm_map_copy_to_entry(copy)) {
9329 entry_size = copy_entry->vme_end -
9330 copy_entry->vme_start;
9331 if ((local_size < copy_size) &&
9332 ((local_size + entry_size)
9333 >= copy_size)) {
9334 vm_map_copy_clip_end(copy, copy_entry,
9335 copy_entry->vme_start +
9336 (copy_size - local_size));
9337 entry_size = copy_entry->vme_end -
9338 copy_entry->vme_start;
9339 local_size += entry_size;
9340 new_offset += entry_size;
9341 }
9342 if (local_size >= copy_size) {
9343 next_copy = copy_entry->vme_next;
9344 copy_entry->vme_next =
9345 vm_map_copy_to_entry(copy);
9346 previous_prev =
9347 copy->cpy_hdr.links.prev;
9348 copy->cpy_hdr.links.prev = copy_entry;
9349 copy->size = copy_size;
9350 remaining_entries =
9351 copy->cpy_hdr.nentries;
9352 remaining_entries -= nentries;
9353 copy->cpy_hdr.nentries = nentries;
9354 break;
9355 } else {
9356 local_size += entry_size;
9357 new_offset += entry_size;
9358 nentries++;
9359 }
9360 copy_entry = copy_entry->vme_next;
9361 }
9362 }
9363
9364 if (aligned) {
9365 pmap_t local_pmap;
9366
9367 if (pmap) {
9368 local_pmap = pmap;
9369 } else {
9370 local_pmap = dst_map->pmap;
9371 }
9372
9373 if ((kr = vm_map_copy_overwrite_aligned(
9374 dst_map, tmp_entry, copy,
9375 base_addr, local_pmap)) != KERN_SUCCESS) {
9376 if (next_copy != NULL) {
9377 copy->cpy_hdr.nentries +=
9378 remaining_entries;
9379 copy->cpy_hdr.links.prev->vme_next =
9380 next_copy;
9381 copy->cpy_hdr.links.prev =
9382 previous_prev;
9383 copy->size += copy_size;
9384 }
9385 return kr;
9386 }
9387 vm_map_unlock(dst_map);
9388 } else {
9389 /*
9390 * Performance gain:
9391 *
9392 * if the copy and dst address are misaligned but the same
9393 * offset within the page we can copy_not_aligned the
9394 * misaligned parts and copy aligned the rest. If they are
9395 * aligned but len is unaligned we simply need to copy
9396 * the end bit unaligned. We'll need to split the misaligned
9397 * bits of the region in this case !
9398 */
9399 /* ALWAYS UNLOCKS THE dst_map MAP */
9400 kr = vm_map_copy_overwrite_unaligned(
9401 dst_map,
9402 tmp_entry,
9403 copy,
9404 base_addr,
9405 discard_on_success);
9406 if (kr != KERN_SUCCESS) {
9407 if (next_copy != NULL) {
9408 copy->cpy_hdr.nentries +=
9409 remaining_entries;
9410 copy->cpy_hdr.links.prev->vme_next =
9411 next_copy;
9412 copy->cpy_hdr.links.prev =
9413 previous_prev;
9414 copy->size += copy_size;
9415 }
9416 return kr;
9417 }
9418 }
9419 total_size -= copy_size;
9420 if (total_size == 0) {
9421 break;
9422 }
9423 base_addr += copy_size;
9424 copy_size = 0;
9425 copy->offset = new_offset;
9426 if (next_copy != NULL) {
9427 copy->cpy_hdr.nentries = remaining_entries;
9428 copy->cpy_hdr.links.next = next_copy;
9429 copy->cpy_hdr.links.prev = previous_prev;
9430 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9431 copy->size = total_size;
9432 }
9433 vm_map_lock(dst_map);
9434 while (TRUE) {
9435 if (!vm_map_lookup_entry(dst_map,
9436 base_addr, &tmp_entry)) {
9437 vm_map_unlock(dst_map);
9438 return KERN_INVALID_ADDRESS;
9439 }
9440 if (tmp_entry->in_transition) {
9441 entry->needs_wakeup = TRUE;
9442 vm_map_entry_wait(dst_map, THREAD_UNINT);
9443 } else {
9444 break;
9445 }
9446 }
9447 vm_map_clip_start(dst_map,
9448 tmp_entry,
9449 vm_map_trunc_page(base_addr,
9450 VM_MAP_PAGE_MASK(dst_map)));
9451
9452 entry = tmp_entry;
9453 } /* while */
9454
9455 /*
9456 * Throw away the vm_map_copy object
9457 */
9458 if (discard_on_success) {
9459 vm_map_copy_discard(copy);
9460 }
9461
9462 return KERN_SUCCESS;
9463 }/* vm_map_copy_overwrite */
9464
9465 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9466 vm_map_copy_overwrite(
9467 vm_map_t dst_map,
9468 vm_map_offset_t dst_addr,
9469 vm_map_copy_t copy,
9470 vm_map_size_t copy_size,
9471 boolean_t interruptible)
9472 {
9473 vm_map_size_t head_size, tail_size;
9474 vm_map_copy_t head_copy, tail_copy;
9475 vm_map_offset_t head_addr, tail_addr;
9476 vm_map_entry_t entry;
9477 kern_return_t kr;
9478 vm_map_offset_t effective_page_mask, effective_page_size;
9479 uint16_t copy_page_shift;
9480
9481 head_size = 0;
9482 tail_size = 0;
9483 head_copy = NULL;
9484 tail_copy = NULL;
9485 head_addr = 0;
9486 tail_addr = 0;
9487
9488 /*
9489 * Check for null copy object.
9490 */
9491 if (copy == VM_MAP_COPY_NULL) {
9492 return KERN_SUCCESS;
9493 }
9494
9495 /*
9496 * Assert that the vm_map_copy is coming from the right
9497 * zone and hasn't been forged
9498 */
9499 vm_map_copy_require(copy);
9500
9501 if (interruptible ||
9502 copy->type != VM_MAP_COPY_ENTRY_LIST) {
9503 /*
9504 * We can't split the "copy" map if we're interruptible
9505 * or if we don't have a "copy" map...
9506 */
9507 blunt_copy:
9508 return vm_map_copy_overwrite_nested(dst_map,
9509 dst_addr,
9510 copy,
9511 interruptible,
9512 (pmap_t) NULL,
9513 TRUE);
9514 }
9515
9516 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9517 if (copy_page_shift < PAGE_SHIFT ||
9518 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9519 goto blunt_copy;
9520 }
9521
9522 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9523 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9524 } else {
9525 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9526 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9527 effective_page_mask);
9528 }
9529 effective_page_size = effective_page_mask + 1;
9530
9531 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9532 /*
9533 * Too small to bother with optimizing...
9534 */
9535 goto blunt_copy;
9536 }
9537
9538 if ((dst_addr & effective_page_mask) !=
9539 (copy->offset & effective_page_mask)) {
9540 /*
9541 * Incompatible mis-alignment of source and destination...
9542 */
9543 goto blunt_copy;
9544 }
9545
9546 /*
9547 * Proper alignment or identical mis-alignment at the beginning.
9548 * Let's try and do a small unaligned copy first (if needed)
9549 * and then an aligned copy for the rest.
9550 */
9551 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9552 head_addr = dst_addr;
9553 head_size = (effective_page_size -
9554 (copy->offset & effective_page_mask));
9555 head_size = MIN(head_size, copy_size);
9556 }
9557 if (!vm_map_page_aligned(copy->offset + copy_size,
9558 effective_page_mask)) {
9559 /*
9560 * Mis-alignment at the end.
9561 * Do an aligned copy up to the last page and
9562 * then an unaligned copy for the remaining bytes.
9563 */
9564 tail_size = ((copy->offset + copy_size) &
9565 effective_page_mask);
9566 tail_size = MIN(tail_size, copy_size);
9567 tail_addr = dst_addr + copy_size - tail_size;
9568 assert(tail_addr >= head_addr + head_size);
9569 }
9570 assert(head_size + tail_size <= copy_size);
9571
9572 if (head_size + tail_size == copy_size) {
9573 /*
9574 * It's all unaligned, no optimization possible...
9575 */
9576 goto blunt_copy;
9577 }
9578
9579 /*
9580 * Can't optimize if there are any submaps in the
9581 * destination due to the way we free the "copy" map
9582 * progressively in vm_map_copy_overwrite_nested()
9583 * in that case.
9584 */
9585 vm_map_lock_read(dst_map);
9586 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9587 vm_map_unlock_read(dst_map);
9588 goto blunt_copy;
9589 }
9590 for (;
9591 (entry != vm_map_copy_to_entry(copy) &&
9592 entry->vme_start < dst_addr + copy_size);
9593 entry = entry->vme_next) {
9594 if (entry->is_sub_map) {
9595 vm_map_unlock_read(dst_map);
9596 goto blunt_copy;
9597 }
9598 }
9599 vm_map_unlock_read(dst_map);
9600
9601 if (head_size) {
9602 /*
9603 * Unaligned copy of the first "head_size" bytes, to reach
9604 * a page boundary.
9605 */
9606
9607 /*
9608 * Extract "head_copy" out of "copy".
9609 */
9610 head_copy = vm_map_copy_allocate();
9611 head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9612 head_copy->cpy_hdr.entries_pageable =
9613 copy->cpy_hdr.entries_pageable;
9614 vm_map_store_init(&head_copy->cpy_hdr);
9615 head_copy->cpy_hdr.page_shift = copy_page_shift;
9616
9617 entry = vm_map_copy_first_entry(copy);
9618 if (entry->vme_end < copy->offset + head_size) {
9619 head_size = entry->vme_end - copy->offset;
9620 }
9621
9622 head_copy->offset = copy->offset;
9623 head_copy->size = head_size;
9624 copy->offset += head_size;
9625 copy->size -= head_size;
9626 copy_size -= head_size;
9627 assert(copy_size > 0);
9628
9629 vm_map_copy_clip_end(copy, entry, copy->offset);
9630 vm_map_copy_entry_unlink(copy, entry);
9631 vm_map_copy_entry_link(head_copy,
9632 vm_map_copy_to_entry(head_copy),
9633 entry);
9634
9635 /*
9636 * Do the unaligned copy.
9637 */
9638 kr = vm_map_copy_overwrite_nested(dst_map,
9639 head_addr,
9640 head_copy,
9641 interruptible,
9642 (pmap_t) NULL,
9643 FALSE);
9644 if (kr != KERN_SUCCESS) {
9645 goto done;
9646 }
9647 }
9648
9649 if (tail_size) {
9650 /*
9651 * Extract "tail_copy" out of "copy".
9652 */
9653 tail_copy = vm_map_copy_allocate();
9654 tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9655 tail_copy->cpy_hdr.entries_pageable =
9656 copy->cpy_hdr.entries_pageable;
9657 vm_map_store_init(&tail_copy->cpy_hdr);
9658 tail_copy->cpy_hdr.page_shift = copy_page_shift;
9659
9660 tail_copy->offset = copy->offset + copy_size - tail_size;
9661 tail_copy->size = tail_size;
9662
9663 copy->size -= tail_size;
9664 copy_size -= tail_size;
9665 assert(copy_size > 0);
9666
9667 entry = vm_map_copy_last_entry(copy);
9668 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9669 entry = vm_map_copy_last_entry(copy);
9670 vm_map_copy_entry_unlink(copy, entry);
9671 vm_map_copy_entry_link(tail_copy,
9672 vm_map_copy_last_entry(tail_copy),
9673 entry);
9674 }
9675
9676 /*
9677 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9678 * we want to avoid TOCTOU issues w.r.t copy->size but
9679 * we don't need to change vm_map_copy_overwrite_nested()
9680 * and all other vm_map_copy_overwrite variants.
9681 *
9682 * So we assign the original copy_size that was passed into
9683 * this routine back to copy.
9684 *
9685 * This use of local 'copy_size' passed into this routine is
9686 * to try and protect against TOCTOU attacks where the kernel
9687 * has been exploited. We don't expect this to be an issue
9688 * during normal system operation.
9689 */
9690 assertf(copy->size == copy_size,
9691 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
9692 copy->size = copy_size;
9693
9694 /*
9695 * Copy most (or possibly all) of the data.
9696 */
9697 kr = vm_map_copy_overwrite_nested(dst_map,
9698 dst_addr + head_size,
9699 copy,
9700 interruptible,
9701 (pmap_t) NULL,
9702 FALSE);
9703 if (kr != KERN_SUCCESS) {
9704 goto done;
9705 }
9706
9707 if (tail_size) {
9708 kr = vm_map_copy_overwrite_nested(dst_map,
9709 tail_addr,
9710 tail_copy,
9711 interruptible,
9712 (pmap_t) NULL,
9713 FALSE);
9714 }
9715
9716 done:
9717 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9718 if (kr == KERN_SUCCESS) {
9719 /*
9720 * Discard all the copy maps.
9721 */
9722 if (head_copy) {
9723 vm_map_copy_discard(head_copy);
9724 head_copy = NULL;
9725 }
9726 vm_map_copy_discard(copy);
9727 if (tail_copy) {
9728 vm_map_copy_discard(tail_copy);
9729 tail_copy = NULL;
9730 }
9731 } else {
9732 /*
9733 * Re-assemble the original copy map.
9734 */
9735 if (head_copy) {
9736 entry = vm_map_copy_first_entry(head_copy);
9737 vm_map_copy_entry_unlink(head_copy, entry);
9738 vm_map_copy_entry_link(copy,
9739 vm_map_copy_to_entry(copy),
9740 entry);
9741 copy->offset -= head_size;
9742 copy->size += head_size;
9743 vm_map_copy_discard(head_copy);
9744 head_copy = NULL;
9745 }
9746 if (tail_copy) {
9747 entry = vm_map_copy_last_entry(tail_copy);
9748 vm_map_copy_entry_unlink(tail_copy, entry);
9749 vm_map_copy_entry_link(copy,
9750 vm_map_copy_last_entry(copy),
9751 entry);
9752 copy->size += tail_size;
9753 vm_map_copy_discard(tail_copy);
9754 tail_copy = NULL;
9755 }
9756 }
9757 return kr;
9758 }
9759
9760
9761 /*
9762 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
9763 *
9764 * Decription:
9765 * Physically copy unaligned data
9766 *
9767 * Implementation:
9768 * Unaligned parts of pages have to be physically copied. We use
9769 * a modified form of vm_fault_copy (which understands none-aligned
9770 * page offsets and sizes) to do the copy. We attempt to copy as
9771 * much memory in one go as possibly, however vm_fault_copy copies
9772 * within 1 memory object so we have to find the smaller of "amount left"
9773 * "source object data size" and "target object data size". With
9774 * unaligned data we don't need to split regions, therefore the source
9775 * (copy) object should be one map entry, the target range may be split
9776 * over multiple map entries however. In any event we are pessimistic
9777 * about these assumptions.
9778 *
9779 * Callers of this function must call vm_map_copy_require on
9780 * previously created vm_map_copy_t or pass a newly created
9781 * one to ensure that it hasn't been forged.
9782 *
9783 * Assumptions:
9784 * dst_map is locked on entry and is return locked on success,
9785 * unlocked on error.
9786 */
9787
9788 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)9789 vm_map_copy_overwrite_unaligned(
9790 vm_map_t dst_map,
9791 vm_map_entry_t entry,
9792 vm_map_copy_t copy,
9793 vm_map_offset_t start,
9794 boolean_t discard_on_success)
9795 {
9796 vm_map_entry_t copy_entry;
9797 vm_map_entry_t copy_entry_next;
9798 vm_map_version_t version;
9799 vm_object_t dst_object;
9800 vm_object_offset_t dst_offset;
9801 vm_object_offset_t src_offset;
9802 vm_object_offset_t entry_offset;
9803 vm_map_offset_t entry_end;
9804 vm_map_size_t src_size,
9805 dst_size,
9806 copy_size,
9807 amount_left;
9808 kern_return_t kr = KERN_SUCCESS;
9809
9810
9811 copy_entry = vm_map_copy_first_entry(copy);
9812
9813 vm_map_lock_write_to_read(dst_map);
9814
9815 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
9816 amount_left = copy->size;
9817 /*
9818 * unaligned so we never clipped this entry, we need the offset into
9819 * the vm_object not just the data.
9820 */
9821 while (amount_left > 0) {
9822 if (entry == vm_map_to_entry(dst_map)) {
9823 vm_map_unlock_read(dst_map);
9824 return KERN_INVALID_ADDRESS;
9825 }
9826
9827 /* "start" must be within the current map entry */
9828 assert((start >= entry->vme_start) && (start < entry->vme_end));
9829
9830 dst_offset = start - entry->vme_start;
9831
9832 dst_size = entry->vme_end - start;
9833
9834 src_size = copy_entry->vme_end -
9835 (copy_entry->vme_start + src_offset);
9836
9837 if (dst_size < src_size) {
9838 /*
9839 * we can only copy dst_size bytes before
9840 * we have to get the next destination entry
9841 */
9842 copy_size = dst_size;
9843 } else {
9844 /*
9845 * we can only copy src_size bytes before
9846 * we have to get the next source copy entry
9847 */
9848 copy_size = src_size;
9849 }
9850
9851 if (copy_size > amount_left) {
9852 copy_size = amount_left;
9853 }
9854 /*
9855 * Entry needs copy, create a shadow shadow object for
9856 * Copy on write region.
9857 */
9858 if (entry->needs_copy &&
9859 ((entry->protection & VM_PROT_WRITE) != 0)) {
9860 if (vm_map_lock_read_to_write(dst_map)) {
9861 vm_map_lock_read(dst_map);
9862 goto RetryLookup;
9863 }
9864 VME_OBJECT_SHADOW(entry,
9865 (vm_map_size_t)(entry->vme_end
9866 - entry->vme_start));
9867 entry->needs_copy = FALSE;
9868 vm_map_lock_write_to_read(dst_map);
9869 }
9870 dst_object = VME_OBJECT(entry);
9871 /*
9872 * unlike with the virtual (aligned) copy we're going
9873 * to fault on it therefore we need a target object.
9874 */
9875 if (dst_object == VM_OBJECT_NULL) {
9876 if (vm_map_lock_read_to_write(dst_map)) {
9877 vm_map_lock_read(dst_map);
9878 goto RetryLookup;
9879 }
9880 dst_object = vm_object_allocate((vm_map_size_t)
9881 entry->vme_end - entry->vme_start);
9882 VME_OBJECT_SET(entry, dst_object, false, 0);
9883 VME_OFFSET_SET(entry, 0);
9884 assert(entry->use_pmap);
9885 vm_map_lock_write_to_read(dst_map);
9886 }
9887 /*
9888 * Take an object reference and unlock map. The "entry" may
9889 * disappear or change when the map is unlocked.
9890 */
9891 vm_object_reference(dst_object);
9892 version.main_timestamp = dst_map->timestamp;
9893 entry_offset = VME_OFFSET(entry);
9894 entry_end = entry->vme_end;
9895 vm_map_unlock_read(dst_map);
9896 /*
9897 * Copy as much as possible in one pass
9898 */
9899 kr = vm_fault_copy(
9900 VME_OBJECT(copy_entry),
9901 VME_OFFSET(copy_entry) + src_offset,
9902 ©_size,
9903 dst_object,
9904 entry_offset + dst_offset,
9905 dst_map,
9906 &version,
9907 THREAD_UNINT );
9908
9909 start += copy_size;
9910 src_offset += copy_size;
9911 amount_left -= copy_size;
9912 /*
9913 * Release the object reference
9914 */
9915 vm_object_deallocate(dst_object);
9916 /*
9917 * If a hard error occurred, return it now
9918 */
9919 if (kr != KERN_SUCCESS) {
9920 return kr;
9921 }
9922
9923 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
9924 || amount_left == 0) {
9925 /*
9926 * all done with this copy entry, dispose.
9927 */
9928 copy_entry_next = copy_entry->vme_next;
9929
9930 if (discard_on_success) {
9931 vm_map_copy_entry_unlink(copy, copy_entry);
9932 assert(!copy_entry->is_sub_map);
9933 vm_object_deallocate(VME_OBJECT(copy_entry));
9934 vm_map_copy_entry_dispose(copy_entry);
9935 }
9936
9937 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
9938 amount_left) {
9939 /*
9940 * not finished copying but run out of source
9941 */
9942 return KERN_INVALID_ADDRESS;
9943 }
9944
9945 copy_entry = copy_entry_next;
9946
9947 src_offset = 0;
9948 }
9949
9950 if (amount_left == 0) {
9951 return KERN_SUCCESS;
9952 }
9953
9954 vm_map_lock_read(dst_map);
9955 if (version.main_timestamp == dst_map->timestamp) {
9956 if (start == entry_end) {
9957 /*
9958 * destination region is split. Use the version
9959 * information to avoid a lookup in the normal
9960 * case.
9961 */
9962 entry = entry->vme_next;
9963 /*
9964 * should be contiguous. Fail if we encounter
9965 * a hole in the destination.
9966 */
9967 if (start != entry->vme_start) {
9968 vm_map_unlock_read(dst_map);
9969 return KERN_INVALID_ADDRESS;
9970 }
9971 }
9972 } else {
9973 /*
9974 * Map version check failed.
9975 * we must lookup the entry because somebody
9976 * might have changed the map behind our backs.
9977 */
9978 RetryLookup:
9979 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
9980 vm_map_unlock_read(dst_map);
9981 return KERN_INVALID_ADDRESS;
9982 }
9983 }
9984 }/* while */
9985
9986 return KERN_SUCCESS;
9987 }/* vm_map_copy_overwrite_unaligned */
9988
9989 /*
9990 * Routine: vm_map_copy_overwrite_aligned [internal use only]
9991 *
9992 * Description:
9993 * Does all the vm_trickery possible for whole pages.
9994 *
9995 * Implementation:
9996 *
9997 * If there are no permanent objects in the destination,
9998 * and the source and destination map entry zones match,
9999 * and the destination map entry is not shared,
10000 * then the map entries can be deleted and replaced
10001 * with those from the copy. The following code is the
10002 * basic idea of what to do, but there are lots of annoying
10003 * little details about getting protection and inheritance
10004 * right. Should add protection, inheritance, and sharing checks
10005 * to the above pass and make sure that no wiring is involved.
10006 *
10007 * Callers of this function must call vm_map_copy_require on
10008 * previously created vm_map_copy_t or pass a newly created
10009 * one to ensure that it hasn't been forged.
10010 */
10011
10012 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10013 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10014 int vm_map_copy_overwrite_aligned_src_large = 0;
10015
10016 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)10017 vm_map_copy_overwrite_aligned(
10018 vm_map_t dst_map,
10019 vm_map_entry_t tmp_entry,
10020 vm_map_copy_t copy,
10021 vm_map_offset_t start,
10022 __unused pmap_t pmap)
10023 {
10024 vm_object_t object;
10025 vm_map_entry_t copy_entry;
10026 vm_map_size_t copy_size;
10027 vm_map_size_t size;
10028 vm_map_entry_t entry;
10029
10030 while ((copy_entry = vm_map_copy_first_entry(copy))
10031 != vm_map_copy_to_entry(copy)) {
10032 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10033
10034 entry = tmp_entry;
10035 if (entry->is_sub_map) {
10036 /* unnested when clipped earlier */
10037 assert(!entry->use_pmap);
10038 }
10039 if (entry == vm_map_to_entry(dst_map)) {
10040 vm_map_unlock(dst_map);
10041 return KERN_INVALID_ADDRESS;
10042 }
10043 size = (entry->vme_end - entry->vme_start);
10044 /*
10045 * Make sure that no holes popped up in the
10046 * address map, and that the protection is
10047 * still valid, in case the map was unlocked
10048 * earlier.
10049 */
10050
10051 if ((entry->vme_start != start) || ((entry->is_sub_map)
10052 && !entry->needs_copy)) {
10053 vm_map_unlock(dst_map);
10054 return KERN_INVALID_ADDRESS;
10055 }
10056 assert(entry != vm_map_to_entry(dst_map));
10057
10058 /*
10059 * Check protection again
10060 */
10061
10062 if (!(entry->protection & VM_PROT_WRITE)) {
10063 vm_map_unlock(dst_map);
10064 return KERN_PROTECTION_FAILURE;
10065 }
10066
10067 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10068 vm_map_unlock(dst_map);
10069 return KERN_PROTECTION_FAILURE;
10070 }
10071
10072 /*
10073 * Adjust to source size first
10074 */
10075
10076 if (copy_size < size) {
10077 if (entry->map_aligned &&
10078 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10079 VM_MAP_PAGE_MASK(dst_map))) {
10080 /* no longer map-aligned */
10081 entry->map_aligned = FALSE;
10082 }
10083 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10084 size = copy_size;
10085 }
10086
10087 /*
10088 * Adjust to destination size
10089 */
10090
10091 if (size < copy_size) {
10092 vm_map_copy_clip_end(copy, copy_entry,
10093 copy_entry->vme_start + size);
10094 copy_size = size;
10095 }
10096
10097 assert((entry->vme_end - entry->vme_start) == size);
10098 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10099 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10100
10101 /*
10102 * If the destination contains temporary unshared memory,
10103 * we can perform the copy by throwing it away and
10104 * installing the source data.
10105 */
10106
10107 object = VME_OBJECT(entry);
10108 if ((!entry->is_shared &&
10109 ((object == VM_OBJECT_NULL) ||
10110 (object->internal && !object->true_share))) ||
10111 entry->needs_copy) {
10112 vm_object_t old_object = VME_OBJECT(entry);
10113 vm_object_offset_t old_offset = VME_OFFSET(entry);
10114 vm_object_offset_t offset;
10115
10116 /*
10117 * Ensure that the source and destination aren't
10118 * identical
10119 */
10120 if (old_object == VME_OBJECT(copy_entry) &&
10121 old_offset == VME_OFFSET(copy_entry)) {
10122 vm_map_copy_entry_unlink(copy, copy_entry);
10123 vm_map_copy_entry_dispose(copy_entry);
10124
10125 if (old_object != VM_OBJECT_NULL) {
10126 vm_object_deallocate(old_object);
10127 }
10128
10129 start = tmp_entry->vme_end;
10130 tmp_entry = tmp_entry->vme_next;
10131 continue;
10132 }
10133
10134 #if XNU_TARGET_OS_OSX
10135 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10136 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10137 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10138 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10139 copy_size <= __TRADEOFF1_COPY_SIZE) {
10140 /*
10141 * Virtual vs. Physical copy tradeoff #1.
10142 *
10143 * Copying only a few pages out of a large
10144 * object: do a physical copy instead of
10145 * a virtual copy, to avoid possibly keeping
10146 * the entire large object alive because of
10147 * those few copy-on-write pages.
10148 */
10149 vm_map_copy_overwrite_aligned_src_large++;
10150 goto slow_copy;
10151 }
10152 #endif /* XNU_TARGET_OS_OSX */
10153
10154 if ((dst_map->pmap != kernel_pmap) &&
10155 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10156 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10157 vm_object_t new_object, new_shadow;
10158
10159 /*
10160 * We're about to map something over a mapping
10161 * established by malloc()...
10162 */
10163 new_object = VME_OBJECT(copy_entry);
10164 if (new_object != VM_OBJECT_NULL) {
10165 vm_object_lock_shared(new_object);
10166 }
10167 while (new_object != VM_OBJECT_NULL &&
10168 #if XNU_TARGET_OS_OSX
10169 !new_object->true_share &&
10170 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10171 #endif /* XNU_TARGET_OS_OSX */
10172 new_object->internal) {
10173 new_shadow = new_object->shadow;
10174 if (new_shadow == VM_OBJECT_NULL) {
10175 break;
10176 }
10177 vm_object_lock_shared(new_shadow);
10178 vm_object_unlock(new_object);
10179 new_object = new_shadow;
10180 }
10181 if (new_object != VM_OBJECT_NULL) {
10182 if (!new_object->internal) {
10183 /*
10184 * The new mapping is backed
10185 * by an external object. We
10186 * don't want malloc'ed memory
10187 * to be replaced with such a
10188 * non-anonymous mapping, so
10189 * let's go off the optimized
10190 * path...
10191 */
10192 vm_map_copy_overwrite_aligned_src_not_internal++;
10193 vm_object_unlock(new_object);
10194 goto slow_copy;
10195 }
10196 #if XNU_TARGET_OS_OSX
10197 if (new_object->true_share ||
10198 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10199 /*
10200 * Same if there's a "true_share"
10201 * object in the shadow chain, or
10202 * an object with a non-default
10203 * (SYMMETRIC) copy strategy.
10204 */
10205 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10206 vm_object_unlock(new_object);
10207 goto slow_copy;
10208 }
10209 #endif /* XNU_TARGET_OS_OSX */
10210 vm_object_unlock(new_object);
10211 }
10212 /*
10213 * The new mapping is still backed by
10214 * anonymous (internal) memory, so it's
10215 * OK to substitute it for the original
10216 * malloc() mapping.
10217 */
10218 }
10219
10220 if (old_object != VM_OBJECT_NULL) {
10221 if (entry->is_sub_map) {
10222 if (entry->use_pmap) {
10223 #ifndef NO_NESTED_PMAP
10224 pmap_unnest(dst_map->pmap,
10225 (addr64_t)entry->vme_start,
10226 entry->vme_end - entry->vme_start);
10227 #endif /* NO_NESTED_PMAP */
10228 if (dst_map->mapped_in_other_pmaps) {
10229 /* clean up parent */
10230 /* map/maps */
10231 vm_map_submap_pmap_clean(
10232 dst_map, entry->vme_start,
10233 entry->vme_end,
10234 VME_SUBMAP(entry),
10235 VME_OFFSET(entry));
10236 }
10237 } else {
10238 vm_map_submap_pmap_clean(
10239 dst_map, entry->vme_start,
10240 entry->vme_end,
10241 VME_SUBMAP(entry),
10242 VME_OFFSET(entry));
10243 }
10244 vm_map_deallocate(VME_SUBMAP(entry));
10245 } else {
10246 if (dst_map->mapped_in_other_pmaps) {
10247 vm_object_pmap_protect_options(
10248 VME_OBJECT(entry),
10249 VME_OFFSET(entry),
10250 entry->vme_end
10251 - entry->vme_start,
10252 PMAP_NULL,
10253 PAGE_SIZE,
10254 entry->vme_start,
10255 VM_PROT_NONE,
10256 PMAP_OPTIONS_REMOVE);
10257 } else {
10258 pmap_remove_options(
10259 dst_map->pmap,
10260 (addr64_t)(entry->vme_start),
10261 (addr64_t)(entry->vme_end),
10262 PMAP_OPTIONS_REMOVE);
10263 }
10264 vm_object_deallocate(old_object);
10265 }
10266 }
10267
10268 if (entry->iokit_acct) {
10269 /* keep using iokit accounting */
10270 entry->use_pmap = FALSE;
10271 } else {
10272 /* use pmap accounting */
10273 entry->use_pmap = TRUE;
10274 }
10275 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, 0);
10276 object = VME_OBJECT(entry);
10277 entry->needs_copy = copy_entry->needs_copy;
10278 entry->wired_count = 0;
10279 entry->user_wired_count = 0;
10280 offset = VME_OFFSET(copy_entry);
10281 VME_OFFSET_SET(entry, offset);
10282
10283 vm_map_copy_entry_unlink(copy, copy_entry);
10284 vm_map_copy_entry_dispose(copy_entry);
10285
10286 /*
10287 * we could try to push pages into the pmap at this point, BUT
10288 * this optimization only saved on average 2 us per page if ALL
10289 * the pages in the source were currently mapped
10290 * and ALL the pages in the dest were touched, if there were fewer
10291 * than 2/3 of the pages touched, this optimization actually cost more cycles
10292 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10293 */
10294
10295 /*
10296 * Set up for the next iteration. The map
10297 * has not been unlocked, so the next
10298 * address should be at the end of this
10299 * entry, and the next map entry should be
10300 * the one following it.
10301 */
10302
10303 start = tmp_entry->vme_end;
10304 tmp_entry = tmp_entry->vme_next;
10305 } else {
10306 vm_map_version_t version;
10307 vm_object_t dst_object;
10308 vm_object_offset_t dst_offset;
10309 kern_return_t r;
10310
10311 slow_copy:
10312 if (entry->needs_copy) {
10313 VME_OBJECT_SHADOW(entry,
10314 (entry->vme_end -
10315 entry->vme_start));
10316 entry->needs_copy = FALSE;
10317 }
10318
10319 dst_object = VME_OBJECT(entry);
10320 dst_offset = VME_OFFSET(entry);
10321
10322 /*
10323 * Take an object reference, and record
10324 * the map version information so that the
10325 * map can be safely unlocked.
10326 */
10327
10328 if (dst_object == VM_OBJECT_NULL) {
10329 /*
10330 * We would usually have just taken the
10331 * optimized path above if the destination
10332 * object has not been allocated yet. But we
10333 * now disable that optimization if the copy
10334 * entry's object is not backed by anonymous
10335 * memory to avoid replacing malloc'ed
10336 * (i.e. re-usable) anonymous memory with a
10337 * not-so-anonymous mapping.
10338 * So we have to handle this case here and
10339 * allocate a new VM object for this map entry.
10340 */
10341 dst_object = vm_object_allocate(
10342 entry->vme_end - entry->vme_start);
10343 dst_offset = 0;
10344 VME_OBJECT_SET(entry, dst_object, false, 0);
10345 VME_OFFSET_SET(entry, dst_offset);
10346 assert(entry->use_pmap);
10347 }
10348
10349 vm_object_reference(dst_object);
10350
10351 /* account for unlock bumping up timestamp */
10352 version.main_timestamp = dst_map->timestamp + 1;
10353
10354 vm_map_unlock(dst_map);
10355
10356 /*
10357 * Copy as much as possible in one pass
10358 */
10359
10360 copy_size = size;
10361 r = vm_fault_copy(
10362 VME_OBJECT(copy_entry),
10363 VME_OFFSET(copy_entry),
10364 ©_size,
10365 dst_object,
10366 dst_offset,
10367 dst_map,
10368 &version,
10369 THREAD_UNINT );
10370
10371 /*
10372 * Release the object reference
10373 */
10374
10375 vm_object_deallocate(dst_object);
10376
10377 /*
10378 * If a hard error occurred, return it now
10379 */
10380
10381 if (r != KERN_SUCCESS) {
10382 return r;
10383 }
10384
10385 if (copy_size != 0) {
10386 /*
10387 * Dispose of the copied region
10388 */
10389
10390 vm_map_copy_clip_end(copy, copy_entry,
10391 copy_entry->vme_start + copy_size);
10392 vm_map_copy_entry_unlink(copy, copy_entry);
10393 vm_object_deallocate(VME_OBJECT(copy_entry));
10394 vm_map_copy_entry_dispose(copy_entry);
10395 }
10396
10397 /*
10398 * Pick up in the destination map where we left off.
10399 *
10400 * Use the version information to avoid a lookup
10401 * in the normal case.
10402 */
10403
10404 start += copy_size;
10405 vm_map_lock(dst_map);
10406 if (version.main_timestamp == dst_map->timestamp &&
10407 copy_size != 0) {
10408 /* We can safely use saved tmp_entry value */
10409
10410 if (tmp_entry->map_aligned &&
10411 !VM_MAP_PAGE_ALIGNED(
10412 start,
10413 VM_MAP_PAGE_MASK(dst_map))) {
10414 /* no longer map-aligned */
10415 tmp_entry->map_aligned = FALSE;
10416 }
10417 vm_map_clip_end(dst_map, tmp_entry, start);
10418 tmp_entry = tmp_entry->vme_next;
10419 } else {
10420 /* Must do lookup of tmp_entry */
10421
10422 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10423 vm_map_unlock(dst_map);
10424 return KERN_INVALID_ADDRESS;
10425 }
10426 if (tmp_entry->map_aligned &&
10427 !VM_MAP_PAGE_ALIGNED(
10428 start,
10429 VM_MAP_PAGE_MASK(dst_map))) {
10430 /* no longer map-aligned */
10431 tmp_entry->map_aligned = FALSE;
10432 }
10433 vm_map_clip_start(dst_map, tmp_entry, start);
10434 }
10435 }
10436 }/* while */
10437
10438 return KERN_SUCCESS;
10439 }/* vm_map_copy_overwrite_aligned */
10440
10441 /*
10442 * Routine: vm_map_copyin_kernel_buffer [internal use only]
10443 *
10444 * Description:
10445 * Copy in data to a kernel buffer from space in the
10446 * source map. The original space may be optionally
10447 * deallocated.
10448 *
10449 * If successful, returns a new copy object.
10450 */
10451 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10452 vm_map_copyin_kernel_buffer(
10453 vm_map_t src_map,
10454 vm_map_offset_t src_addr,
10455 vm_map_size_t len,
10456 boolean_t src_destroy,
10457 vm_map_copy_t *copy_result)
10458 {
10459 kern_return_t kr;
10460 vm_map_copy_t copy;
10461
10462 if (len > msg_ool_size_small) {
10463 return KERN_INVALID_ARGUMENT;
10464 }
10465
10466 copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10467 copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10468 if (copy->cpy_kdata == NULL) {
10469 zfree(vm_map_copy_zone, copy);
10470 return KERN_RESOURCE_SHORTAGE;
10471 }
10472
10473 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10474 copy->size = len;
10475 copy->offset = 0;
10476
10477 kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10478 if (kr != KERN_SUCCESS) {
10479 kfree_data(copy->cpy_kdata, len);
10480 zfree(vm_map_copy_zone, copy);
10481 return kr;
10482 }
10483
10484 if (src_destroy) {
10485 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10486
10487 if (src_map == kernel_map) {
10488 flags |= VM_MAP_REMOVE_KUNWIRE;
10489 }
10490
10491 (void)vm_map_remove_guard(src_map,
10492 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10493 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10494 flags, KMEM_GUARD_NONE);
10495 }
10496
10497 *copy_result = copy;
10498 return KERN_SUCCESS;
10499 }
10500
10501 /*
10502 * Routine: vm_map_copyout_kernel_buffer [internal use only]
10503 *
10504 * Description:
10505 * Copy out data from a kernel buffer into space in the
10506 * destination map. The space may be otpionally dynamically
10507 * allocated.
10508 *
10509 * If successful, consumes the copy object.
10510 * Otherwise, the caller is responsible for it.
10511 *
10512 * Callers of this function must call vm_map_copy_require on
10513 * previously created vm_map_copy_t or pass a newly created
10514 * one to ensure that it hasn't been forged.
10515 */
10516 static int vm_map_copyout_kernel_buffer_failures = 0;
10517 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10518 vm_map_copyout_kernel_buffer(
10519 vm_map_t map,
10520 vm_map_address_t *addr, /* IN/OUT */
10521 vm_map_copy_t copy,
10522 vm_map_size_t copy_size,
10523 boolean_t overwrite,
10524 boolean_t consume_on_success)
10525 {
10526 kern_return_t kr = KERN_SUCCESS;
10527 thread_t thread = current_thread();
10528
10529 assert(copy->size == copy_size);
10530
10531 /*
10532 * check for corrupted vm_map_copy structure
10533 */
10534 if (copy_size > msg_ool_size_small || copy->offset) {
10535 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10536 (long long)copy->size, (long long)copy->offset);
10537 }
10538
10539 if (!overwrite) {
10540 /*
10541 * Allocate space in the target map for the data
10542 */
10543 *addr = 0;
10544 kr = vm_map_enter(map,
10545 addr,
10546 vm_map_round_page(copy_size,
10547 VM_MAP_PAGE_MASK(map)),
10548 (vm_map_offset_t) 0,
10549 VM_FLAGS_ANYWHERE,
10550 VM_MAP_KERNEL_FLAGS_NONE,
10551 VM_KERN_MEMORY_NONE,
10552 VM_OBJECT_NULL,
10553 (vm_object_offset_t) 0,
10554 FALSE,
10555 VM_PROT_DEFAULT,
10556 VM_PROT_ALL,
10557 VM_INHERIT_DEFAULT);
10558 if (kr != KERN_SUCCESS) {
10559 return kr;
10560 }
10561 #if KASAN
10562 if (map->pmap == kernel_pmap) {
10563 kasan_notify_address(*addr, copy->size);
10564 }
10565 #endif
10566 }
10567
10568 /*
10569 * Copyout the data from the kernel buffer to the target map.
10570 */
10571 if (thread->map == map) {
10572 /*
10573 * If the target map is the current map, just do
10574 * the copy.
10575 */
10576 assert((vm_size_t)copy_size == copy_size);
10577 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10578 kr = KERN_INVALID_ADDRESS;
10579 }
10580 } else {
10581 vm_map_t oldmap;
10582
10583 /*
10584 * If the target map is another map, assume the
10585 * target's address space identity for the duration
10586 * of the copy.
10587 */
10588 vm_map_reference(map);
10589 oldmap = vm_map_switch(map);
10590
10591 assert((vm_size_t)copy_size == copy_size);
10592 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10593 vm_map_copyout_kernel_buffer_failures++;
10594 kr = KERN_INVALID_ADDRESS;
10595 }
10596
10597 (void) vm_map_switch(oldmap);
10598 vm_map_deallocate(map);
10599 }
10600
10601 if (kr != KERN_SUCCESS) {
10602 /* the copy failed, clean up */
10603 if (!overwrite) {
10604 /*
10605 * Deallocate the space we allocated in the target map.
10606 */
10607 vm_map_remove(map,
10608 vm_map_trunc_page(*addr,
10609 VM_MAP_PAGE_MASK(map)),
10610 vm_map_round_page((*addr +
10611 vm_map_round_page(copy_size,
10612 VM_MAP_PAGE_MASK(map))),
10613 VM_MAP_PAGE_MASK(map)));
10614 *addr = 0;
10615 }
10616 } else {
10617 /* copy was successful, dicard the copy structure */
10618 if (consume_on_success) {
10619 kfree_data(copy->cpy_kdata, copy_size);
10620 zfree(vm_map_copy_zone, copy);
10621 }
10622 }
10623
10624 return kr;
10625 }
10626
10627 /*
10628 * Routine: vm_map_copy_insert [internal use only]
10629 *
10630 * Description:
10631 * Link a copy chain ("copy") into a map at the
10632 * specified location (after "where").
10633 *
10634 * Callers of this function must call vm_map_copy_require on
10635 * previously created vm_map_copy_t or pass a newly created
10636 * one to ensure that it hasn't been forged.
10637 * Side effects:
10638 * The copy chain is destroyed.
10639 */
10640 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10641 vm_map_copy_insert(
10642 vm_map_t map,
10643 vm_map_entry_t after_where,
10644 vm_map_copy_t copy)
10645 {
10646 vm_map_entry_t entry;
10647
10648 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10649 entry = vm_map_copy_first_entry(copy);
10650 vm_map_copy_entry_unlink(copy, entry);
10651 vm_map_store_entry_link(map, after_where, entry,
10652 VM_MAP_KERNEL_FLAGS_NONE);
10653 after_where = entry;
10654 }
10655 zfree(vm_map_copy_zone, copy);
10656 }
10657
10658 /*
10659 * Callers of this function must call vm_map_copy_require on
10660 * previously created vm_map_copy_t or pass a newly created
10661 * one to ensure that it hasn't been forged.
10662 */
10663 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)10664 vm_map_copy_remap(
10665 vm_map_t map,
10666 vm_map_entry_t where,
10667 vm_map_copy_t copy,
10668 vm_map_offset_t adjustment,
10669 vm_prot_t cur_prot,
10670 vm_prot_t max_prot,
10671 vm_inherit_t inheritance)
10672 {
10673 vm_map_entry_t copy_entry, new_entry;
10674
10675 for (copy_entry = vm_map_copy_first_entry(copy);
10676 copy_entry != vm_map_copy_to_entry(copy);
10677 copy_entry = copy_entry->vme_next) {
10678 /* get a new VM map entry for the map */
10679 new_entry = vm_map_entry_create(map);
10680 /* copy the "copy entry" to the new entry */
10681 vm_map_entry_copy(map, new_entry, copy_entry);
10682 /* adjust "start" and "end" */
10683 new_entry->vme_start += adjustment;
10684 new_entry->vme_end += adjustment;
10685 /* clear some attributes */
10686 new_entry->inheritance = inheritance;
10687 new_entry->protection = cur_prot;
10688 new_entry->max_protection = max_prot;
10689 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
10690 /* take an extra reference on the entry's "object" */
10691 if (new_entry->is_sub_map) {
10692 assert(!new_entry->use_pmap); /* not nested */
10693 vm_map_reference(VME_SUBMAP(new_entry));
10694 } else {
10695 vm_object_reference(VME_OBJECT(new_entry));
10696 }
10697 /* insert the new entry in the map */
10698 vm_map_store_entry_link(map, where, new_entry,
10699 VM_MAP_KERNEL_FLAGS_NONE);
10700 /* continue inserting the "copy entries" after the new entry */
10701 where = new_entry;
10702 }
10703 }
10704
10705
10706 /*
10707 * Returns true if *size matches (or is in the range of) copy->size.
10708 * Upon returning true, the *size field is updated with the actual size of the
10709 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
10710 */
10711 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)10712 vm_map_copy_validate_size(
10713 vm_map_t dst_map,
10714 vm_map_copy_t copy,
10715 vm_map_size_t *size)
10716 {
10717 if (copy == VM_MAP_COPY_NULL) {
10718 return FALSE;
10719 }
10720
10721 /*
10722 * Assert that the vm_map_copy is coming from the right
10723 * zone and hasn't been forged
10724 */
10725 vm_map_copy_require(copy);
10726
10727 vm_map_size_t copy_sz = copy->size;
10728 vm_map_size_t sz = *size;
10729 switch (copy->type) {
10730 case VM_MAP_COPY_OBJECT:
10731 case VM_MAP_COPY_KERNEL_BUFFER:
10732 if (sz == copy_sz) {
10733 return TRUE;
10734 }
10735 break;
10736 case VM_MAP_COPY_ENTRY_LIST:
10737 /*
10738 * potential page-size rounding prevents us from exactly
10739 * validating this flavor of vm_map_copy, but we can at least
10740 * assert that it's within a range.
10741 */
10742 if (copy_sz >= sz &&
10743 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
10744 *size = copy_sz;
10745 return TRUE;
10746 }
10747 break;
10748 default:
10749 break;
10750 }
10751 return FALSE;
10752 }
10753
10754 /*
10755 * Routine: vm_map_copyout_size
10756 *
10757 * Description:
10758 * Copy out a copy chain ("copy") into newly-allocated
10759 * space in the destination map. Uses a prevalidated
10760 * size for the copy object (vm_map_copy_validate_size).
10761 *
10762 * If successful, consumes the copy object.
10763 * Otherwise, the caller is responsible for it.
10764 */
10765 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)10766 vm_map_copyout_size(
10767 vm_map_t dst_map,
10768 vm_map_address_t *dst_addr, /* OUT */
10769 vm_map_copy_t copy,
10770 vm_map_size_t copy_size)
10771 {
10772 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
10773 TRUE, /* consume_on_success */
10774 VM_PROT_DEFAULT,
10775 VM_PROT_ALL,
10776 VM_INHERIT_DEFAULT);
10777 }
10778
10779 /*
10780 * Routine: vm_map_copyout
10781 *
10782 * Description:
10783 * Copy out a copy chain ("copy") into newly-allocated
10784 * space in the destination map.
10785 *
10786 * If successful, consumes the copy object.
10787 * Otherwise, the caller is responsible for it.
10788 */
10789 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)10790 vm_map_copyout(
10791 vm_map_t dst_map,
10792 vm_map_address_t *dst_addr, /* OUT */
10793 vm_map_copy_t copy)
10794 {
10795 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
10796 TRUE, /* consume_on_success */
10797 VM_PROT_DEFAULT,
10798 VM_PROT_ALL,
10799 VM_INHERIT_DEFAULT);
10800 }
10801
10802 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)10803 vm_map_copyout_internal(
10804 vm_map_t dst_map,
10805 vm_map_address_t *dst_addr, /* OUT */
10806 vm_map_copy_t copy,
10807 vm_map_size_t copy_size,
10808 boolean_t consume_on_success,
10809 vm_prot_t cur_protection,
10810 vm_prot_t max_protection,
10811 vm_inherit_t inheritance)
10812 {
10813 vm_map_size_t size;
10814 vm_map_size_t adjustment;
10815 vm_map_offset_t start;
10816 vm_object_offset_t vm_copy_start;
10817 vm_map_entry_t last;
10818 vm_map_entry_t entry;
10819 vm_map_copy_t original_copy;
10820 kern_return_t kr;
10821 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10822
10823 /*
10824 * Check for null copy object.
10825 */
10826
10827 if (copy == VM_MAP_COPY_NULL) {
10828 *dst_addr = 0;
10829 return KERN_SUCCESS;
10830 }
10831
10832 /*
10833 * Assert that the vm_map_copy is coming from the right
10834 * zone and hasn't been forged
10835 */
10836 vm_map_copy_require(copy);
10837
10838 if (copy->size != copy_size) {
10839 *dst_addr = 0;
10840 return KERN_FAILURE;
10841 }
10842
10843 /*
10844 * Check for special copy object, created
10845 * by vm_map_copyin_object.
10846 */
10847
10848 if (copy->type == VM_MAP_COPY_OBJECT) {
10849 vm_object_t object = copy->cpy_object;
10850 vm_object_offset_t offset;
10851
10852 offset = vm_object_trunc_page(copy->offset);
10853 size = vm_map_round_page((copy_size +
10854 (vm_map_size_t)(copy->offset -
10855 offset)),
10856 VM_MAP_PAGE_MASK(dst_map));
10857 *dst_addr = 0;
10858 kr = vm_map_enter(dst_map, dst_addr, size,
10859 (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
10860 VM_MAP_KERNEL_FLAGS_NONE,
10861 VM_KERN_MEMORY_NONE,
10862 object, offset, FALSE,
10863 VM_PROT_DEFAULT, VM_PROT_ALL,
10864 VM_INHERIT_DEFAULT);
10865 if (kr != KERN_SUCCESS) {
10866 return kr;
10867 }
10868 /* Account for non-pagealigned copy object */
10869 *dst_addr += (vm_map_offset_t)(copy->offset - offset);
10870 if (consume_on_success) {
10871 zfree(vm_map_copy_zone, copy);
10872 }
10873 return KERN_SUCCESS;
10874 }
10875
10876 /*
10877 * Check for special kernel buffer allocated
10878 * by new_ipc_kmsg_copyin.
10879 */
10880
10881 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
10882 return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
10883 copy, copy_size, FALSE,
10884 consume_on_success);
10885 }
10886
10887 original_copy = copy;
10888 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
10889 vm_map_copy_t target_copy;
10890 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
10891
10892 target_copy = VM_MAP_COPY_NULL;
10893 DEBUG4K_ADJUST("adjusting...\n");
10894 kr = vm_map_copy_adjust_to_target(
10895 copy,
10896 0, /* offset */
10897 copy->size, /* size */
10898 dst_map,
10899 TRUE, /* copy */
10900 &target_copy,
10901 &overmap_start,
10902 &overmap_end,
10903 &trimmed_start);
10904 if (kr != KERN_SUCCESS) {
10905 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
10906 return kr;
10907 }
10908 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
10909 if (target_copy != copy) {
10910 copy = target_copy;
10911 }
10912 copy_size = copy->size;
10913 }
10914
10915 /*
10916 * Find space for the data
10917 */
10918
10919 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
10920 VM_MAP_COPY_PAGE_MASK(copy));
10921 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
10922 VM_MAP_COPY_PAGE_MASK(copy))
10923 - vm_copy_start;
10924
10925
10926 if (dst_map == kernel_map) {
10927 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10928 }
10929
10930 vm_map_lock(dst_map);
10931 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
10932 &start, &last);
10933 if (kr != KERN_SUCCESS) {
10934 vm_map_unlock(dst_map);
10935 return kr;
10936 }
10937
10938 adjustment = start - vm_copy_start;
10939 if (!consume_on_success) {
10940 /*
10941 * We're not allowed to consume "copy", so we'll have to
10942 * copy its map entries into the destination map below.
10943 * No need to re-allocate map entries from the correct
10944 * (pageable or not) zone, since we'll get new map entries
10945 * during the transfer.
10946 * We'll also adjust the map entries's "start" and "end"
10947 * during the transfer, to keep "copy"'s entries consistent
10948 * with its "offset".
10949 */
10950 goto after_adjustments;
10951 }
10952
10953 /*
10954 * Since we're going to just drop the map
10955 * entries from the copy into the destination
10956 * map, they must come from the same pool.
10957 */
10958
10959 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
10960 /*
10961 * Mismatches occur when dealing with the default
10962 * pager.
10963 */
10964 vm_map_entry_t next, new;
10965
10966 /*
10967 * Find the zone that the copies were allocated from
10968 */
10969
10970 entry = vm_map_copy_first_entry(copy);
10971
10972 /*
10973 * Reinitialize the copy so that vm_map_copy_entry_link
10974 * will work.
10975 */
10976 vm_map_store_copy_reset(copy, entry);
10977 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
10978
10979 /*
10980 * Copy each entry.
10981 */
10982 while (entry != vm_map_copy_to_entry(copy)) {
10983 new = vm_map_copy_entry_create(copy);
10984 vm_map_entry_copy_full(new, entry);
10985 new->vme_no_copy_on_read = FALSE;
10986 assert(!new->iokit_acct);
10987 if (new->is_sub_map) {
10988 /* clr address space specifics */
10989 new->use_pmap = FALSE;
10990 }
10991 vm_map_copy_entry_link(copy,
10992 vm_map_copy_last_entry(copy),
10993 new);
10994 next = entry->vme_next;
10995 vm_map_entry_dispose(entry);
10996 entry = next;
10997 }
10998 }
10999
11000 /*
11001 * Adjust the addresses in the copy chain, and
11002 * reset the region attributes.
11003 */
11004
11005 for (entry = vm_map_copy_first_entry(copy);
11006 entry != vm_map_copy_to_entry(copy);
11007 entry = entry->vme_next) {
11008 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
11009 /*
11010 * We're injecting this copy entry into a map that
11011 * has the standard page alignment, so clear
11012 * "map_aligned" (which might have been inherited
11013 * from the original map entry).
11014 */
11015 entry->map_aligned = FALSE;
11016 }
11017
11018 entry->vme_start += adjustment;
11019 entry->vme_end += adjustment;
11020
11021 if (entry->map_aligned) {
11022 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11023 VM_MAP_PAGE_MASK(dst_map)));
11024 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11025 VM_MAP_PAGE_MASK(dst_map)));
11026 }
11027
11028 entry->inheritance = VM_INHERIT_DEFAULT;
11029 entry->protection = VM_PROT_DEFAULT;
11030 entry->max_protection = VM_PROT_ALL;
11031 entry->behavior = VM_BEHAVIOR_DEFAULT;
11032
11033 /*
11034 * If the entry is now wired,
11035 * map the pages into the destination map.
11036 */
11037 if (entry->wired_count != 0) {
11038 vm_map_offset_t va;
11039 vm_object_offset_t offset;
11040 vm_object_t object;
11041 vm_prot_t prot;
11042 int type_of_fault;
11043
11044 /* TODO4K would need to use actual page size */
11045 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11046
11047 object = VME_OBJECT(entry);
11048 offset = VME_OFFSET(entry);
11049 va = entry->vme_start;
11050
11051 pmap_pageable(dst_map->pmap,
11052 entry->vme_start,
11053 entry->vme_end,
11054 TRUE);
11055
11056 while (va < entry->vme_end) {
11057 vm_page_t m;
11058 struct vm_object_fault_info fault_info = {};
11059
11060 /*
11061 * Look up the page in the object.
11062 * Assert that the page will be found in the
11063 * top object:
11064 * either
11065 * the object was newly created by
11066 * vm_object_copy_slowly, and has
11067 * copies of all of the pages from
11068 * the source object
11069 * or
11070 * the object was moved from the old
11071 * map entry; because the old map
11072 * entry was wired, all of the pages
11073 * were in the top-level object.
11074 * (XXX not true if we wire pages for
11075 * reading)
11076 */
11077 vm_object_lock(object);
11078
11079 m = vm_page_lookup(object, offset);
11080 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11081 m->vmp_absent) {
11082 panic("vm_map_copyout: wiring %p", m);
11083 }
11084
11085 prot = entry->protection;
11086
11087 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11088 prot) {
11089 prot |= VM_PROT_EXECUTE;
11090 }
11091
11092 type_of_fault = DBG_CACHE_HIT_FAULT;
11093
11094 fault_info.user_tag = VME_ALIAS(entry);
11095 fault_info.pmap_options = 0;
11096 if (entry->iokit_acct ||
11097 (!entry->is_sub_map && !entry->use_pmap)) {
11098 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11099 }
11100
11101 vm_fault_enter(m,
11102 dst_map->pmap,
11103 va,
11104 PAGE_SIZE, 0,
11105 prot,
11106 prot,
11107 VM_PAGE_WIRED(m),
11108 FALSE, /* change_wiring */
11109 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11110 &fault_info,
11111 NULL, /* need_retry */
11112 &type_of_fault);
11113
11114 vm_object_unlock(object);
11115
11116 offset += PAGE_SIZE_64;
11117 va += PAGE_SIZE;
11118 }
11119 }
11120 }
11121
11122 after_adjustments:
11123
11124 /*
11125 * Correct the page alignment for the result
11126 */
11127
11128 *dst_addr = start + (copy->offset - vm_copy_start);
11129
11130 #if KASAN
11131 kasan_notify_address(*dst_addr, size);
11132 #endif
11133
11134 /*
11135 * Update the hints and the map size
11136 */
11137
11138 if (consume_on_success) {
11139 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11140 } else {
11141 SAVE_HINT_MAP_WRITE(dst_map, last);
11142 }
11143
11144 dst_map->size += size;
11145
11146 /*
11147 * Link in the copy
11148 */
11149
11150 if (consume_on_success) {
11151 vm_map_copy_insert(dst_map, last, copy);
11152 if (copy != original_copy) {
11153 vm_map_copy_discard(original_copy);
11154 original_copy = VM_MAP_COPY_NULL;
11155 }
11156 } else {
11157 vm_map_copy_remap(dst_map, last, copy, adjustment,
11158 cur_protection, max_protection,
11159 inheritance);
11160 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11161 vm_map_copy_discard(copy);
11162 copy = original_copy;
11163 }
11164 }
11165
11166
11167 vm_map_unlock(dst_map);
11168
11169 /*
11170 * XXX If wiring_required, call vm_map_pageable
11171 */
11172
11173 return KERN_SUCCESS;
11174 }
11175
11176 /*
11177 * Routine: vm_map_copyin
11178 *
11179 * Description:
11180 * see vm_map_copyin_common. Exported via Unsupported.exports.
11181 *
11182 */
11183
11184 #undef vm_map_copyin
11185
11186 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11187 vm_map_copyin(
11188 vm_map_t src_map,
11189 vm_map_address_t src_addr,
11190 vm_map_size_t len,
11191 boolean_t src_destroy,
11192 vm_map_copy_t *copy_result) /* OUT */
11193 {
11194 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11195 FALSE, copy_result, FALSE);
11196 }
11197
11198 /*
11199 * Routine: vm_map_copyin_common
11200 *
11201 * Description:
11202 * Copy the specified region (src_addr, len) from the
11203 * source address space (src_map), possibly removing
11204 * the region from the source address space (src_destroy).
11205 *
11206 * Returns:
11207 * A vm_map_copy_t object (copy_result), suitable for
11208 * insertion into another address space (using vm_map_copyout),
11209 * copying over another address space region (using
11210 * vm_map_copy_overwrite). If the copy is unused, it
11211 * should be destroyed (using vm_map_copy_discard).
11212 *
11213 * In/out conditions:
11214 * The source map should not be locked on entry.
11215 */
11216
11217 typedef struct submap_map {
11218 vm_map_t parent_map;
11219 vm_map_offset_t base_start;
11220 vm_map_offset_t base_end;
11221 vm_map_size_t base_len;
11222 struct submap_map *next;
11223 } submap_map_t;
11224
11225 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11226 vm_map_copyin_common(
11227 vm_map_t src_map,
11228 vm_map_address_t src_addr,
11229 vm_map_size_t len,
11230 boolean_t src_destroy,
11231 __unused boolean_t src_volatile,
11232 vm_map_copy_t *copy_result, /* OUT */
11233 boolean_t use_maxprot)
11234 {
11235 int flags;
11236
11237 flags = 0;
11238 if (src_destroy) {
11239 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11240 }
11241 if (use_maxprot) {
11242 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11243 }
11244 return vm_map_copyin_internal(src_map,
11245 src_addr,
11246 len,
11247 flags,
11248 copy_result);
11249 }
11250 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11251 vm_map_copyin_internal(
11252 vm_map_t src_map,
11253 vm_map_address_t src_addr,
11254 vm_map_size_t len,
11255 int flags,
11256 vm_map_copy_t *copy_result) /* OUT */
11257 {
11258 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11259 * in multi-level lookup, this
11260 * entry contains the actual
11261 * vm_object/offset.
11262 */
11263 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11264
11265 vm_map_offset_t src_start; /* Start of current entry --
11266 * where copy is taking place now
11267 */
11268 vm_map_offset_t src_end; /* End of entire region to be
11269 * copied */
11270 vm_map_offset_t src_base;
11271 vm_map_t base_map = src_map;
11272 boolean_t map_share = FALSE;
11273 submap_map_t *parent_maps = NULL;
11274
11275 vm_map_copy_t copy; /* Resulting copy */
11276 vm_map_address_t copy_addr;
11277 vm_map_size_t copy_size;
11278 boolean_t src_destroy;
11279 boolean_t use_maxprot;
11280 boolean_t preserve_purgeable;
11281 boolean_t entry_was_shared;
11282 vm_map_entry_t saved_src_entry;
11283
11284 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11285 return KERN_INVALID_ARGUMENT;
11286 }
11287
11288 #if CONFIG_KERNEL_TBI
11289 if (src_map->pmap == kernel_pmap) {
11290 src_addr = VM_KERNEL_TBI_FILL(src_addr);
11291 }
11292 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
11293
11294 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11295 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11296 preserve_purgeable =
11297 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11298
11299 /*
11300 * Check for copies of zero bytes.
11301 */
11302
11303 if (len == 0) {
11304 *copy_result = VM_MAP_COPY_NULL;
11305 return KERN_SUCCESS;
11306 }
11307
11308 /*
11309 * Check that the end address doesn't overflow
11310 */
11311 src_end = src_addr + len;
11312 if (src_end < src_addr) {
11313 return KERN_INVALID_ADDRESS;
11314 }
11315
11316 /*
11317 * Compute (page aligned) start and end of region
11318 */
11319 src_start = vm_map_trunc_page(src_addr,
11320 VM_MAP_PAGE_MASK(src_map));
11321 src_end = vm_map_round_page(src_end,
11322 VM_MAP_PAGE_MASK(src_map));
11323
11324 /*
11325 * If the copy is sufficiently small, use a kernel buffer instead
11326 * of making a virtual copy. The theory being that the cost of
11327 * setting up VM (and taking C-O-W faults) dominates the copy costs
11328 * for small regions.
11329 */
11330 if ((len <= msg_ool_size_small) &&
11331 !use_maxprot &&
11332 !preserve_purgeable &&
11333 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11334 /*
11335 * Since the "msg_ool_size_small" threshold was increased and
11336 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11337 * address space limits, we revert to doing a virtual copy if the
11338 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11339 * of the commpage would now fail when it used to work.
11340 */
11341 (src_start >= vm_map_min(src_map) &&
11342 src_start < vm_map_max(src_map) &&
11343 src_end >= vm_map_min(src_map) &&
11344 src_end < vm_map_max(src_map))) {
11345 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11346 src_destroy, copy_result);
11347 }
11348
11349 /*
11350 * Allocate a header element for the list.
11351 *
11352 * Use the start and end in the header to
11353 * remember the endpoints prior to rounding.
11354 */
11355
11356 copy = vm_map_copy_allocate();
11357 copy->type = VM_MAP_COPY_ENTRY_LIST;
11358 copy->cpy_hdr.entries_pageable = TRUE;
11359 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11360
11361 vm_map_store_init( &(copy->cpy_hdr));
11362
11363 copy->offset = src_addr;
11364 copy->size = len;
11365
11366 new_entry = vm_map_copy_entry_create(copy);
11367
11368 #define RETURN(x) \
11369 MACRO_BEGIN \
11370 vm_map_unlock(src_map); \
11371 if(src_map != base_map) \
11372 vm_map_deallocate(src_map); \
11373 if (new_entry != VM_MAP_ENTRY_NULL) \
11374 vm_map_copy_entry_dispose(new_entry); \
11375 vm_map_copy_discard(copy); \
11376 { \
11377 submap_map_t *_ptr; \
11378 \
11379 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11380 parent_maps=parent_maps->next; \
11381 if (_ptr->parent_map != base_map) \
11382 vm_map_deallocate(_ptr->parent_map); \
11383 kfree_type(submap_map_t, _ptr); \
11384 } \
11385 } \
11386 MACRO_RETURN(x); \
11387 MACRO_END
11388
11389 /*
11390 * Find the beginning of the region.
11391 */
11392
11393 vm_map_lock(src_map);
11394
11395 /*
11396 * Lookup the original "src_addr" rather than the truncated
11397 * "src_start", in case "src_start" falls in a non-map-aligned
11398 * map entry *before* the map entry that contains "src_addr"...
11399 */
11400 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11401 RETURN(KERN_INVALID_ADDRESS);
11402 }
11403 if (!tmp_entry->is_sub_map) {
11404 /*
11405 * ... but clip to the map-rounded "src_start" rather than
11406 * "src_addr" to preserve map-alignment. We'll adjust the
11407 * first copy entry at the end, if needed.
11408 */
11409 vm_map_clip_start(src_map, tmp_entry, src_start);
11410 }
11411 if (src_start < tmp_entry->vme_start) {
11412 /*
11413 * Move "src_start" up to the start of the
11414 * first map entry to copy.
11415 */
11416 src_start = tmp_entry->vme_start;
11417 }
11418 /* set for later submap fix-up */
11419 copy_addr = src_start;
11420
11421 /*
11422 * Go through entries until we get to the end.
11423 */
11424
11425 while (TRUE) {
11426 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
11427 vm_map_size_t src_size; /* Size of source
11428 * map entry (in both
11429 * maps)
11430 */
11431
11432 vm_object_t src_object; /* Object to copy */
11433 vm_object_offset_t src_offset;
11434
11435 vm_object_t new_copy_object;/* vm_object_copy_* result */
11436
11437 boolean_t src_needs_copy; /* Should source map
11438 * be made read-only
11439 * for copy-on-write?
11440 */
11441
11442 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
11443
11444 boolean_t was_wired; /* Was source wired? */
11445 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
11446 vm_map_version_t version; /* Version before locks
11447 * dropped to make copy
11448 */
11449 kern_return_t result; /* Return value from
11450 * copy_strategically.
11451 */
11452 while (tmp_entry->is_sub_map) {
11453 vm_map_size_t submap_len;
11454 submap_map_t *ptr;
11455
11456 ptr = kalloc_type(submap_map_t, Z_WAITOK);
11457 ptr->next = parent_maps;
11458 parent_maps = ptr;
11459 ptr->parent_map = src_map;
11460 ptr->base_start = src_start;
11461 ptr->base_end = src_end;
11462 submap_len = tmp_entry->vme_end - src_start;
11463 if (submap_len > (src_end - src_start)) {
11464 submap_len = src_end - src_start;
11465 }
11466 ptr->base_len = submap_len;
11467
11468 src_start -= tmp_entry->vme_start;
11469 src_start += VME_OFFSET(tmp_entry);
11470 src_end = src_start + submap_len;
11471 src_map = VME_SUBMAP(tmp_entry);
11472 vm_map_lock(src_map);
11473 /* keep an outstanding reference for all maps in */
11474 /* the parents tree except the base map */
11475 vm_map_reference(src_map);
11476 vm_map_unlock(ptr->parent_map);
11477 if (!vm_map_lookup_entry(
11478 src_map, src_start, &tmp_entry)) {
11479 RETURN(KERN_INVALID_ADDRESS);
11480 }
11481 map_share = TRUE;
11482 if (!tmp_entry->is_sub_map) {
11483 vm_map_clip_start(src_map, tmp_entry, src_start);
11484 }
11485 src_entry = tmp_entry;
11486 }
11487 /* we are now in the lowest level submap... */
11488
11489 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11490 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11491 /* This is not, supported for now.In future */
11492 /* we will need to detect the phys_contig */
11493 /* condition and then upgrade copy_slowly */
11494 /* to do physical copy from the device mem */
11495 /* based object. We can piggy-back off of */
11496 /* the was wired boolean to set-up the */
11497 /* proper handling */
11498 RETURN(KERN_PROTECTION_FAILURE);
11499 }
11500 /*
11501 * Create a new address map entry to hold the result.
11502 * Fill in the fields from the appropriate source entries.
11503 * We must unlock the source map to do this if we need
11504 * to allocate a map entry.
11505 */
11506 if (new_entry == VM_MAP_ENTRY_NULL) {
11507 version.main_timestamp = src_map->timestamp;
11508 vm_map_unlock(src_map);
11509
11510 new_entry = vm_map_copy_entry_create(copy);
11511
11512 vm_map_lock(src_map);
11513 if ((version.main_timestamp + 1) != src_map->timestamp) {
11514 if (!vm_map_lookup_entry(src_map, src_start,
11515 &tmp_entry)) {
11516 RETURN(KERN_INVALID_ADDRESS);
11517 }
11518 if (!tmp_entry->is_sub_map) {
11519 vm_map_clip_start(src_map, tmp_entry, src_start);
11520 }
11521 continue; /* restart w/ new tmp_entry */
11522 }
11523 }
11524
11525 /*
11526 * Verify that the region can be read.
11527 */
11528 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11529 !use_maxprot) ||
11530 (src_entry->max_protection & VM_PROT_READ) == 0) {
11531 RETURN(KERN_PROTECTION_FAILURE);
11532 }
11533
11534 /*
11535 * Clip against the endpoints of the entire region.
11536 */
11537
11538 vm_map_clip_end(src_map, src_entry, src_end);
11539
11540 src_size = src_entry->vme_end - src_start;
11541 src_object = VME_OBJECT(src_entry);
11542 src_offset = VME_OFFSET(src_entry);
11543 was_wired = (src_entry->wired_count != 0);
11544
11545 vm_map_entry_copy(src_map, new_entry, src_entry);
11546 if (new_entry->is_sub_map) {
11547 /* clr address space specifics */
11548 new_entry->use_pmap = FALSE;
11549 } else {
11550 /*
11551 * We're dealing with a copy-on-write operation,
11552 * so the resulting mapping should not inherit the
11553 * original mapping's accounting settings.
11554 * "iokit_acct" should have been cleared in
11555 * vm_map_entry_copy().
11556 * "use_pmap" should be reset to its default (TRUE)
11557 * so that the new mapping gets accounted for in
11558 * the task's memory footprint.
11559 */
11560 assert(!new_entry->iokit_acct);
11561 new_entry->use_pmap = TRUE;
11562 }
11563
11564 /*
11565 * Attempt non-blocking copy-on-write optimizations.
11566 */
11567
11568 /*
11569 * If we are destroying the source, and the object
11570 * is internal, we could move the object reference
11571 * from the source to the copy. The copy is
11572 * copy-on-write only if the source is.
11573 * We make another reference to the object, because
11574 * destroying the source entry will deallocate it.
11575 *
11576 * This memory transfer has to be atomic, (to prevent
11577 * the VM object from being shared or copied while
11578 * it's being moved here), so we could only do this
11579 * if we won't have to unlock the VM map until the
11580 * original mapping has been fully removed.
11581 */
11582
11583 RestartCopy:
11584 if ((src_object == VM_OBJECT_NULL ||
11585 (!was_wired && !map_share && !tmp_entry->is_shared
11586 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11587 vm_object_copy_quickly(
11588 VME_OBJECT(new_entry),
11589 src_offset,
11590 src_size,
11591 &src_needs_copy,
11592 &new_entry_needs_copy)) {
11593 new_entry->needs_copy = new_entry_needs_copy;
11594
11595 /*
11596 * Handle copy-on-write obligations
11597 */
11598
11599 if (src_needs_copy && !tmp_entry->needs_copy) {
11600 vm_prot_t prot;
11601
11602 prot = src_entry->protection & ~VM_PROT_WRITE;
11603
11604 if (override_nx(src_map, VME_ALIAS(src_entry))
11605 && prot) {
11606 prot |= VM_PROT_EXECUTE;
11607 }
11608
11609 vm_object_pmap_protect(
11610 src_object,
11611 src_offset,
11612 src_size,
11613 (src_entry->is_shared ?
11614 PMAP_NULL
11615 : src_map->pmap),
11616 VM_MAP_PAGE_SIZE(src_map),
11617 src_entry->vme_start,
11618 prot);
11619
11620 assert(tmp_entry->wired_count == 0);
11621 tmp_entry->needs_copy = TRUE;
11622 }
11623
11624 /*
11625 * The map has never been unlocked, so it's safe
11626 * to move to the next entry rather than doing
11627 * another lookup.
11628 */
11629
11630 goto CopySuccessful;
11631 }
11632
11633 entry_was_shared = tmp_entry->is_shared;
11634
11635 /*
11636 * Take an object reference, so that we may
11637 * release the map lock(s).
11638 */
11639
11640 assert(src_object != VM_OBJECT_NULL);
11641 vm_object_reference(src_object);
11642
11643 /*
11644 * Record the timestamp for later verification.
11645 * Unlock the map.
11646 */
11647
11648 version.main_timestamp = src_map->timestamp;
11649 vm_map_unlock(src_map); /* Increments timestamp once! */
11650 saved_src_entry = src_entry;
11651 tmp_entry = VM_MAP_ENTRY_NULL;
11652 src_entry = VM_MAP_ENTRY_NULL;
11653
11654 /*
11655 * Perform the copy
11656 */
11657
11658 if (was_wired ||
11659 (debug4k_no_cow_copyin &&
11660 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
11661 CopySlowly:
11662 vm_object_lock(src_object);
11663 result = vm_object_copy_slowly(
11664 src_object,
11665 src_offset,
11666 src_size,
11667 THREAD_UNINT,
11668 &new_copy_object);
11669 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11670 saved_used_for_jit = new_entry->used_for_jit;
11671 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
11672 new_entry->used_for_jit = saved_used_for_jit;
11673 VME_OFFSET_SET(new_entry,
11674 src_offset - vm_object_trunc_page(src_offset));
11675 new_entry->needs_copy = FALSE;
11676 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
11677 (entry_was_shared || map_share)) {
11678 vm_object_t new_object;
11679
11680 vm_object_lock_shared(src_object);
11681 new_object = vm_object_copy_delayed(
11682 src_object,
11683 src_offset,
11684 src_size,
11685 TRUE);
11686 if (new_object == VM_OBJECT_NULL) {
11687 goto CopySlowly;
11688 }
11689
11690 VME_OBJECT_SET(new_entry, new_object, false, 0);
11691 assert(new_entry->wired_count == 0);
11692 new_entry->needs_copy = TRUE;
11693 assert(!new_entry->iokit_acct);
11694 assert(new_object->purgable == VM_PURGABLE_DENY);
11695 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
11696 result = KERN_SUCCESS;
11697 } else {
11698 vm_object_offset_t new_offset;
11699 new_offset = VME_OFFSET(new_entry);
11700 result = vm_object_copy_strategically(src_object,
11701 src_offset,
11702 src_size,
11703 &new_copy_object,
11704 &new_offset,
11705 &new_entry_needs_copy);
11706 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11707 saved_used_for_jit = new_entry->used_for_jit;
11708 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
11709 new_entry->used_for_jit = saved_used_for_jit;
11710 if (new_offset != VME_OFFSET(new_entry)) {
11711 VME_OFFSET_SET(new_entry, new_offset);
11712 }
11713
11714 new_entry->needs_copy = new_entry_needs_copy;
11715 }
11716
11717 if (result == KERN_SUCCESS &&
11718 ((preserve_purgeable &&
11719 src_object->purgable != VM_PURGABLE_DENY) ||
11720 new_entry->used_for_jit)) {
11721 /*
11722 * Purgeable objects should be COPY_NONE, true share;
11723 * this should be propogated to the copy.
11724 *
11725 * Also force mappings the pmap specially protects to
11726 * be COPY_NONE; trying to COW these mappings would
11727 * change the effective protections, which could have
11728 * side effects if the pmap layer relies on the
11729 * specified protections.
11730 */
11731
11732 vm_object_t new_object;
11733
11734 new_object = VME_OBJECT(new_entry);
11735 assert(new_object != src_object);
11736 vm_object_lock(new_object);
11737 assert(new_object->ref_count == 1);
11738 assert(new_object->shadow == VM_OBJECT_NULL);
11739 assert(new_object->copy == VM_OBJECT_NULL);
11740 assert(new_object->vo_owner == NULL);
11741
11742 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
11743
11744 if (preserve_purgeable &&
11745 src_object->purgable != VM_PURGABLE_DENY) {
11746 new_object->true_share = TRUE;
11747
11748 /* start as non-volatile with no owner... */
11749 new_object->purgable = VM_PURGABLE_NONVOLATILE;
11750 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
11751 /* ... and move to src_object's purgeable state */
11752 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
11753 int state;
11754 state = src_object->purgable;
11755 vm_object_purgable_control(
11756 new_object,
11757 VM_PURGABLE_SET_STATE_FROM_KERNEL,
11758 &state);
11759 }
11760 /* no pmap accounting for purgeable objects */
11761 new_entry->use_pmap = FALSE;
11762 }
11763
11764 vm_object_unlock(new_object);
11765 new_object = VM_OBJECT_NULL;
11766 }
11767
11768 if (result != KERN_SUCCESS &&
11769 result != KERN_MEMORY_RESTART_COPY) {
11770 vm_map_lock(src_map);
11771 RETURN(result);
11772 }
11773
11774 /*
11775 * Throw away the extra reference
11776 */
11777
11778 vm_object_deallocate(src_object);
11779
11780 /*
11781 * Verify that the map has not substantially
11782 * changed while the copy was being made.
11783 */
11784
11785 vm_map_lock(src_map);
11786
11787 if ((version.main_timestamp + 1) == src_map->timestamp) {
11788 /* src_map hasn't changed: src_entry is still valid */
11789 src_entry = saved_src_entry;
11790 goto VerificationSuccessful;
11791 }
11792
11793 /*
11794 * Simple version comparison failed.
11795 *
11796 * Retry the lookup and verify that the
11797 * same object/offset are still present.
11798 *
11799 * [Note: a memory manager that colludes with
11800 * the calling task can detect that we have
11801 * cheated. While the map was unlocked, the
11802 * mapping could have been changed and restored.]
11803 */
11804
11805 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
11806 if (result != KERN_MEMORY_RESTART_COPY) {
11807 vm_object_deallocate(VME_OBJECT(new_entry));
11808 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL, false, 0);
11809 /* reset accounting state */
11810 new_entry->iokit_acct = FALSE;
11811 new_entry->use_pmap = TRUE;
11812 }
11813 RETURN(KERN_INVALID_ADDRESS);
11814 }
11815
11816 src_entry = tmp_entry;
11817 vm_map_clip_start(src_map, src_entry, src_start);
11818
11819 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
11820 !use_maxprot) ||
11821 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
11822 goto VerificationFailed;
11823 }
11824
11825 if (src_entry->vme_end < new_entry->vme_end) {
11826 /*
11827 * This entry might have been shortened
11828 * (vm_map_clip_end) or been replaced with
11829 * an entry that ends closer to "src_start"
11830 * than before.
11831 * Adjust "new_entry" accordingly; copying
11832 * less memory would be correct but we also
11833 * redo the copy (see below) if the new entry
11834 * no longer points at the same object/offset.
11835 */
11836 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
11837 VM_MAP_COPY_PAGE_MASK(copy)));
11838 new_entry->vme_end = src_entry->vme_end;
11839 src_size = new_entry->vme_end - src_start;
11840 } else if (src_entry->vme_end > new_entry->vme_end) {
11841 /*
11842 * This entry might have been extended
11843 * (vm_map_entry_simplify() or coalesce)
11844 * or been replaced with an entry that ends farther
11845 * from "src_start" than before.
11846 *
11847 * We've called vm_object_copy_*() only on
11848 * the previous <start:end> range, so we can't
11849 * just extend new_entry. We have to re-do
11850 * the copy based on the new entry as if it was
11851 * pointing at a different object/offset (see
11852 * "Verification failed" below).
11853 */
11854 }
11855
11856 if ((VME_OBJECT(src_entry) != src_object) ||
11857 (VME_OFFSET(src_entry) != src_offset) ||
11858 (src_entry->vme_end > new_entry->vme_end)) {
11859 /*
11860 * Verification failed.
11861 *
11862 * Start over with this top-level entry.
11863 */
11864
11865 VerificationFailed: ;
11866
11867 vm_object_deallocate(VME_OBJECT(new_entry));
11868 tmp_entry = src_entry;
11869 continue;
11870 }
11871
11872 /*
11873 * Verification succeeded.
11874 */
11875
11876 VerificationSuccessful:;
11877
11878 if (result == KERN_MEMORY_RESTART_COPY) {
11879 goto RestartCopy;
11880 }
11881
11882 /*
11883 * Copy succeeded.
11884 */
11885
11886 CopySuccessful: ;
11887
11888 /*
11889 * Link in the new copy entry.
11890 */
11891
11892 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
11893 new_entry);
11894
11895 /*
11896 * Determine whether the entire region
11897 * has been copied.
11898 */
11899 src_base = src_start;
11900 src_start = new_entry->vme_end;
11901 new_entry = VM_MAP_ENTRY_NULL;
11902 while ((src_start >= src_end) && (src_end != 0)) {
11903 submap_map_t *ptr;
11904
11905 if (src_map == base_map) {
11906 /* back to the top */
11907 break;
11908 }
11909
11910 ptr = parent_maps;
11911 assert(ptr != NULL);
11912 parent_maps = parent_maps->next;
11913
11914 /* fix up the damage we did in that submap */
11915 vm_map_simplify_range(src_map,
11916 src_base,
11917 src_end);
11918
11919 vm_map_unlock(src_map);
11920 vm_map_deallocate(src_map);
11921 vm_map_lock(ptr->parent_map);
11922 src_map = ptr->parent_map;
11923 src_base = ptr->base_start;
11924 src_start = ptr->base_start + ptr->base_len;
11925 src_end = ptr->base_end;
11926 if (!vm_map_lookup_entry(src_map,
11927 src_start,
11928 &tmp_entry) &&
11929 (src_end > src_start)) {
11930 RETURN(KERN_INVALID_ADDRESS);
11931 }
11932 kfree_type(submap_map_t, ptr);
11933 if (parent_maps == NULL) {
11934 map_share = FALSE;
11935 }
11936 src_entry = tmp_entry->vme_prev;
11937 }
11938
11939 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
11940 (src_start >= src_addr + len) &&
11941 (src_addr + len != 0)) {
11942 /*
11943 * Stop copying now, even though we haven't reached
11944 * "src_end". We'll adjust the end of the last copy
11945 * entry at the end, if needed.
11946 *
11947 * If src_map's aligment is different from the
11948 * system's page-alignment, there could be
11949 * extra non-map-aligned map entries between
11950 * the original (non-rounded) "src_addr + len"
11951 * and the rounded "src_end".
11952 * We do not want to copy those map entries since
11953 * they're not part of the copied range.
11954 */
11955 break;
11956 }
11957
11958 if ((src_start >= src_end) && (src_end != 0)) {
11959 break;
11960 }
11961
11962 /*
11963 * Verify that there are no gaps in the region
11964 */
11965
11966 tmp_entry = src_entry->vme_next;
11967 if ((tmp_entry->vme_start != src_start) ||
11968 (tmp_entry == vm_map_to_entry(src_map))) {
11969 RETURN(KERN_INVALID_ADDRESS);
11970 }
11971 }
11972
11973 /*
11974 * If the source should be destroyed, do it now, since the
11975 * copy was successful.
11976 */
11977 if (src_destroy) {
11978 (void)vm_map_remove_and_unlock(src_map,
11979 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11980 src_end,
11981 ((src_map == kernel_map) ?
11982 VM_MAP_REMOVE_KUNWIRE :
11983 VM_MAP_REMOVE_NO_FLAGS),
11984 KMEM_GUARD_NONE);
11985 } else {
11986 /* fix up the damage we did in the base map */
11987 vm_map_simplify_range(
11988 src_map,
11989 vm_map_trunc_page(src_addr,
11990 VM_MAP_PAGE_MASK(src_map)),
11991 vm_map_round_page(src_end,
11992 VM_MAP_PAGE_MASK(src_map)));
11993 vm_map_unlock(src_map);
11994 }
11995
11996 tmp_entry = VM_MAP_ENTRY_NULL;
11997
11998 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
11999 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12000 vm_map_offset_t original_start, original_offset, original_end;
12001
12002 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12003
12004 /* adjust alignment of first copy_entry's "vme_start" */
12005 tmp_entry = vm_map_copy_first_entry(copy);
12006 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12007 vm_map_offset_t adjustment;
12008
12009 original_start = tmp_entry->vme_start;
12010 original_offset = VME_OFFSET(tmp_entry);
12011
12012 /* map-align the start of the first copy entry... */
12013 adjustment = (tmp_entry->vme_start -
12014 vm_map_trunc_page(
12015 tmp_entry->vme_start,
12016 VM_MAP_PAGE_MASK(src_map)));
12017 tmp_entry->vme_start -= adjustment;
12018 VME_OFFSET_SET(tmp_entry,
12019 VME_OFFSET(tmp_entry) - adjustment);
12020 copy_addr -= adjustment;
12021 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12022 /* ... adjust for mis-aligned start of copy range */
12023 adjustment =
12024 (vm_map_trunc_page(copy->offset,
12025 PAGE_MASK) -
12026 vm_map_trunc_page(copy->offset,
12027 VM_MAP_PAGE_MASK(src_map)));
12028 if (adjustment) {
12029 assert(page_aligned(adjustment));
12030 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12031 tmp_entry->vme_start += adjustment;
12032 VME_OFFSET_SET(tmp_entry,
12033 (VME_OFFSET(tmp_entry) +
12034 adjustment));
12035 copy_addr += adjustment;
12036 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12037 }
12038
12039 /*
12040 * Assert that the adjustments haven't exposed
12041 * more than was originally copied...
12042 */
12043 assert(tmp_entry->vme_start >= original_start);
12044 assert(VME_OFFSET(tmp_entry) >= original_offset);
12045 /*
12046 * ... and that it did not adjust outside of a
12047 * a single 16K page.
12048 */
12049 assert(vm_map_trunc_page(tmp_entry->vme_start,
12050 VM_MAP_PAGE_MASK(src_map)) ==
12051 vm_map_trunc_page(original_start,
12052 VM_MAP_PAGE_MASK(src_map)));
12053 }
12054
12055 /* adjust alignment of last copy_entry's "vme_end" */
12056 tmp_entry = vm_map_copy_last_entry(copy);
12057 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12058 vm_map_offset_t adjustment;
12059
12060 original_end = tmp_entry->vme_end;
12061
12062 /* map-align the end of the last copy entry... */
12063 tmp_entry->vme_end =
12064 vm_map_round_page(tmp_entry->vme_end,
12065 VM_MAP_PAGE_MASK(src_map));
12066 /* ... adjust for mis-aligned end of copy range */
12067 adjustment =
12068 (vm_map_round_page((copy->offset +
12069 copy->size),
12070 VM_MAP_PAGE_MASK(src_map)) -
12071 vm_map_round_page((copy->offset +
12072 copy->size),
12073 PAGE_MASK));
12074 if (adjustment) {
12075 assert(page_aligned(adjustment));
12076 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12077 tmp_entry->vme_end -= adjustment;
12078 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12079 }
12080
12081 /*
12082 * Assert that the adjustments haven't exposed
12083 * more than was originally copied...
12084 */
12085 assert(tmp_entry->vme_end <= original_end);
12086 /*
12087 * ... and that it did not adjust outside of a
12088 * a single 16K page.
12089 */
12090 assert(vm_map_round_page(tmp_entry->vme_end,
12091 VM_MAP_PAGE_MASK(src_map)) ==
12092 vm_map_round_page(original_end,
12093 VM_MAP_PAGE_MASK(src_map)));
12094 }
12095 }
12096
12097 /* Fix-up start and end points in copy. This is necessary */
12098 /* when the various entries in the copy object were picked */
12099 /* up from different sub-maps */
12100
12101 tmp_entry = vm_map_copy_first_entry(copy);
12102 copy_size = 0; /* compute actual size */
12103 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12104 assert(VM_MAP_PAGE_ALIGNED(
12105 copy_addr + (tmp_entry->vme_end -
12106 tmp_entry->vme_start),
12107 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12108 assert(VM_MAP_PAGE_ALIGNED(
12109 copy_addr,
12110 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12111
12112 /*
12113 * The copy_entries will be injected directly into the
12114 * destination map and might not be "map aligned" there...
12115 */
12116 tmp_entry->map_aligned = FALSE;
12117
12118 tmp_entry->vme_end = copy_addr +
12119 (tmp_entry->vme_end - tmp_entry->vme_start);
12120 tmp_entry->vme_start = copy_addr;
12121 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12122 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12123 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12124 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12125 }
12126
12127 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12128 copy_size < copy->size) {
12129 /*
12130 * The actual size of the VM map copy is smaller than what
12131 * was requested by the caller. This must be because some
12132 * PAGE_SIZE-sized pages are missing at the end of the last
12133 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12134 * The caller might not have been aware of those missing
12135 * pages and might not want to be aware of it, which is
12136 * fine as long as they don't try to access (and crash on)
12137 * those missing pages.
12138 * Let's adjust the size of the "copy", to avoid failing
12139 * in vm_map_copyout() or vm_map_copy_overwrite().
12140 */
12141 assert(vm_map_round_page(copy_size,
12142 VM_MAP_PAGE_MASK(src_map)) ==
12143 vm_map_round_page(copy->size,
12144 VM_MAP_PAGE_MASK(src_map)));
12145 copy->size = copy_size;
12146 }
12147
12148 *copy_result = copy;
12149 return KERN_SUCCESS;
12150
12151 #undef RETURN
12152 }
12153
12154 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12155 vm_map_copy_extract(
12156 vm_map_t src_map,
12157 vm_map_address_t src_addr,
12158 vm_map_size_t len,
12159 boolean_t do_copy,
12160 vm_map_copy_t *copy_result, /* OUT */
12161 vm_prot_t *cur_prot, /* IN/OUT */
12162 vm_prot_t *max_prot, /* IN/OUT */
12163 vm_inherit_t inheritance,
12164 vm_map_kernel_flags_t vmk_flags)
12165 {
12166 vm_map_copy_t copy;
12167 kern_return_t kr;
12168 vm_prot_t required_cur_prot, required_max_prot;
12169
12170 /*
12171 * Check for copies of zero bytes.
12172 */
12173
12174 if (len == 0) {
12175 *copy_result = VM_MAP_COPY_NULL;
12176 return KERN_SUCCESS;
12177 }
12178
12179 /*
12180 * Check that the end address doesn't overflow
12181 */
12182 if (src_addr + len < src_addr) {
12183 return KERN_INVALID_ADDRESS;
12184 }
12185
12186 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12187 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12188 }
12189
12190 required_cur_prot = *cur_prot;
12191 required_max_prot = *max_prot;
12192
12193 /*
12194 * Allocate a header element for the list.
12195 *
12196 * Use the start and end in the header to
12197 * remember the endpoints prior to rounding.
12198 */
12199
12200 copy = vm_map_copy_allocate();
12201 copy->type = VM_MAP_COPY_ENTRY_LIST;
12202 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12203
12204 vm_map_store_init(©->cpy_hdr);
12205
12206 copy->offset = 0;
12207 copy->size = len;
12208
12209 kr = vm_map_remap_extract(src_map,
12210 src_addr,
12211 len,
12212 do_copy, /* copy */
12213 ©->cpy_hdr,
12214 cur_prot, /* IN/OUT */
12215 max_prot, /* IN/OUT */
12216 inheritance,
12217 vmk_flags);
12218 if (kr != KERN_SUCCESS) {
12219 vm_map_copy_discard(copy);
12220 return kr;
12221 }
12222 if (required_cur_prot != VM_PROT_NONE) {
12223 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12224 assert((*max_prot & required_max_prot) == required_max_prot);
12225 }
12226
12227 *copy_result = copy;
12228 return KERN_SUCCESS;
12229 }
12230
12231 /*
12232 * vm_map_copyin_object:
12233 *
12234 * Create a copy object from an object.
12235 * Our caller donates an object reference.
12236 */
12237
12238 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12239 vm_map_copyin_object(
12240 vm_object_t object,
12241 vm_object_offset_t offset, /* offset of region in object */
12242 vm_object_size_t size, /* size of region in object */
12243 vm_map_copy_t *copy_result) /* OUT */
12244 {
12245 vm_map_copy_t copy; /* Resulting copy */
12246
12247 /*
12248 * We drop the object into a special copy object
12249 * that contains the object directly.
12250 */
12251
12252 copy = vm_map_copy_allocate();
12253 copy->type = VM_MAP_COPY_OBJECT;
12254 copy->cpy_object = object;
12255 copy->offset = offset;
12256 copy->size = size;
12257
12258 *copy_result = copy;
12259 return KERN_SUCCESS;
12260 }
12261
12262 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12263 vm_map_fork_share(
12264 vm_map_t old_map,
12265 vm_map_entry_t old_entry,
12266 vm_map_t new_map)
12267 {
12268 vm_object_t object;
12269 vm_map_entry_t new_entry;
12270
12271 /*
12272 * New sharing code. New map entry
12273 * references original object. Internal
12274 * objects use asynchronous copy algorithm for
12275 * future copies. First make sure we have
12276 * the right object. If we need a shadow,
12277 * or someone else already has one, then
12278 * make a new shadow and share it.
12279 */
12280
12281 if (!old_entry->is_sub_map) {
12282 object = VME_OBJECT(old_entry);
12283 }
12284
12285 if (old_entry->is_sub_map) {
12286 assert(old_entry->wired_count == 0);
12287 #ifndef NO_NESTED_PMAP
12288 if (old_entry->use_pmap) {
12289 kern_return_t result;
12290
12291 result = pmap_nest(new_map->pmap,
12292 (VME_SUBMAP(old_entry))->pmap,
12293 (addr64_t)old_entry->vme_start,
12294 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12295 if (result) {
12296 panic("vm_map_fork_share: pmap_nest failed!");
12297 }
12298 }
12299 #endif /* NO_NESTED_PMAP */
12300 } else if (object == VM_OBJECT_NULL) {
12301 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12302 old_entry->vme_start));
12303 VME_OFFSET_SET(old_entry, 0);
12304 VME_OBJECT_SET(old_entry, object, false, 0);
12305 old_entry->use_pmap = TRUE;
12306 // assert(!old_entry->needs_copy);
12307 } else if (object->copy_strategy !=
12308 MEMORY_OBJECT_COPY_SYMMETRIC) {
12309 /*
12310 * We are already using an asymmetric
12311 * copy, and therefore we already have
12312 * the right object.
12313 */
12314
12315 assert(!old_entry->needs_copy);
12316 } else if (old_entry->needs_copy || /* case 1 */
12317 object->shadowed || /* case 2 */
12318 (!object->true_share && /* case 3 */
12319 !old_entry->is_shared &&
12320 (object->vo_size >
12321 (vm_map_size_t)(old_entry->vme_end -
12322 old_entry->vme_start)))) {
12323 /*
12324 * We need to create a shadow.
12325 * There are three cases here.
12326 * In the first case, we need to
12327 * complete a deferred symmetrical
12328 * copy that we participated in.
12329 * In the second and third cases,
12330 * we need to create the shadow so
12331 * that changes that we make to the
12332 * object do not interfere with
12333 * any symmetrical copies which
12334 * have occured (case 2) or which
12335 * might occur (case 3).
12336 *
12337 * The first case is when we had
12338 * deferred shadow object creation
12339 * via the entry->needs_copy mechanism.
12340 * This mechanism only works when
12341 * only one entry points to the source
12342 * object, and we are about to create
12343 * a second entry pointing to the
12344 * same object. The problem is that
12345 * there is no way of mapping from
12346 * an object to the entries pointing
12347 * to it. (Deferred shadow creation
12348 * works with one entry because occurs
12349 * at fault time, and we walk from the
12350 * entry to the object when handling
12351 * the fault.)
12352 *
12353 * The second case is when the object
12354 * to be shared has already been copied
12355 * with a symmetric copy, but we point
12356 * directly to the object without
12357 * needs_copy set in our entry. (This
12358 * can happen because different ranges
12359 * of an object can be pointed to by
12360 * different entries. In particular,
12361 * a single entry pointing to an object
12362 * can be split by a call to vm_inherit,
12363 * which, combined with task_create, can
12364 * result in the different entries
12365 * having different needs_copy values.)
12366 * The shadowed flag in the object allows
12367 * us to detect this case. The problem
12368 * with this case is that if this object
12369 * has or will have shadows, then we
12370 * must not perform an asymmetric copy
12371 * of this object, since such a copy
12372 * allows the object to be changed, which
12373 * will break the previous symmetrical
12374 * copies (which rely upon the object
12375 * not changing). In a sense, the shadowed
12376 * flag says "don't change this object".
12377 * We fix this by creating a shadow
12378 * object for this object, and sharing
12379 * that. This works because we are free
12380 * to change the shadow object (and thus
12381 * to use an asymmetric copy strategy);
12382 * this is also semantically correct,
12383 * since this object is temporary, and
12384 * therefore a copy of the object is
12385 * as good as the object itself. (This
12386 * is not true for permanent objects,
12387 * since the pager needs to see changes,
12388 * which won't happen if the changes
12389 * are made to a copy.)
12390 *
12391 * The third case is when the object
12392 * to be shared has parts sticking
12393 * outside of the entry we're working
12394 * with, and thus may in the future
12395 * be subject to a symmetrical copy.
12396 * (This is a preemptive version of
12397 * case 2.)
12398 */
12399 VME_OBJECT_SHADOW(old_entry,
12400 (vm_map_size_t) (old_entry->vme_end -
12401 old_entry->vme_start));
12402
12403 /*
12404 * If we're making a shadow for other than
12405 * copy on write reasons, then we have
12406 * to remove write permission.
12407 */
12408
12409 if (!old_entry->needs_copy &&
12410 (old_entry->protection & VM_PROT_WRITE)) {
12411 vm_prot_t prot;
12412
12413 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12414
12415 prot = old_entry->protection & ~VM_PROT_WRITE;
12416
12417 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12418
12419 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12420 prot |= VM_PROT_EXECUTE;
12421 }
12422
12423
12424 if (old_map->mapped_in_other_pmaps) {
12425 vm_object_pmap_protect(
12426 VME_OBJECT(old_entry),
12427 VME_OFFSET(old_entry),
12428 (old_entry->vme_end -
12429 old_entry->vme_start),
12430 PMAP_NULL,
12431 PAGE_SIZE,
12432 old_entry->vme_start,
12433 prot);
12434 } else {
12435 pmap_protect(old_map->pmap,
12436 old_entry->vme_start,
12437 old_entry->vme_end,
12438 prot);
12439 }
12440 }
12441
12442 old_entry->needs_copy = FALSE;
12443 object = VME_OBJECT(old_entry);
12444 }
12445
12446
12447 /*
12448 * If object was using a symmetric copy strategy,
12449 * change its copy strategy to the default
12450 * asymmetric copy strategy, which is copy_delay
12451 * in the non-norma case and copy_call in the
12452 * norma case. Bump the reference count for the
12453 * new entry.
12454 */
12455
12456 if (old_entry->is_sub_map) {
12457 vm_map_reference(VME_SUBMAP(old_entry));
12458 } else {
12459 vm_object_lock(object);
12460 vm_object_reference_locked(object);
12461 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12462 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12463 }
12464 vm_object_unlock(object);
12465 }
12466
12467 /*
12468 * Clone the entry, using object ref from above.
12469 * Mark both entries as shared.
12470 */
12471
12472 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12473 vm_map_entry_copy(old_map, new_entry, old_entry);
12474 old_entry->is_shared = TRUE;
12475 new_entry->is_shared = TRUE;
12476
12477 /*
12478 * We're dealing with a shared mapping, so the resulting mapping
12479 * should inherit some of the original mapping's accounting settings.
12480 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12481 * "use_pmap" should stay the same as before (if it hasn't been reset
12482 * to TRUE when we cleared "iokit_acct").
12483 */
12484 assert(!new_entry->iokit_acct);
12485
12486 /*
12487 * If old entry's inheritence is VM_INHERIT_NONE,
12488 * the new entry is for corpse fork, remove the
12489 * write permission from the new entry.
12490 */
12491 if (old_entry->inheritance == VM_INHERIT_NONE) {
12492 new_entry->protection &= ~VM_PROT_WRITE;
12493 new_entry->max_protection &= ~VM_PROT_WRITE;
12494 }
12495
12496 /*
12497 * Insert the entry into the new map -- we
12498 * know we're inserting at the end of the new
12499 * map.
12500 */
12501
12502 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12503 VM_MAP_KERNEL_FLAGS_NONE);
12504
12505 /*
12506 * Update the physical map
12507 */
12508
12509 if (old_entry->is_sub_map) {
12510 /* Bill Angell pmap support goes here */
12511 } else {
12512 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12513 old_entry->vme_end - old_entry->vme_start,
12514 old_entry->vme_start);
12515 }
12516 }
12517
12518 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12519 vm_map_fork_copy(
12520 vm_map_t old_map,
12521 vm_map_entry_t *old_entry_p,
12522 vm_map_t new_map,
12523 int vm_map_copyin_flags)
12524 {
12525 vm_map_entry_t old_entry = *old_entry_p;
12526 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12527 vm_map_offset_t start = old_entry->vme_start;
12528 vm_map_copy_t copy;
12529 vm_map_entry_t last = vm_map_last_entry(new_map);
12530
12531 vm_map_unlock(old_map);
12532 /*
12533 * Use maxprot version of copyin because we
12534 * care about whether this memory can ever
12535 * be accessed, not just whether it's accessible
12536 * right now.
12537 */
12538 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12539 if (vm_map_copyin_internal(old_map, start, entry_size,
12540 vm_map_copyin_flags, ©)
12541 != KERN_SUCCESS) {
12542 /*
12543 * The map might have changed while it
12544 * was unlocked, check it again. Skip
12545 * any blank space or permanently
12546 * unreadable region.
12547 */
12548 vm_map_lock(old_map);
12549 if (!vm_map_lookup_entry(old_map, start, &last) ||
12550 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12551 last = last->vme_next;
12552 }
12553 *old_entry_p = last;
12554
12555 /*
12556 * XXX For some error returns, want to
12557 * XXX skip to the next element. Note
12558 * that INVALID_ADDRESS and
12559 * PROTECTION_FAILURE are handled above.
12560 */
12561
12562 return FALSE;
12563 }
12564
12565 /*
12566 * Assert that the vm_map_copy is coming from the right
12567 * zone and hasn't been forged
12568 */
12569 vm_map_copy_require(copy);
12570
12571 /*
12572 * Insert the copy into the new map
12573 */
12574 vm_map_copy_insert(new_map, last, copy);
12575
12576 /*
12577 * Pick up the traversal at the end of
12578 * the copied region.
12579 */
12580
12581 vm_map_lock(old_map);
12582 start += entry_size;
12583 if (!vm_map_lookup_entry(old_map, start, &last)) {
12584 last = last->vme_next;
12585 } else {
12586 if (last->vme_start == start) {
12587 /*
12588 * No need to clip here and we don't
12589 * want to cause any unnecessary
12590 * unnesting...
12591 */
12592 } else {
12593 vm_map_clip_start(old_map, last, start);
12594 }
12595 }
12596 *old_entry_p = last;
12597
12598 return TRUE;
12599 }
12600
12601 /*
12602 * vm_map_fork:
12603 *
12604 * Create and return a new map based on the old
12605 * map, according to the inheritance values on the
12606 * regions in that map and the options.
12607 *
12608 * The source map must not be locked.
12609 */
12610 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)12611 vm_map_fork(
12612 ledger_t ledger,
12613 vm_map_t old_map,
12614 int options)
12615 {
12616 pmap_t new_pmap;
12617 vm_map_t new_map;
12618 vm_map_entry_t old_entry;
12619 vm_map_size_t new_size = 0, entry_size;
12620 vm_map_entry_t new_entry;
12621 boolean_t src_needs_copy;
12622 boolean_t new_entry_needs_copy;
12623 boolean_t pmap_is64bit;
12624 int vm_map_copyin_flags;
12625 vm_inherit_t old_entry_inheritance;
12626 int map_create_options;
12627 kern_return_t footprint_collect_kr;
12628
12629 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
12630 VM_MAP_FORK_PRESERVE_PURGEABLE |
12631 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
12632 /* unsupported option */
12633 return VM_MAP_NULL;
12634 }
12635
12636 pmap_is64bit =
12637 #if defined(__i386__) || defined(__x86_64__)
12638 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
12639 #elif defined(__arm64__)
12640 old_map->pmap->is_64bit;
12641 #elif defined(__arm__)
12642 FALSE;
12643 #else
12644 #error Unknown architecture.
12645 #endif
12646
12647 unsigned int pmap_flags = 0;
12648 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
12649 #if defined(HAS_APPLE_PAC)
12650 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
12651 #endif
12652 #if PMAP_CREATE_FORCE_4K_PAGES
12653 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
12654 PAGE_SIZE != FOURK_PAGE_SIZE) {
12655 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
12656 }
12657 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
12658 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
12659 if (new_pmap == NULL) {
12660 return VM_MAP_NULL;
12661 }
12662
12663 vm_map_reference(old_map);
12664 vm_map_lock(old_map);
12665
12666 map_create_options = 0;
12667 if (old_map->hdr.entries_pageable) {
12668 map_create_options |= VM_MAP_CREATE_PAGEABLE;
12669 }
12670 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12671 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
12672 footprint_collect_kr = KERN_SUCCESS;
12673 }
12674 new_map = vm_map_create_options(new_pmap,
12675 old_map->min_offset,
12676 old_map->max_offset,
12677 map_create_options);
12678 /* inherit cs_enforcement */
12679 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
12680 vm_map_lock(new_map);
12681 vm_commit_pagezero_status(new_map);
12682 /* inherit the parent map's page size */
12683 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
12684
12685 /* ensure PMAP_CS structures are prepared for the fork */
12686 pmap_cs_fork_prepare(old_map->pmap, new_pmap);
12687
12688 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
12689 /*
12690 * Abort any corpse collection if the system is shutting down.
12691 */
12692 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12693 get_system_inshutdown()) {
12694 vm_map_corpse_footprint_collect_done(new_map);
12695 vm_map_unlock(new_map);
12696 vm_map_unlock(old_map);
12697 vm_map_deallocate(new_map);
12698 vm_map_deallocate(old_map);
12699 printf("Aborting corpse map due to system shutdown\n");
12700 return VM_MAP_NULL;
12701 }
12702
12703 entry_size = old_entry->vme_end - old_entry->vme_start;
12704
12705 old_entry_inheritance = old_entry->inheritance;
12706 /*
12707 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
12708 * share VM_INHERIT_NONE entries that are not backed by a
12709 * device pager.
12710 */
12711 if (old_entry_inheritance == VM_INHERIT_NONE &&
12712 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
12713 (old_entry->protection & VM_PROT_READ) &&
12714 !(!old_entry->is_sub_map &&
12715 VME_OBJECT(old_entry) != NULL &&
12716 VME_OBJECT(old_entry)->pager != NULL &&
12717 is_device_pager_ops(
12718 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
12719 old_entry_inheritance = VM_INHERIT_SHARE;
12720 }
12721
12722 if (old_entry_inheritance != VM_INHERIT_NONE &&
12723 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12724 footprint_collect_kr == KERN_SUCCESS) {
12725 /*
12726 * The corpse won't have old_map->pmap to query
12727 * footprint information, so collect that data now
12728 * and store it in new_map->vmmap_corpse_footprint
12729 * for later autopsy.
12730 */
12731 footprint_collect_kr =
12732 vm_map_corpse_footprint_collect(old_map,
12733 old_entry,
12734 new_map);
12735 }
12736
12737 switch (old_entry_inheritance) {
12738 case VM_INHERIT_NONE:
12739 break;
12740
12741 case VM_INHERIT_SHARE:
12742 vm_map_fork_share(old_map, old_entry, new_map);
12743 new_size += entry_size;
12744 break;
12745
12746 case VM_INHERIT_COPY:
12747
12748 /*
12749 * Inline the copy_quickly case;
12750 * upon failure, fall back on call
12751 * to vm_map_fork_copy.
12752 */
12753
12754 if (old_entry->is_sub_map) {
12755 break;
12756 }
12757 if ((old_entry->wired_count != 0) ||
12758 ((VME_OBJECT(old_entry) != NULL) &&
12759 (VME_OBJECT(old_entry)->true_share))) {
12760 goto slow_vm_map_fork_copy;
12761 }
12762
12763 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
12764 vm_map_entry_copy(old_map, new_entry, old_entry);
12765 if (old_entry->permanent) {
12766 /* inherit "permanent" on fork() */
12767 new_entry->permanent = TRUE;
12768 }
12769
12770 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
12771 new_map->jit_entry_exists = TRUE;
12772 }
12773
12774 if (new_entry->is_sub_map) {
12775 /* clear address space specifics */
12776 new_entry->use_pmap = FALSE;
12777 } else {
12778 /*
12779 * We're dealing with a copy-on-write operation,
12780 * so the resulting mapping should not inherit
12781 * the original mapping's accounting settings.
12782 * "iokit_acct" should have been cleared in
12783 * vm_map_entry_copy().
12784 * "use_pmap" should be reset to its default
12785 * (TRUE) so that the new mapping gets
12786 * accounted for in the task's memory footprint.
12787 */
12788 assert(!new_entry->iokit_acct);
12789 new_entry->use_pmap = TRUE;
12790 }
12791
12792 if (!vm_object_copy_quickly(
12793 VME_OBJECT(new_entry),
12794 VME_OFFSET(old_entry),
12795 (old_entry->vme_end -
12796 old_entry->vme_start),
12797 &src_needs_copy,
12798 &new_entry_needs_copy)) {
12799 vm_map_entry_dispose(new_entry);
12800 goto slow_vm_map_fork_copy;
12801 }
12802
12803 /*
12804 * Handle copy-on-write obligations
12805 */
12806
12807 if (src_needs_copy && !old_entry->needs_copy) {
12808 vm_prot_t prot;
12809
12810 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12811
12812 prot = old_entry->protection & ~VM_PROT_WRITE;
12813
12814 if (override_nx(old_map, VME_ALIAS(old_entry))
12815 && prot) {
12816 prot |= VM_PROT_EXECUTE;
12817 }
12818
12819 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12820
12821 vm_object_pmap_protect(
12822 VME_OBJECT(old_entry),
12823 VME_OFFSET(old_entry),
12824 (old_entry->vme_end -
12825 old_entry->vme_start),
12826 ((old_entry->is_shared
12827 || old_map->mapped_in_other_pmaps)
12828 ? PMAP_NULL :
12829 old_map->pmap),
12830 VM_MAP_PAGE_SIZE(old_map),
12831 old_entry->vme_start,
12832 prot);
12833
12834 assert(old_entry->wired_count == 0);
12835 old_entry->needs_copy = TRUE;
12836 }
12837 new_entry->needs_copy = new_entry_needs_copy;
12838
12839 /*
12840 * Insert the entry at the end
12841 * of the map.
12842 */
12843
12844 vm_map_store_entry_link(new_map,
12845 vm_map_last_entry(new_map),
12846 new_entry,
12847 VM_MAP_KERNEL_FLAGS_NONE);
12848 new_size += entry_size;
12849 break;
12850
12851 slow_vm_map_fork_copy:
12852 vm_map_copyin_flags = 0;
12853 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
12854 vm_map_copyin_flags |=
12855 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
12856 }
12857 if (vm_map_fork_copy(old_map,
12858 &old_entry,
12859 new_map,
12860 vm_map_copyin_flags)) {
12861 new_size += entry_size;
12862 }
12863 continue;
12864 }
12865 old_entry = old_entry->vme_next;
12866 }
12867
12868 #if defined(__arm64__)
12869 pmap_insert_sharedpage(new_map->pmap);
12870 #endif /* __arm64__ */
12871
12872 new_map->size = new_size;
12873
12874 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12875 vm_map_corpse_footprint_collect_done(new_map);
12876 }
12877
12878 /* Propagate JIT entitlement for the pmap layer. */
12879 if (pmap_get_jit_entitled(old_map->pmap)) {
12880 /* Tell the pmap that it supports JIT. */
12881 pmap_set_jit_entitled(new_map->pmap);
12882 }
12883
12884 vm_map_unlock(new_map);
12885 vm_map_unlock(old_map);
12886 vm_map_deallocate(old_map);
12887
12888 return new_map;
12889 }
12890
12891 /*
12892 * vm_map_exec:
12893 *
12894 * Setup the "new_map" with the proper execution environment according
12895 * to the type of executable (platform, 64bit, chroot environment).
12896 * Map the comm page and shared region, etc...
12897 */
12898 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit)12899 vm_map_exec(
12900 vm_map_t new_map,
12901 task_t task,
12902 boolean_t is64bit,
12903 void *fsroot,
12904 cpu_type_t cpu,
12905 cpu_subtype_t cpu_subtype,
12906 boolean_t reslide,
12907 boolean_t is_driverkit)
12908 {
12909 SHARED_REGION_TRACE_DEBUG(
12910 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
12911 (void *)VM_KERNEL_ADDRPERM(current_task()),
12912 (void *)VM_KERNEL_ADDRPERM(new_map),
12913 (void *)VM_KERNEL_ADDRPERM(task),
12914 (void *)VM_KERNEL_ADDRPERM(fsroot),
12915 cpu,
12916 cpu_subtype));
12917 (void) vm_commpage_enter(new_map, task, is64bit);
12918
12919 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit);
12920
12921 SHARED_REGION_TRACE_DEBUG(
12922 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
12923 (void *)VM_KERNEL_ADDRPERM(current_task()),
12924 (void *)VM_KERNEL_ADDRPERM(new_map),
12925 (void *)VM_KERNEL_ADDRPERM(task),
12926 (void *)VM_KERNEL_ADDRPERM(fsroot),
12927 cpu,
12928 cpu_subtype));
12929
12930 /*
12931 * Some devices have region(s) of memory that shouldn't get allocated by
12932 * user processes. The following code creates dummy vm_map_entry_t's for each
12933 * of the regions that needs to be reserved to prevent any allocations in
12934 * those regions.
12935 */
12936 kern_return_t kr = KERN_FAILURE;
12937 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
12938 vmk_flags.vmkf_permanent = TRUE;
12939 vmk_flags.vmkf_beyond_max = TRUE;
12940
12941 struct vm_reserved_region *regions = NULL;
12942 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
12943 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
12944
12945 for (size_t i = 0; i < num_regions; ++i) {
12946 kr = vm_map_enter(
12947 new_map,
12948 ®ions[i].vmrr_addr,
12949 regions[i].vmrr_size,
12950 (vm_map_offset_t)0,
12951 VM_FLAGS_FIXED,
12952 vmk_flags,
12953 VM_KERN_MEMORY_NONE,
12954 VM_OBJECT_NULL,
12955 (vm_object_offset_t)0,
12956 FALSE,
12957 VM_PROT_NONE,
12958 VM_PROT_NONE,
12959 VM_INHERIT_COPY);
12960
12961 if (kr != KERN_SUCCESS) {
12962 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
12963 }
12964 }
12965
12966 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
12967
12968 return KERN_SUCCESS;
12969 }
12970
12971 uint64_t vm_map_lookup_locked_copy_slowly_count = 0;
12972 uint64_t vm_map_lookup_locked_copy_slowly_size = 0;
12973 uint64_t vm_map_lookup_locked_copy_slowly_max = 0;
12974 uint64_t vm_map_lookup_locked_copy_slowly_restart = 0;
12975 uint64_t vm_map_lookup_locked_copy_slowly_error = 0;
12976 uint64_t vm_map_lookup_locked_copy_strategically_count = 0;
12977 uint64_t vm_map_lookup_locked_copy_strategically_size = 0;
12978 uint64_t vm_map_lookup_locked_copy_strategically_max = 0;
12979 uint64_t vm_map_lookup_locked_copy_strategically_restart = 0;
12980 uint64_t vm_map_lookup_locked_copy_strategically_error = 0;
12981 uint64_t vm_map_lookup_locked_copy_shadow_count = 0;
12982 uint64_t vm_map_lookup_locked_copy_shadow_size = 0;
12983 uint64_t vm_map_lookup_locked_copy_shadow_max = 0;
12984 /*
12985 * vm_map_lookup_locked:
12986 *
12987 * Finds the VM object, offset, and
12988 * protection for a given virtual address in the
12989 * specified map, assuming a page fault of the
12990 * type specified.
12991 *
12992 * Returns the (object, offset, protection) for
12993 * this address, whether it is wired down, and whether
12994 * this map has the only reference to the data in question.
12995 * In order to later verify this lookup, a "version"
12996 * is returned.
12997 * If contended != NULL, *contended will be set to
12998 * true iff the thread had to spin or block to acquire
12999 * an exclusive lock.
13000 *
13001 * The map MUST be locked by the caller and WILL be
13002 * locked on exit. In order to guarantee the
13003 * existence of the returned object, it is returned
13004 * locked.
13005 *
13006 * If a lookup is requested with "write protection"
13007 * specified, the map may be changed to perform virtual
13008 * copying operations, although the data referenced will
13009 * remain the same.
13010 */
13011 kern_return_t
vm_map_lookup_locked(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)13012 vm_map_lookup_locked(
13013 vm_map_t *var_map, /* IN/OUT */
13014 vm_map_offset_t vaddr,
13015 vm_prot_t fault_type,
13016 int object_lock_type,
13017 vm_map_version_t *out_version, /* OUT */
13018 vm_object_t *object, /* OUT */
13019 vm_object_offset_t *offset, /* OUT */
13020 vm_prot_t *out_prot, /* OUT */
13021 boolean_t *wired, /* OUT */
13022 vm_object_fault_info_t fault_info, /* OUT */
13023 vm_map_t *real_map, /* OUT */
13024 bool *contended) /* OUT */
13025 {
13026 vm_map_entry_t entry;
13027 vm_map_t map = *var_map;
13028 vm_map_t old_map = *var_map;
13029 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13030 vm_map_offset_t cow_parent_vaddr = 0;
13031 vm_map_offset_t old_start = 0;
13032 vm_map_offset_t old_end = 0;
13033 vm_prot_t prot;
13034 boolean_t mask_protections;
13035 boolean_t force_copy;
13036 boolean_t no_force_copy_if_executable;
13037 boolean_t submap_needed_copy;
13038 vm_prot_t original_fault_type;
13039 vm_map_size_t fault_page_mask;
13040
13041 /*
13042 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13043 * as a mask against the mapping's actual protections, not as an
13044 * absolute value.
13045 */
13046 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13047 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13048 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13049 fault_type &= VM_PROT_ALL;
13050 original_fault_type = fault_type;
13051 if (contended) {
13052 *contended = false;
13053 }
13054
13055 *real_map = map;
13056
13057 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13058 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13059
13060 RetryLookup:
13061 fault_type = original_fault_type;
13062
13063 /*
13064 * If the map has an interesting hint, try it before calling
13065 * full blown lookup routine.
13066 */
13067 entry = map->hint;
13068
13069 if ((entry == vm_map_to_entry(map)) ||
13070 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13071 vm_map_entry_t tmp_entry;
13072
13073 /*
13074 * Entry was either not a valid hint, or the vaddr
13075 * was not contained in the entry, so do a full lookup.
13076 */
13077 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13078 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13079 vm_map_unlock(cow_sub_map_parent);
13080 }
13081 if ((*real_map != map)
13082 && (*real_map != cow_sub_map_parent)) {
13083 vm_map_unlock(*real_map);
13084 }
13085 return KERN_INVALID_ADDRESS;
13086 }
13087
13088 entry = tmp_entry;
13089 }
13090 if (map == old_map) {
13091 old_start = entry->vme_start;
13092 old_end = entry->vme_end;
13093 }
13094
13095 /*
13096 * Handle submaps. Drop lock on upper map, submap is
13097 * returned locked.
13098 */
13099
13100 submap_needed_copy = FALSE;
13101 submap_recurse:
13102 if (entry->is_sub_map) {
13103 vm_map_offset_t local_vaddr;
13104 vm_map_offset_t end_delta;
13105 vm_map_offset_t start_delta;
13106 vm_map_entry_t submap_entry, saved_submap_entry;
13107 vm_object_offset_t submap_entry_offset;
13108 vm_object_size_t submap_entry_size;
13109 vm_prot_t subentry_protection;
13110 vm_prot_t subentry_max_protection;
13111 boolean_t subentry_no_copy_on_read;
13112 boolean_t mapped_needs_copy = FALSE;
13113 vm_map_version_t version;
13114
13115 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13116 "map %p (%d) entry %p submap %p (%d)\n",
13117 map, VM_MAP_PAGE_SHIFT(map), entry,
13118 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13119
13120 local_vaddr = vaddr;
13121
13122 if ((entry->use_pmap &&
13123 !((fault_type & VM_PROT_WRITE) ||
13124 force_copy))) {
13125 /* if real_map equals map we unlock below */
13126 if ((*real_map != map) &&
13127 (*real_map != cow_sub_map_parent)) {
13128 vm_map_unlock(*real_map);
13129 }
13130 *real_map = VME_SUBMAP(entry);
13131 }
13132
13133 if (entry->needs_copy &&
13134 ((fault_type & VM_PROT_WRITE) ||
13135 force_copy)) {
13136 if (!mapped_needs_copy) {
13137 if (vm_map_lock_read_to_write(map)) {
13138 vm_map_lock_read(map);
13139 *real_map = map;
13140 goto RetryLookup;
13141 }
13142 vm_map_lock_read(VME_SUBMAP(entry));
13143 *var_map = VME_SUBMAP(entry);
13144 cow_sub_map_parent = map;
13145 /* reset base to map before cow object */
13146 /* this is the map which will accept */
13147 /* the new cow object */
13148 old_start = entry->vme_start;
13149 old_end = entry->vme_end;
13150 cow_parent_vaddr = vaddr;
13151 mapped_needs_copy = TRUE;
13152 } else {
13153 vm_map_lock_read(VME_SUBMAP(entry));
13154 *var_map = VME_SUBMAP(entry);
13155 if ((cow_sub_map_parent != map) &&
13156 (*real_map != map)) {
13157 vm_map_unlock(map);
13158 }
13159 }
13160 } else {
13161 if (entry->needs_copy) {
13162 submap_needed_copy = TRUE;
13163 }
13164 vm_map_lock_read(VME_SUBMAP(entry));
13165 *var_map = VME_SUBMAP(entry);
13166 /* leave map locked if it is a target */
13167 /* cow sub_map above otherwise, just */
13168 /* follow the maps down to the object */
13169 /* here we unlock knowing we are not */
13170 /* revisiting the map. */
13171 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13172 vm_map_unlock_read(map);
13173 }
13174 }
13175
13176 map = *var_map;
13177
13178 /* calculate the offset in the submap for vaddr */
13179 local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13180 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13181 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13182 (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13183
13184 RetrySubMap:
13185 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13186 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13187 vm_map_unlock(cow_sub_map_parent);
13188 }
13189 if ((*real_map != map)
13190 && (*real_map != cow_sub_map_parent)) {
13191 vm_map_unlock(*real_map);
13192 }
13193 *real_map = map;
13194 return KERN_INVALID_ADDRESS;
13195 }
13196
13197 /* find the attenuated shadow of the underlying object */
13198 /* on our target map */
13199
13200 /* in english the submap object may extend beyond the */
13201 /* region mapped by the entry or, may only fill a portion */
13202 /* of it. For our purposes, we only care if the object */
13203 /* doesn't fill. In this case the area which will */
13204 /* ultimately be clipped in the top map will only need */
13205 /* to be as big as the portion of the underlying entry */
13206 /* which is mapped */
13207 start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13208 submap_entry->vme_start - VME_OFFSET(entry) : 0;
13209
13210 end_delta =
13211 (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13212 submap_entry->vme_end ?
13213 0 : (VME_OFFSET(entry) +
13214 (old_end - old_start))
13215 - submap_entry->vme_end;
13216
13217 old_start += start_delta;
13218 old_end -= end_delta;
13219
13220 if (submap_entry->is_sub_map) {
13221 entry = submap_entry;
13222 vaddr = local_vaddr;
13223 goto submap_recurse;
13224 }
13225
13226 if (((fault_type & VM_PROT_WRITE) ||
13227 force_copy)
13228 && cow_sub_map_parent) {
13229 vm_object_t sub_object, copy_object;
13230 vm_object_offset_t copy_offset;
13231 vm_map_offset_t local_start;
13232 vm_map_offset_t local_end;
13233 boolean_t object_copied = FALSE;
13234 vm_object_offset_t object_copied_offset = 0;
13235 boolean_t object_copied_needs_copy = FALSE;
13236 kern_return_t kr = KERN_SUCCESS;
13237
13238 if (vm_map_lock_read_to_write(map)) {
13239 vm_map_lock_read(map);
13240 old_start -= start_delta;
13241 old_end += end_delta;
13242 goto RetrySubMap;
13243 }
13244
13245
13246 sub_object = VME_OBJECT(submap_entry);
13247 if (sub_object == VM_OBJECT_NULL) {
13248 sub_object =
13249 vm_object_allocate(
13250 (vm_map_size_t)
13251 (submap_entry->vme_end -
13252 submap_entry->vme_start));
13253 VME_OBJECT_SET(submap_entry, sub_object, false, 0);
13254 VME_OFFSET_SET(submap_entry, 0);
13255 assert(!submap_entry->is_sub_map);
13256 assert(submap_entry->use_pmap);
13257 }
13258 local_start = local_vaddr -
13259 (cow_parent_vaddr - old_start);
13260 local_end = local_vaddr +
13261 (old_end - cow_parent_vaddr);
13262 vm_map_clip_start(map, submap_entry, local_start);
13263 vm_map_clip_end(map, submap_entry, local_end);
13264 if (submap_entry->is_sub_map) {
13265 /* unnesting was done when clipping */
13266 assert(!submap_entry->use_pmap);
13267 }
13268
13269 /* This is the COW case, lets connect */
13270 /* an entry in our space to the underlying */
13271 /* object in the submap, bypassing the */
13272 /* submap. */
13273 submap_entry_offset = VME_OFFSET(submap_entry);
13274 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13275
13276 if ((submap_entry->wired_count != 0 ||
13277 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13278 (submap_entry->protection & VM_PROT_EXECUTE) &&
13279 no_force_copy_if_executable) {
13280 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13281 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13282 vm_map_unlock(cow_sub_map_parent);
13283 }
13284 if ((*real_map != map)
13285 && (*real_map != cow_sub_map_parent)) {
13286 vm_map_unlock(*real_map);
13287 }
13288 *real_map = map;
13289 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13290 vm_map_lock_write_to_read(map);
13291 kr = KERN_PROTECTION_FAILURE;
13292 DTRACE_VM4(submap_no_copy_executable,
13293 vm_map_t, map,
13294 vm_object_offset_t, submap_entry_offset,
13295 vm_object_size_t, submap_entry_size,
13296 int, kr);
13297 return kr;
13298 }
13299
13300 if (submap_entry->wired_count != 0) {
13301 vm_object_reference(sub_object);
13302
13303 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13304 "submap_entry %p offset 0x%llx\n",
13305 submap_entry, VME_OFFSET(submap_entry));
13306
13307 DTRACE_VM6(submap_copy_slowly,
13308 vm_map_t, cow_sub_map_parent,
13309 vm_map_offset_t, vaddr,
13310 vm_map_t, map,
13311 vm_object_size_t, submap_entry_size,
13312 int, submap_entry->wired_count,
13313 int, sub_object->copy_strategy);
13314
13315 saved_submap_entry = submap_entry;
13316 version.main_timestamp = map->timestamp;
13317 vm_map_unlock(map); /* Increments timestamp by 1 */
13318 submap_entry = VM_MAP_ENTRY_NULL;
13319
13320 vm_object_lock(sub_object);
13321 kr = vm_object_copy_slowly(sub_object,
13322 submap_entry_offset,
13323 submap_entry_size,
13324 FALSE,
13325 ©_object);
13326 object_copied = TRUE;
13327 object_copied_offset = 0;
13328 /* 4k: account for extra offset in physical page */
13329 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13330 object_copied_needs_copy = FALSE;
13331 vm_object_deallocate(sub_object);
13332
13333 vm_map_lock(map);
13334
13335 if (kr != KERN_SUCCESS &&
13336 kr != KERN_MEMORY_RESTART_COPY) {
13337 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13338 vm_map_unlock(cow_sub_map_parent);
13339 }
13340 if ((*real_map != map)
13341 && (*real_map != cow_sub_map_parent)) {
13342 vm_map_unlock(*real_map);
13343 }
13344 *real_map = map;
13345 vm_object_deallocate(copy_object);
13346 copy_object = VM_OBJECT_NULL;
13347 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13348 vm_map_lock_write_to_read(map);
13349 DTRACE_VM4(submap_copy_error_slowly,
13350 vm_object_t, sub_object,
13351 vm_object_offset_t, submap_entry_offset,
13352 vm_object_size_t, submap_entry_size,
13353 int, kr);
13354 vm_map_lookup_locked_copy_slowly_error++;
13355 return kr;
13356 }
13357
13358 if ((kr == KERN_SUCCESS) &&
13359 (version.main_timestamp + 1) == map->timestamp) {
13360 submap_entry = saved_submap_entry;
13361 } else {
13362 saved_submap_entry = NULL;
13363 old_start -= start_delta;
13364 old_end += end_delta;
13365 vm_object_deallocate(copy_object);
13366 copy_object = VM_OBJECT_NULL;
13367 vm_map_lock_write_to_read(map);
13368 vm_map_lookup_locked_copy_slowly_restart++;
13369 goto RetrySubMap;
13370 }
13371 vm_map_lookup_locked_copy_slowly_count++;
13372 vm_map_lookup_locked_copy_slowly_size += submap_entry_size;
13373 if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) {
13374 vm_map_lookup_locked_copy_slowly_max = submap_entry_size;
13375 }
13376 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13377 submap_entry_offset = VME_OFFSET(submap_entry);
13378 copy_object = VM_OBJECT_NULL;
13379 object_copied_offset = submap_entry_offset;
13380 object_copied_needs_copy = FALSE;
13381 DTRACE_VM6(submap_copy_strategically,
13382 vm_map_t, cow_sub_map_parent,
13383 vm_map_offset_t, vaddr,
13384 vm_map_t, map,
13385 vm_object_size_t, submap_entry_size,
13386 int, submap_entry->wired_count,
13387 int, sub_object->copy_strategy);
13388 kr = vm_object_copy_strategically(
13389 sub_object,
13390 submap_entry_offset,
13391 submap_entry->vme_end - submap_entry->vme_start,
13392 ©_object,
13393 &object_copied_offset,
13394 &object_copied_needs_copy);
13395 if (kr == KERN_MEMORY_RESTART_COPY) {
13396 old_start -= start_delta;
13397 old_end += end_delta;
13398 vm_object_deallocate(copy_object);
13399 copy_object = VM_OBJECT_NULL;
13400 vm_map_lock_write_to_read(map);
13401 vm_map_lookup_locked_copy_strategically_restart++;
13402 goto RetrySubMap;
13403 }
13404 if (kr != KERN_SUCCESS) {
13405 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13406 vm_map_unlock(cow_sub_map_parent);
13407 }
13408 if ((*real_map != map)
13409 && (*real_map != cow_sub_map_parent)) {
13410 vm_map_unlock(*real_map);
13411 }
13412 *real_map = map;
13413 vm_object_deallocate(copy_object);
13414 copy_object = VM_OBJECT_NULL;
13415 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13416 vm_map_lock_write_to_read(map);
13417 DTRACE_VM4(submap_copy_error_strategically,
13418 vm_object_t, sub_object,
13419 vm_object_offset_t, submap_entry_offset,
13420 vm_object_size_t, submap_entry_size,
13421 int, kr);
13422 vm_map_lookup_locked_copy_strategically_error++;
13423 return kr;
13424 }
13425 assert(copy_object != VM_OBJECT_NULL);
13426 assert(copy_object != sub_object);
13427 object_copied = TRUE;
13428 vm_map_lookup_locked_copy_strategically_count++;
13429 vm_map_lookup_locked_copy_strategically_size += submap_entry_size;
13430 if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) {
13431 vm_map_lookup_locked_copy_strategically_max = submap_entry_size;
13432 }
13433 } else {
13434 /* set up shadow object */
13435 object_copied = FALSE;
13436 copy_object = sub_object;
13437 vm_object_lock(sub_object);
13438 vm_object_reference_locked(sub_object);
13439 sub_object->shadowed = TRUE;
13440 vm_object_unlock(sub_object);
13441
13442 assert(submap_entry->wired_count == 0);
13443 submap_entry->needs_copy = TRUE;
13444
13445 prot = submap_entry->protection;
13446 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13447 prot = prot & ~VM_PROT_WRITE;
13448 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13449
13450 if (override_nx(old_map,
13451 VME_ALIAS(submap_entry))
13452 && prot) {
13453 prot |= VM_PROT_EXECUTE;
13454 }
13455
13456 vm_object_pmap_protect(
13457 sub_object,
13458 VME_OFFSET(submap_entry),
13459 submap_entry->vme_end -
13460 submap_entry->vme_start,
13461 (submap_entry->is_shared
13462 || map->mapped_in_other_pmaps) ?
13463 PMAP_NULL : map->pmap,
13464 VM_MAP_PAGE_SIZE(map),
13465 submap_entry->vme_start,
13466 prot);
13467 vm_map_lookup_locked_copy_shadow_count++;
13468 vm_map_lookup_locked_copy_shadow_size += submap_entry_size;
13469 if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) {
13470 vm_map_lookup_locked_copy_shadow_max = submap_entry_size;
13471 }
13472 }
13473
13474 /*
13475 * Adjust the fault offset to the submap entry.
13476 */
13477 copy_offset = (local_vaddr -
13478 submap_entry->vme_start +
13479 VME_OFFSET(submap_entry));
13480
13481 /* This works diffently than the */
13482 /* normal submap case. We go back */
13483 /* to the parent of the cow map and*/
13484 /* clip out the target portion of */
13485 /* the sub_map, substituting the */
13486 /* new copy object, */
13487
13488 subentry_protection = submap_entry->protection;
13489 subentry_max_protection = submap_entry->max_protection;
13490 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13491 vm_map_unlock(map);
13492 submap_entry = NULL; /* not valid after map unlock */
13493
13494 local_start = old_start;
13495 local_end = old_end;
13496 map = cow_sub_map_parent;
13497 *var_map = cow_sub_map_parent;
13498 vaddr = cow_parent_vaddr;
13499 cow_sub_map_parent = NULL;
13500
13501 if (!vm_map_lookup_entry(map,
13502 vaddr, &entry)) {
13503 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13504 vm_map_unlock(cow_sub_map_parent);
13505 }
13506 if ((*real_map != map)
13507 && (*real_map != cow_sub_map_parent)) {
13508 vm_map_unlock(*real_map);
13509 }
13510 *real_map = map;
13511 vm_object_deallocate(
13512 copy_object);
13513 copy_object = VM_OBJECT_NULL;
13514 vm_map_lock_write_to_read(map);
13515 DTRACE_VM4(submap_lookup_post_unlock,
13516 uint64_t, (uint64_t)entry->vme_start,
13517 uint64_t, (uint64_t)entry->vme_end,
13518 vm_map_offset_t, vaddr,
13519 int, object_copied);
13520 return KERN_INVALID_ADDRESS;
13521 }
13522
13523 /* clip out the portion of space */
13524 /* mapped by the sub map which */
13525 /* corresponds to the underlying */
13526 /* object */
13527
13528 /*
13529 * Clip (and unnest) the smallest nested chunk
13530 * possible around the faulting address...
13531 */
13532 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
13533 local_end = local_start + pmap_shared_region_size_min(map->pmap);
13534 /*
13535 * ... but don't go beyond the "old_start" to "old_end"
13536 * range, to avoid spanning over another VM region
13537 * with a possibly different VM object and/or offset.
13538 */
13539 if (local_start < old_start) {
13540 local_start = old_start;
13541 }
13542 if (local_end > old_end) {
13543 local_end = old_end;
13544 }
13545 /*
13546 * Adjust copy_offset to the start of the range.
13547 */
13548 copy_offset -= (vaddr - local_start);
13549
13550 vm_map_clip_start(map, entry, local_start);
13551 vm_map_clip_end(map, entry, local_end);
13552 if (entry->is_sub_map) {
13553 /* unnesting was done when clipping */
13554 assert(!entry->use_pmap);
13555 }
13556
13557 /* substitute copy object for */
13558 /* shared map entry */
13559 vm_map_deallocate(VME_SUBMAP(entry));
13560 assert(!entry->iokit_acct);
13561 entry->use_pmap = TRUE;
13562 VME_OBJECT_SET(entry, copy_object, false, 0);
13563
13564 /* propagate the submap entry's protections */
13565 if (entry->protection != VM_PROT_READ) {
13566 /*
13567 * Someone has already altered the top entry's
13568 * protections via vm_protect(VM_PROT_COPY).
13569 * Respect these new values and ignore the
13570 * submap entry's protections.
13571 */
13572 } else {
13573 /*
13574 * Regular copy-on-write: propagate the submap
13575 * entry's protections to the top map entry.
13576 */
13577 entry->protection |= subentry_protection;
13578 }
13579 entry->max_protection |= subentry_max_protection;
13580 /* propagate no_copy_on_read */
13581 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
13582
13583 if ((entry->protection & VM_PROT_WRITE) &&
13584 (entry->protection & VM_PROT_EXECUTE) &&
13585 #if XNU_TARGET_OS_OSX
13586 map->pmap != kernel_pmap &&
13587 (vm_map_cs_enforcement(map)
13588 #if __arm64__
13589 || !VM_MAP_IS_EXOTIC(map)
13590 #endif /* __arm64__ */
13591 ) &&
13592 #endif /* XNU_TARGET_OS_OSX */
13593 !(entry->used_for_jit) &&
13594 VM_MAP_POLICY_WX_STRIP_X(map)) {
13595 DTRACE_VM3(cs_wx,
13596 uint64_t, (uint64_t)entry->vme_start,
13597 uint64_t, (uint64_t)entry->vme_end,
13598 vm_prot_t, entry->protection);
13599 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
13600 proc_selfpid(),
13601 (current_task()->bsd_info
13602 ? proc_name_address(current_task()->bsd_info)
13603 : "?"),
13604 __FUNCTION__);
13605 entry->protection &= ~VM_PROT_EXECUTE;
13606 }
13607
13608 if (object_copied) {
13609 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
13610 entry->needs_copy = object_copied_needs_copy;
13611 entry->is_shared = FALSE;
13612 } else {
13613 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
13614 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
13615 assert(entry->wired_count == 0);
13616 VME_OFFSET_SET(entry, copy_offset);
13617 entry->needs_copy = TRUE;
13618 if (map != old_map) {
13619 entry->is_shared = TRUE;
13620 }
13621 }
13622 if (entry->inheritance == VM_INHERIT_SHARE) {
13623 entry->inheritance = VM_INHERIT_COPY;
13624 }
13625
13626 vm_map_lock_write_to_read(map);
13627 } else {
13628 if ((cow_sub_map_parent)
13629 && (cow_sub_map_parent != *real_map)
13630 && (cow_sub_map_parent != map)) {
13631 vm_map_unlock(cow_sub_map_parent);
13632 }
13633 entry = submap_entry;
13634 vaddr = local_vaddr;
13635 }
13636 }
13637
13638 /*
13639 * Check whether this task is allowed to have
13640 * this page.
13641 */
13642
13643 prot = entry->protection;
13644
13645 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
13646 /*
13647 * HACK -- if not a stack, then allow execution
13648 */
13649 prot |= VM_PROT_EXECUTE;
13650 }
13651
13652 if (mask_protections) {
13653 fault_type &= prot;
13654 if (fault_type == VM_PROT_NONE) {
13655 goto protection_failure;
13656 }
13657 }
13658 if (((fault_type & prot) != fault_type)
13659 #if __arm64__
13660 /* prefetch abort in execute-only page */
13661 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
13662 #elif defined(__x86_64__)
13663 /* Consider the UEXEC bit when handling an EXECUTE fault */
13664 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
13665 #endif
13666 ) {
13667 protection_failure:
13668 if (*real_map != map) {
13669 vm_map_unlock(*real_map);
13670 }
13671 *real_map = map;
13672
13673 if ((fault_type & VM_PROT_EXECUTE) && prot) {
13674 log_stack_execution_failure((addr64_t)vaddr, prot);
13675 }
13676
13677 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
13678 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
13679 /*
13680 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
13681 *
13682 * kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
13683 */
13684 return KERN_PROTECTION_FAILURE;
13685 }
13686
13687 /*
13688 * If this page is not pageable, we have to get
13689 * it for all possible accesses.
13690 */
13691
13692 *wired = (entry->wired_count != 0);
13693 if (*wired) {
13694 fault_type = prot;
13695 }
13696
13697 /*
13698 * If the entry was copy-on-write, we either ...
13699 */
13700
13701 if (entry->needs_copy) {
13702 /*
13703 * If we want to write the page, we may as well
13704 * handle that now since we've got the map locked.
13705 *
13706 * If we don't need to write the page, we just
13707 * demote the permissions allowed.
13708 */
13709
13710 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
13711 /*
13712 * Make a new object, and place it in the
13713 * object chain. Note that no new references
13714 * have appeared -- one just moved from the
13715 * map to the new object.
13716 */
13717
13718 if (vm_map_lock_read_to_write(map)) {
13719 vm_map_lock_read(map);
13720 goto RetryLookup;
13721 }
13722
13723 if (VME_OBJECT(entry)->shadowed == FALSE) {
13724 vm_object_lock(VME_OBJECT(entry));
13725 VME_OBJECT(entry)->shadowed = TRUE;
13726 vm_object_unlock(VME_OBJECT(entry));
13727 }
13728 VME_OBJECT_SHADOW(entry,
13729 (vm_map_size_t) (entry->vme_end -
13730 entry->vme_start));
13731 entry->needs_copy = FALSE;
13732
13733 vm_map_lock_write_to_read(map);
13734 }
13735 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
13736 /*
13737 * We're attempting to read a copy-on-write
13738 * page -- don't allow writes.
13739 */
13740
13741 prot &= (~VM_PROT_WRITE);
13742 }
13743 }
13744
13745 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
13746 /*
13747 * We went through a "needs_copy" submap without triggering
13748 * a copy, so granting write access to the page would bypass
13749 * that submap's "needs_copy".
13750 */
13751 assert(!(fault_type & VM_PROT_WRITE));
13752 assert(!*wired);
13753 assert(!force_copy);
13754 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
13755 prot &= ~VM_PROT_WRITE;
13756 }
13757
13758 /*
13759 * Create an object if necessary.
13760 */
13761 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
13762 if (vm_map_lock_read_to_write(map)) {
13763 vm_map_lock_read(map);
13764 goto RetryLookup;
13765 }
13766
13767 VME_OBJECT_SET(entry,
13768 vm_object_allocate(
13769 (vm_map_size_t)(entry->vme_end -
13770 entry->vme_start)), false, 0);
13771 VME_OFFSET_SET(entry, 0);
13772 assert(entry->use_pmap);
13773 vm_map_lock_write_to_read(map);
13774 }
13775
13776 /*
13777 * Return the object/offset from this entry. If the entry
13778 * was copy-on-write or empty, it has been fixed up. Also
13779 * return the protection.
13780 */
13781
13782 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
13783 *object = VME_OBJECT(entry);
13784 *out_prot = prot;
13785 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
13786
13787 if (fault_info) {
13788 fault_info->interruptible = THREAD_UNINT; /* for now... */
13789 /* ... the caller will change "interruptible" if needed */
13790 fault_info->cluster_size = 0;
13791 fault_info->user_tag = VME_ALIAS(entry);
13792 fault_info->pmap_options = 0;
13793 if (entry->iokit_acct ||
13794 (!entry->is_sub_map && !entry->use_pmap)) {
13795 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
13796 }
13797 fault_info->behavior = entry->behavior;
13798 fault_info->lo_offset = VME_OFFSET(entry);
13799 fault_info->hi_offset =
13800 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
13801 fault_info->no_cache = entry->no_cache;
13802 fault_info->stealth = FALSE;
13803 fault_info->io_sync = FALSE;
13804 if (entry->used_for_jit ||
13805 entry->vme_resilient_codesign) {
13806 fault_info->cs_bypass = TRUE;
13807 } else {
13808 fault_info->cs_bypass = FALSE;
13809 }
13810 fault_info->pmap_cs_associated = FALSE;
13811 #if CONFIG_PMAP_CS
13812 if (entry->pmap_cs_associated) {
13813 /*
13814 * The pmap layer will validate this page
13815 * before allowing it to be executed from.
13816 */
13817 fault_info->pmap_cs_associated = TRUE;
13818 }
13819 #endif /* CONFIG_PMAP_CS */
13820 fault_info->mark_zf_absent = FALSE;
13821 fault_info->batch_pmap_op = FALSE;
13822 fault_info->resilient_media = entry->vme_resilient_media;
13823 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
13824 if (entry->translated_allow_execute) {
13825 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
13826 }
13827 }
13828
13829 /*
13830 * Lock the object to prevent it from disappearing
13831 */
13832 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
13833 if (contended == NULL) {
13834 vm_object_lock(*object);
13835 } else {
13836 *contended = vm_object_lock_check_contended(*object);
13837 }
13838 } else {
13839 vm_object_lock_shared(*object);
13840 }
13841
13842 /*
13843 * Save the version number
13844 */
13845
13846 out_version->main_timestamp = map->timestamp;
13847
13848 return KERN_SUCCESS;
13849 }
13850
13851
13852 /*
13853 * vm_map_verify:
13854 *
13855 * Verifies that the map in question has not changed
13856 * since the given version. The map has to be locked
13857 * ("shared" mode is fine) before calling this function
13858 * and it will be returned locked too.
13859 */
13860 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)13861 vm_map_verify(
13862 vm_map_t map,
13863 vm_map_version_t *version) /* REF */
13864 {
13865 boolean_t result;
13866
13867 vm_map_lock_assert_held(map);
13868 result = (map->timestamp == version->main_timestamp);
13869
13870 return result;
13871 }
13872
13873 /*
13874 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
13875 * Goes away after regular vm_region_recurse function migrates to
13876 * 64 bits
13877 * vm_region_recurse: A form of vm_region which follows the
13878 * submaps in a target map
13879 *
13880 */
13881
13882 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)13883 vm_map_region_recurse_64(
13884 vm_map_t map,
13885 vm_map_offset_t *address, /* IN/OUT */
13886 vm_map_size_t *size, /* OUT */
13887 natural_t *nesting_depth, /* IN/OUT */
13888 vm_region_submap_info_64_t submap_info, /* IN/OUT */
13889 mach_msg_type_number_t *count) /* IN/OUT */
13890 {
13891 mach_msg_type_number_t original_count;
13892 vm_region_extended_info_data_t extended;
13893 vm_map_entry_t tmp_entry;
13894 vm_map_offset_t user_address;
13895 unsigned int user_max_depth;
13896
13897 /*
13898 * "curr_entry" is the VM map entry preceding or including the
13899 * address we're looking for.
13900 * "curr_map" is the map or sub-map containing "curr_entry".
13901 * "curr_address" is the equivalent of the top map's "user_address"
13902 * in the current map.
13903 * "curr_offset" is the cumulated offset of "curr_map" in the
13904 * target task's address space.
13905 * "curr_depth" is the depth of "curr_map" in the chain of
13906 * sub-maps.
13907 *
13908 * "curr_max_below" and "curr_max_above" limit the range (around
13909 * "curr_address") we should take into account in the current (sub)map.
13910 * They limit the range to what's visible through the map entries
13911 * we've traversed from the top map to the current map.
13912 *
13913 */
13914 vm_map_entry_t curr_entry;
13915 vm_map_address_t curr_address;
13916 vm_map_offset_t curr_offset;
13917 vm_map_t curr_map;
13918 unsigned int curr_depth;
13919 vm_map_offset_t curr_max_below, curr_max_above;
13920 vm_map_offset_t curr_skip;
13921
13922 /*
13923 * "next_" is the same as "curr_" but for the VM region immediately
13924 * after the address we're looking for. We need to keep track of this
13925 * too because we want to return info about that region if the
13926 * address we're looking for is not mapped.
13927 */
13928 vm_map_entry_t next_entry;
13929 vm_map_offset_t next_offset;
13930 vm_map_offset_t next_address;
13931 vm_map_t next_map;
13932 unsigned int next_depth;
13933 vm_map_offset_t next_max_below, next_max_above;
13934 vm_map_offset_t next_skip;
13935
13936 boolean_t look_for_pages;
13937 vm_region_submap_short_info_64_t short_info;
13938 boolean_t do_region_footprint;
13939 int effective_page_size, effective_page_shift;
13940 boolean_t submap_needed_copy;
13941
13942 if (map == VM_MAP_NULL) {
13943 /* no address space to work on */
13944 return KERN_INVALID_ARGUMENT;
13945 }
13946
13947 effective_page_shift = vm_self_region_page_shift(map);
13948 effective_page_size = (1 << effective_page_shift);
13949
13950 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
13951 /*
13952 * "info" structure is not big enough and
13953 * would overflow
13954 */
13955 return KERN_INVALID_ARGUMENT;
13956 }
13957
13958 do_region_footprint = task_self_region_footprint();
13959 original_count = *count;
13960
13961 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
13962 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
13963 look_for_pages = FALSE;
13964 short_info = (vm_region_submap_short_info_64_t) submap_info;
13965 submap_info = NULL;
13966 } else {
13967 look_for_pages = TRUE;
13968 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
13969 short_info = NULL;
13970
13971 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
13972 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
13973 }
13974 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
13975 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
13976 }
13977 }
13978
13979 user_address = *address;
13980 user_max_depth = *nesting_depth;
13981 submap_needed_copy = FALSE;
13982
13983 if (not_in_kdp) {
13984 vm_map_lock_read(map);
13985 }
13986
13987 recurse_again:
13988 curr_entry = NULL;
13989 curr_map = map;
13990 curr_address = user_address;
13991 curr_offset = 0;
13992 curr_skip = 0;
13993 curr_depth = 0;
13994 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
13995 curr_max_below = curr_address;
13996
13997 next_entry = NULL;
13998 next_map = NULL;
13999 next_address = 0;
14000 next_offset = 0;
14001 next_skip = 0;
14002 next_depth = 0;
14003 next_max_above = (vm_map_offset_t) -1;
14004 next_max_below = (vm_map_offset_t) -1;
14005
14006 for (;;) {
14007 if (vm_map_lookup_entry(curr_map,
14008 curr_address,
14009 &tmp_entry)) {
14010 /* tmp_entry contains the address we're looking for */
14011 curr_entry = tmp_entry;
14012 } else {
14013 vm_map_offset_t skip;
14014 /*
14015 * The address is not mapped. "tmp_entry" is the
14016 * map entry preceding the address. We want the next
14017 * one, if it exists.
14018 */
14019 curr_entry = tmp_entry->vme_next;
14020
14021 if (curr_entry == vm_map_to_entry(curr_map) ||
14022 (curr_entry->vme_start >=
14023 curr_address + curr_max_above)) {
14024 /* no next entry at this level: stop looking */
14025 if (not_in_kdp) {
14026 vm_map_unlock_read(curr_map);
14027 }
14028 curr_entry = NULL;
14029 curr_map = NULL;
14030 curr_skip = 0;
14031 curr_offset = 0;
14032 curr_depth = 0;
14033 curr_max_above = 0;
14034 curr_max_below = 0;
14035 break;
14036 }
14037
14038 /* adjust current address and offset */
14039 skip = curr_entry->vme_start - curr_address;
14040 curr_address = curr_entry->vme_start;
14041 curr_skip += skip;
14042 curr_offset += skip;
14043 curr_max_above -= skip;
14044 curr_max_below = 0;
14045 }
14046
14047 /*
14048 * Is the next entry at this level closer to the address (or
14049 * deeper in the submap chain) than the one we had
14050 * so far ?
14051 */
14052 tmp_entry = curr_entry->vme_next;
14053 if (tmp_entry == vm_map_to_entry(curr_map)) {
14054 /* no next entry at this level */
14055 } else if (tmp_entry->vme_start >=
14056 curr_address + curr_max_above) {
14057 /*
14058 * tmp_entry is beyond the scope of what we mapped of
14059 * this submap in the upper level: ignore it.
14060 */
14061 } else if ((next_entry == NULL) ||
14062 (tmp_entry->vme_start + curr_offset <=
14063 next_entry->vme_start + next_offset)) {
14064 /*
14065 * We didn't have a "next_entry" or this one is
14066 * closer to the address we're looking for:
14067 * use this "tmp_entry" as the new "next_entry".
14068 */
14069 if (next_entry != NULL) {
14070 /* unlock the last "next_map" */
14071 if (next_map != curr_map && not_in_kdp) {
14072 vm_map_unlock_read(next_map);
14073 }
14074 }
14075 next_entry = tmp_entry;
14076 next_map = curr_map;
14077 next_depth = curr_depth;
14078 next_address = next_entry->vme_start;
14079 next_skip = curr_skip;
14080 next_skip += (next_address - curr_address);
14081 next_offset = curr_offset;
14082 next_offset += (next_address - curr_address);
14083 next_max_above = MIN(next_max_above, curr_max_above);
14084 next_max_above = MIN(next_max_above,
14085 next_entry->vme_end - next_address);
14086 next_max_below = MIN(next_max_below, curr_max_below);
14087 next_max_below = MIN(next_max_below,
14088 next_address - next_entry->vme_start);
14089 }
14090
14091 /*
14092 * "curr_max_{above,below}" allow us to keep track of the
14093 * portion of the submap that is actually mapped at this level:
14094 * the rest of that submap is irrelevant to us, since it's not
14095 * mapped here.
14096 * The relevant portion of the map starts at
14097 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14098 */
14099 curr_max_above = MIN(curr_max_above,
14100 curr_entry->vme_end - curr_address);
14101 curr_max_below = MIN(curr_max_below,
14102 curr_address - curr_entry->vme_start);
14103
14104 if (!curr_entry->is_sub_map ||
14105 curr_depth >= user_max_depth) {
14106 /*
14107 * We hit a leaf map or we reached the maximum depth
14108 * we could, so stop looking. Keep the current map
14109 * locked.
14110 */
14111 break;
14112 }
14113
14114 /*
14115 * Get down to the next submap level.
14116 */
14117
14118 if (curr_entry->needs_copy) {
14119 /* everything below this is effectively copy-on-write */
14120 submap_needed_copy = TRUE;
14121 }
14122
14123 /*
14124 * Lock the next level and unlock the current level,
14125 * unless we need to keep it locked to access the "next_entry"
14126 * later.
14127 */
14128 if (not_in_kdp) {
14129 vm_map_lock_read(VME_SUBMAP(curr_entry));
14130 }
14131 if (curr_map == next_map) {
14132 /* keep "next_map" locked in case we need it */
14133 } else {
14134 /* release this map */
14135 if (not_in_kdp) {
14136 vm_map_unlock_read(curr_map);
14137 }
14138 }
14139
14140 /*
14141 * Adjust the offset. "curr_entry" maps the submap
14142 * at relative address "curr_entry->vme_start" in the
14143 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14144 * bytes of the submap.
14145 * "curr_offset" always represents the offset of a virtual
14146 * address in the curr_map relative to the absolute address
14147 * space (i.e. the top-level VM map).
14148 */
14149 curr_offset +=
14150 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14151 curr_address = user_address + curr_offset;
14152 /* switch to the submap */
14153 curr_map = VME_SUBMAP(curr_entry);
14154 curr_depth++;
14155 curr_entry = NULL;
14156 }
14157
14158 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14159 // so probably should be a real 32b ID vs. ptr.
14160 // Current users just check for equality
14161
14162 if (curr_entry == NULL) {
14163 /* no VM region contains the address... */
14164
14165 if (do_region_footprint && /* we want footprint numbers */
14166 next_entry == NULL && /* & there are no more regions */
14167 /* & we haven't already provided our fake region: */
14168 user_address <= vm_map_last_entry(map)->vme_end) {
14169 ledger_amount_t ledger_resident, ledger_compressed;
14170
14171 /*
14172 * Add a fake memory region to account for
14173 * purgeable and/or ledger-tagged memory that
14174 * counts towards this task's memory footprint,
14175 * i.e. the resident/compressed pages of non-volatile
14176 * objects owned by that task.
14177 */
14178 task_ledgers_footprint(map->pmap->ledger,
14179 &ledger_resident,
14180 &ledger_compressed);
14181 if (ledger_resident + ledger_compressed == 0) {
14182 /* no purgeable memory usage to report */
14183 return KERN_INVALID_ADDRESS;
14184 }
14185 /* fake region to show nonvolatile footprint */
14186 if (look_for_pages) {
14187 submap_info->protection = VM_PROT_DEFAULT;
14188 submap_info->max_protection = VM_PROT_DEFAULT;
14189 submap_info->inheritance = VM_INHERIT_DEFAULT;
14190 submap_info->offset = 0;
14191 submap_info->user_tag = -1;
14192 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14193 submap_info->pages_shared_now_private = 0;
14194 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14195 submap_info->pages_dirtied = submap_info->pages_resident;
14196 submap_info->ref_count = 1;
14197 submap_info->shadow_depth = 0;
14198 submap_info->external_pager = 0;
14199 submap_info->share_mode = SM_PRIVATE;
14200 if (submap_needed_copy) {
14201 submap_info->share_mode = SM_COW;
14202 }
14203 submap_info->is_submap = 0;
14204 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14205 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14206 submap_info->user_wired_count = 0;
14207 submap_info->pages_reusable = 0;
14208 } else {
14209 short_info->user_tag = -1;
14210 short_info->offset = 0;
14211 short_info->protection = VM_PROT_DEFAULT;
14212 short_info->inheritance = VM_INHERIT_DEFAULT;
14213 short_info->max_protection = VM_PROT_DEFAULT;
14214 short_info->behavior = VM_BEHAVIOR_DEFAULT;
14215 short_info->user_wired_count = 0;
14216 short_info->is_submap = 0;
14217 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14218 short_info->external_pager = 0;
14219 short_info->shadow_depth = 0;
14220 short_info->share_mode = SM_PRIVATE;
14221 if (submap_needed_copy) {
14222 short_info->share_mode = SM_COW;
14223 }
14224 short_info->ref_count = 1;
14225 }
14226 *nesting_depth = 0;
14227 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14228 // *address = user_address;
14229 *address = vm_map_last_entry(map)->vme_end;
14230 return KERN_SUCCESS;
14231 }
14232
14233 if (next_entry == NULL) {
14234 /* ... and no VM region follows it either */
14235 return KERN_INVALID_ADDRESS;
14236 }
14237 /* ... gather info about the next VM region */
14238 curr_entry = next_entry;
14239 curr_map = next_map; /* still locked ... */
14240 curr_address = next_address;
14241 curr_skip = next_skip;
14242 curr_offset = next_offset;
14243 curr_depth = next_depth;
14244 curr_max_above = next_max_above;
14245 curr_max_below = next_max_below;
14246 } else {
14247 /* we won't need "next_entry" after all */
14248 if (next_entry != NULL) {
14249 /* release "next_map" */
14250 if (next_map != curr_map && not_in_kdp) {
14251 vm_map_unlock_read(next_map);
14252 }
14253 }
14254 }
14255 next_entry = NULL;
14256 next_map = NULL;
14257 next_offset = 0;
14258 next_skip = 0;
14259 next_depth = 0;
14260 next_max_below = -1;
14261 next_max_above = -1;
14262
14263 if (curr_entry->is_sub_map &&
14264 curr_depth < user_max_depth) {
14265 /*
14266 * We're not as deep as we could be: we must have
14267 * gone back up after not finding anything mapped
14268 * below the original top-level map entry's.
14269 * Let's move "curr_address" forward and recurse again.
14270 */
14271 user_address = curr_address;
14272 goto recurse_again;
14273 }
14274
14275 *nesting_depth = curr_depth;
14276 *size = curr_max_above + curr_max_below;
14277 *address = user_address + curr_skip - curr_max_below;
14278
14279 if (look_for_pages) {
14280 submap_info->user_tag = VME_ALIAS(curr_entry);
14281 submap_info->offset = VME_OFFSET(curr_entry);
14282 submap_info->protection = curr_entry->protection;
14283 submap_info->inheritance = curr_entry->inheritance;
14284 submap_info->max_protection = curr_entry->max_protection;
14285 submap_info->behavior = curr_entry->behavior;
14286 submap_info->user_wired_count = curr_entry->user_wired_count;
14287 submap_info->is_submap = curr_entry->is_sub_map;
14288 if (curr_entry->is_sub_map) {
14289 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14290 } else {
14291 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14292 }
14293 } else {
14294 short_info->user_tag = VME_ALIAS(curr_entry);
14295 short_info->offset = VME_OFFSET(curr_entry);
14296 short_info->protection = curr_entry->protection;
14297 short_info->inheritance = curr_entry->inheritance;
14298 short_info->max_protection = curr_entry->max_protection;
14299 short_info->behavior = curr_entry->behavior;
14300 short_info->user_wired_count = curr_entry->user_wired_count;
14301 short_info->is_submap = curr_entry->is_sub_map;
14302 if (curr_entry->is_sub_map) {
14303 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
14304 } else {
14305 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14306 }
14307 }
14308
14309 extended.pages_resident = 0;
14310 extended.pages_swapped_out = 0;
14311 extended.pages_shared_now_private = 0;
14312 extended.pages_dirtied = 0;
14313 extended.pages_reusable = 0;
14314 extended.external_pager = 0;
14315 extended.shadow_depth = 0;
14316 extended.share_mode = SM_EMPTY;
14317 extended.ref_count = 0;
14318
14319 if (not_in_kdp) {
14320 if (!curr_entry->is_sub_map) {
14321 vm_map_offset_t range_start, range_end;
14322 range_start = MAX((curr_address - curr_max_below),
14323 curr_entry->vme_start);
14324 range_end = MIN((curr_address + curr_max_above),
14325 curr_entry->vme_end);
14326 vm_map_region_walk(curr_map,
14327 range_start,
14328 curr_entry,
14329 (VME_OFFSET(curr_entry) +
14330 (range_start -
14331 curr_entry->vme_start)),
14332 range_end - range_start,
14333 &extended,
14334 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14335 if (extended.external_pager &&
14336 extended.ref_count == 2 &&
14337 extended.share_mode == SM_SHARED) {
14338 extended.share_mode = SM_PRIVATE;
14339 }
14340 if (submap_needed_copy) {
14341 extended.share_mode = SM_COW;
14342 }
14343 } else {
14344 if (curr_entry->use_pmap) {
14345 extended.share_mode = SM_TRUESHARED;
14346 } else {
14347 extended.share_mode = SM_PRIVATE;
14348 }
14349 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14350 }
14351 }
14352
14353 if (look_for_pages) {
14354 submap_info->pages_resident = extended.pages_resident;
14355 submap_info->pages_swapped_out = extended.pages_swapped_out;
14356 submap_info->pages_shared_now_private =
14357 extended.pages_shared_now_private;
14358 submap_info->pages_dirtied = extended.pages_dirtied;
14359 submap_info->external_pager = extended.external_pager;
14360 submap_info->shadow_depth = extended.shadow_depth;
14361 submap_info->share_mode = extended.share_mode;
14362 submap_info->ref_count = extended.ref_count;
14363
14364 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14365 submap_info->pages_reusable = extended.pages_reusable;
14366 }
14367 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14368 if (curr_entry->is_sub_map) {
14369 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_SUBMAP(curr_entry));
14370 } else if (VME_OBJECT(curr_entry)) {
14371 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry));
14372 } else {
14373 submap_info->object_id_full = 0ull;
14374 }
14375 }
14376 } else {
14377 short_info->external_pager = extended.external_pager;
14378 short_info->shadow_depth = extended.shadow_depth;
14379 short_info->share_mode = extended.share_mode;
14380 short_info->ref_count = extended.ref_count;
14381 }
14382
14383 if (not_in_kdp) {
14384 vm_map_unlock_read(curr_map);
14385 }
14386
14387 return KERN_SUCCESS;
14388 }
14389
14390 /*
14391 * vm_region:
14392 *
14393 * User call to obtain information about a region in
14394 * a task's address map. Currently, only one flavor is
14395 * supported.
14396 *
14397 * XXX The reserved and behavior fields cannot be filled
14398 * in until the vm merge from the IK is completed, and
14399 * vm_reserve is implemented.
14400 */
14401
14402 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14403 vm_map_region(
14404 vm_map_t map,
14405 vm_map_offset_t *address, /* IN/OUT */
14406 vm_map_size_t *size, /* OUT */
14407 vm_region_flavor_t flavor, /* IN */
14408 vm_region_info_t info, /* OUT */
14409 mach_msg_type_number_t *count, /* IN/OUT */
14410 mach_port_t *object_name) /* OUT */
14411 {
14412 vm_map_entry_t tmp_entry;
14413 vm_map_entry_t entry;
14414 vm_map_offset_t start;
14415
14416 if (map == VM_MAP_NULL) {
14417 return KERN_INVALID_ARGUMENT;
14418 }
14419
14420 switch (flavor) {
14421 case VM_REGION_BASIC_INFO:
14422 /* legacy for old 32-bit objects info */
14423 {
14424 vm_region_basic_info_t basic;
14425
14426 if (*count < VM_REGION_BASIC_INFO_COUNT) {
14427 return KERN_INVALID_ARGUMENT;
14428 }
14429
14430 basic = (vm_region_basic_info_t) info;
14431 *count = VM_REGION_BASIC_INFO_COUNT;
14432
14433 vm_map_lock_read(map);
14434
14435 start = *address;
14436 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14437 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14438 vm_map_unlock_read(map);
14439 return KERN_INVALID_ADDRESS;
14440 }
14441 } else {
14442 entry = tmp_entry;
14443 }
14444
14445 start = entry->vme_start;
14446
14447 basic->offset = (uint32_t)VME_OFFSET(entry);
14448 basic->protection = entry->protection;
14449 basic->inheritance = entry->inheritance;
14450 basic->max_protection = entry->max_protection;
14451 basic->behavior = entry->behavior;
14452 basic->user_wired_count = entry->user_wired_count;
14453 basic->reserved = entry->is_sub_map;
14454 *address = start;
14455 *size = (entry->vme_end - start);
14456
14457 if (object_name) {
14458 *object_name = IP_NULL;
14459 }
14460 if (entry->is_sub_map) {
14461 basic->shared = FALSE;
14462 } else {
14463 basic->shared = entry->is_shared;
14464 }
14465
14466 vm_map_unlock_read(map);
14467 return KERN_SUCCESS;
14468 }
14469
14470 case VM_REGION_BASIC_INFO_64:
14471 {
14472 vm_region_basic_info_64_t basic;
14473
14474 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14475 return KERN_INVALID_ARGUMENT;
14476 }
14477
14478 basic = (vm_region_basic_info_64_t) info;
14479 *count = VM_REGION_BASIC_INFO_COUNT_64;
14480
14481 vm_map_lock_read(map);
14482
14483 start = *address;
14484 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14485 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14486 vm_map_unlock_read(map);
14487 return KERN_INVALID_ADDRESS;
14488 }
14489 } else {
14490 entry = tmp_entry;
14491 }
14492
14493 start = entry->vme_start;
14494
14495 basic->offset = VME_OFFSET(entry);
14496 basic->protection = entry->protection;
14497 basic->inheritance = entry->inheritance;
14498 basic->max_protection = entry->max_protection;
14499 basic->behavior = entry->behavior;
14500 basic->user_wired_count = entry->user_wired_count;
14501 basic->reserved = entry->is_sub_map;
14502 *address = start;
14503 *size = (entry->vme_end - start);
14504
14505 if (object_name) {
14506 *object_name = IP_NULL;
14507 }
14508 if (entry->is_sub_map) {
14509 basic->shared = FALSE;
14510 } else {
14511 basic->shared = entry->is_shared;
14512 }
14513
14514 vm_map_unlock_read(map);
14515 return KERN_SUCCESS;
14516 }
14517 case VM_REGION_EXTENDED_INFO:
14518 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
14519 return KERN_INVALID_ARGUMENT;
14520 }
14521 OS_FALLTHROUGH;
14522 case VM_REGION_EXTENDED_INFO__legacy:
14523 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
14524 return KERN_INVALID_ARGUMENT;
14525 }
14526
14527 {
14528 vm_region_extended_info_t extended;
14529 mach_msg_type_number_t original_count;
14530 int effective_page_size, effective_page_shift;
14531
14532 extended = (vm_region_extended_info_t) info;
14533
14534 effective_page_shift = vm_self_region_page_shift(map);
14535 effective_page_size = (1 << effective_page_shift);
14536
14537 vm_map_lock_read(map);
14538
14539 start = *address;
14540 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14541 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14542 vm_map_unlock_read(map);
14543 return KERN_INVALID_ADDRESS;
14544 }
14545 } else {
14546 entry = tmp_entry;
14547 }
14548 start = entry->vme_start;
14549
14550 extended->protection = entry->protection;
14551 extended->user_tag = VME_ALIAS(entry);
14552 extended->pages_resident = 0;
14553 extended->pages_swapped_out = 0;
14554 extended->pages_shared_now_private = 0;
14555 extended->pages_dirtied = 0;
14556 extended->external_pager = 0;
14557 extended->shadow_depth = 0;
14558
14559 original_count = *count;
14560 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
14561 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
14562 } else {
14563 extended->pages_reusable = 0;
14564 *count = VM_REGION_EXTENDED_INFO_COUNT;
14565 }
14566
14567 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
14568
14569 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
14570 extended->share_mode = SM_PRIVATE;
14571 }
14572
14573 if (object_name) {
14574 *object_name = IP_NULL;
14575 }
14576 *address = start;
14577 *size = (entry->vme_end - start);
14578
14579 vm_map_unlock_read(map);
14580 return KERN_SUCCESS;
14581 }
14582 case VM_REGION_TOP_INFO:
14583 {
14584 vm_region_top_info_t top;
14585
14586 if (*count < VM_REGION_TOP_INFO_COUNT) {
14587 return KERN_INVALID_ARGUMENT;
14588 }
14589
14590 top = (vm_region_top_info_t) info;
14591 *count = VM_REGION_TOP_INFO_COUNT;
14592
14593 vm_map_lock_read(map);
14594
14595 start = *address;
14596 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14597 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14598 vm_map_unlock_read(map);
14599 return KERN_INVALID_ADDRESS;
14600 }
14601 } else {
14602 entry = tmp_entry;
14603 }
14604 start = entry->vme_start;
14605
14606 top->private_pages_resident = 0;
14607 top->shared_pages_resident = 0;
14608
14609 vm_map_region_top_walk(entry, top);
14610
14611 if (object_name) {
14612 *object_name = IP_NULL;
14613 }
14614 *address = start;
14615 *size = (entry->vme_end - start);
14616
14617 vm_map_unlock_read(map);
14618 return KERN_SUCCESS;
14619 }
14620 default:
14621 return KERN_INVALID_ARGUMENT;
14622 }
14623 }
14624
14625 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
14626 MIN((entry_size), \
14627 ((obj)->all_reusable ? \
14628 (obj)->wired_page_count : \
14629 (obj)->resident_page_count - (obj)->reusable_page_count))
14630
14631 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)14632 vm_map_region_top_walk(
14633 vm_map_entry_t entry,
14634 vm_region_top_info_t top)
14635 {
14636 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
14637 top->share_mode = SM_EMPTY;
14638 top->ref_count = 0;
14639 top->obj_id = 0;
14640 return;
14641 }
14642
14643 {
14644 struct vm_object *obj, *tmp_obj;
14645 int ref_count;
14646 uint32_t entry_size;
14647
14648 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
14649
14650 obj = VME_OBJECT(entry);
14651
14652 vm_object_lock(obj);
14653
14654 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14655 ref_count--;
14656 }
14657
14658 assert(obj->reusable_page_count <= obj->resident_page_count);
14659 if (obj->shadow) {
14660 if (ref_count == 1) {
14661 top->private_pages_resident =
14662 OBJ_RESIDENT_COUNT(obj, entry_size);
14663 } else {
14664 top->shared_pages_resident =
14665 OBJ_RESIDENT_COUNT(obj, entry_size);
14666 }
14667 top->ref_count = ref_count;
14668 top->share_mode = SM_COW;
14669
14670 while ((tmp_obj = obj->shadow)) {
14671 vm_object_lock(tmp_obj);
14672 vm_object_unlock(obj);
14673 obj = tmp_obj;
14674
14675 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14676 ref_count--;
14677 }
14678
14679 assert(obj->reusable_page_count <= obj->resident_page_count);
14680 top->shared_pages_resident +=
14681 OBJ_RESIDENT_COUNT(obj, entry_size);
14682 top->ref_count += ref_count - 1;
14683 }
14684 } else {
14685 if (entry->superpage_size) {
14686 top->share_mode = SM_LARGE_PAGE;
14687 top->shared_pages_resident = 0;
14688 top->private_pages_resident = entry_size;
14689 } else if (entry->needs_copy) {
14690 top->share_mode = SM_COW;
14691 top->shared_pages_resident =
14692 OBJ_RESIDENT_COUNT(obj, entry_size);
14693 } else {
14694 if (ref_count == 1 ||
14695 (ref_count == 2 && obj->named)) {
14696 top->share_mode = SM_PRIVATE;
14697 top->private_pages_resident =
14698 OBJ_RESIDENT_COUNT(obj,
14699 entry_size);
14700 } else {
14701 top->share_mode = SM_SHARED;
14702 top->shared_pages_resident =
14703 OBJ_RESIDENT_COUNT(obj,
14704 entry_size);
14705 }
14706 }
14707 top->ref_count = ref_count;
14708 }
14709 /* XXX K64: obj_id will be truncated */
14710 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
14711
14712 vm_object_unlock(obj);
14713 }
14714 }
14715
14716 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)14717 vm_map_region_walk(
14718 vm_map_t map,
14719 vm_map_offset_t va,
14720 vm_map_entry_t entry,
14721 vm_object_offset_t offset,
14722 vm_object_size_t range,
14723 vm_region_extended_info_t extended,
14724 boolean_t look_for_pages,
14725 mach_msg_type_number_t count)
14726 {
14727 struct vm_object *obj, *tmp_obj;
14728 vm_map_offset_t last_offset;
14729 int i;
14730 int ref_count;
14731 struct vm_object *shadow_object;
14732 unsigned short shadow_depth;
14733 boolean_t do_region_footprint;
14734 int effective_page_size, effective_page_shift;
14735 vm_map_offset_t effective_page_mask;
14736
14737 do_region_footprint = task_self_region_footprint();
14738
14739 if ((entry->is_sub_map) ||
14740 (VME_OBJECT(entry) == 0) ||
14741 (VME_OBJECT(entry)->phys_contiguous &&
14742 !entry->superpage_size)) {
14743 extended->share_mode = SM_EMPTY;
14744 extended->ref_count = 0;
14745 return;
14746 }
14747
14748 if (entry->superpage_size) {
14749 extended->shadow_depth = 0;
14750 extended->share_mode = SM_LARGE_PAGE;
14751 extended->ref_count = 1;
14752 extended->external_pager = 0;
14753
14754 /* TODO4K: Superpage in 4k mode? */
14755 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
14756 extended->shadow_depth = 0;
14757 return;
14758 }
14759
14760 effective_page_shift = vm_self_region_page_shift(map);
14761 effective_page_size = (1 << effective_page_shift);
14762 effective_page_mask = effective_page_size - 1;
14763
14764 offset = vm_map_trunc_page(offset, effective_page_mask);
14765
14766 obj = VME_OBJECT(entry);
14767
14768 vm_object_lock(obj);
14769
14770 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14771 ref_count--;
14772 }
14773
14774 if (look_for_pages) {
14775 for (last_offset = offset + range;
14776 offset < last_offset;
14777 offset += effective_page_size, va += effective_page_size) {
14778 if (do_region_footprint) {
14779 int disp;
14780
14781 disp = 0;
14782 if (map->has_corpse_footprint) {
14783 /*
14784 * Query the page info data we saved
14785 * while forking the corpse.
14786 */
14787 vm_map_corpse_footprint_query_page_info(
14788 map,
14789 va,
14790 &disp);
14791 } else {
14792 /*
14793 * Query the pmap.
14794 */
14795 vm_map_footprint_query_page_info(
14796 map,
14797 entry,
14798 va,
14799 &disp);
14800 }
14801 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
14802 extended->pages_resident++;
14803 }
14804 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
14805 extended->pages_reusable++;
14806 }
14807 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
14808 extended->pages_dirtied++;
14809 }
14810 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
14811 extended->pages_swapped_out++;
14812 }
14813 continue;
14814 }
14815
14816 vm_map_region_look_for_page(map, va, obj,
14817 vm_object_trunc_page(offset), ref_count,
14818 0, extended, count);
14819 }
14820
14821 if (do_region_footprint) {
14822 goto collect_object_info;
14823 }
14824 } else {
14825 collect_object_info:
14826 shadow_object = obj->shadow;
14827 shadow_depth = 0;
14828
14829 if (!(obj->internal)) {
14830 extended->external_pager = 1;
14831 }
14832
14833 if (shadow_object != VM_OBJECT_NULL) {
14834 vm_object_lock(shadow_object);
14835 for (;
14836 shadow_object != VM_OBJECT_NULL;
14837 shadow_depth++) {
14838 vm_object_t next_shadow;
14839
14840 if (!(shadow_object->internal)) {
14841 extended->external_pager = 1;
14842 }
14843
14844 next_shadow = shadow_object->shadow;
14845 if (next_shadow) {
14846 vm_object_lock(next_shadow);
14847 }
14848 vm_object_unlock(shadow_object);
14849 shadow_object = next_shadow;
14850 }
14851 }
14852 extended->shadow_depth = shadow_depth;
14853 }
14854
14855 if (extended->shadow_depth || entry->needs_copy) {
14856 extended->share_mode = SM_COW;
14857 } else {
14858 if (ref_count == 1) {
14859 extended->share_mode = SM_PRIVATE;
14860 } else {
14861 if (obj->true_share) {
14862 extended->share_mode = SM_TRUESHARED;
14863 } else {
14864 extended->share_mode = SM_SHARED;
14865 }
14866 }
14867 }
14868 extended->ref_count = ref_count - extended->shadow_depth;
14869
14870 for (i = 0; i < extended->shadow_depth; i++) {
14871 if ((tmp_obj = obj->shadow) == 0) {
14872 break;
14873 }
14874 vm_object_lock(tmp_obj);
14875 vm_object_unlock(obj);
14876
14877 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
14878 ref_count--;
14879 }
14880
14881 extended->ref_count += ref_count;
14882 obj = tmp_obj;
14883 }
14884 vm_object_unlock(obj);
14885
14886 if (extended->share_mode == SM_SHARED) {
14887 vm_map_entry_t cur;
14888 vm_map_entry_t last;
14889 int my_refs;
14890
14891 obj = VME_OBJECT(entry);
14892 last = vm_map_to_entry(map);
14893 my_refs = 0;
14894
14895 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14896 ref_count--;
14897 }
14898 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
14899 my_refs += vm_map_region_count_obj_refs(cur, obj);
14900 }
14901
14902 if (my_refs == ref_count) {
14903 extended->share_mode = SM_PRIVATE_ALIASED;
14904 } else if (my_refs > 1) {
14905 extended->share_mode = SM_SHARED_ALIASED;
14906 }
14907 }
14908 }
14909
14910
14911 /* object is locked on entry and locked on return */
14912
14913
14914 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)14915 vm_map_region_look_for_page(
14916 __unused vm_map_t map,
14917 __unused vm_map_offset_t va,
14918 vm_object_t object,
14919 vm_object_offset_t offset,
14920 int max_refcnt,
14921 unsigned short depth,
14922 vm_region_extended_info_t extended,
14923 mach_msg_type_number_t count)
14924 {
14925 vm_page_t p;
14926 vm_object_t shadow;
14927 int ref_count;
14928 vm_object_t caller_object;
14929
14930 shadow = object->shadow;
14931 caller_object = object;
14932
14933
14934 while (TRUE) {
14935 if (!(object->internal)) {
14936 extended->external_pager = 1;
14937 }
14938
14939 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
14940 if (shadow && (max_refcnt == 1)) {
14941 extended->pages_shared_now_private++;
14942 }
14943
14944 if (!p->vmp_fictitious &&
14945 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
14946 extended->pages_dirtied++;
14947 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
14948 if (p->vmp_reusable || object->all_reusable) {
14949 extended->pages_reusable++;
14950 }
14951 }
14952
14953 extended->pages_resident++;
14954
14955 if (object != caller_object) {
14956 vm_object_unlock(object);
14957 }
14958
14959 return;
14960 }
14961 if (object->internal &&
14962 object->alive &&
14963 !object->terminating &&
14964 object->pager_ready) {
14965 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
14966 == VM_EXTERNAL_STATE_EXISTS) {
14967 /* the pager has that page */
14968 extended->pages_swapped_out++;
14969 if (object != caller_object) {
14970 vm_object_unlock(object);
14971 }
14972 return;
14973 }
14974 }
14975
14976 if (shadow) {
14977 vm_object_lock(shadow);
14978
14979 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
14980 ref_count--;
14981 }
14982
14983 if (++depth > extended->shadow_depth) {
14984 extended->shadow_depth = depth;
14985 }
14986
14987 if (ref_count > max_refcnt) {
14988 max_refcnt = ref_count;
14989 }
14990
14991 if (object != caller_object) {
14992 vm_object_unlock(object);
14993 }
14994
14995 offset = offset + object->vo_shadow_offset;
14996 object = shadow;
14997 shadow = object->shadow;
14998 continue;
14999 }
15000 if (object != caller_object) {
15001 vm_object_unlock(object);
15002 }
15003 break;
15004 }
15005 }
15006
15007 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)15008 vm_map_region_count_obj_refs(
15009 vm_map_entry_t entry,
15010 vm_object_t object)
15011 {
15012 int ref_count;
15013 vm_object_t chk_obj;
15014 vm_object_t tmp_obj;
15015
15016 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15017 return 0;
15018 }
15019
15020 ref_count = 0;
15021 chk_obj = VME_OBJECT(entry);
15022 vm_object_lock(chk_obj);
15023
15024 while (chk_obj) {
15025 if (chk_obj == object) {
15026 ref_count++;
15027 }
15028 tmp_obj = chk_obj->shadow;
15029 if (tmp_obj) {
15030 vm_object_lock(tmp_obj);
15031 }
15032 vm_object_unlock(chk_obj);
15033
15034 chk_obj = tmp_obj;
15035 }
15036
15037 return ref_count;
15038 }
15039
15040
15041 /*
15042 * Routine: vm_map_simplify
15043 *
15044 * Description:
15045 * Attempt to simplify the map representation in
15046 * the vicinity of the given starting address.
15047 * Note:
15048 * This routine is intended primarily to keep the
15049 * kernel maps more compact -- they generally don't
15050 * benefit from the "expand a map entry" technology
15051 * at allocation time because the adjacent entry
15052 * is often wired down.
15053 */
15054 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)15055 vm_map_simplify_entry(
15056 vm_map_t map,
15057 vm_map_entry_t this_entry)
15058 {
15059 vm_map_entry_t prev_entry;
15060
15061 prev_entry = this_entry->vme_prev;
15062
15063 if ((this_entry != vm_map_to_entry(map)) &&
15064 (prev_entry != vm_map_to_entry(map)) &&
15065
15066 (prev_entry->vme_end == this_entry->vme_start) &&
15067
15068 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15069 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
15070 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
15071 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15072 prev_entry->vme_start))
15073 == VME_OFFSET(this_entry)) &&
15074
15075 (prev_entry->behavior == this_entry->behavior) &&
15076 (prev_entry->needs_copy == this_entry->needs_copy) &&
15077 (prev_entry->protection == this_entry->protection) &&
15078 (prev_entry->max_protection == this_entry->max_protection) &&
15079 (prev_entry->inheritance == this_entry->inheritance) &&
15080 (prev_entry->use_pmap == this_entry->use_pmap) &&
15081 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15082 (prev_entry->no_cache == this_entry->no_cache) &&
15083 (prev_entry->permanent == this_entry->permanent) &&
15084 (prev_entry->map_aligned == this_entry->map_aligned) &&
15085 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15086 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15087 (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15088 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15089 (prev_entry->vme_resilient_codesign ==
15090 this_entry->vme_resilient_codesign) &&
15091 (prev_entry->vme_resilient_media ==
15092 this_entry->vme_resilient_media) &&
15093 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15094
15095 (prev_entry->wired_count == this_entry->wired_count) &&
15096 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15097
15098 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15099 (prev_entry->in_transition == FALSE) &&
15100 (this_entry->in_transition == FALSE) &&
15101 (prev_entry->needs_wakeup == FALSE) &&
15102 (this_entry->needs_wakeup == FALSE) &&
15103 (prev_entry->is_shared == this_entry->is_shared) &&
15104 (prev_entry->superpage_size == FALSE) &&
15105 (this_entry->superpage_size == FALSE)
15106 ) {
15107 vm_map_store_entry_unlink(map, prev_entry);
15108 assert(prev_entry->vme_start < this_entry->vme_end);
15109 if (prev_entry->map_aligned) {
15110 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15111 VM_MAP_PAGE_MASK(map)));
15112 }
15113 this_entry->vme_start = prev_entry->vme_start;
15114 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15115
15116 if (map->holelistenabled) {
15117 vm_map_store_update_first_free(map, this_entry, TRUE);
15118 }
15119
15120 if (prev_entry->is_sub_map) {
15121 vm_map_deallocate(VME_SUBMAP(prev_entry));
15122 } else {
15123 vm_object_deallocate(VME_OBJECT(prev_entry));
15124 }
15125 vm_map_entry_dispose(prev_entry);
15126 SAVE_HINT_MAP_WRITE(map, this_entry);
15127 }
15128 }
15129
15130 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15131 vm_map_simplify(
15132 vm_map_t map,
15133 vm_map_offset_t start)
15134 {
15135 vm_map_entry_t this_entry;
15136
15137 vm_map_lock(map);
15138 if (vm_map_lookup_entry(map, start, &this_entry)) {
15139 vm_map_simplify_entry(map, this_entry);
15140 vm_map_simplify_entry(map, this_entry->vme_next);
15141 }
15142 vm_map_unlock(map);
15143 }
15144
15145 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15146 vm_map_simplify_range(
15147 vm_map_t map,
15148 vm_map_offset_t start,
15149 vm_map_offset_t end)
15150 {
15151 vm_map_entry_t entry;
15152
15153 /*
15154 * The map should be locked (for "write") by the caller.
15155 */
15156
15157 if (start >= end) {
15158 /* invalid address range */
15159 return;
15160 }
15161
15162 start = vm_map_trunc_page(start,
15163 VM_MAP_PAGE_MASK(map));
15164 end = vm_map_round_page(end,
15165 VM_MAP_PAGE_MASK(map));
15166
15167 if (!vm_map_lookup_entry(map, start, &entry)) {
15168 /* "start" is not mapped and "entry" ends before "start" */
15169 if (entry == vm_map_to_entry(map)) {
15170 /* start with first entry in the map */
15171 entry = vm_map_first_entry(map);
15172 } else {
15173 /* start with next entry */
15174 entry = entry->vme_next;
15175 }
15176 }
15177
15178 while (entry != vm_map_to_entry(map) &&
15179 entry->vme_start <= end) {
15180 /* try and coalesce "entry" with its previous entry */
15181 vm_map_simplify_entry(map, entry);
15182 entry = entry->vme_next;
15183 }
15184 }
15185
15186
15187 /*
15188 * Routine: vm_map_machine_attribute
15189 * Purpose:
15190 * Provide machine-specific attributes to mappings,
15191 * such as cachability etc. for machines that provide
15192 * them. NUMA architectures and machines with big/strange
15193 * caches will use this.
15194 * Note:
15195 * Responsibilities for locking and checking are handled here,
15196 * everything else in the pmap module. If any non-volatile
15197 * information must be kept, the pmap module should handle
15198 * it itself. [This assumes that attributes do not
15199 * need to be inherited, which seems ok to me]
15200 */
15201 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15202 vm_map_machine_attribute(
15203 vm_map_t map,
15204 vm_map_offset_t start,
15205 vm_map_offset_t end,
15206 vm_machine_attribute_t attribute,
15207 vm_machine_attribute_val_t* value) /* IN/OUT */
15208 {
15209 kern_return_t ret;
15210 vm_map_size_t sync_size;
15211 vm_map_entry_t entry;
15212
15213 if (start < vm_map_min(map) || end > vm_map_max(map)) {
15214 return KERN_INVALID_ADDRESS;
15215 }
15216
15217 /* Figure how much memory we need to flush (in page increments) */
15218 sync_size = end - start;
15219
15220 vm_map_lock(map);
15221
15222 if (attribute != MATTR_CACHE) {
15223 /* If we don't have to find physical addresses, we */
15224 /* don't have to do an explicit traversal here. */
15225 ret = pmap_attribute(map->pmap, start, end - start,
15226 attribute, value);
15227 vm_map_unlock(map);
15228 return ret;
15229 }
15230
15231 ret = KERN_SUCCESS; /* Assume it all worked */
15232
15233 while (sync_size) {
15234 if (vm_map_lookup_entry(map, start, &entry)) {
15235 vm_map_size_t sub_size;
15236 if ((entry->vme_end - start) > sync_size) {
15237 sub_size = sync_size;
15238 sync_size = 0;
15239 } else {
15240 sub_size = entry->vme_end - start;
15241 sync_size -= sub_size;
15242 }
15243 if (entry->is_sub_map) {
15244 vm_map_offset_t sub_start;
15245 vm_map_offset_t sub_end;
15246
15247 sub_start = (start - entry->vme_start)
15248 + VME_OFFSET(entry);
15249 sub_end = sub_start + sub_size;
15250 vm_map_machine_attribute(
15251 VME_SUBMAP(entry),
15252 sub_start,
15253 sub_end,
15254 attribute, value);
15255 } else if (VME_OBJECT(entry)) {
15256 vm_page_t m;
15257 vm_object_t object;
15258 vm_object_t base_object;
15259 vm_object_t last_object;
15260 vm_object_offset_t offset;
15261 vm_object_offset_t base_offset;
15262 vm_map_size_t range;
15263 range = sub_size;
15264 offset = (start - entry->vme_start)
15265 + VME_OFFSET(entry);
15266 offset = vm_object_trunc_page(offset);
15267 base_offset = offset;
15268 object = VME_OBJECT(entry);
15269 base_object = object;
15270 last_object = NULL;
15271
15272 vm_object_lock(object);
15273
15274 while (range) {
15275 m = vm_page_lookup(
15276 object, offset);
15277
15278 if (m && !m->vmp_fictitious) {
15279 ret =
15280 pmap_attribute_cache_sync(
15281 VM_PAGE_GET_PHYS_PAGE(m),
15282 PAGE_SIZE,
15283 attribute, value);
15284 } else if (object->shadow) {
15285 offset = offset + object->vo_shadow_offset;
15286 last_object = object;
15287 object = object->shadow;
15288 vm_object_lock(last_object->shadow);
15289 vm_object_unlock(last_object);
15290 continue;
15291 }
15292 if (range < PAGE_SIZE) {
15293 range = 0;
15294 } else {
15295 range -= PAGE_SIZE;
15296 }
15297
15298 if (base_object != object) {
15299 vm_object_unlock(object);
15300 vm_object_lock(base_object);
15301 object = base_object;
15302 }
15303 /* Bump to the next page */
15304 base_offset += PAGE_SIZE;
15305 offset = base_offset;
15306 }
15307 vm_object_unlock(object);
15308 }
15309 start += sub_size;
15310 } else {
15311 vm_map_unlock(map);
15312 return KERN_FAILURE;
15313 }
15314 }
15315
15316 vm_map_unlock(map);
15317
15318 return ret;
15319 }
15320
15321 /*
15322 * vm_map_behavior_set:
15323 *
15324 * Sets the paging reference behavior of the specified address
15325 * range in the target map. Paging reference behavior affects
15326 * how pagein operations resulting from faults on the map will be
15327 * clustered.
15328 */
15329 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15330 vm_map_behavior_set(
15331 vm_map_t map,
15332 vm_map_offset_t start,
15333 vm_map_offset_t end,
15334 vm_behavior_t new_behavior)
15335 {
15336 vm_map_entry_t entry;
15337 vm_map_entry_t temp_entry;
15338
15339 if (start > end ||
15340 start < vm_map_min(map) ||
15341 end > vm_map_max(map)) {
15342 return KERN_NO_SPACE;
15343 }
15344
15345 switch (new_behavior) {
15346 /*
15347 * This first block of behaviors all set a persistent state on the specified
15348 * memory range. All we have to do here is to record the desired behavior
15349 * in the vm_map_entry_t's.
15350 */
15351
15352 case VM_BEHAVIOR_DEFAULT:
15353 case VM_BEHAVIOR_RANDOM:
15354 case VM_BEHAVIOR_SEQUENTIAL:
15355 case VM_BEHAVIOR_RSEQNTL:
15356 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15357 vm_map_lock(map);
15358
15359 /*
15360 * The entire address range must be valid for the map.
15361 * Note that vm_map_range_check() does a
15362 * vm_map_lookup_entry() internally and returns the
15363 * entry containing the start of the address range if
15364 * the entire range is valid.
15365 */
15366 if (vm_map_range_check(map, start, end, &temp_entry)) {
15367 entry = temp_entry;
15368 vm_map_clip_start(map, entry, start);
15369 } else {
15370 vm_map_unlock(map);
15371 return KERN_INVALID_ADDRESS;
15372 }
15373
15374 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15375 vm_map_clip_end(map, entry, end);
15376 if (entry->is_sub_map) {
15377 assert(!entry->use_pmap);
15378 }
15379
15380 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15381 entry->zero_wired_pages = TRUE;
15382 } else {
15383 entry->behavior = new_behavior;
15384 }
15385 entry = entry->vme_next;
15386 }
15387
15388 vm_map_unlock(map);
15389 break;
15390
15391 /*
15392 * The rest of these are different from the above in that they cause
15393 * an immediate action to take place as opposed to setting a behavior that
15394 * affects future actions.
15395 */
15396
15397 case VM_BEHAVIOR_WILLNEED:
15398 return vm_map_willneed(map, start, end);
15399
15400 case VM_BEHAVIOR_DONTNEED:
15401 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15402
15403 case VM_BEHAVIOR_FREE:
15404 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15405
15406 case VM_BEHAVIOR_REUSABLE:
15407 return vm_map_reusable_pages(map, start, end);
15408
15409 case VM_BEHAVIOR_REUSE:
15410 return vm_map_reuse_pages(map, start, end);
15411
15412 case VM_BEHAVIOR_CAN_REUSE:
15413 return vm_map_can_reuse(map, start, end);
15414
15415 #if MACH_ASSERT
15416 case VM_BEHAVIOR_PAGEOUT:
15417 return vm_map_pageout(map, start, end);
15418 #endif /* MACH_ASSERT */
15419
15420 default:
15421 return KERN_INVALID_ARGUMENT;
15422 }
15423
15424 return KERN_SUCCESS;
15425 }
15426
15427
15428 /*
15429 * Internals for madvise(MADV_WILLNEED) system call.
15430 *
15431 * The implementation is to do:-
15432 * a) read-ahead if the mapping corresponds to a mapped regular file
15433 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15434 */
15435
15436
15437 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15438 vm_map_willneed(
15439 vm_map_t map,
15440 vm_map_offset_t start,
15441 vm_map_offset_t end
15442 )
15443 {
15444 vm_map_entry_t entry;
15445 vm_object_t object;
15446 memory_object_t pager;
15447 struct vm_object_fault_info fault_info = {};
15448 kern_return_t kr;
15449 vm_object_size_t len;
15450 vm_object_offset_t offset;
15451
15452 fault_info.interruptible = THREAD_UNINT; /* ignored value */
15453 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
15454 fault_info.stealth = TRUE;
15455
15456 /*
15457 * The MADV_WILLNEED operation doesn't require any changes to the
15458 * vm_map_entry_t's, so the read lock is sufficient.
15459 */
15460
15461 vm_map_lock_read(map);
15462
15463 /*
15464 * The madvise semantics require that the address range be fully
15465 * allocated with no holes. Otherwise, we're required to return
15466 * an error.
15467 */
15468
15469 if (!vm_map_range_check(map, start, end, &entry)) {
15470 vm_map_unlock_read(map);
15471 return KERN_INVALID_ADDRESS;
15472 }
15473
15474 /*
15475 * Examine each vm_map_entry_t in the range.
15476 */
15477 for (; entry != vm_map_to_entry(map) && start < end;) {
15478 /*
15479 * The first time through, the start address could be anywhere
15480 * within the vm_map_entry we found. So adjust the offset to
15481 * correspond. After that, the offset will always be zero to
15482 * correspond to the beginning of the current vm_map_entry.
15483 */
15484 offset = (start - entry->vme_start) + VME_OFFSET(entry);
15485
15486 /*
15487 * Set the length so we don't go beyond the end of the
15488 * map_entry or beyond the end of the range we were given.
15489 * This range could span also multiple map entries all of which
15490 * map different files, so make sure we only do the right amount
15491 * of I/O for each object. Note that it's possible for there
15492 * to be multiple map entries all referring to the same object
15493 * but with different page permissions, but it's not worth
15494 * trying to optimize that case.
15495 */
15496 len = MIN(entry->vme_end - start, end - start);
15497
15498 if ((vm_size_t) len != len) {
15499 /* 32-bit overflow */
15500 len = (vm_size_t) (0 - PAGE_SIZE);
15501 }
15502 fault_info.cluster_size = (vm_size_t) len;
15503 fault_info.lo_offset = offset;
15504 fault_info.hi_offset = offset + len;
15505 fault_info.user_tag = VME_ALIAS(entry);
15506 fault_info.pmap_options = 0;
15507 if (entry->iokit_acct ||
15508 (!entry->is_sub_map && !entry->use_pmap)) {
15509 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
15510 }
15511
15512 /*
15513 * If the entry is a submap OR there's no read permission
15514 * to this mapping, then just skip it.
15515 */
15516 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
15517 entry = entry->vme_next;
15518 start = entry->vme_start;
15519 continue;
15520 }
15521
15522 object = VME_OBJECT(entry);
15523
15524 if (object == NULL ||
15525 (object && object->internal)) {
15526 /*
15527 * Memory range backed by anonymous memory.
15528 */
15529 vm_size_t region_size = 0, effective_page_size = 0;
15530 vm_map_offset_t addr = 0, effective_page_mask = 0;
15531
15532 region_size = len;
15533 addr = start;
15534
15535 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
15536 effective_page_size = effective_page_mask + 1;
15537
15538 vm_map_unlock_read(map);
15539
15540 while (region_size) {
15541 vm_pre_fault(
15542 vm_map_trunc_page(addr, effective_page_mask),
15543 VM_PROT_READ | VM_PROT_WRITE);
15544
15545 region_size -= effective_page_size;
15546 addr += effective_page_size;
15547 }
15548 } else {
15549 /*
15550 * Find the file object backing this map entry. If there is
15551 * none, then we simply ignore the "will need" advice for this
15552 * entry and go on to the next one.
15553 */
15554 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
15555 entry = entry->vme_next;
15556 start = entry->vme_start;
15557 continue;
15558 }
15559
15560 vm_object_paging_begin(object);
15561 pager = object->pager;
15562 vm_object_unlock(object);
15563
15564 /*
15565 * The data_request() could take a long time, so let's
15566 * release the map lock to avoid blocking other threads.
15567 */
15568 vm_map_unlock_read(map);
15569
15570 /*
15571 * Get the data from the object asynchronously.
15572 *
15573 * Note that memory_object_data_request() places limits on the
15574 * amount of I/O it will do. Regardless of the len we
15575 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
15576 * silently truncates the len to that size. This isn't
15577 * necessarily bad since madvise shouldn't really be used to
15578 * page in unlimited amounts of data. Other Unix variants
15579 * limit the willneed case as well. If this turns out to be an
15580 * issue for developers, then we can always adjust the policy
15581 * here and still be backwards compatible since this is all
15582 * just "advice".
15583 */
15584 kr = memory_object_data_request(
15585 pager,
15586 vm_object_trunc_page(offset) + object->paging_offset,
15587 0, /* ignored */
15588 VM_PROT_READ,
15589 (memory_object_fault_info_t)&fault_info);
15590
15591 vm_object_lock(object);
15592 vm_object_paging_end(object);
15593 vm_object_unlock(object);
15594
15595 /*
15596 * If we couldn't do the I/O for some reason, just give up on
15597 * the madvise. We still return success to the user since
15598 * madvise isn't supposed to fail when the advice can't be
15599 * taken.
15600 */
15601
15602 if (kr != KERN_SUCCESS) {
15603 return KERN_SUCCESS;
15604 }
15605 }
15606
15607 start += len;
15608 if (start >= end) {
15609 /* done */
15610 return KERN_SUCCESS;
15611 }
15612
15613 /* look up next entry */
15614 vm_map_lock_read(map);
15615 if (!vm_map_lookup_entry(map, start, &entry)) {
15616 /*
15617 * There's a new hole in the address range.
15618 */
15619 vm_map_unlock_read(map);
15620 return KERN_INVALID_ADDRESS;
15621 }
15622 }
15623
15624 vm_map_unlock_read(map);
15625 return KERN_SUCCESS;
15626 }
15627
15628 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)15629 vm_map_entry_is_reusable(
15630 vm_map_entry_t entry)
15631 {
15632 /* Only user map entries */
15633
15634 vm_object_t object;
15635
15636 if (entry->is_sub_map) {
15637 return FALSE;
15638 }
15639
15640 switch (VME_ALIAS(entry)) {
15641 case VM_MEMORY_MALLOC:
15642 case VM_MEMORY_MALLOC_SMALL:
15643 case VM_MEMORY_MALLOC_LARGE:
15644 case VM_MEMORY_REALLOC:
15645 case VM_MEMORY_MALLOC_TINY:
15646 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
15647 case VM_MEMORY_MALLOC_LARGE_REUSED:
15648 /*
15649 * This is a malloc() memory region: check if it's still
15650 * in its original state and can be re-used for more
15651 * malloc() allocations.
15652 */
15653 break;
15654 default:
15655 /*
15656 * Not a malloc() memory region: let the caller decide if
15657 * it's re-usable.
15658 */
15659 return TRUE;
15660 }
15661
15662 if (/*entry->is_shared ||*/
15663 entry->is_sub_map ||
15664 entry->in_transition ||
15665 entry->protection != VM_PROT_DEFAULT ||
15666 entry->max_protection != VM_PROT_ALL ||
15667 entry->inheritance != VM_INHERIT_DEFAULT ||
15668 entry->no_cache ||
15669 entry->permanent ||
15670 entry->superpage_size != FALSE ||
15671 entry->zero_wired_pages ||
15672 entry->wired_count != 0 ||
15673 entry->user_wired_count != 0) {
15674 return FALSE;
15675 }
15676
15677 object = VME_OBJECT(entry);
15678 if (object == VM_OBJECT_NULL) {
15679 return TRUE;
15680 }
15681 if (
15682 #if 0
15683 /*
15684 * Let's proceed even if the VM object is potentially
15685 * shared.
15686 * We check for this later when processing the actual
15687 * VM pages, so the contents will be safe if shared.
15688 *
15689 * But we can still mark this memory region as "reusable" to
15690 * acknowledge that the caller did let us know that the memory
15691 * could be re-used and should not be penalized for holding
15692 * on to it. This allows its "resident size" to not include
15693 * the reusable range.
15694 */
15695 object->ref_count == 1 &&
15696 #endif
15697 object->wired_page_count == 0 &&
15698 object->copy == VM_OBJECT_NULL &&
15699 object->shadow == VM_OBJECT_NULL &&
15700 object->internal &&
15701 object->purgable == VM_PURGABLE_DENY &&
15702 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
15703 !object->code_signed) {
15704 return TRUE;
15705 }
15706 return FALSE;
15707 }
15708
15709 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15710 vm_map_reuse_pages(
15711 vm_map_t map,
15712 vm_map_offset_t start,
15713 vm_map_offset_t end)
15714 {
15715 vm_map_entry_t entry;
15716 vm_object_t object;
15717 vm_object_offset_t start_offset, end_offset;
15718
15719 /*
15720 * The MADV_REUSE operation doesn't require any changes to the
15721 * vm_map_entry_t's, so the read lock is sufficient.
15722 */
15723
15724 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15725 /*
15726 * XXX TODO4K
15727 * need to figure out what reusable means for a
15728 * portion of a native page.
15729 */
15730 return KERN_SUCCESS;
15731 }
15732
15733 vm_map_lock_read(map);
15734 assert(map->pmap != kernel_pmap); /* protect alias access */
15735
15736 /*
15737 * The madvise semantics require that the address range be fully
15738 * allocated with no holes. Otherwise, we're required to return
15739 * an error.
15740 */
15741
15742 if (!vm_map_range_check(map, start, end, &entry)) {
15743 vm_map_unlock_read(map);
15744 vm_page_stats_reusable.reuse_pages_failure++;
15745 return KERN_INVALID_ADDRESS;
15746 }
15747
15748 /*
15749 * Examine each vm_map_entry_t in the range.
15750 */
15751 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15752 entry = entry->vme_next) {
15753 /*
15754 * Sanity check on the VM map entry.
15755 */
15756 if (!vm_map_entry_is_reusable(entry)) {
15757 vm_map_unlock_read(map);
15758 vm_page_stats_reusable.reuse_pages_failure++;
15759 return KERN_INVALID_ADDRESS;
15760 }
15761
15762 /*
15763 * The first time through, the start address could be anywhere
15764 * within the vm_map_entry we found. So adjust the offset to
15765 * correspond.
15766 */
15767 if (entry->vme_start < start) {
15768 start_offset = start - entry->vme_start;
15769 } else {
15770 start_offset = 0;
15771 }
15772 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15773 start_offset += VME_OFFSET(entry);
15774 end_offset += VME_OFFSET(entry);
15775
15776 object = VME_OBJECT(entry);
15777 if (object != VM_OBJECT_NULL) {
15778 vm_object_lock(object);
15779 vm_object_reuse_pages(object, start_offset, end_offset,
15780 TRUE);
15781 vm_object_unlock(object);
15782 }
15783
15784 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
15785 /*
15786 * XXX
15787 * We do not hold the VM map exclusively here.
15788 * The "alias" field is not that critical, so it's
15789 * safe to update it here, as long as it is the only
15790 * one that can be modified while holding the VM map
15791 * "shared".
15792 */
15793 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
15794 }
15795 }
15796
15797 vm_map_unlock_read(map);
15798 vm_page_stats_reusable.reuse_pages_success++;
15799 return KERN_SUCCESS;
15800 }
15801
15802
15803 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15804 vm_map_reusable_pages(
15805 vm_map_t map,
15806 vm_map_offset_t start,
15807 vm_map_offset_t end)
15808 {
15809 vm_map_entry_t entry;
15810 vm_object_t object;
15811 vm_object_offset_t start_offset, end_offset;
15812 vm_map_offset_t pmap_offset;
15813
15814 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15815 /*
15816 * XXX TODO4K
15817 * need to figure out what reusable means for a portion
15818 * of a native page.
15819 */
15820 return KERN_SUCCESS;
15821 }
15822
15823 /*
15824 * The MADV_REUSABLE operation doesn't require any changes to the
15825 * vm_map_entry_t's, so the read lock is sufficient.
15826 */
15827
15828 vm_map_lock_read(map);
15829 assert(map->pmap != kernel_pmap); /* protect alias access */
15830
15831 /*
15832 * The madvise semantics require that the address range be fully
15833 * allocated with no holes. Otherwise, we're required to return
15834 * an error.
15835 */
15836
15837 if (!vm_map_range_check(map, start, end, &entry)) {
15838 vm_map_unlock_read(map);
15839 vm_page_stats_reusable.reusable_pages_failure++;
15840 return KERN_INVALID_ADDRESS;
15841 }
15842
15843 /*
15844 * Examine each vm_map_entry_t in the range.
15845 */
15846 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15847 entry = entry->vme_next) {
15848 int kill_pages = 0;
15849
15850 /*
15851 * Sanity check on the VM map entry.
15852 */
15853 if (!vm_map_entry_is_reusable(entry)) {
15854 vm_map_unlock_read(map);
15855 vm_page_stats_reusable.reusable_pages_failure++;
15856 return KERN_INVALID_ADDRESS;
15857 }
15858
15859 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
15860 /* not writable: can't discard contents */
15861 vm_map_unlock_read(map);
15862 vm_page_stats_reusable.reusable_nonwritable++;
15863 vm_page_stats_reusable.reusable_pages_failure++;
15864 return KERN_PROTECTION_FAILURE;
15865 }
15866
15867 /*
15868 * The first time through, the start address could be anywhere
15869 * within the vm_map_entry we found. So adjust the offset to
15870 * correspond.
15871 */
15872 if (entry->vme_start < start) {
15873 start_offset = start - entry->vme_start;
15874 pmap_offset = start;
15875 } else {
15876 start_offset = 0;
15877 pmap_offset = entry->vme_start;
15878 }
15879 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15880 start_offset += VME_OFFSET(entry);
15881 end_offset += VME_OFFSET(entry);
15882
15883 object = VME_OBJECT(entry);
15884 if (object == VM_OBJECT_NULL) {
15885 continue;
15886 }
15887
15888
15889 vm_object_lock(object);
15890 if (((object->ref_count == 1) ||
15891 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
15892 object->copy == VM_OBJECT_NULL)) &&
15893 object->shadow == VM_OBJECT_NULL &&
15894 /*
15895 * "iokit_acct" entries are billed for their virtual size
15896 * (rather than for their resident pages only), so they
15897 * wouldn't benefit from making pages reusable, and it
15898 * would be hard to keep track of pages that are both
15899 * "iokit_acct" and "reusable" in the pmap stats and
15900 * ledgers.
15901 */
15902 !(entry->iokit_acct ||
15903 (!entry->is_sub_map && !entry->use_pmap))) {
15904 if (object->ref_count != 1) {
15905 vm_page_stats_reusable.reusable_shared++;
15906 }
15907 kill_pages = 1;
15908 } else {
15909 kill_pages = -1;
15910 }
15911 if (kill_pages != -1) {
15912 vm_object_deactivate_pages(object,
15913 start_offset,
15914 end_offset - start_offset,
15915 kill_pages,
15916 TRUE /*reusable_pages*/,
15917 map->pmap,
15918 pmap_offset);
15919 } else {
15920 vm_page_stats_reusable.reusable_pages_shared++;
15921 DTRACE_VM4(vm_map_reusable_pages_shared,
15922 unsigned int, VME_ALIAS(entry),
15923 vm_map_t, map,
15924 vm_map_entry_t, entry,
15925 vm_object_t, object);
15926 }
15927 vm_object_unlock(object);
15928
15929 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
15930 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
15931 /*
15932 * XXX
15933 * We do not hold the VM map exclusively here.
15934 * The "alias" field is not that critical, so it's
15935 * safe to update it here, as long as it is the only
15936 * one that can be modified while holding the VM map
15937 * "shared".
15938 */
15939 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
15940 }
15941 }
15942
15943 vm_map_unlock_read(map);
15944 vm_page_stats_reusable.reusable_pages_success++;
15945 return KERN_SUCCESS;
15946 }
15947
15948
15949 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15950 vm_map_can_reuse(
15951 vm_map_t map,
15952 vm_map_offset_t start,
15953 vm_map_offset_t end)
15954 {
15955 vm_map_entry_t entry;
15956
15957 /*
15958 * The MADV_REUSABLE operation doesn't require any changes to the
15959 * vm_map_entry_t's, so the read lock is sufficient.
15960 */
15961
15962 vm_map_lock_read(map);
15963 assert(map->pmap != kernel_pmap); /* protect alias access */
15964
15965 /*
15966 * The madvise semantics require that the address range be fully
15967 * allocated with no holes. Otherwise, we're required to return
15968 * an error.
15969 */
15970
15971 if (!vm_map_range_check(map, start, end, &entry)) {
15972 vm_map_unlock_read(map);
15973 vm_page_stats_reusable.can_reuse_failure++;
15974 return KERN_INVALID_ADDRESS;
15975 }
15976
15977 /*
15978 * Examine each vm_map_entry_t in the range.
15979 */
15980 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15981 entry = entry->vme_next) {
15982 /*
15983 * Sanity check on the VM map entry.
15984 */
15985 if (!vm_map_entry_is_reusable(entry)) {
15986 vm_map_unlock_read(map);
15987 vm_page_stats_reusable.can_reuse_failure++;
15988 return KERN_INVALID_ADDRESS;
15989 }
15990 }
15991
15992 vm_map_unlock_read(map);
15993 vm_page_stats_reusable.can_reuse_success++;
15994 return KERN_SUCCESS;
15995 }
15996
15997
15998 #if MACH_ASSERT
15999 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)16000 vm_map_pageout(
16001 vm_map_t map,
16002 vm_map_offset_t start,
16003 vm_map_offset_t end)
16004 {
16005 vm_map_entry_t entry;
16006
16007 /*
16008 * The MADV_PAGEOUT operation doesn't require any changes to the
16009 * vm_map_entry_t's, so the read lock is sufficient.
16010 */
16011
16012 vm_map_lock_read(map);
16013
16014 /*
16015 * The madvise semantics require that the address range be fully
16016 * allocated with no holes. Otherwise, we're required to return
16017 * an error.
16018 */
16019
16020 if (!vm_map_range_check(map, start, end, &entry)) {
16021 vm_map_unlock_read(map);
16022 return KERN_INVALID_ADDRESS;
16023 }
16024
16025 /*
16026 * Examine each vm_map_entry_t in the range.
16027 */
16028 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16029 entry = entry->vme_next) {
16030 vm_object_t object;
16031
16032 /*
16033 * Sanity check on the VM map entry.
16034 */
16035 if (entry->is_sub_map) {
16036 vm_map_t submap;
16037 vm_map_offset_t submap_start;
16038 vm_map_offset_t submap_end;
16039 vm_map_entry_t submap_entry;
16040
16041 submap = VME_SUBMAP(entry);
16042 submap_start = VME_OFFSET(entry);
16043 submap_end = submap_start + (entry->vme_end -
16044 entry->vme_start);
16045
16046 vm_map_lock_read(submap);
16047
16048 if (!vm_map_range_check(submap,
16049 submap_start,
16050 submap_end,
16051 &submap_entry)) {
16052 vm_map_unlock_read(submap);
16053 vm_map_unlock_read(map);
16054 return KERN_INVALID_ADDRESS;
16055 }
16056
16057 if (submap_entry->is_sub_map) {
16058 vm_map_unlock_read(submap);
16059 continue;
16060 }
16061
16062 object = VME_OBJECT(submap_entry);
16063 if (object == VM_OBJECT_NULL || !object->internal) {
16064 vm_map_unlock_read(submap);
16065 continue;
16066 }
16067
16068 vm_object_pageout(object);
16069
16070 vm_map_unlock_read(submap);
16071 submap = VM_MAP_NULL;
16072 submap_entry = VM_MAP_ENTRY_NULL;
16073 continue;
16074 }
16075
16076 object = VME_OBJECT(entry);
16077 if (object == VM_OBJECT_NULL || !object->internal) {
16078 continue;
16079 }
16080
16081 vm_object_pageout(object);
16082 }
16083
16084 vm_map_unlock_read(map);
16085 return KERN_SUCCESS;
16086 }
16087 #endif /* MACH_ASSERT */
16088
16089
16090 /*
16091 * Routine: vm_map_entry_insert
16092 *
16093 * Description: This routine inserts a new vm_entry in a locked map.
16094 */
16095 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,unsigned int superpage_size,boolean_t clear_map_aligned,int alias)16096 vm_map_entry_insert(
16097 vm_map_t map,
16098 vm_map_entry_t insp_entry,
16099 vm_map_offset_t start,
16100 vm_map_offset_t end,
16101 vm_object_t object,
16102 vm_object_offset_t offset,
16103 vm_map_kernel_flags_t vmk_flags,
16104 boolean_t needs_copy,
16105 vm_prot_t cur_protection,
16106 vm_prot_t max_protection,
16107 vm_inherit_t inheritance,
16108 boolean_t no_cache,
16109 boolean_t permanent,
16110 unsigned int superpage_size,
16111 boolean_t clear_map_aligned,
16112 int alias)
16113 {
16114 vm_map_entry_t new_entry;
16115 boolean_t map_aligned = FALSE;
16116
16117 assert(insp_entry != (vm_map_entry_t)0);
16118 vm_map_lock_assert_exclusive(map);
16119
16120 #if DEVELOPMENT || DEBUG
16121 vm_object_offset_t end_offset = 0;
16122 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16123 #endif /* DEVELOPMENT || DEBUG */
16124
16125 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16126 map_aligned = TRUE;
16127 }
16128 if (clear_map_aligned &&
16129 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16130 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16131 map_aligned = FALSE;
16132 }
16133 if (map_aligned) {
16134 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16135 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16136 } else {
16137 assert(page_aligned(start));
16138 assert(page_aligned(end));
16139 }
16140 assert(start < end);
16141
16142 new_entry = vm_map_entry_create(map);
16143
16144 new_entry->vme_start = start;
16145 new_entry->vme_end = end;
16146
16147 if (vmk_flags.vmkf_submap) {
16148 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
16149 VME_SUBMAP_SET(new_entry, (vm_map_t)object);
16150 } else {
16151 VME_OBJECT_SET(new_entry, object, false, 0);
16152 }
16153 VME_OFFSET_SET(new_entry, offset);
16154 VME_ALIAS_SET(new_entry, alias);
16155
16156 new_entry->map_aligned = map_aligned;
16157 new_entry->needs_copy = needs_copy;
16158 new_entry->inheritance = inheritance;
16159 new_entry->protection = cur_protection;
16160 new_entry->max_protection = max_protection;
16161 /*
16162 * submap: "use_pmap" means "nested".
16163 * default: false.
16164 *
16165 * object: "use_pmap" means "use pmap accounting" for footprint.
16166 * default: true.
16167 */
16168 new_entry->use_pmap = !vmk_flags.vmkf_submap;
16169 new_entry->no_cache = no_cache;
16170 new_entry->permanent = permanent;
16171 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
16172 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
16173 new_entry->superpage_size = (superpage_size != 0);
16174
16175 if (vmk_flags.vmkf_map_jit) {
16176 if (!(map->jit_entry_exists) ||
16177 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16178 new_entry->used_for_jit = TRUE;
16179 map->jit_entry_exists = TRUE;
16180 }
16181 }
16182
16183 /*
16184 * Insert the new entry into the list.
16185 */
16186
16187 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16188 map->size += end - start;
16189
16190 /*
16191 * Update the free space hint and the lookup hint.
16192 */
16193
16194 SAVE_HINT_MAP_WRITE(map, new_entry);
16195 return new_entry;
16196 }
16197
16198 /*
16199 * Routine: vm_map_remap_extract
16200 *
16201 * Description: This routine returns a vm_entry list from a map.
16202 */
16203 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16204 vm_map_remap_extract(
16205 vm_map_t map,
16206 vm_map_offset_t addr,
16207 vm_map_size_t size,
16208 boolean_t copy,
16209 struct vm_map_header *map_header,
16210 vm_prot_t *cur_protection, /* IN/OUT */
16211 vm_prot_t *max_protection, /* IN/OUT */
16212 /* What, no behavior? */
16213 vm_inherit_t inheritance,
16214 vm_map_kernel_flags_t vmk_flags)
16215 {
16216 kern_return_t result;
16217 vm_map_size_t mapped_size;
16218 vm_map_size_t tmp_size;
16219 vm_map_entry_t src_entry; /* result of last map lookup */
16220 vm_map_entry_t new_entry;
16221 vm_object_offset_t offset;
16222 vm_map_offset_t map_address;
16223 vm_map_offset_t src_start; /* start of entry to map */
16224 vm_map_offset_t src_end; /* end of region to be mapped */
16225 vm_object_t object;
16226 vm_map_version_t version;
16227 boolean_t src_needs_copy;
16228 boolean_t new_entry_needs_copy;
16229 vm_map_entry_t saved_src_entry;
16230 boolean_t src_entry_was_wired;
16231 vm_prot_t max_prot_for_prot_copy;
16232 vm_map_offset_t effective_page_mask;
16233 boolean_t pageable, same_map;
16234 boolean_t vm_remap_legacy;
16235 vm_prot_t required_cur_prot, required_max_prot;
16236 vm_object_t new_copy_object; /* vm_object_copy_* result */
16237 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
16238
16239 pageable = vmk_flags.vmkf_copy_pageable;
16240 same_map = vmk_flags.vmkf_copy_same_map;
16241
16242 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16243
16244 assert(map != VM_MAP_NULL);
16245 assert(size != 0);
16246 assert(size == vm_map_round_page(size, effective_page_mask));
16247 assert(inheritance == VM_INHERIT_NONE ||
16248 inheritance == VM_INHERIT_COPY ||
16249 inheritance == VM_INHERIT_SHARE);
16250 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16251 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16252 assert((*cur_protection & *max_protection) == *cur_protection);
16253
16254 /*
16255 * Compute start and end of region.
16256 */
16257 src_start = vm_map_trunc_page(addr, effective_page_mask);
16258 src_end = vm_map_round_page(src_start + size, effective_page_mask);
16259
16260 /*
16261 * Initialize map_header.
16262 */
16263 map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16264 map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16265 map_header->nentries = 0;
16266 map_header->entries_pageable = pageable;
16267 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16268 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16269 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16270
16271 vm_map_store_init( map_header );
16272
16273 if (copy && vmk_flags.vmkf_remap_prot_copy) {
16274 /*
16275 * Special case for vm_map_protect(VM_PROT_COPY):
16276 * we want to set the new mappings' max protection to the
16277 * specified *max_protection...
16278 */
16279 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16280 /* ... but we want to use the vm_remap() legacy mode */
16281 *max_protection = VM_PROT_NONE;
16282 *cur_protection = VM_PROT_NONE;
16283 } else {
16284 max_prot_for_prot_copy = VM_PROT_NONE;
16285 }
16286
16287 if (*cur_protection == VM_PROT_NONE &&
16288 *max_protection == VM_PROT_NONE) {
16289 /*
16290 * vm_remap() legacy mode:
16291 * Extract all memory regions in the specified range and
16292 * collect the strictest set of protections allowed on the
16293 * entire range, so the caller knows what they can do with
16294 * the remapped range.
16295 * We start with VM_PROT_ALL and we'll remove the protections
16296 * missing from each memory region.
16297 */
16298 vm_remap_legacy = TRUE;
16299 *cur_protection = VM_PROT_ALL;
16300 *max_protection = VM_PROT_ALL;
16301 required_cur_prot = VM_PROT_NONE;
16302 required_max_prot = VM_PROT_NONE;
16303 } else {
16304 /*
16305 * vm_remap_new() mode:
16306 * Extract all memory regions in the specified range and
16307 * ensure that they have at least the protections specified
16308 * by the caller via *cur_protection and *max_protection.
16309 * The resulting mapping should have these protections.
16310 */
16311 vm_remap_legacy = FALSE;
16312 if (copy) {
16313 required_cur_prot = VM_PROT_NONE;
16314 required_max_prot = VM_PROT_READ;
16315 } else {
16316 required_cur_prot = *cur_protection;
16317 required_max_prot = *max_protection;
16318 }
16319 }
16320
16321 map_address = 0;
16322 mapped_size = 0;
16323 result = KERN_SUCCESS;
16324
16325 /*
16326 * The specified source virtual space might correspond to
16327 * multiple map entries, need to loop on them.
16328 */
16329 vm_map_lock(map);
16330 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16331 /*
16332 * This address space uses sub-pages so the range might
16333 * not be re-mappable in an address space with larger
16334 * pages. Re-assemble any broken-up VM map entries to
16335 * improve our chances of making it work.
16336 */
16337 vm_map_simplify_range(map, src_start, src_end);
16338 }
16339 while (mapped_size != size) {
16340 vm_map_size_t entry_size;
16341
16342 /*
16343 * Find the beginning of the region.
16344 */
16345 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16346 result = KERN_INVALID_ADDRESS;
16347 break;
16348 }
16349
16350 if (src_start < src_entry->vme_start ||
16351 (mapped_size && src_start != src_entry->vme_start)) {
16352 result = KERN_INVALID_ADDRESS;
16353 break;
16354 }
16355
16356 tmp_size = size - mapped_size;
16357 if (src_end > src_entry->vme_end) {
16358 tmp_size -= (src_end - src_entry->vme_end);
16359 }
16360
16361 entry_size = (vm_map_size_t)(src_entry->vme_end -
16362 src_entry->vme_start);
16363
16364 if (src_entry->is_sub_map &&
16365 vmk_flags.vmkf_copy_single_object) {
16366 vm_map_t submap;
16367 vm_map_offset_t submap_start;
16368 vm_map_size_t submap_size;
16369 boolean_t submap_needs_copy;
16370
16371 /*
16372 * No check for "required protection" on "src_entry"
16373 * because the protections that matter are the ones
16374 * on the submap's VM map entry, which will be checked
16375 * during the call to vm_map_remap_extract() below.
16376 */
16377 submap_size = src_entry->vme_end - src_start;
16378 if (submap_size > size) {
16379 submap_size = size;
16380 }
16381 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16382 submap = VME_SUBMAP(src_entry);
16383 if (copy) {
16384 /*
16385 * The caller wants a copy-on-write re-mapping,
16386 * so let's extract from the submap accordingly.
16387 */
16388 submap_needs_copy = TRUE;
16389 } else if (src_entry->needs_copy) {
16390 /*
16391 * The caller wants a shared re-mapping but the
16392 * submap is mapped with "needs_copy", so its
16393 * contents can't be shared as is. Extract the
16394 * contents of the submap as "copy-on-write".
16395 * The re-mapping won't be shared with the
16396 * original mapping but this is equivalent to
16397 * what happened with the original "remap from
16398 * submap" code.
16399 * The shared region is mapped "needs_copy", for
16400 * example.
16401 */
16402 submap_needs_copy = TRUE;
16403 } else {
16404 /*
16405 * The caller wants a shared re-mapping and
16406 * this mapping can be shared (no "needs_copy"),
16407 * so let's extract from the submap accordingly.
16408 * Kernel submaps are mapped without
16409 * "needs_copy", for example.
16410 */
16411 submap_needs_copy = FALSE;
16412 }
16413 vm_map_reference(submap);
16414 vm_map_unlock(map);
16415 src_entry = NULL;
16416 if (vm_remap_legacy) {
16417 *cur_protection = VM_PROT_NONE;
16418 *max_protection = VM_PROT_NONE;
16419 }
16420
16421 DTRACE_VM7(remap_submap_recurse,
16422 vm_map_t, map,
16423 vm_map_offset_t, addr,
16424 vm_map_size_t, size,
16425 boolean_t, copy,
16426 vm_map_offset_t, submap_start,
16427 vm_map_size_t, submap_size,
16428 boolean_t, submap_needs_copy);
16429
16430 result = vm_map_remap_extract(submap,
16431 submap_start,
16432 submap_size,
16433 submap_needs_copy,
16434 map_header,
16435 cur_protection,
16436 max_protection,
16437 inheritance,
16438 vmk_flags);
16439 vm_map_deallocate(submap);
16440 return result;
16441 }
16442
16443 if (src_entry->is_sub_map) {
16444 /* protections for submap mapping are irrelevant here */
16445 } else if (((src_entry->protection & required_cur_prot) !=
16446 required_cur_prot) ||
16447 ((src_entry->max_protection & required_max_prot) !=
16448 required_max_prot)) {
16449 if (vmk_flags.vmkf_copy_single_object &&
16450 mapped_size != 0) {
16451 /*
16452 * Single object extraction.
16453 * We can't extract more with the required
16454 * protection but we've extracted some, so
16455 * stop there and declare success.
16456 * The caller should check the size of
16457 * the copy entry we've extracted.
16458 */
16459 result = KERN_SUCCESS;
16460 } else {
16461 /*
16462 * VM range extraction.
16463 * Required proctection is not available
16464 * for this part of the range: fail.
16465 */
16466 result = KERN_PROTECTION_FAILURE;
16467 }
16468 break;
16469 }
16470
16471 if (src_entry->is_sub_map) {
16472 vm_map_t submap;
16473 vm_map_offset_t submap_start;
16474 vm_map_size_t submap_size;
16475 vm_map_copy_t submap_copy;
16476 vm_prot_t submap_curprot, submap_maxprot;
16477 boolean_t submap_needs_copy;
16478
16479 /*
16480 * No check for "required protection" on "src_entry"
16481 * because the protections that matter are the ones
16482 * on the submap's VM map entry, which will be checked
16483 * during the call to vm_map_copy_extract() below.
16484 */
16485 object = VM_OBJECT_NULL;
16486 submap_copy = VM_MAP_COPY_NULL;
16487
16488 /* find equivalent range in the submap */
16489 submap = VME_SUBMAP(src_entry);
16490 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16491 submap_size = tmp_size;
16492 if (copy) {
16493 /*
16494 * The caller wants a copy-on-write re-mapping,
16495 * so let's extract from the submap accordingly.
16496 */
16497 submap_needs_copy = TRUE;
16498 } else if (src_entry->needs_copy) {
16499 /*
16500 * The caller wants a shared re-mapping but the
16501 * submap is mapped with "needs_copy", so its
16502 * contents can't be shared as is. Extract the
16503 * contents of the submap as "copy-on-write".
16504 * The re-mapping won't be shared with the
16505 * original mapping but this is equivalent to
16506 * what happened with the original "remap from
16507 * submap" code.
16508 * The shared region is mapped "needs_copy", for
16509 * example.
16510 */
16511 submap_needs_copy = TRUE;
16512 } else {
16513 /*
16514 * The caller wants a shared re-mapping and
16515 * this mapping can be shared (no "needs_copy"),
16516 * so let's extract from the submap accordingly.
16517 * Kernel submaps are mapped without
16518 * "needs_copy", for example.
16519 */
16520 submap_needs_copy = FALSE;
16521 }
16522 /* extra ref to keep submap alive */
16523 vm_map_reference(submap);
16524
16525 DTRACE_VM7(remap_submap_recurse,
16526 vm_map_t, map,
16527 vm_map_offset_t, addr,
16528 vm_map_size_t, size,
16529 boolean_t, copy,
16530 vm_map_offset_t, submap_start,
16531 vm_map_size_t, submap_size,
16532 boolean_t, submap_needs_copy);
16533
16534 /*
16535 * The map can be safely unlocked since we
16536 * already hold a reference on the submap.
16537 *
16538 * No timestamp since we don't care if the map
16539 * gets modified while we're down in the submap.
16540 * We'll resume the extraction at src_start + tmp_size
16541 * anyway.
16542 */
16543 vm_map_unlock(map);
16544 src_entry = NULL; /* not valid once map is unlocked */
16545
16546 if (vm_remap_legacy) {
16547 submap_curprot = VM_PROT_NONE;
16548 submap_maxprot = VM_PROT_NONE;
16549 if (max_prot_for_prot_copy) {
16550 submap_maxprot = max_prot_for_prot_copy;
16551 }
16552 } else {
16553 assert(!max_prot_for_prot_copy);
16554 submap_curprot = *cur_protection;
16555 submap_maxprot = *max_protection;
16556 }
16557 result = vm_map_copy_extract(submap,
16558 submap_start,
16559 submap_size,
16560 submap_needs_copy,
16561 &submap_copy,
16562 &submap_curprot,
16563 &submap_maxprot,
16564 inheritance,
16565 vmk_flags);
16566
16567 /* release extra ref on submap */
16568 vm_map_deallocate(submap);
16569 submap = VM_MAP_NULL;
16570
16571 if (result != KERN_SUCCESS) {
16572 vm_map_lock(map);
16573 break;
16574 }
16575
16576 /* transfer submap_copy entries to map_header */
16577 while (vm_map_copy_first_entry(submap_copy) !=
16578 vm_map_copy_to_entry(submap_copy)) {
16579 vm_map_entry_t copy_entry;
16580 vm_map_size_t copy_entry_size;
16581
16582 copy_entry = vm_map_copy_first_entry(submap_copy);
16583
16584 /*
16585 * Prevent kernel_object from being exposed to
16586 * user space.
16587 */
16588 if (__improbable(copy_entry->vme_kernel_object)) {
16589 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16590 proc_selfpid(),
16591 (current_task()->bsd_info
16592 ? proc_name_address(current_task()->bsd_info)
16593 : "?"));
16594 DTRACE_VM(extract_kernel_only);
16595 result = KERN_INVALID_RIGHT;
16596 vm_map_copy_discard(submap_copy);
16597 submap_copy = VM_MAP_COPY_NULL;
16598 vm_map_lock(map);
16599 break;
16600 }
16601
16602 vm_map_copy_entry_unlink(submap_copy, copy_entry);
16603 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
16604 copy_entry->vme_start = map_address;
16605 copy_entry->vme_end = map_address + copy_entry_size;
16606 map_address += copy_entry_size;
16607 mapped_size += copy_entry_size;
16608 src_start += copy_entry_size;
16609 assert(src_start <= src_end);
16610 _vm_map_store_entry_link(map_header,
16611 map_header->links.prev,
16612 copy_entry);
16613 }
16614 /* done with submap_copy */
16615 vm_map_copy_discard(submap_copy);
16616
16617 if (vm_remap_legacy) {
16618 *cur_protection &= submap_curprot;
16619 *max_protection &= submap_maxprot;
16620 }
16621
16622 /* re-acquire the map lock and continue to next entry */
16623 vm_map_lock(map);
16624 continue;
16625 } else {
16626 object = VME_OBJECT(src_entry);
16627
16628 /*
16629 * Prevent kernel_object from being exposed to
16630 * user space.
16631 */
16632 if (__improbable(object == kernel_object)) {
16633 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16634 proc_selfpid(),
16635 (current_task()->bsd_info
16636 ? proc_name_address(current_task()->bsd_info)
16637 : "?"));
16638 DTRACE_VM(extract_kernel_only);
16639 result = KERN_INVALID_RIGHT;
16640 break;
16641 }
16642
16643 if (src_entry->iokit_acct) {
16644 /*
16645 * This entry uses "IOKit accounting".
16646 */
16647 } else if (object != VM_OBJECT_NULL &&
16648 (object->purgable != VM_PURGABLE_DENY ||
16649 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
16650 /*
16651 * Purgeable objects have their own accounting:
16652 * no pmap accounting for them.
16653 */
16654 assertf(!src_entry->use_pmap,
16655 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16656 map,
16657 src_entry,
16658 (uint64_t)src_entry->vme_start,
16659 (uint64_t)src_entry->vme_end,
16660 src_entry->protection,
16661 src_entry->max_protection,
16662 VME_ALIAS(src_entry));
16663 } else {
16664 /*
16665 * Not IOKit or purgeable:
16666 * must be accounted by pmap stats.
16667 */
16668 assertf(src_entry->use_pmap,
16669 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16670 map,
16671 src_entry,
16672 (uint64_t)src_entry->vme_start,
16673 (uint64_t)src_entry->vme_end,
16674 src_entry->protection,
16675 src_entry->max_protection,
16676 VME_ALIAS(src_entry));
16677 }
16678
16679 if (object == VM_OBJECT_NULL) {
16680 assert(!src_entry->needs_copy);
16681 object = vm_object_allocate(entry_size);
16682 VME_OFFSET_SET(src_entry, 0);
16683 VME_OBJECT_SET(src_entry, object, false, 0);
16684 assert(src_entry->use_pmap);
16685 assert(!map->mapped_in_other_pmaps);
16686 } else if (src_entry->wired_count ||
16687 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
16688 /*
16689 * A wired memory region should not have
16690 * any pending copy-on-write and needs to
16691 * keep pointing at the VM object that
16692 * contains the wired pages.
16693 * If we're sharing this memory (copy=false),
16694 * we'll share this VM object.
16695 * If we're copying this memory (copy=true),
16696 * we'll call vm_object_copy_slowly() below
16697 * and use the new VM object for the remapping.
16698 *
16699 * Or, we are already using an asymmetric
16700 * copy, and therefore we already have
16701 * the right object.
16702 */
16703 assert(!src_entry->needs_copy);
16704 } else if (src_entry->needs_copy || object->shadowed ||
16705 (object->internal && !object->true_share &&
16706 !src_entry->is_shared &&
16707 object->vo_size > entry_size)) {
16708 VME_OBJECT_SHADOW(src_entry, entry_size);
16709 assert(src_entry->use_pmap);
16710
16711 if (!src_entry->needs_copy &&
16712 (src_entry->protection & VM_PROT_WRITE)) {
16713 vm_prot_t prot;
16714
16715 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16716
16717 prot = src_entry->protection & ~VM_PROT_WRITE;
16718
16719 if (override_nx(map,
16720 VME_ALIAS(src_entry))
16721 && prot) {
16722 prot |= VM_PROT_EXECUTE;
16723 }
16724
16725 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16726
16727 if (map->mapped_in_other_pmaps) {
16728 vm_object_pmap_protect(
16729 VME_OBJECT(src_entry),
16730 VME_OFFSET(src_entry),
16731 entry_size,
16732 PMAP_NULL,
16733 PAGE_SIZE,
16734 src_entry->vme_start,
16735 prot);
16736 #if MACH_ASSERT
16737 } else if (__improbable(map->pmap == PMAP_NULL)) {
16738 extern boolean_t vm_tests_in_progress;
16739 assert(vm_tests_in_progress);
16740 /*
16741 * Some VM tests (in vm_tests.c)
16742 * sometimes want to use a VM
16743 * map without a pmap.
16744 * Otherwise, this should never
16745 * happen.
16746 */
16747 #endif /* MACH_ASSERT */
16748 } else {
16749 pmap_protect(vm_map_pmap(map),
16750 src_entry->vme_start,
16751 src_entry->vme_end,
16752 prot);
16753 }
16754 }
16755
16756 object = VME_OBJECT(src_entry);
16757 src_entry->needs_copy = FALSE;
16758 }
16759
16760
16761 vm_object_lock(object);
16762 vm_object_reference_locked(object); /* object ref. for new entry */
16763 assert(!src_entry->needs_copy);
16764 if (object->copy_strategy ==
16765 MEMORY_OBJECT_COPY_SYMMETRIC) {
16766 /*
16767 * If we want to share this object (copy==0),
16768 * it needs to be COPY_DELAY.
16769 * If we want to copy this object (copy==1),
16770 * we can't just set "needs_copy" on our side
16771 * and expect the other side to do the same
16772 * (symmetrically), so we can't let the object
16773 * stay COPY_SYMMETRIC.
16774 * So we always switch from COPY_SYMMETRIC to
16775 * COPY_DELAY.
16776 */
16777 object->copy_strategy =
16778 MEMORY_OBJECT_COPY_DELAY;
16779 object->true_share = TRUE;
16780 }
16781 vm_object_unlock(object);
16782 }
16783
16784 offset = (VME_OFFSET(src_entry) +
16785 (src_start - src_entry->vme_start));
16786
16787 new_entry = _vm_map_entry_create(map_header);
16788 vm_map_entry_copy(map, new_entry, src_entry);
16789 if (new_entry->is_sub_map) {
16790 /* clr address space specifics */
16791 new_entry->use_pmap = FALSE;
16792 } else if (copy) {
16793 /*
16794 * We're dealing with a copy-on-write operation,
16795 * so the resulting mapping should not inherit the
16796 * original mapping's accounting settings.
16797 * "use_pmap" should be reset to its default (TRUE)
16798 * so that the new mapping gets accounted for in
16799 * the task's memory footprint.
16800 */
16801 new_entry->use_pmap = TRUE;
16802 }
16803 /* "iokit_acct" was cleared in vm_map_entry_copy() */
16804 assert(!new_entry->iokit_acct);
16805
16806 new_entry->map_aligned = FALSE;
16807
16808 new_entry->vme_start = map_address;
16809 new_entry->vme_end = map_address + tmp_size;
16810 assert(new_entry->vme_start < new_entry->vme_end);
16811 if (copy && vmk_flags.vmkf_remap_prot_copy) {
16812 /*
16813 * Remapping for vm_map_protect(VM_PROT_COPY)
16814 * to convert a read-only mapping into a
16815 * copy-on-write version of itself but
16816 * with write access:
16817 * keep the original inheritance and add
16818 * VM_PROT_WRITE to the max protection.
16819 */
16820 new_entry->inheritance = src_entry->inheritance;
16821 new_entry->protection &= max_prot_for_prot_copy;
16822 new_entry->max_protection |= VM_PROT_WRITE;
16823 } else {
16824 new_entry->inheritance = inheritance;
16825 if (!vm_remap_legacy) {
16826 new_entry->protection = *cur_protection;
16827 new_entry->max_protection = *max_protection;
16828 }
16829 }
16830 VME_OFFSET_SET(new_entry, offset);
16831
16832 /*
16833 * The new region has to be copied now if required.
16834 */
16835 RestartCopy:
16836 if (!copy) {
16837 if (src_entry->used_for_jit == TRUE) {
16838 if (same_map) {
16839 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
16840 /*
16841 * Cannot allow an entry describing a JIT
16842 * region to be shared across address spaces.
16843 */
16844 result = KERN_INVALID_ARGUMENT;
16845 vm_object_deallocate(object);
16846 vm_map_entry_dispose(new_entry);
16847 new_entry = VM_MAP_ENTRY_NULL;
16848 break;
16849 }
16850 }
16851
16852 src_entry->is_shared = TRUE;
16853 new_entry->is_shared = TRUE;
16854 if (!(new_entry->is_sub_map)) {
16855 new_entry->needs_copy = FALSE;
16856 }
16857 } else if (src_entry->is_sub_map) {
16858 /* make this a COW sub_map if not already */
16859 assert(new_entry->wired_count == 0);
16860 new_entry->needs_copy = TRUE;
16861 object = VM_OBJECT_NULL;
16862 } else if (src_entry->wired_count == 0 &&
16863 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
16864 vm_object_copy_quickly(VME_OBJECT(new_entry),
16865 VME_OFFSET(new_entry),
16866 (new_entry->vme_end -
16867 new_entry->vme_start),
16868 &src_needs_copy,
16869 &new_entry_needs_copy)) {
16870 new_entry->needs_copy = new_entry_needs_copy;
16871 new_entry->is_shared = FALSE;
16872 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16873
16874 /*
16875 * Handle copy_on_write semantics.
16876 */
16877 if (src_needs_copy && !src_entry->needs_copy) {
16878 vm_prot_t prot;
16879
16880 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16881
16882 prot = src_entry->protection & ~VM_PROT_WRITE;
16883
16884 if (override_nx(map,
16885 VME_ALIAS(src_entry))
16886 && prot) {
16887 prot |= VM_PROT_EXECUTE;
16888 }
16889
16890 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16891
16892 vm_object_pmap_protect(object,
16893 offset,
16894 entry_size,
16895 ((src_entry->is_shared
16896 || map->mapped_in_other_pmaps) ?
16897 PMAP_NULL : map->pmap),
16898 VM_MAP_PAGE_SIZE(map),
16899 src_entry->vme_start,
16900 prot);
16901
16902 assert(src_entry->wired_count == 0);
16903 src_entry->needs_copy = TRUE;
16904 }
16905 /*
16906 * Throw away the old object reference of the new entry.
16907 */
16908 vm_object_deallocate(object);
16909 } else {
16910 new_entry->is_shared = FALSE;
16911 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16912
16913 src_entry_was_wired = (src_entry->wired_count > 0);
16914 saved_src_entry = src_entry;
16915 src_entry = VM_MAP_ENTRY_NULL;
16916
16917 /*
16918 * The map can be safely unlocked since we
16919 * already hold a reference on the object.
16920 *
16921 * Record the timestamp of the map for later
16922 * verification, and unlock the map.
16923 */
16924 version.main_timestamp = map->timestamp;
16925 vm_map_unlock(map); /* Increments timestamp once! */
16926
16927 /*
16928 * Perform the copy.
16929 */
16930 if (src_entry_was_wired > 0 ||
16931 (debug4k_no_cow_copyin &&
16932 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
16933 vm_object_lock(object);
16934 result = vm_object_copy_slowly(
16935 object,
16936 offset,
16937 (new_entry->vme_end -
16938 new_entry->vme_start),
16939 THREAD_UNINT,
16940 &new_copy_object);
16941 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16942 saved_used_for_jit = new_entry->used_for_jit;
16943 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
16944 new_entry->used_for_jit = saved_used_for_jit;
16945 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
16946 new_entry->needs_copy = FALSE;
16947 } else {
16948 vm_object_offset_t new_offset;
16949
16950 new_offset = VME_OFFSET(new_entry);
16951 result = vm_object_copy_strategically(
16952 object,
16953 offset,
16954 (new_entry->vme_end -
16955 new_entry->vme_start),
16956 &new_copy_object,
16957 &new_offset,
16958 &new_entry_needs_copy);
16959 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16960 saved_used_for_jit = new_entry->used_for_jit;
16961 VME_OBJECT_SET(new_entry, new_copy_object, false, 0);
16962 new_entry->used_for_jit = saved_used_for_jit;
16963 if (new_offset != VME_OFFSET(new_entry)) {
16964 VME_OFFSET_SET(new_entry, new_offset);
16965 }
16966
16967 new_entry->needs_copy = new_entry_needs_copy;
16968 }
16969
16970 /*
16971 * Throw away the old object reference of the new entry.
16972 */
16973 vm_object_deallocate(object);
16974
16975 if (result != KERN_SUCCESS &&
16976 result != KERN_MEMORY_RESTART_COPY) {
16977 vm_map_entry_dispose(new_entry);
16978 vm_map_lock(map);
16979 break;
16980 }
16981
16982 /*
16983 * Verify that the map has not substantially
16984 * changed while the copy was being made.
16985 */
16986
16987 vm_map_lock(map);
16988 if (version.main_timestamp + 1 != map->timestamp) {
16989 /*
16990 * Simple version comparison failed.
16991 *
16992 * Retry the lookup and verify that the
16993 * same object/offset are still present.
16994 */
16995 saved_src_entry = VM_MAP_ENTRY_NULL;
16996 vm_object_deallocate(VME_OBJECT(new_entry));
16997 vm_map_entry_dispose(new_entry);
16998 if (result == KERN_MEMORY_RESTART_COPY) {
16999 result = KERN_SUCCESS;
17000 }
17001 continue;
17002 }
17003 /* map hasn't changed: src_entry is still valid */
17004 src_entry = saved_src_entry;
17005 saved_src_entry = VM_MAP_ENTRY_NULL;
17006
17007 if (result == KERN_MEMORY_RESTART_COPY) {
17008 vm_object_reference(object);
17009 goto RestartCopy;
17010 }
17011 }
17012
17013 _vm_map_store_entry_link(map_header,
17014 map_header->links.prev, new_entry);
17015
17016 /* protections for submap mapping are irrelevant here */
17017 if (vm_remap_legacy && !src_entry->is_sub_map) {
17018 *cur_protection &= src_entry->protection;
17019 *max_protection &= src_entry->max_protection;
17020 }
17021
17022 map_address += tmp_size;
17023 mapped_size += tmp_size;
17024 src_start += tmp_size;
17025
17026 if (vmk_flags.vmkf_copy_single_object) {
17027 if (mapped_size != size) {
17028 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
17029 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
17030 if (src_entry->vme_next != vm_map_to_entry(map) &&
17031 src_entry->vme_next->vme_object_value ==
17032 src_entry->vme_object_value) {
17033 /* XXX TODO4K */
17034 DEBUG4K_ERROR("could have extended copy to next entry...\n");
17035 }
17036 }
17037 break;
17038 }
17039 } /* end while */
17040
17041 vm_map_unlock(map);
17042 if (result != KERN_SUCCESS) {
17043 /*
17044 * Free all allocated elements.
17045 */
17046 for (src_entry = map_header->links.next;
17047 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
17048 src_entry = new_entry) {
17049 new_entry = src_entry->vme_next;
17050 _vm_map_store_entry_unlink(map_header, src_entry);
17051 if (src_entry->is_sub_map) {
17052 vm_map_deallocate(VME_SUBMAP(src_entry));
17053 } else {
17054 vm_object_deallocate(VME_OBJECT(src_entry));
17055 }
17056 vm_map_entry_dispose(src_entry);
17057 }
17058 }
17059 return result;
17060 }
17061
17062 bool
vm_map_is_exotic(vm_map_t map)17063 vm_map_is_exotic(
17064 vm_map_t map)
17065 {
17066 return VM_MAP_IS_EXOTIC(map);
17067 }
17068
17069 bool
vm_map_is_alien(vm_map_t map)17070 vm_map_is_alien(
17071 vm_map_t map)
17072 {
17073 return VM_MAP_IS_ALIEN(map);
17074 }
17075
17076 #if XNU_TARGET_OS_OSX
17077 void
vm_map_mark_alien(vm_map_t map)17078 vm_map_mark_alien(
17079 vm_map_t map)
17080 {
17081 vm_map_lock(map);
17082 map->is_alien = true;
17083 vm_map_unlock(map);
17084 }
17085
17086 void
vm_map_single_jit(vm_map_t map)17087 vm_map_single_jit(
17088 vm_map_t map)
17089 {
17090 vm_map_lock(map);
17091 map->single_jit = true;
17092 vm_map_unlock(map);
17093 }
17094 #endif /* XNU_TARGET_OS_OSX */
17095
17096 /*
17097 * Callers of this function must call vm_map_copy_require on
17098 * previously created vm_map_copy_t or pass a newly created
17099 * one to ensure that it hasn't been forged.
17100 */
17101 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17102 vm_map_copy_to_physcopy(
17103 vm_map_copy_t copy_map,
17104 vm_map_t target_map)
17105 {
17106 vm_map_size_t size;
17107 vm_map_entry_t entry;
17108 vm_map_entry_t new_entry;
17109 vm_object_t new_object;
17110 unsigned int pmap_flags;
17111 pmap_t new_pmap;
17112 vm_map_t new_map;
17113 vm_map_address_t src_start, src_end, src_cur;
17114 vm_map_address_t dst_start, dst_end, dst_cur;
17115 kern_return_t kr;
17116 void *kbuf;
17117
17118 /*
17119 * Perform the equivalent of vm_allocate() and memcpy().
17120 * Replace the mappings in "copy_map" with the newly allocated mapping.
17121 */
17122 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17123
17124 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17125
17126 /* create a new pmap to map "copy_map" */
17127 pmap_flags = 0;
17128 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17129 #if PMAP_CREATE_FORCE_4K_PAGES
17130 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17131 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17132 pmap_flags |= PMAP_CREATE_64BIT;
17133 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17134 if (new_pmap == NULL) {
17135 return KERN_RESOURCE_SHORTAGE;
17136 }
17137
17138 /* allocate new VM object */
17139 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17140 new_object = vm_object_allocate(size);
17141 assert(new_object);
17142
17143 /* allocate new VM map entry */
17144 new_entry = vm_map_copy_entry_create(copy_map);
17145 assert(new_entry);
17146
17147 /* finish initializing new VM map entry */
17148 new_entry->protection = VM_PROT_DEFAULT;
17149 new_entry->max_protection = VM_PROT_DEFAULT;
17150 new_entry->use_pmap = TRUE;
17151
17152 /* make new VM map entry point to new VM object */
17153 new_entry->vme_start = 0;
17154 new_entry->vme_end = size;
17155 VME_OBJECT_SET(new_entry, new_object, false, 0);
17156 VME_OFFSET_SET(new_entry, 0);
17157
17158 /* create a new pageable VM map to map "copy_map" */
17159 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17160 VM_MAP_CREATE_PAGEABLE);
17161 assert(new_map);
17162 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17163
17164 /* map "copy_map" in the new VM map */
17165 src_start = 0;
17166 kr = vm_map_copyout_internal(
17167 new_map,
17168 &src_start,
17169 copy_map,
17170 copy_map->size,
17171 FALSE, /* consume_on_success */
17172 VM_PROT_DEFAULT,
17173 VM_PROT_DEFAULT,
17174 VM_INHERIT_DEFAULT);
17175 assert(kr == KERN_SUCCESS);
17176 src_end = src_start + copy_map->size;
17177
17178 /* map "new_object" in the new VM map */
17179 vm_object_reference(new_object);
17180 dst_start = 0;
17181 kr = vm_map_enter(new_map,
17182 &dst_start,
17183 size,
17184 0, /* mask */
17185 VM_FLAGS_ANYWHERE,
17186 VM_MAP_KERNEL_FLAGS_NONE,
17187 VM_KERN_MEMORY_OSFMK,
17188 new_object,
17189 0, /* offset */
17190 FALSE, /* needs copy */
17191 VM_PROT_DEFAULT,
17192 VM_PROT_DEFAULT,
17193 VM_INHERIT_DEFAULT);
17194 assert(kr == KERN_SUCCESS);
17195 dst_end = dst_start + size;
17196
17197 /* get a kernel buffer */
17198 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17199
17200 /* physically copy "copy_map" mappings to new VM object */
17201 for (src_cur = src_start, dst_cur = dst_start;
17202 src_cur < src_end;
17203 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17204 vm_size_t bytes;
17205
17206 bytes = PAGE_SIZE;
17207 if (src_cur + PAGE_SIZE > src_end) {
17208 /* partial copy for last page */
17209 bytes = src_end - src_cur;
17210 assert(bytes > 0 && bytes < PAGE_SIZE);
17211 /* rest of dst page should be zero-filled */
17212 }
17213 /* get bytes from src mapping */
17214 kr = copyinmap(new_map, src_cur, kbuf, bytes);
17215 if (kr != KERN_SUCCESS) {
17216 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17217 }
17218 /* put bytes in dst mapping */
17219 assert(dst_cur < dst_end);
17220 assert(dst_cur + bytes <= dst_end);
17221 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17222 if (kr != KERN_SUCCESS) {
17223 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17224 }
17225 }
17226
17227 /* free kernel buffer */
17228 kfree_data(kbuf, PAGE_SIZE);
17229
17230 /* destroy new map */
17231 vm_map_destroy(new_map);
17232 new_map = VM_MAP_NULL;
17233
17234 /* dispose of the old map entries in "copy_map" */
17235 while (vm_map_copy_first_entry(copy_map) !=
17236 vm_map_copy_to_entry(copy_map)) {
17237 entry = vm_map_copy_first_entry(copy_map);
17238 vm_map_copy_entry_unlink(copy_map, entry);
17239 if (entry->is_sub_map) {
17240 vm_map_deallocate(VME_SUBMAP(entry));
17241 } else {
17242 vm_object_deallocate(VME_OBJECT(entry));
17243 }
17244 vm_map_copy_entry_dispose(entry);
17245 }
17246
17247 /* change "copy_map"'s page_size to match "target_map" */
17248 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17249 copy_map->offset = 0;
17250 copy_map->size = size;
17251
17252 /* insert new map entry in "copy_map" */
17253 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17254 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17255
17256 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17257 return KERN_SUCCESS;
17258 }
17259
17260 void
17261 vm_map_copy_adjust_get_target_copy_map(
17262 vm_map_copy_t copy_map,
17263 vm_map_copy_t *target_copy_map_p);
17264 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17265 vm_map_copy_adjust_get_target_copy_map(
17266 vm_map_copy_t copy_map,
17267 vm_map_copy_t *target_copy_map_p)
17268 {
17269 vm_map_copy_t target_copy_map;
17270 vm_map_entry_t entry, target_entry;
17271
17272 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17273 /* the caller already has a "target_copy_map": use it */
17274 return;
17275 }
17276
17277 /* the caller wants us to create a new copy of "copy_map" */
17278 target_copy_map = vm_map_copy_allocate();
17279 target_copy_map->type = copy_map->type;
17280 assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17281 target_copy_map->offset = copy_map->offset;
17282 target_copy_map->size = copy_map->size;
17283 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17284 vm_map_store_init(&target_copy_map->cpy_hdr);
17285 for (entry = vm_map_copy_first_entry(copy_map);
17286 entry != vm_map_copy_to_entry(copy_map);
17287 entry = entry->vme_next) {
17288 target_entry = vm_map_copy_entry_create(target_copy_map);
17289 vm_map_entry_copy_full(target_entry, entry);
17290 if (target_entry->is_sub_map) {
17291 vm_map_reference(VME_SUBMAP(target_entry));
17292 } else {
17293 vm_object_reference(VME_OBJECT(target_entry));
17294 }
17295 vm_map_copy_entry_link(
17296 target_copy_map,
17297 vm_map_copy_last_entry(target_copy_map),
17298 target_entry);
17299 }
17300 entry = VM_MAP_ENTRY_NULL;
17301 *target_copy_map_p = target_copy_map;
17302 }
17303
17304 /*
17305 * Callers of this function must call vm_map_copy_require on
17306 * previously created vm_map_copy_t or pass a newly created
17307 * one to ensure that it hasn't been forged.
17308 */
17309 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17310 vm_map_copy_trim(
17311 vm_map_copy_t copy_map,
17312 uint16_t new_page_shift,
17313 vm_map_offset_t trim_start,
17314 vm_map_offset_t trim_end)
17315 {
17316 uint16_t copy_page_shift;
17317 vm_map_entry_t entry, next_entry;
17318
17319 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17320 assert(copy_map->cpy_hdr.nentries > 0);
17321
17322 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17323 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17324
17325 /* use the new page_shift to do the clipping */
17326 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17327 copy_map->cpy_hdr.page_shift = new_page_shift;
17328
17329 for (entry = vm_map_copy_first_entry(copy_map);
17330 entry != vm_map_copy_to_entry(copy_map);
17331 entry = next_entry) {
17332 next_entry = entry->vme_next;
17333 if (entry->vme_end <= trim_start) {
17334 /* entry fully before trim range: skip */
17335 continue;
17336 }
17337 if (entry->vme_start >= trim_end) {
17338 /* entry fully after trim range: done */
17339 break;
17340 }
17341 /* clip entry if needed */
17342 vm_map_copy_clip_start(copy_map, entry, trim_start);
17343 vm_map_copy_clip_end(copy_map, entry, trim_end);
17344 /* dispose of entry */
17345 copy_map->size -= entry->vme_end - entry->vme_start;
17346 vm_map_copy_entry_unlink(copy_map, entry);
17347 if (entry->is_sub_map) {
17348 vm_map_deallocate(VME_SUBMAP(entry));
17349 } else {
17350 vm_object_deallocate(VME_OBJECT(entry));
17351 }
17352 vm_map_copy_entry_dispose(entry);
17353 entry = VM_MAP_ENTRY_NULL;
17354 }
17355
17356 /* restore copy_map's original page_shift */
17357 copy_map->cpy_hdr.page_shift = copy_page_shift;
17358 }
17359
17360 /*
17361 * Make any necessary adjustments to "copy_map" to allow it to be
17362 * mapped into "target_map".
17363 * If no changes were necessary, "target_copy_map" points to the
17364 * untouched "copy_map".
17365 * If changes are necessary, changes will be made to "target_copy_map".
17366 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17367 * copy the original "copy_map" to it before applying the changes.
17368 * The caller should discard "target_copy_map" if it's not the same as
17369 * the original "copy_map".
17370 */
17371 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17372 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17373 vm_map_copy_adjust_to_target(
17374 vm_map_copy_t src_copy_map,
17375 vm_map_offset_t offset,
17376 vm_map_size_t size,
17377 vm_map_t target_map,
17378 boolean_t copy,
17379 vm_map_copy_t *target_copy_map_p,
17380 vm_map_offset_t *overmap_start_p,
17381 vm_map_offset_t *overmap_end_p,
17382 vm_map_offset_t *trimmed_start_p)
17383 {
17384 vm_map_copy_t copy_map, target_copy_map;
17385 vm_map_size_t target_size;
17386 vm_map_size_t src_copy_map_size;
17387 vm_map_size_t overmap_start, overmap_end;
17388 int misalignments;
17389 vm_map_entry_t entry, target_entry;
17390 vm_map_offset_t addr_adjustment;
17391 vm_map_offset_t new_start, new_end;
17392 int copy_page_mask, target_page_mask;
17393 uint16_t copy_page_shift, target_page_shift;
17394 vm_map_offset_t trimmed_end;
17395
17396 /*
17397 * Assert that the vm_map_copy is coming from the right
17398 * zone and hasn't been forged
17399 */
17400 vm_map_copy_require(src_copy_map);
17401 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17402
17403 /*
17404 * Start working with "src_copy_map" but we'll switch
17405 * to "target_copy_map" as soon as we start making adjustments.
17406 */
17407 copy_map = src_copy_map;
17408 src_copy_map_size = src_copy_map->size;
17409
17410 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17411 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17412 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17413 target_page_mask = VM_MAP_PAGE_MASK(target_map);
17414
17415 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17416
17417 target_copy_map = *target_copy_map_p;
17418 if (target_copy_map != VM_MAP_COPY_NULL) {
17419 vm_map_copy_require(target_copy_map);
17420 }
17421
17422 if (offset + size > copy_map->size) {
17423 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17424 return KERN_INVALID_ARGUMENT;
17425 }
17426
17427 /* trim the end */
17428 trimmed_end = 0;
17429 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17430 if (new_end < copy_map->size) {
17431 trimmed_end = src_copy_map_size - new_end;
17432 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17433 /* get "target_copy_map" if needed and adjust it */
17434 vm_map_copy_adjust_get_target_copy_map(copy_map,
17435 &target_copy_map);
17436 copy_map = target_copy_map;
17437 vm_map_copy_trim(target_copy_map, target_page_shift,
17438 new_end, copy_map->size);
17439 }
17440
17441 /* trim the start */
17442 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17443 if (new_start != 0) {
17444 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17445 /* get "target_copy_map" if needed and adjust it */
17446 vm_map_copy_adjust_get_target_copy_map(copy_map,
17447 &target_copy_map);
17448 copy_map = target_copy_map;
17449 vm_map_copy_trim(target_copy_map, target_page_shift,
17450 0, new_start);
17451 }
17452 *trimmed_start_p = new_start;
17453
17454 /* target_size starts with what's left after trimming */
17455 target_size = copy_map->size;
17456 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17457 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17458 (uint64_t)target_size, (uint64_t)src_copy_map_size,
17459 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17460
17461 /* check for misalignments but don't adjust yet */
17462 misalignments = 0;
17463 overmap_start = 0;
17464 overmap_end = 0;
17465 if (copy_page_shift < target_page_shift) {
17466 /*
17467 * Remapping from 4K to 16K: check the VM object alignments
17468 * throughout the range.
17469 * If the start and end of the range are mis-aligned, we can
17470 * over-map to re-align, and adjust the "overmap" start/end
17471 * and "target_size" of the range accordingly.
17472 * If there is any mis-alignment within the range:
17473 * if "copy":
17474 * we can do immediate-copy instead of copy-on-write,
17475 * else:
17476 * no way to remap and share; fail.
17477 */
17478 for (entry = vm_map_copy_first_entry(copy_map);
17479 entry != vm_map_copy_to_entry(copy_map);
17480 entry = entry->vme_next) {
17481 vm_object_offset_t object_offset_start, object_offset_end;
17482
17483 object_offset_start = VME_OFFSET(entry);
17484 object_offset_end = object_offset_start;
17485 object_offset_end += entry->vme_end - entry->vme_start;
17486 if (object_offset_start & target_page_mask) {
17487 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17488 overmap_start++;
17489 } else {
17490 misalignments++;
17491 }
17492 }
17493 if (object_offset_end & target_page_mask) {
17494 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
17495 overmap_end++;
17496 } else {
17497 misalignments++;
17498 }
17499 }
17500 }
17501 }
17502 entry = VM_MAP_ENTRY_NULL;
17503
17504 /* decide how to deal with misalignments */
17505 assert(overmap_start <= 1);
17506 assert(overmap_end <= 1);
17507 if (!overmap_start && !overmap_end && !misalignments) {
17508 /* copy_map is properly aligned for target_map ... */
17509 if (*trimmed_start_p) {
17510 /* ... but we trimmed it, so still need to adjust */
17511 } else {
17512 /* ... and we didn't trim anything: we're done */
17513 if (target_copy_map == VM_MAP_COPY_NULL) {
17514 target_copy_map = copy_map;
17515 }
17516 *target_copy_map_p = target_copy_map;
17517 *overmap_start_p = 0;
17518 *overmap_end_p = 0;
17519 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17520 return KERN_SUCCESS;
17521 }
17522 } else if (misalignments && !copy) {
17523 /* can't "share" if misaligned */
17524 DEBUG4K_ADJUST("unsupported sharing\n");
17525 #if MACH_ASSERT
17526 if (debug4k_panic_on_misaligned_sharing) {
17527 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
17528 }
17529 #endif /* MACH_ASSERT */
17530 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
17531 return KERN_NOT_SUPPORTED;
17532 } else {
17533 /* can't virtual-copy if misaligned (but can physical-copy) */
17534 DEBUG4K_ADJUST("mis-aligned copying\n");
17535 }
17536
17537 /* get a "target_copy_map" if needed and switch to it */
17538 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
17539 copy_map = target_copy_map;
17540
17541 if (misalignments && copy) {
17542 vm_map_size_t target_copy_map_size;
17543
17544 /*
17545 * Can't do copy-on-write with misaligned mappings.
17546 * Replace the mappings with a physical copy of the original
17547 * mappings' contents.
17548 */
17549 target_copy_map_size = target_copy_map->size;
17550 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
17551 if (kr != KERN_SUCCESS) {
17552 return kr;
17553 }
17554 *target_copy_map_p = target_copy_map;
17555 *overmap_start_p = 0;
17556 *overmap_end_p = target_copy_map->size - target_copy_map_size;
17557 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17558 return KERN_SUCCESS;
17559 }
17560
17561 /* apply the adjustments */
17562 misalignments = 0;
17563 overmap_start = 0;
17564 overmap_end = 0;
17565 /* remove copy_map->offset, so that everything starts at offset 0 */
17566 addr_adjustment = copy_map->offset;
17567 /* also remove whatever we trimmed from the start */
17568 addr_adjustment += *trimmed_start_p;
17569 for (target_entry = vm_map_copy_first_entry(target_copy_map);
17570 target_entry != vm_map_copy_to_entry(target_copy_map);
17571 target_entry = target_entry->vme_next) {
17572 vm_object_offset_t object_offset_start, object_offset_end;
17573
17574 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17575 object_offset_start = VME_OFFSET(target_entry);
17576 if (object_offset_start & target_page_mask) {
17577 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17578 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17579 /*
17580 * start of 1st entry is mis-aligned:
17581 * re-adjust by over-mapping.
17582 */
17583 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
17584 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
17585 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
17586 } else {
17587 misalignments++;
17588 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17589 assert(copy);
17590 }
17591 }
17592
17593 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17594 target_size += overmap_start;
17595 } else {
17596 target_entry->vme_start += overmap_start;
17597 }
17598 target_entry->vme_end += overmap_start;
17599
17600 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
17601 if (object_offset_end & target_page_mask) {
17602 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17603 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
17604 /*
17605 * end of last entry is mis-aligned: re-adjust by over-mapping.
17606 */
17607 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
17608 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
17609 target_entry->vme_end += overmap_end;
17610 target_size += overmap_end;
17611 } else {
17612 misalignments++;
17613 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17614 assert(copy);
17615 }
17616 }
17617 target_entry->vme_start -= addr_adjustment;
17618 target_entry->vme_end -= addr_adjustment;
17619 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17620 }
17621
17622 target_copy_map->size = target_size;
17623 target_copy_map->offset += overmap_start;
17624 target_copy_map->offset -= addr_adjustment;
17625 target_copy_map->cpy_hdr.page_shift = target_page_shift;
17626
17627 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
17628 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
17629 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
17630 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
17631
17632 *target_copy_map_p = target_copy_map;
17633 *overmap_start_p = overmap_start;
17634 *overmap_end_p = overmap_end;
17635
17636 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17637 return KERN_SUCCESS;
17638 }
17639
17640 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)17641 vm_map_range_physical_size(
17642 vm_map_t map,
17643 vm_map_address_t start,
17644 mach_vm_size_t size,
17645 mach_vm_size_t * phys_size)
17646 {
17647 kern_return_t kr;
17648 vm_map_copy_t copy_map, target_copy_map;
17649 vm_map_offset_t adjusted_start, adjusted_end;
17650 vm_map_size_t adjusted_size;
17651 vm_prot_t cur_prot, max_prot;
17652 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17653 vm_map_kernel_flags_t vmk_flags;
17654
17655 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
17656 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
17657 adjusted_size = adjusted_end - adjusted_start;
17658 *phys_size = adjusted_size;
17659 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
17660 return KERN_SUCCESS;
17661 }
17662 if (start == 0) {
17663 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
17664 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
17665 adjusted_size = adjusted_end - adjusted_start;
17666 *phys_size = adjusted_size;
17667 return KERN_SUCCESS;
17668 }
17669 if (adjusted_size == 0) {
17670 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx adjusted 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_size);
17671 *phys_size = 0;
17672 return KERN_SUCCESS;
17673 }
17674
17675 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
17676 vmk_flags.vmkf_copy_pageable = TRUE;
17677 vmk_flags.vmkf_copy_same_map = TRUE;
17678 assert(adjusted_size != 0);
17679 cur_prot = VM_PROT_NONE; /* legacy mode */
17680 max_prot = VM_PROT_NONE; /* legacy mode */
17681 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
17682 FALSE /* copy */,
17683 ©_map,
17684 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
17685 vmk_flags);
17686 if (kr != KERN_SUCCESS) {
17687 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17688 //assert(0);
17689 *phys_size = 0;
17690 return kr;
17691 }
17692 assert(copy_map != VM_MAP_COPY_NULL);
17693 target_copy_map = copy_map;
17694 DEBUG4K_ADJUST("adjusting...\n");
17695 kr = vm_map_copy_adjust_to_target(
17696 copy_map,
17697 start - adjusted_start, /* offset */
17698 size, /* size */
17699 kernel_map,
17700 FALSE, /* copy */
17701 &target_copy_map,
17702 &overmap_start,
17703 &overmap_end,
17704 &trimmed_start);
17705 if (kr == KERN_SUCCESS) {
17706 if (target_copy_map->size != *phys_size) {
17707 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
17708 }
17709 *phys_size = target_copy_map->size;
17710 } else {
17711 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17712 //assert(0);
17713 *phys_size = 0;
17714 }
17715 vm_map_copy_discard(copy_map);
17716 copy_map = VM_MAP_COPY_NULL;
17717
17718 return kr;
17719 }
17720
17721
17722 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)17723 memory_entry_check_for_adjustment(
17724 vm_map_t src_map,
17725 ipc_port_t port,
17726 vm_map_offset_t *overmap_start,
17727 vm_map_offset_t *overmap_end)
17728 {
17729 kern_return_t kr = KERN_SUCCESS;
17730 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
17731
17732 assert(port);
17733 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
17734
17735 vm_named_entry_t named_entry;
17736
17737 named_entry = mach_memory_entry_from_port(port);
17738 named_entry_lock(named_entry);
17739 copy_map = named_entry->backing.copy;
17740 target_copy_map = copy_map;
17741
17742 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
17743 vm_map_offset_t trimmed_start;
17744
17745 trimmed_start = 0;
17746 DEBUG4K_ADJUST("adjusting...\n");
17747 kr = vm_map_copy_adjust_to_target(
17748 copy_map,
17749 0, /* offset */
17750 copy_map->size, /* size */
17751 src_map,
17752 FALSE, /* copy */
17753 &target_copy_map,
17754 overmap_start,
17755 overmap_end,
17756 &trimmed_start);
17757 assert(trimmed_start == 0);
17758 }
17759 named_entry_unlock(named_entry);
17760
17761 return kr;
17762 }
17763
17764
17765 /*
17766 * Routine: vm_remap
17767 *
17768 * Map portion of a task's address space.
17769 * Mapped region must not overlap more than
17770 * one vm memory object. Protections and
17771 * inheritance attributes remain the same
17772 * as in the original task and are out parameters.
17773 * Source and Target task can be identical
17774 * Other attributes are identical as for vm_map()
17775 */
17776 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)17777 vm_map_remap(
17778 vm_map_t target_map,
17779 vm_map_address_t *address,
17780 vm_map_size_t size,
17781 vm_map_offset_t mask,
17782 int flags,
17783 vm_map_kernel_flags_t vmk_flags,
17784 vm_tag_t tag,
17785 vm_map_t src_map,
17786 vm_map_offset_t memory_address,
17787 boolean_t copy,
17788 vm_prot_t *cur_protection, /* IN/OUT */
17789 vm_prot_t *max_protection, /* IN/OUT */
17790 vm_inherit_t inheritance)
17791 {
17792 kern_return_t result;
17793 vm_map_entry_t entry;
17794 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
17795 vm_map_entry_t new_entry;
17796 vm_map_copy_t copy_map;
17797 vm_map_offset_t offset_in_mapping;
17798 vm_map_size_t target_size = 0;
17799 vm_map_size_t src_page_mask, target_page_mask;
17800 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17801 vm_map_offset_t initial_memory_address;
17802 vm_map_size_t initial_size;
17803 VM_MAP_ZAP_DECLARE(zap_list);
17804
17805 if (target_map == VM_MAP_NULL) {
17806 return KERN_INVALID_ARGUMENT;
17807 }
17808
17809 initial_memory_address = memory_address;
17810 initial_size = size;
17811 src_page_mask = VM_MAP_PAGE_MASK(src_map);
17812 target_page_mask = VM_MAP_PAGE_MASK(target_map);
17813
17814 switch (inheritance) {
17815 case VM_INHERIT_NONE:
17816 case VM_INHERIT_COPY:
17817 case VM_INHERIT_SHARE:
17818 if (size != 0 && src_map != VM_MAP_NULL) {
17819 break;
17820 }
17821 OS_FALLTHROUGH;
17822 default:
17823 return KERN_INVALID_ARGUMENT;
17824 }
17825
17826 if (src_page_mask != target_page_mask) {
17827 if (copy) {
17828 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17829 } else {
17830 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17831 }
17832 }
17833
17834 /*
17835 * If the user is requesting that we return the address of the
17836 * first byte of the data (rather than the base of the page),
17837 * then we use different rounding semantics: specifically,
17838 * we assume that (memory_address, size) describes a region
17839 * all of whose pages we must cover, rather than a base to be truncated
17840 * down and a size to be added to that base. So we figure out
17841 * the highest page that the requested region includes and make
17842 * sure that the size will cover it.
17843 *
17844 * The key example we're worried about it is of the form:
17845 *
17846 * memory_address = 0x1ff0, size = 0x20
17847 *
17848 * With the old semantics, we round down the memory_address to 0x1000
17849 * and round up the size to 0x1000, resulting in our covering *only*
17850 * page 0x1000. With the new semantics, we'd realize that the region covers
17851 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
17852 * 0x1000 and page 0x2000 in the region we remap.
17853 */
17854 if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17855 vm_map_offset_t range_start, range_end;
17856
17857 range_start = vm_map_trunc_page(memory_address, src_page_mask);
17858 range_end = vm_map_round_page(memory_address + size, src_page_mask);
17859 memory_address = range_start;
17860 size = range_end - range_start;
17861 offset_in_mapping = initial_memory_address - memory_address;
17862 } else {
17863 /*
17864 * IMPORTANT:
17865 * This legacy code path is broken: for the range mentioned
17866 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
17867 * two 4k pages, it yields [ memory_address = 0x1000,
17868 * size = 0x1000 ], which covers only the first 4k page.
17869 * BUT some code unfortunately depends on this bug, so we
17870 * can't fix it without breaking something.
17871 * New code should get automatically opted in the new
17872 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
17873 */
17874 offset_in_mapping = 0;
17875 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
17876 size = vm_map_round_page(size, src_page_mask);
17877 initial_memory_address = memory_address;
17878 initial_size = size;
17879 }
17880
17881
17882 if (size == 0) {
17883 return KERN_INVALID_ARGUMENT;
17884 }
17885
17886 if (flags & VM_FLAGS_RESILIENT_MEDIA) {
17887 /* must be copy-on-write to be "media resilient" */
17888 if (!copy) {
17889 return KERN_INVALID_ARGUMENT;
17890 }
17891 }
17892
17893 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
17894 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
17895
17896 assert(size != 0);
17897 result = vm_map_copy_extract(src_map,
17898 memory_address,
17899 size,
17900 copy, ©_map,
17901 cur_protection, /* IN/OUT */
17902 max_protection, /* IN/OUT */
17903 inheritance,
17904 vmk_flags);
17905 if (result != KERN_SUCCESS) {
17906 return result;
17907 }
17908 assert(copy_map != VM_MAP_COPY_NULL);
17909
17910 overmap_start = 0;
17911 overmap_end = 0;
17912 trimmed_start = 0;
17913 target_size = size;
17914 if (src_page_mask != target_page_mask) {
17915 vm_map_copy_t target_copy_map;
17916
17917 target_copy_map = copy_map; /* can modify "copy_map" itself */
17918 DEBUG4K_ADJUST("adjusting...\n");
17919 result = vm_map_copy_adjust_to_target(
17920 copy_map,
17921 offset_in_mapping, /* offset */
17922 initial_size,
17923 target_map,
17924 copy,
17925 &target_copy_map,
17926 &overmap_start,
17927 &overmap_end,
17928 &trimmed_start);
17929 if (result != KERN_SUCCESS) {
17930 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
17931 vm_map_copy_discard(copy_map);
17932 return result;
17933 }
17934 if (trimmed_start == 0) {
17935 /* nothing trimmed: no adjustment needed */
17936 } else if (trimmed_start >= offset_in_mapping) {
17937 /* trimmed more than offset_in_mapping: nothing left */
17938 assert(overmap_start == 0);
17939 assert(overmap_end == 0);
17940 offset_in_mapping = 0;
17941 } else {
17942 /* trimmed some of offset_in_mapping: adjust */
17943 assert(overmap_start == 0);
17944 assert(overmap_end == 0);
17945 offset_in_mapping -= trimmed_start;
17946 }
17947 offset_in_mapping += overmap_start;
17948 target_size = target_copy_map->size;
17949 }
17950
17951 /*
17952 * Allocate/check a range of free virtual address
17953 * space for the target
17954 */
17955 *address = vm_map_trunc_page(*address, target_page_mask);
17956 vm_map_lock(target_map);
17957 target_size = vm_map_round_page(target_size, target_page_mask);
17958 result = vm_map_remap_range_allocate(target_map, address,
17959 target_size, mask, flags, vmk_flags, tag,
17960 &insp_entry, &zap_list);
17961
17962 for (entry = vm_map_copy_first_entry(copy_map);
17963 entry != vm_map_copy_to_entry(copy_map);
17964 entry = new_entry) {
17965 new_entry = entry->vme_next;
17966 vm_map_copy_entry_unlink(copy_map, entry);
17967 if (result == KERN_SUCCESS) {
17968 if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17969 /* no codesigning -> read-only access */
17970 entry->max_protection = VM_PROT_READ;
17971 entry->protection = VM_PROT_READ;
17972 entry->vme_resilient_codesign = TRUE;
17973 }
17974 entry->vme_start += *address;
17975 entry->vme_end += *address;
17976 assert(!entry->map_aligned);
17977 if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
17978 !entry->is_sub_map &&
17979 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
17980 VME_OBJECT(entry)->internal)) {
17981 entry->vme_resilient_media = TRUE;
17982 }
17983 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
17984 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
17985 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
17986 vm_map_store_entry_link(target_map, insp_entry, entry,
17987 vmk_flags);
17988 insp_entry = entry;
17989 } else {
17990 if (!entry->is_sub_map) {
17991 vm_object_deallocate(VME_OBJECT(entry));
17992 } else {
17993 vm_map_deallocate(VME_SUBMAP(entry));
17994 }
17995 vm_map_copy_entry_dispose(entry);
17996 }
17997 }
17998
17999 if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
18000 *cur_protection = VM_PROT_READ;
18001 *max_protection = VM_PROT_READ;
18002 }
18003
18004 if (result == KERN_SUCCESS) {
18005 target_map->size += target_size;
18006 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
18007
18008 }
18009 vm_map_unlock(target_map);
18010
18011 vm_map_zap_dispose(&zap_list);
18012
18013 if (result == KERN_SUCCESS && target_map->wiring_required) {
18014 result = vm_map_wire_kernel(target_map, *address,
18015 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
18016 TRUE);
18017 }
18018
18019 /*
18020 * If requested, return the address of the data pointed to by the
18021 * request, rather than the base of the resulting page.
18022 */
18023 if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
18024 *address += offset_in_mapping;
18025 }
18026
18027 if (src_page_mask != target_page_mask) {
18028 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
18029 }
18030 vm_map_copy_discard(copy_map);
18031 copy_map = VM_MAP_COPY_NULL;
18032
18033 return result;
18034 }
18035
18036 /*
18037 * Routine: vm_map_remap_range_allocate
18038 *
18039 * Description:
18040 * Allocate a range in the specified virtual address map.
18041 * returns the address and the map entry just before the allocated
18042 * range
18043 *
18044 * Map must be locked.
18045 */
18046
18047 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)18048 vm_map_remap_range_allocate(
18049 vm_map_t map,
18050 vm_map_address_t *address, /* IN/OUT */
18051 vm_map_size_t size,
18052 vm_map_offset_t mask,
18053 int flags,
18054 vm_map_kernel_flags_t vmk_flags,
18055 __unused vm_tag_t tag,
18056 vm_map_entry_t *map_entry, /* OUT */
18057 vm_map_zap_t zap_list)
18058 {
18059 vm_map_entry_t entry;
18060 vm_map_offset_t start;
18061 kern_return_t kr;
18062
18063 start = *address;
18064
18065 if (flags & VM_FLAGS_ANYWHERE) {
18066 if (flags & VM_FLAGS_RANDOM_ADDR) {
18067 vmk_flags.vmkf_random_address = true;
18068 }
18069 if (start) {
18070 vmk_flags.vmkf_range_id = kmem_addr_get_range(start, size);
18071 }
18072
18073 kr = vm_map_locate_space(map, size, mask, vmk_flags,
18074 &start, &entry);
18075 if (kr != KERN_SUCCESS) {
18076 return kr;
18077 }
18078 *address = start;
18079 } else {
18080 vm_map_entry_t temp_entry;
18081 vm_map_offset_t end;
18082
18083 /*
18084 * Verify that:
18085 * the address doesn't itself violate
18086 * the mask requirement.
18087 */
18088
18089 if ((start & mask) != 0) {
18090 return KERN_NO_SPACE;
18091 }
18092
18093
18094 /*
18095 * ... the address is within bounds
18096 */
18097
18098 end = start + size;
18099
18100 if ((start < map->min_offset) ||
18101 (end > map->max_offset) ||
18102 (start >= end)) {
18103 return KERN_INVALID_ADDRESS;
18104 }
18105
18106 /*
18107 * If we're asked to overwrite whatever was mapped in that
18108 * range, first deallocate that range.
18109 */
18110 if (flags & VM_FLAGS_OVERWRITE) {
18111 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18112
18113 /*
18114 * We use a "zap_list" to avoid having to unlock
18115 * the "map" in vm_map_delete(), which would compromise
18116 * the atomicity of the "deallocate" and then "remap"
18117 * combination.
18118 */
18119 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18120
18121 if (vmk_flags.vmkf_overwrite_immutable) {
18122 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18123 }
18124 (void)vm_map_delete(map, start, end, remove_flags,
18125 KMEM_GUARD_NONE, zap_list);
18126 }
18127
18128 /*
18129 * ... the starting address isn't allocated
18130 */
18131
18132 if (vm_map_lookup_entry(map, start, &temp_entry)) {
18133 return KERN_NO_SPACE;
18134 }
18135
18136 entry = temp_entry;
18137
18138 /*
18139 * ... the next region doesn't overlap the
18140 * end point.
18141 */
18142
18143 if ((entry->vme_next != vm_map_to_entry(map)) &&
18144 (entry->vme_next->vme_start < end)) {
18145 return KERN_NO_SPACE;
18146 }
18147 }
18148 *map_entry = entry;
18149 return KERN_SUCCESS;
18150 }
18151
18152 /*
18153 * vm_map_switch:
18154 *
18155 * Set the address map for the current thread to the specified map
18156 */
18157
18158 vm_map_t
vm_map_switch(vm_map_t map)18159 vm_map_switch(
18160 vm_map_t map)
18161 {
18162 int mycpu;
18163 thread_t thread = current_thread();
18164 vm_map_t oldmap = thread->map;
18165
18166 mp_disable_preemption();
18167 mycpu = cpu_number();
18168
18169 /*
18170 * Deactivate the current map and activate the requested map
18171 */
18172 PMAP_SWITCH_USER(thread, map, mycpu);
18173
18174 mp_enable_preemption();
18175 return oldmap;
18176 }
18177
18178
18179 /*
18180 * Routine: vm_map_write_user
18181 *
18182 * Description:
18183 * Copy out data from a kernel space into space in the
18184 * destination map. The space must already exist in the
18185 * destination map.
18186 * NOTE: This routine should only be called by threads
18187 * which can block on a page fault. i.e. kernel mode user
18188 * threads.
18189 *
18190 */
18191 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18192 vm_map_write_user(
18193 vm_map_t map,
18194 void *src_p,
18195 vm_map_address_t dst_addr,
18196 vm_size_t size)
18197 {
18198 kern_return_t kr = KERN_SUCCESS;
18199
18200 if (current_map() == map) {
18201 if (copyout(src_p, dst_addr, size)) {
18202 kr = KERN_INVALID_ADDRESS;
18203 }
18204 } else {
18205 vm_map_t oldmap;
18206
18207 /* take on the identity of the target map while doing */
18208 /* the transfer */
18209
18210 vm_map_reference(map);
18211 oldmap = vm_map_switch(map);
18212 if (copyout(src_p, dst_addr, size)) {
18213 kr = KERN_INVALID_ADDRESS;
18214 }
18215 vm_map_switch(oldmap);
18216 vm_map_deallocate(map);
18217 }
18218 return kr;
18219 }
18220
18221 /*
18222 * Routine: vm_map_read_user
18223 *
18224 * Description:
18225 * Copy in data from a user space source map into the
18226 * kernel map. The space must already exist in the
18227 * kernel map.
18228 * NOTE: This routine should only be called by threads
18229 * which can block on a page fault. i.e. kernel mode user
18230 * threads.
18231 *
18232 */
18233 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18234 vm_map_read_user(
18235 vm_map_t map,
18236 vm_map_address_t src_addr,
18237 void *dst_p,
18238 vm_size_t size)
18239 {
18240 kern_return_t kr = KERN_SUCCESS;
18241
18242 if (current_map() == map) {
18243 if (copyin(src_addr, dst_p, size)) {
18244 kr = KERN_INVALID_ADDRESS;
18245 }
18246 } else {
18247 vm_map_t oldmap;
18248
18249 /* take on the identity of the target map while doing */
18250 /* the transfer */
18251
18252 vm_map_reference(map);
18253 oldmap = vm_map_switch(map);
18254 if (copyin(src_addr, dst_p, size)) {
18255 kr = KERN_INVALID_ADDRESS;
18256 }
18257 vm_map_switch(oldmap);
18258 vm_map_deallocate(map);
18259 }
18260 return kr;
18261 }
18262
18263
18264 /*
18265 * vm_map_check_protection:
18266 *
18267 * Assert that the target map allows the specified
18268 * privilege on the entire address region given.
18269 * The entire region must be allocated.
18270 */
18271 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18272 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18273 vm_map_offset_t end, vm_prot_t protection)
18274 {
18275 vm_map_entry_t entry;
18276 vm_map_entry_t tmp_entry;
18277
18278 vm_map_lock(map);
18279
18280 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18281 vm_map_unlock(map);
18282 return FALSE;
18283 }
18284
18285 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18286 vm_map_unlock(map);
18287 return FALSE;
18288 }
18289
18290 entry = tmp_entry;
18291
18292 while (start < end) {
18293 if (entry == vm_map_to_entry(map)) {
18294 vm_map_unlock(map);
18295 return FALSE;
18296 }
18297
18298 /*
18299 * No holes allowed!
18300 */
18301
18302 if (start < entry->vme_start) {
18303 vm_map_unlock(map);
18304 return FALSE;
18305 }
18306
18307 /*
18308 * Check protection associated with entry.
18309 */
18310
18311 if ((entry->protection & protection) != protection) {
18312 vm_map_unlock(map);
18313 return FALSE;
18314 }
18315
18316 /* go to next entry */
18317
18318 start = entry->vme_end;
18319 entry = entry->vme_next;
18320 }
18321 vm_map_unlock(map);
18322 return TRUE;
18323 }
18324
18325 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18326 vm_map_purgable_control(
18327 vm_map_t map,
18328 vm_map_offset_t address,
18329 vm_purgable_t control,
18330 int *state)
18331 {
18332 vm_map_entry_t entry;
18333 vm_object_t object;
18334 kern_return_t kr;
18335 boolean_t was_nonvolatile;
18336
18337 /*
18338 * Vet all the input parameters and current type and state of the
18339 * underlaying object. Return with an error if anything is amiss.
18340 */
18341 if (map == VM_MAP_NULL) {
18342 return KERN_INVALID_ARGUMENT;
18343 }
18344
18345 if (control != VM_PURGABLE_SET_STATE &&
18346 control != VM_PURGABLE_GET_STATE &&
18347 control != VM_PURGABLE_PURGE_ALL &&
18348 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18349 return KERN_INVALID_ARGUMENT;
18350 }
18351
18352 if (control == VM_PURGABLE_PURGE_ALL) {
18353 vm_purgeable_object_purge_all();
18354 return KERN_SUCCESS;
18355 }
18356
18357 if ((control == VM_PURGABLE_SET_STATE ||
18358 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18359 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18360 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18361 return KERN_INVALID_ARGUMENT;
18362 }
18363
18364 vm_map_lock_read(map);
18365
18366 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18367 /*
18368 * Must pass a valid non-submap address.
18369 */
18370 vm_map_unlock_read(map);
18371 return KERN_INVALID_ADDRESS;
18372 }
18373
18374 if ((entry->protection & VM_PROT_WRITE) == 0 &&
18375 control != VM_PURGABLE_GET_STATE) {
18376 /*
18377 * Can't apply purgable controls to something you can't write.
18378 */
18379 vm_map_unlock_read(map);
18380 return KERN_PROTECTION_FAILURE;
18381 }
18382
18383 object = VME_OBJECT(entry);
18384 if (object == VM_OBJECT_NULL ||
18385 object->purgable == VM_PURGABLE_DENY) {
18386 /*
18387 * Object must already be present and be purgeable.
18388 */
18389 vm_map_unlock_read(map);
18390 return KERN_INVALID_ARGUMENT;
18391 }
18392
18393 vm_object_lock(object);
18394
18395 #if 00
18396 if (VME_OFFSET(entry) != 0 ||
18397 entry->vme_end - entry->vme_start != object->vo_size) {
18398 /*
18399 * Can only apply purgable controls to the whole (existing)
18400 * object at once.
18401 */
18402 vm_map_unlock_read(map);
18403 vm_object_unlock(object);
18404 return KERN_INVALID_ARGUMENT;
18405 }
18406 #endif
18407
18408 assert(!entry->is_sub_map);
18409 assert(!entry->use_pmap); /* purgeable has its own accounting */
18410
18411 vm_map_unlock_read(map);
18412
18413 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
18414
18415 kr = vm_object_purgable_control(object, control, state);
18416
18417 if (was_nonvolatile &&
18418 object->purgable != VM_PURGABLE_NONVOLATILE &&
18419 map->pmap == kernel_pmap) {
18420 #if DEBUG
18421 object->vo_purgeable_volatilizer = kernel_task;
18422 #endif /* DEBUG */
18423 }
18424
18425 vm_object_unlock(object);
18426
18427 return kr;
18428 }
18429
18430 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)18431 vm_map_footprint_query_page_info(
18432 vm_map_t map,
18433 vm_map_entry_t map_entry,
18434 vm_map_offset_t curr_s_offset,
18435 int *disposition_p)
18436 {
18437 int pmap_disp;
18438 vm_object_t object = VM_OBJECT_NULL;
18439 int disposition;
18440 int effective_page_size;
18441
18442 vm_map_lock_assert_held(map);
18443 assert(!map->has_corpse_footprint);
18444 assert(curr_s_offset >= map_entry->vme_start);
18445 assert(curr_s_offset < map_entry->vme_end);
18446
18447 if (map_entry->is_sub_map) {
18448 if (!map_entry->use_pmap) {
18449 /* nested pmap: no footprint */
18450 *disposition_p = 0;
18451 return;
18452 }
18453 } else {
18454 object = VME_OBJECT(map_entry);
18455 if (object == VM_OBJECT_NULL) {
18456 /* nothing mapped here: no need to ask */
18457 *disposition_p = 0;
18458 return;
18459 }
18460 }
18461
18462 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
18463
18464 pmap_disp = 0;
18465
18466 /*
18467 * Query the pmap.
18468 */
18469 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
18470
18471 /*
18472 * Compute this page's disposition.
18473 */
18474 disposition = 0;
18475
18476 /* deal with "alternate accounting" first */
18477 if (!map_entry->is_sub_map &&
18478 object->vo_no_footprint) {
18479 /* does not count in footprint */
18480 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18481 } else if (!map_entry->is_sub_map &&
18482 (object->purgable == VM_PURGABLE_NONVOLATILE ||
18483 (object->purgable == VM_PURGABLE_DENY &&
18484 object->vo_ledger_tag)) &&
18485 VM_OBJECT_OWNER(object) != NULL &&
18486 VM_OBJECT_OWNER(object)->map == map) {
18487 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18488 if ((((curr_s_offset
18489 - map_entry->vme_start
18490 + VME_OFFSET(map_entry))
18491 / effective_page_size) <
18492 (object->resident_page_count +
18493 vm_compressor_pager_get_count(object->pager)))) {
18494 /*
18495 * Non-volatile purgeable object owned
18496 * by this task: report the first
18497 * "#resident + #compressed" pages as
18498 * "resident" (to show that they
18499 * contribute to the footprint) but not
18500 * "dirty" (to avoid double-counting
18501 * with the fake "non-volatile" region
18502 * we'll report at the end of the
18503 * address space to account for all
18504 * (mapped or not) non-volatile memory
18505 * owned by this task.
18506 */
18507 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18508 }
18509 } else if (!map_entry->is_sub_map &&
18510 (object->purgable == VM_PURGABLE_VOLATILE ||
18511 object->purgable == VM_PURGABLE_EMPTY) &&
18512 VM_OBJECT_OWNER(object) != NULL &&
18513 VM_OBJECT_OWNER(object)->map == map) {
18514 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18515 if ((((curr_s_offset
18516 - map_entry->vme_start
18517 + VME_OFFSET(map_entry))
18518 / effective_page_size) <
18519 object->wired_page_count)) {
18520 /*
18521 * Volatile|empty purgeable object owned
18522 * by this task: report the first
18523 * "#wired" pages as "resident" (to
18524 * show that they contribute to the
18525 * footprint) but not "dirty" (to avoid
18526 * double-counting with the fake
18527 * "non-volatile" region we'll report
18528 * at the end of the address space to
18529 * account for all (mapped or not)
18530 * non-volatile memory owned by this
18531 * task.
18532 */
18533 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18534 }
18535 } else if (!map_entry->is_sub_map &&
18536 map_entry->iokit_acct &&
18537 object->internal &&
18538 object->purgable == VM_PURGABLE_DENY) {
18539 /*
18540 * Non-purgeable IOKit memory: phys_footprint
18541 * includes the entire virtual mapping.
18542 */
18543 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18544 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18545 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18546 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
18547 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
18548 /* alternate accounting */
18549 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
18550 if (map->pmap->footprint_was_suspended) {
18551 /*
18552 * The assertion below can fail if dyld
18553 * suspended footprint accounting
18554 * while doing some adjustments to
18555 * this page; the mapping would say
18556 * "use pmap accounting" but the page
18557 * would be marked "alternate
18558 * accounting".
18559 */
18560 } else
18561 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
18562 {
18563 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18564 }
18565 disposition = 0;
18566 } else {
18567 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
18568 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18569 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18570 disposition |= VM_PAGE_QUERY_PAGE_REF;
18571 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
18572 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18573 } else {
18574 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18575 }
18576 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
18577 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18578 }
18579 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
18580 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18581 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18582 }
18583 }
18584
18585 *disposition_p = disposition;
18586 }
18587
18588 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)18589 vm_map_page_query_internal(
18590 vm_map_t target_map,
18591 vm_map_offset_t offset,
18592 int *disposition,
18593 int *ref_count)
18594 {
18595 kern_return_t kr;
18596 vm_page_info_basic_data_t info;
18597 mach_msg_type_number_t count;
18598
18599 count = VM_PAGE_INFO_BASIC_COUNT;
18600 kr = vm_map_page_info(target_map,
18601 offset,
18602 VM_PAGE_INFO_BASIC,
18603 (vm_page_info_t) &info,
18604 &count);
18605 if (kr == KERN_SUCCESS) {
18606 *disposition = info.disposition;
18607 *ref_count = info.ref_count;
18608 } else {
18609 *disposition = 0;
18610 *ref_count = 0;
18611 }
18612
18613 return kr;
18614 }
18615
18616 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18617 vm_map_page_info(
18618 vm_map_t map,
18619 vm_map_offset_t offset,
18620 vm_page_info_flavor_t flavor,
18621 vm_page_info_t info,
18622 mach_msg_type_number_t *count)
18623 {
18624 return vm_map_page_range_info_internal(map,
18625 offset, /* start of range */
18626 (offset + 1), /* this will get rounded in the call to the page boundary */
18627 (int)-1, /* effective_page_shift: unspecified */
18628 flavor,
18629 info,
18630 count);
18631 }
18632
18633 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18634 vm_map_page_range_info_internal(
18635 vm_map_t map,
18636 vm_map_offset_t start_offset,
18637 vm_map_offset_t end_offset,
18638 int effective_page_shift,
18639 vm_page_info_flavor_t flavor,
18640 vm_page_info_t info,
18641 mach_msg_type_number_t *count)
18642 {
18643 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
18644 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
18645 vm_page_t m = VM_PAGE_NULL;
18646 kern_return_t retval = KERN_SUCCESS;
18647 int disposition = 0;
18648 int ref_count = 0;
18649 int depth = 0, info_idx = 0;
18650 vm_page_info_basic_t basic_info = 0;
18651 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
18652 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
18653 boolean_t do_region_footprint;
18654 ledger_amount_t ledger_resident, ledger_compressed;
18655 int effective_page_size;
18656 vm_map_offset_t effective_page_mask;
18657
18658 switch (flavor) {
18659 case VM_PAGE_INFO_BASIC:
18660 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
18661 /*
18662 * The "vm_page_info_basic_data" structure was not
18663 * properly padded, so allow the size to be off by
18664 * one to maintain backwards binary compatibility...
18665 */
18666 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
18667 return KERN_INVALID_ARGUMENT;
18668 }
18669 }
18670 break;
18671 default:
18672 return KERN_INVALID_ARGUMENT;
18673 }
18674
18675 if (effective_page_shift == -1) {
18676 effective_page_shift = vm_self_region_page_shift_safely(map);
18677 if (effective_page_shift == -1) {
18678 return KERN_INVALID_ARGUMENT;
18679 }
18680 }
18681 effective_page_size = (1 << effective_page_shift);
18682 effective_page_mask = effective_page_size - 1;
18683
18684 do_region_footprint = task_self_region_footprint();
18685 disposition = 0;
18686 ref_count = 0;
18687 depth = 0;
18688 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
18689 retval = KERN_SUCCESS;
18690
18691 offset_in_page = start_offset & effective_page_mask;
18692 start = vm_map_trunc_page(start_offset, effective_page_mask);
18693 end = vm_map_round_page(end_offset, effective_page_mask);
18694
18695 if (end < start) {
18696 return KERN_INVALID_ARGUMENT;
18697 }
18698
18699 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
18700
18701 vm_map_lock_read(map);
18702
18703 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
18704
18705 for (curr_s_offset = start; curr_s_offset < end;) {
18706 /*
18707 * New lookup needs reset of these variables.
18708 */
18709 curr_object = object = VM_OBJECT_NULL;
18710 offset_in_object = 0;
18711 ref_count = 0;
18712 depth = 0;
18713
18714 if (do_region_footprint &&
18715 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
18716 /*
18717 * Request for "footprint" info about a page beyond
18718 * the end of address space: this must be for
18719 * the fake region vm_map_region_recurse_64()
18720 * reported to account for non-volatile purgeable
18721 * memory owned by this task.
18722 */
18723 disposition = 0;
18724
18725 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
18726 (unsigned) ledger_compressed) {
18727 /*
18728 * We haven't reported all the "non-volatile
18729 * compressed" pages yet, so report this fake
18730 * page as "compressed".
18731 */
18732 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18733 } else {
18734 /*
18735 * We've reported all the non-volatile
18736 * compressed page but not all the non-volatile
18737 * pages , so report this fake page as
18738 * "resident dirty".
18739 */
18740 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18741 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18742 disposition |= VM_PAGE_QUERY_PAGE_REF;
18743 }
18744 switch (flavor) {
18745 case VM_PAGE_INFO_BASIC:
18746 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18747 basic_info->disposition = disposition;
18748 basic_info->ref_count = 1;
18749 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18750 basic_info->offset = 0;
18751 basic_info->depth = 0;
18752
18753 info_idx++;
18754 break;
18755 }
18756 curr_s_offset += effective_page_size;
18757 continue;
18758 }
18759
18760 /*
18761 * First, find the map entry covering "curr_s_offset", going down
18762 * submaps if necessary.
18763 */
18764 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
18765 /* no entry -> no object -> no page */
18766
18767 if (curr_s_offset < vm_map_min(map)) {
18768 /*
18769 * Illegal address that falls below map min.
18770 */
18771 curr_e_offset = MIN(end, vm_map_min(map));
18772 } else if (curr_s_offset >= vm_map_max(map)) {
18773 /*
18774 * Illegal address that falls on/after map max.
18775 */
18776 curr_e_offset = end;
18777 } else if (map_entry == vm_map_to_entry(map)) {
18778 /*
18779 * Hit a hole.
18780 */
18781 if (map_entry->vme_next == vm_map_to_entry(map)) {
18782 /*
18783 * Empty map.
18784 */
18785 curr_e_offset = MIN(map->max_offset, end);
18786 } else {
18787 /*
18788 * Hole at start of the map.
18789 */
18790 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18791 }
18792 } else {
18793 if (map_entry->vme_next == vm_map_to_entry(map)) {
18794 /*
18795 * Hole at the end of the map.
18796 */
18797 curr_e_offset = MIN(map->max_offset, end);
18798 } else {
18799 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18800 }
18801 }
18802
18803 assert(curr_e_offset >= curr_s_offset);
18804
18805 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18806
18807 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18808
18809 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18810
18811 curr_s_offset = curr_e_offset;
18812
18813 info_idx += num_pages;
18814
18815 continue;
18816 }
18817
18818 /* compute offset from this map entry's start */
18819 offset_in_object = curr_s_offset - map_entry->vme_start;
18820
18821 /* compute offset into this map entry's object (or submap) */
18822 offset_in_object += VME_OFFSET(map_entry);
18823
18824 if (map_entry->is_sub_map) {
18825 vm_map_t sub_map = VM_MAP_NULL;
18826 vm_page_info_t submap_info = 0;
18827 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
18828
18829 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
18830
18831 submap_s_offset = offset_in_object;
18832 submap_e_offset = submap_s_offset + range_len;
18833
18834 sub_map = VME_SUBMAP(map_entry);
18835
18836 vm_map_reference(sub_map);
18837 vm_map_unlock_read(map);
18838
18839 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18840
18841 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
18842 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
18843
18844 retval = vm_map_page_range_info_internal(sub_map,
18845 submap_s_offset,
18846 submap_e_offset,
18847 effective_page_shift,
18848 VM_PAGE_INFO_BASIC,
18849 (vm_page_info_t) submap_info,
18850 count);
18851
18852 assert(retval == KERN_SUCCESS);
18853
18854 vm_map_lock_read(map);
18855 vm_map_deallocate(sub_map);
18856
18857 /* Move the "info" index by the number of pages we inspected.*/
18858 info_idx += range_len >> effective_page_shift;
18859
18860 /* Move our current offset by the size of the range we inspected.*/
18861 curr_s_offset += range_len;
18862
18863 continue;
18864 }
18865
18866 object = VME_OBJECT(map_entry);
18867
18868 if (object == VM_OBJECT_NULL) {
18869 /*
18870 * We don't have an object here and, hence,
18871 * no pages to inspect. We'll fill up the
18872 * info structure appropriately.
18873 */
18874
18875 curr_e_offset = MIN(map_entry->vme_end, end);
18876
18877 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18878
18879 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18880
18881 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18882
18883 curr_s_offset = curr_e_offset;
18884
18885 info_idx += num_pages;
18886
18887 continue;
18888 }
18889
18890 if (do_region_footprint) {
18891 disposition = 0;
18892 if (map->has_corpse_footprint) {
18893 /*
18894 * Query the page info data we saved
18895 * while forking the corpse.
18896 */
18897 vm_map_corpse_footprint_query_page_info(
18898 map,
18899 curr_s_offset,
18900 &disposition);
18901 } else {
18902 /*
18903 * Query the live pmap for footprint info
18904 * about this page.
18905 */
18906 vm_map_footprint_query_page_info(
18907 map,
18908 map_entry,
18909 curr_s_offset,
18910 &disposition);
18911 }
18912 switch (flavor) {
18913 case VM_PAGE_INFO_BASIC:
18914 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18915 basic_info->disposition = disposition;
18916 basic_info->ref_count = 1;
18917 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18918 basic_info->offset = 0;
18919 basic_info->depth = 0;
18920
18921 info_idx++;
18922 break;
18923 }
18924 curr_s_offset += effective_page_size;
18925 continue;
18926 }
18927
18928 vm_object_reference(object);
18929 /*
18930 * Shared mode -- so we can allow other readers
18931 * to grab the lock too.
18932 */
18933 vm_object_lock_shared(object);
18934
18935 curr_e_offset = MIN(map_entry->vme_end, end);
18936
18937 vm_map_unlock_read(map);
18938
18939 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
18940
18941 curr_object = object;
18942
18943 for (; curr_s_offset < curr_e_offset;) {
18944 if (object == curr_object) {
18945 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
18946 } else {
18947 ref_count = curr_object->ref_count;
18948 }
18949
18950 curr_offset_in_object = offset_in_object;
18951
18952 for (;;) {
18953 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
18954
18955 if (m != VM_PAGE_NULL) {
18956 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18957 break;
18958 } else {
18959 if (curr_object->internal &&
18960 curr_object->alive &&
18961 !curr_object->terminating &&
18962 curr_object->pager_ready) {
18963 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
18964 == VM_EXTERNAL_STATE_EXISTS) {
18965 /* the pager has that page */
18966 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18967 break;
18968 }
18969 }
18970
18971 /*
18972 * Go down the VM object shadow chain until we find the page
18973 * we're looking for.
18974 */
18975
18976 if (curr_object->shadow != VM_OBJECT_NULL) {
18977 vm_object_t shadow = VM_OBJECT_NULL;
18978
18979 curr_offset_in_object += curr_object->vo_shadow_offset;
18980 shadow = curr_object->shadow;
18981
18982 vm_object_lock_shared(shadow);
18983 vm_object_unlock(curr_object);
18984
18985 curr_object = shadow;
18986 depth++;
18987 continue;
18988 } else {
18989 break;
18990 }
18991 }
18992 }
18993
18994 /* The ref_count is not strictly accurate, it measures the number */
18995 /* of entities holding a ref on the object, they may not be mapping */
18996 /* the object or may not be mapping the section holding the */
18997 /* target page but its still a ball park number and though an over- */
18998 /* count, it picks up the copy-on-write cases */
18999
19000 /* We could also get a picture of page sharing from pmap_attributes */
19001 /* but this would under count as only faulted-in mappings would */
19002 /* show up. */
19003
19004 if ((curr_object == object) && curr_object->shadow) {
19005 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
19006 }
19007
19008 if (!curr_object->internal) {
19009 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19010 }
19011
19012 if (m != VM_PAGE_NULL) {
19013 if (m->vmp_fictitious) {
19014 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
19015 } else {
19016 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
19017 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19018 }
19019
19020 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
19021 disposition |= VM_PAGE_QUERY_PAGE_REF;
19022 }
19023
19024 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
19025 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
19026 }
19027
19028 /*
19029 * XXX TODO4K:
19030 * when this routine deals with 4k
19031 * pages, check the appropriate CS bit
19032 * here.
19033 */
19034 if (m->vmp_cs_validated) {
19035 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
19036 }
19037 if (m->vmp_cs_tainted) {
19038 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
19039 }
19040 if (m->vmp_cs_nx) {
19041 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
19042 }
19043 if (m->vmp_reusable || curr_object->all_reusable) {
19044 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19045 }
19046 }
19047 }
19048
19049 switch (flavor) {
19050 case VM_PAGE_INFO_BASIC:
19051 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
19052 basic_info->disposition = disposition;
19053 basic_info->ref_count = ref_count;
19054 basic_info->object_id = (vm_object_id_t) (uintptr_t)
19055 VM_KERNEL_ADDRPERM(curr_object);
19056 basic_info->offset =
19057 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
19058 basic_info->depth = depth;
19059
19060 info_idx++;
19061 break;
19062 }
19063
19064 disposition = 0;
19065 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19066
19067 /*
19068 * Move to next offset in the range and in our object.
19069 */
19070 curr_s_offset += effective_page_size;
19071 offset_in_object += effective_page_size;
19072 curr_offset_in_object = offset_in_object;
19073
19074 if (curr_object != object) {
19075 vm_object_unlock(curr_object);
19076
19077 curr_object = object;
19078
19079 vm_object_lock_shared(curr_object);
19080 } else {
19081 vm_object_lock_yield_shared(curr_object);
19082 }
19083 }
19084
19085 vm_object_unlock(curr_object);
19086 vm_object_deallocate(curr_object);
19087
19088 vm_map_lock_read(map);
19089 }
19090
19091 vm_map_unlock_read(map);
19092 return retval;
19093 }
19094
19095 /*
19096 * vm_map_msync
19097 *
19098 * Synchronises the memory range specified with its backing store
19099 * image by either flushing or cleaning the contents to the appropriate
19100 * memory manager engaging in a memory object synchronize dialog with
19101 * the manager. The client doesn't return until the manager issues
19102 * m_o_s_completed message. MIG Magically converts user task parameter
19103 * to the task's address map.
19104 *
19105 * interpretation of sync_flags
19106 * VM_SYNC_INVALIDATE - discard pages, only return precious
19107 * pages to manager.
19108 *
19109 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19110 * - discard pages, write dirty or precious
19111 * pages back to memory manager.
19112 *
19113 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19114 * - write dirty or precious pages back to
19115 * the memory manager.
19116 *
19117 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
19118 * is a hole in the region, and we would
19119 * have returned KERN_SUCCESS, return
19120 * KERN_INVALID_ADDRESS instead.
19121 *
19122 * NOTE
19123 * The memory object attributes have not yet been implemented, this
19124 * function will have to deal with the invalidate attribute
19125 *
19126 * RETURNS
19127 * KERN_INVALID_TASK Bad task parameter
19128 * KERN_INVALID_ARGUMENT both sync and async were specified.
19129 * KERN_SUCCESS The usual.
19130 * KERN_INVALID_ADDRESS There was a hole in the region.
19131 */
19132
19133 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19134 vm_map_msync(
19135 vm_map_t map,
19136 vm_map_address_t address,
19137 vm_map_size_t size,
19138 vm_sync_t sync_flags)
19139 {
19140 vm_map_entry_t entry;
19141 vm_map_size_t amount_left;
19142 vm_object_offset_t offset;
19143 vm_object_offset_t start_offset, end_offset;
19144 boolean_t do_sync_req;
19145 boolean_t had_hole = FALSE;
19146 vm_map_offset_t pmap_offset;
19147
19148 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19149 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19150 return KERN_INVALID_ARGUMENT;
19151 }
19152
19153 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19154 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19155 }
19156
19157 /*
19158 * align address and size on page boundaries
19159 */
19160 size = (vm_map_round_page(address + size,
19161 VM_MAP_PAGE_MASK(map)) -
19162 vm_map_trunc_page(address,
19163 VM_MAP_PAGE_MASK(map)));
19164 address = vm_map_trunc_page(address,
19165 VM_MAP_PAGE_MASK(map));
19166
19167 if (map == VM_MAP_NULL) {
19168 return KERN_INVALID_TASK;
19169 }
19170
19171 if (size == 0) {
19172 return KERN_SUCCESS;
19173 }
19174
19175 amount_left = size;
19176
19177 while (amount_left > 0) {
19178 vm_object_size_t flush_size;
19179 vm_object_t object;
19180
19181 vm_map_lock(map);
19182 if (!vm_map_lookup_entry(map,
19183 address,
19184 &entry)) {
19185 vm_map_size_t skip;
19186
19187 /*
19188 * hole in the address map.
19189 */
19190 had_hole = TRUE;
19191
19192 if (sync_flags & VM_SYNC_KILLPAGES) {
19193 /*
19194 * For VM_SYNC_KILLPAGES, there should be
19195 * no holes in the range, since we couldn't
19196 * prevent someone else from allocating in
19197 * that hole and we wouldn't want to "kill"
19198 * their pages.
19199 */
19200 vm_map_unlock(map);
19201 break;
19202 }
19203
19204 /*
19205 * Check for empty map.
19206 */
19207 if (entry == vm_map_to_entry(map) &&
19208 entry->vme_next == entry) {
19209 vm_map_unlock(map);
19210 break;
19211 }
19212 /*
19213 * Check that we don't wrap and that
19214 * we have at least one real map entry.
19215 */
19216 if ((map->hdr.nentries == 0) ||
19217 (entry->vme_next->vme_start < address)) {
19218 vm_map_unlock(map);
19219 break;
19220 }
19221 /*
19222 * Move up to the next entry if needed
19223 */
19224 skip = (entry->vme_next->vme_start - address);
19225 if (skip >= amount_left) {
19226 amount_left = 0;
19227 } else {
19228 amount_left -= skip;
19229 }
19230 address = entry->vme_next->vme_start;
19231 vm_map_unlock(map);
19232 continue;
19233 }
19234
19235 offset = address - entry->vme_start;
19236 pmap_offset = address;
19237
19238 /*
19239 * do we have more to flush than is contained in this
19240 * entry ?
19241 */
19242 if (amount_left + entry->vme_start + offset > entry->vme_end) {
19243 flush_size = entry->vme_end -
19244 (entry->vme_start + offset);
19245 } else {
19246 flush_size = amount_left;
19247 }
19248 amount_left -= flush_size;
19249 address += flush_size;
19250
19251 if (entry->is_sub_map == TRUE) {
19252 vm_map_t local_map;
19253 vm_map_offset_t local_offset;
19254
19255 local_map = VME_SUBMAP(entry);
19256 local_offset = VME_OFFSET(entry);
19257 vm_map_reference(local_map);
19258 vm_map_unlock(map);
19259 if (vm_map_msync(
19260 local_map,
19261 local_offset,
19262 flush_size,
19263 sync_flags) == KERN_INVALID_ADDRESS) {
19264 had_hole = TRUE;
19265 }
19266 vm_map_deallocate(local_map);
19267 continue;
19268 }
19269 object = VME_OBJECT(entry);
19270
19271 /*
19272 * We can't sync this object if the object has not been
19273 * created yet
19274 */
19275 if (object == VM_OBJECT_NULL) {
19276 vm_map_unlock(map);
19277 continue;
19278 }
19279 offset += VME_OFFSET(entry);
19280
19281 vm_object_lock(object);
19282
19283 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19284 int kill_pages = 0;
19285 boolean_t reusable_pages = FALSE;
19286
19287 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19288 /*
19289 * This is a destructive operation and so we
19290 * err on the side of limiting the range of
19291 * the operation.
19292 */
19293 start_offset = vm_object_round_page(offset);
19294 end_offset = vm_object_trunc_page(offset + flush_size);
19295
19296 if (end_offset <= start_offset) {
19297 vm_object_unlock(object);
19298 vm_map_unlock(map);
19299 continue;
19300 }
19301
19302 pmap_offset += start_offset - offset;
19303 } else {
19304 start_offset = offset;
19305 end_offset = offset + flush_size;
19306 }
19307
19308 if (sync_flags & VM_SYNC_KILLPAGES) {
19309 if (((object->ref_count == 1) ||
19310 ((object->copy_strategy !=
19311 MEMORY_OBJECT_COPY_SYMMETRIC) &&
19312 (object->copy == VM_OBJECT_NULL))) &&
19313 (object->shadow == VM_OBJECT_NULL)) {
19314 if (object->ref_count != 1) {
19315 vm_page_stats_reusable.free_shared++;
19316 }
19317 kill_pages = 1;
19318 } else {
19319 kill_pages = -1;
19320 }
19321 }
19322 if (kill_pages != -1) {
19323 vm_object_deactivate_pages(
19324 object,
19325 start_offset,
19326 (vm_object_size_t) (end_offset - start_offset),
19327 kill_pages,
19328 reusable_pages,
19329 map->pmap,
19330 pmap_offset);
19331 }
19332 vm_object_unlock(object);
19333 vm_map_unlock(map);
19334 continue;
19335 }
19336 /*
19337 * We can't sync this object if there isn't a pager.
19338 * Don't bother to sync internal objects, since there can't
19339 * be any "permanent" storage for these objects anyway.
19340 */
19341 if ((object->pager == MEMORY_OBJECT_NULL) ||
19342 (object->internal) || (object->private)) {
19343 vm_object_unlock(object);
19344 vm_map_unlock(map);
19345 continue;
19346 }
19347 /*
19348 * keep reference on the object until syncing is done
19349 */
19350 vm_object_reference_locked(object);
19351 vm_object_unlock(object);
19352
19353 vm_map_unlock(map);
19354
19355 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19356 start_offset = vm_object_trunc_page(offset);
19357 end_offset = vm_object_round_page(offset + flush_size);
19358 } else {
19359 start_offset = offset;
19360 end_offset = offset + flush_size;
19361 }
19362
19363 do_sync_req = vm_object_sync(object,
19364 start_offset,
19365 (end_offset - start_offset),
19366 sync_flags & VM_SYNC_INVALIDATE,
19367 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19368 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19369 sync_flags & VM_SYNC_SYNCHRONOUS);
19370
19371 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19372 /*
19373 * clear out the clustering and read-ahead hints
19374 */
19375 vm_object_lock(object);
19376
19377 object->pages_created = 0;
19378 object->pages_used = 0;
19379 object->sequential = 0;
19380 object->last_alloc = 0;
19381
19382 vm_object_unlock(object);
19383 }
19384 vm_object_deallocate(object);
19385 } /* while */
19386
19387 /* for proper msync() behaviour */
19388 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19389 return KERN_INVALID_ADDRESS;
19390 }
19391
19392 return KERN_SUCCESS;
19393 }/* vm_msync */
19394
19395 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19396 vm_named_entry_associate_vm_object(
19397 vm_named_entry_t named_entry,
19398 vm_object_t object,
19399 vm_object_offset_t offset,
19400 vm_object_size_t size,
19401 vm_prot_t prot)
19402 {
19403 vm_map_copy_t copy;
19404 vm_map_entry_t copy_entry;
19405
19406 assert(!named_entry->is_sub_map);
19407 assert(!named_entry->is_copy);
19408 assert(!named_entry->is_object);
19409 assert(!named_entry->internal);
19410 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
19411
19412 copy = vm_map_copy_allocate();
19413 copy->type = VM_MAP_COPY_ENTRY_LIST;
19414 copy->offset = offset;
19415 copy->size = size;
19416 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
19417 vm_map_store_init(©->cpy_hdr);
19418
19419 copy_entry = vm_map_copy_entry_create(copy);
19420 copy_entry->protection = prot;
19421 copy_entry->max_protection = prot;
19422 copy_entry->use_pmap = TRUE;
19423 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
19424 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
19425 VME_OBJECT_SET(copy_entry, object, false, 0);
19426 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
19427 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
19428
19429 named_entry->backing.copy = copy;
19430 named_entry->is_object = TRUE;
19431 if (object->internal) {
19432 named_entry->internal = TRUE;
19433 }
19434
19435 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
19436 named_entry, copy, object, offset, size, prot);
19437 }
19438
19439 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)19440 vm_named_entry_to_vm_object(
19441 vm_named_entry_t named_entry)
19442 {
19443 vm_map_copy_t copy;
19444 vm_map_entry_t copy_entry;
19445 vm_object_t object;
19446
19447 assert(!named_entry->is_sub_map);
19448 assert(!named_entry->is_copy);
19449 assert(named_entry->is_object);
19450 copy = named_entry->backing.copy;
19451 assert(copy != VM_MAP_COPY_NULL);
19452 /*
19453 * Assert that the vm_map_copy is coming from the right
19454 * zone and hasn't been forged
19455 */
19456 vm_map_copy_require(copy);
19457 assert(copy->cpy_hdr.nentries == 1);
19458 copy_entry = vm_map_copy_first_entry(copy);
19459 object = VME_OBJECT(copy_entry);
19460
19461 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
19462
19463 return object;
19464 }
19465
19466 /*
19467 * Routine: convert_port_entry_to_map
19468 * Purpose:
19469 * Convert from a port specifying an entry or a task
19470 * to a map. Doesn't consume the port ref; produces a map ref,
19471 * which may be null. Unlike convert_port_to_map, the
19472 * port may be task or a named entry backed.
19473 * Conditions:
19474 * Nothing locked.
19475 */
19476
19477 vm_map_t
convert_port_entry_to_map(ipc_port_t port)19478 convert_port_entry_to_map(
19479 ipc_port_t port)
19480 {
19481 vm_map_t map = VM_MAP_NULL;
19482 vm_named_entry_t named_entry;
19483
19484 if (!IP_VALID(port)) {
19485 return VM_MAP_NULL;
19486 }
19487
19488 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
19489 return convert_port_to_map(port);
19490 }
19491
19492 named_entry = mach_memory_entry_from_port(port);
19493
19494 if ((named_entry->is_sub_map) &&
19495 (named_entry->protection & VM_PROT_WRITE)) {
19496 map = named_entry->backing.map;
19497 if (map->pmap != PMAP_NULL) {
19498 if (map->pmap == kernel_pmap) {
19499 panic("userspace has access "
19500 "to a kernel map %p", map);
19501 }
19502 pmap_require(map->pmap);
19503 }
19504 vm_map_reference(map);
19505 }
19506
19507 return map;
19508 }
19509
19510 /*
19511 * Export routines to other components for the things we access locally through
19512 * macros.
19513 */
19514 #undef current_map
19515 vm_map_t
current_map(void)19516 current_map(void)
19517 {
19518 return current_map_fast();
19519 }
19520
19521 /*
19522 * vm_map_reference:
19523 *
19524 * Takes a reference on the specified map.
19525 */
19526 void
vm_map_reference(vm_map_t map)19527 vm_map_reference(
19528 vm_map_t map)
19529 {
19530 if (__probable(map != VM_MAP_NULL)) {
19531 vm_map_require(map);
19532 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
19533 }
19534 }
19535
19536 /*
19537 * vm_map_deallocate:
19538 *
19539 * Removes a reference from the specified map,
19540 * destroying it if no references remain.
19541 * The map should not be locked.
19542 */
19543 void
vm_map_deallocate(vm_map_t map)19544 vm_map_deallocate(
19545 vm_map_t map)
19546 {
19547 if (__probable(map != VM_MAP_NULL)) {
19548 vm_map_require(map);
19549 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
19550 vm_map_destroy(map);
19551 }
19552 }
19553 }
19554
19555 void
vm_map_inspect_deallocate(vm_map_inspect_t map)19556 vm_map_inspect_deallocate(
19557 vm_map_inspect_t map)
19558 {
19559 vm_map_deallocate((vm_map_t)map);
19560 }
19561
19562 void
vm_map_read_deallocate(vm_map_read_t map)19563 vm_map_read_deallocate(
19564 vm_map_read_t map)
19565 {
19566 vm_map_deallocate((vm_map_t)map);
19567 }
19568
19569
19570 void
vm_map_disable_NX(vm_map_t map)19571 vm_map_disable_NX(vm_map_t map)
19572 {
19573 if (map == NULL) {
19574 return;
19575 }
19576 if (map->pmap == NULL) {
19577 return;
19578 }
19579
19580 pmap_disable_NX(map->pmap);
19581 }
19582
19583 void
vm_map_disallow_data_exec(vm_map_t map)19584 vm_map_disallow_data_exec(vm_map_t map)
19585 {
19586 if (map == NULL) {
19587 return;
19588 }
19589
19590 map->map_disallow_data_exec = TRUE;
19591 }
19592
19593 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
19594 * more descriptive.
19595 */
19596 void
vm_map_set_32bit(vm_map_t map)19597 vm_map_set_32bit(vm_map_t map)
19598 {
19599 #if defined(__arm__) || defined(__arm64__)
19600 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
19601 #else
19602 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
19603 #endif
19604 }
19605
19606
19607 void
vm_map_set_64bit(vm_map_t map)19608 vm_map_set_64bit(vm_map_t map)
19609 {
19610 #if defined(__arm__) || defined(__arm64__)
19611 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
19612 #else
19613 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
19614 #endif
19615 }
19616
19617 /*
19618 * Expand the maximum size of an existing map to the maximum supported.
19619 */
19620 void
vm_map_set_jumbo(vm_map_t map)19621 vm_map_set_jumbo(vm_map_t map)
19622 {
19623 #if defined (__arm64__) && !defined(CONFIG_ARROW)
19624 vm_map_set_max_addr(map, ~0);
19625 #else /* arm64 */
19626 (void) map;
19627 #endif
19628 }
19629
19630 /*
19631 * This map has a JIT entitlement
19632 */
19633 void
vm_map_set_jit_entitled(vm_map_t map)19634 vm_map_set_jit_entitled(vm_map_t map)
19635 {
19636 #if defined (__arm64__)
19637 pmap_set_jit_entitled(map->pmap);
19638 #else /* arm64 */
19639 (void) map;
19640 #endif
19641 }
19642
19643 /*
19644 * Expand the maximum size of an existing map.
19645 */
19646 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)19647 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
19648 {
19649 #if defined(__arm64__)
19650 vm_map_offset_t max_supported_offset = 0;
19651 vm_map_offset_t old_max_offset = map->max_offset;
19652 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
19653
19654 new_max_offset = trunc_page(new_max_offset);
19655
19656 /* The address space cannot be shrunk using this routine. */
19657 if (old_max_offset >= new_max_offset) {
19658 return;
19659 }
19660
19661 if (max_supported_offset < new_max_offset) {
19662 new_max_offset = max_supported_offset;
19663 }
19664
19665 map->max_offset = new_max_offset;
19666
19667 if (map->holes_list->prev->vme_end == old_max_offset) {
19668 /*
19669 * There is already a hole at the end of the map; simply make it bigger.
19670 */
19671 map->holes_list->prev->vme_end = map->max_offset;
19672 } else {
19673 /*
19674 * There is no hole at the end, so we need to create a new hole
19675 * for the new empty space we're creating.
19676 */
19677 struct vm_map_links *new_hole = zalloc(vm_map_holes_zone);
19678 new_hole->start = old_max_offset;
19679 new_hole->end = map->max_offset;
19680 new_hole->prev = map->holes_list->prev;
19681 new_hole->next = (struct vm_map_entry *)map->holes_list;
19682 map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
19683 map->holes_list->prev = (struct vm_map_entry *)new_hole;
19684 }
19685 #else
19686 (void)map;
19687 (void)new_max_offset;
19688 #endif
19689 }
19690
19691 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)19692 vm_compute_max_offset(boolean_t is64)
19693 {
19694 #if defined(__arm__) || defined(__arm64__)
19695 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
19696 #else
19697 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
19698 #endif
19699 }
19700
19701 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)19702 vm_map_get_max_aslr_slide_section(
19703 vm_map_t map __unused,
19704 int64_t *max_sections,
19705 int64_t *section_size)
19706 {
19707 #if defined(__arm64__)
19708 *max_sections = 3;
19709 *section_size = ARM_TT_TWIG_SIZE;
19710 #else
19711 *max_sections = 1;
19712 *section_size = 0;
19713 #endif
19714 }
19715
19716 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)19717 vm_map_get_max_aslr_slide_pages(vm_map_t map)
19718 {
19719 #if defined(__arm64__)
19720 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
19721 * limited embedded address space; this is also meant to minimize pmap
19722 * memory usage on 16KB page systems.
19723 */
19724 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
19725 #else
19726 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19727 #endif
19728 }
19729
19730 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)19731 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
19732 {
19733 #if defined(__arm64__)
19734 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
19735 * of independent entropy on 16KB page systems.
19736 */
19737 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
19738 #else
19739 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19740 #endif
19741 }
19742
19743 #ifndef __arm__
19744 boolean_t
vm_map_is_64bit(vm_map_t map)19745 vm_map_is_64bit(
19746 vm_map_t map)
19747 {
19748 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
19749 }
19750 #endif
19751
19752 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)19753 vm_map_has_hard_pagezero(
19754 vm_map_t map,
19755 vm_map_offset_t pagezero_size)
19756 {
19757 /*
19758 * XXX FBDP
19759 * We should lock the VM map (for read) here but we can get away
19760 * with it for now because there can't really be any race condition:
19761 * the VM map's min_offset is changed only when the VM map is created
19762 * and when the zero page is established (when the binary gets loaded),
19763 * and this routine gets called only when the task terminates and the
19764 * VM map is being torn down, and when a new map is created via
19765 * load_machfile()/execve().
19766 */
19767 return map->min_offset >= pagezero_size;
19768 }
19769
19770 /*
19771 * Raise a VM map's maximun offset.
19772 */
19773 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)19774 vm_map_raise_max_offset(
19775 vm_map_t map,
19776 vm_map_offset_t new_max_offset)
19777 {
19778 kern_return_t ret;
19779
19780 vm_map_lock(map);
19781 ret = KERN_INVALID_ADDRESS;
19782
19783 if (new_max_offset >= map->max_offset) {
19784 if (!vm_map_is_64bit(map)) {
19785 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
19786 map->max_offset = new_max_offset;
19787 ret = KERN_SUCCESS;
19788 }
19789 } else {
19790 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
19791 map->max_offset = new_max_offset;
19792 ret = KERN_SUCCESS;
19793 }
19794 }
19795 }
19796
19797 vm_map_unlock(map);
19798 return ret;
19799 }
19800
19801
19802 /*
19803 * Raise a VM map's minimum offset.
19804 * To strictly enforce "page zero" reservation.
19805 */
19806 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)19807 vm_map_raise_min_offset(
19808 vm_map_t map,
19809 vm_map_offset_t new_min_offset)
19810 {
19811 vm_map_entry_t first_entry;
19812
19813 new_min_offset = vm_map_round_page(new_min_offset,
19814 VM_MAP_PAGE_MASK(map));
19815
19816 vm_map_lock(map);
19817
19818 if (new_min_offset < map->min_offset) {
19819 /*
19820 * Can't move min_offset backwards, as that would expose
19821 * a part of the address space that was previously, and for
19822 * possibly good reasons, inaccessible.
19823 */
19824 vm_map_unlock(map);
19825 return KERN_INVALID_ADDRESS;
19826 }
19827 if (new_min_offset >= map->max_offset) {
19828 /* can't go beyond the end of the address space */
19829 vm_map_unlock(map);
19830 return KERN_INVALID_ADDRESS;
19831 }
19832
19833 first_entry = vm_map_first_entry(map);
19834 if (first_entry != vm_map_to_entry(map) &&
19835 first_entry->vme_start < new_min_offset) {
19836 /*
19837 * Some memory was already allocated below the new
19838 * minimun offset. It's too late to change it now...
19839 */
19840 vm_map_unlock(map);
19841 return KERN_NO_SPACE;
19842 }
19843
19844 map->min_offset = new_min_offset;
19845
19846 assert(map->holes_list);
19847 map->holes_list->start = new_min_offset;
19848 assert(new_min_offset < map->holes_list->end);
19849
19850 vm_map_unlock(map);
19851
19852 return KERN_SUCCESS;
19853 }
19854
19855 /*
19856 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
19857 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
19858 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
19859 * have to reach over to the BSD data structures.
19860 */
19861
19862 uint64_t vm_map_set_size_limit_count = 0;
19863 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)19864 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
19865 {
19866 kern_return_t kr;
19867
19868 vm_map_lock(map);
19869 if (new_size_limit < map->size) {
19870 /* new limit should not be lower than its current size */
19871 DTRACE_VM2(vm_map_set_size_limit_fail,
19872 vm_map_size_t, map->size,
19873 uint64_t, new_size_limit);
19874 kr = KERN_FAILURE;
19875 } else if (new_size_limit == map->size_limit) {
19876 /* no change */
19877 kr = KERN_SUCCESS;
19878 } else {
19879 /* set new limit */
19880 DTRACE_VM2(vm_map_set_size_limit,
19881 vm_map_size_t, map->size,
19882 uint64_t, new_size_limit);
19883 if (new_size_limit != RLIM_INFINITY) {
19884 vm_map_set_size_limit_count++;
19885 }
19886 map->size_limit = new_size_limit;
19887 kr = KERN_SUCCESS;
19888 }
19889 vm_map_unlock(map);
19890 return kr;
19891 }
19892
19893 uint64_t vm_map_set_data_limit_count = 0;
19894 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)19895 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
19896 {
19897 kern_return_t kr;
19898
19899 vm_map_lock(map);
19900 if (new_data_limit < map->size) {
19901 /* new limit should not be lower than its current size */
19902 DTRACE_VM2(vm_map_set_data_limit_fail,
19903 vm_map_size_t, map->size,
19904 uint64_t, new_data_limit);
19905 kr = KERN_FAILURE;
19906 } else if (new_data_limit == map->data_limit) {
19907 /* no change */
19908 kr = KERN_SUCCESS;
19909 } else {
19910 /* set new limit */
19911 DTRACE_VM2(vm_map_set_data_limit,
19912 vm_map_size_t, map->size,
19913 uint64_t, new_data_limit);
19914 if (new_data_limit != RLIM_INFINITY) {
19915 vm_map_set_data_limit_count++;
19916 }
19917 map->data_limit = new_data_limit;
19918 kr = KERN_SUCCESS;
19919 }
19920 vm_map_unlock(map);
19921 return kr;
19922 }
19923
19924 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)19925 vm_map_set_user_wire_limit(vm_map_t map,
19926 vm_size_t limit)
19927 {
19928 vm_map_lock(map);
19929 map->user_wire_limit = limit;
19930 vm_map_unlock(map);
19931 }
19932
19933
19934 void
vm_map_switch_protect(vm_map_t map,boolean_t val)19935 vm_map_switch_protect(vm_map_t map,
19936 boolean_t val)
19937 {
19938 vm_map_lock(map);
19939 map->switch_protect = val;
19940 vm_map_unlock(map);
19941 }
19942
19943 extern int cs_process_enforcement_enable;
19944 boolean_t
vm_map_cs_enforcement(vm_map_t map)19945 vm_map_cs_enforcement(
19946 vm_map_t map)
19947 {
19948 if (cs_process_enforcement_enable) {
19949 return TRUE;
19950 }
19951 return map->cs_enforcement;
19952 }
19953
19954 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)19955 vm_map_cs_wx_enable(
19956 vm_map_t map)
19957 {
19958 return pmap_cs_allow_invalid(vm_map_pmap(map));
19959 }
19960
19961 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)19962 vm_map_cs_debugged_set(
19963 vm_map_t map,
19964 boolean_t val)
19965 {
19966 vm_map_lock(map);
19967 map->cs_debugged = val;
19968 vm_map_unlock(map);
19969 }
19970
19971 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)19972 vm_map_cs_enforcement_set(
19973 vm_map_t map,
19974 boolean_t val)
19975 {
19976 vm_map_lock(map);
19977 map->cs_enforcement = val;
19978 pmap_set_vm_map_cs_enforced(map->pmap, val);
19979 vm_map_unlock(map);
19980 }
19981
19982 /*
19983 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
19984 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
19985 * bump both counters.
19986 */
19987 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)19988 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
19989 {
19990 pmap_t pmap = vm_map_pmap(map);
19991
19992 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19993 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19994 }
19995
19996 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)19997 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
19998 {
19999 pmap_t pmap = vm_map_pmap(map);
20000
20001 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
20002 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
20003 }
20004
20005 /* Add (generate) code signature for memory range */
20006 #if CONFIG_DYNAMIC_CODE_SIGNING
20007 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)20008 vm_map_sign(vm_map_t map,
20009 vm_map_offset_t start,
20010 vm_map_offset_t end)
20011 {
20012 vm_map_entry_t entry;
20013 vm_page_t m;
20014 vm_object_t object;
20015
20016 /*
20017 * Vet all the input parameters and current type and state of the
20018 * underlaying object. Return with an error if anything is amiss.
20019 */
20020 if (map == VM_MAP_NULL) {
20021 return KERN_INVALID_ARGUMENT;
20022 }
20023
20024 vm_map_lock_read(map);
20025
20026 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
20027 /*
20028 * Must pass a valid non-submap address.
20029 */
20030 vm_map_unlock_read(map);
20031 return KERN_INVALID_ADDRESS;
20032 }
20033
20034 if ((entry->vme_start > start) || (entry->vme_end < end)) {
20035 /*
20036 * Map entry doesn't cover the requested range. Not handling
20037 * this situation currently.
20038 */
20039 vm_map_unlock_read(map);
20040 return KERN_INVALID_ARGUMENT;
20041 }
20042
20043 object = VME_OBJECT(entry);
20044 if (object == VM_OBJECT_NULL) {
20045 /*
20046 * Object must already be present or we can't sign.
20047 */
20048 vm_map_unlock_read(map);
20049 return KERN_INVALID_ARGUMENT;
20050 }
20051
20052 vm_object_lock(object);
20053 vm_map_unlock_read(map);
20054
20055 while (start < end) {
20056 uint32_t refmod;
20057
20058 m = vm_page_lookup(object,
20059 start - entry->vme_start + VME_OFFSET(entry));
20060 if (m == VM_PAGE_NULL) {
20061 /* shoud we try to fault a page here? we can probably
20062 * demand it exists and is locked for this request */
20063 vm_object_unlock(object);
20064 return KERN_FAILURE;
20065 }
20066 /* deal with special page status */
20067 if (m->vmp_busy ||
20068 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20069 vm_object_unlock(object);
20070 return KERN_FAILURE;
20071 }
20072
20073 /* Page is OK... now "validate" it */
20074 /* This is the place where we'll call out to create a code
20075 * directory, later */
20076 /* XXX TODO4K: deal with 4k subpages individually? */
20077 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20078
20079 /* The page is now "clean" for codesigning purposes. That means
20080 * we don't consider it as modified (wpmapped) anymore. But
20081 * we'll disconnect the page so we note any future modification
20082 * attempts. */
20083 m->vmp_wpmapped = FALSE;
20084 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20085
20086 /* Pull the dirty status from the pmap, since we cleared the
20087 * wpmapped bit */
20088 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20089 SET_PAGE_DIRTY(m, FALSE);
20090 }
20091
20092 /* On to the next page */
20093 start += PAGE_SIZE;
20094 }
20095 vm_object_unlock(object);
20096
20097 return KERN_SUCCESS;
20098 }
20099 #endif
20100
20101 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20102 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20103 {
20104 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
20105 vm_map_entry_t next_entry;
20106 kern_return_t kr = KERN_SUCCESS;
20107 VM_MAP_ZAP_DECLARE(zap_list);
20108
20109 vm_map_lock(map);
20110
20111 for (entry = vm_map_first_entry(map);
20112 entry != vm_map_to_entry(map);
20113 entry = next_entry) {
20114 next_entry = entry->vme_next;
20115
20116 if (!entry->is_sub_map &&
20117 VME_OBJECT(entry) &&
20118 (VME_OBJECT(entry)->internal == TRUE) &&
20119 (VME_OBJECT(entry)->ref_count == 1)) {
20120 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20121 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20122
20123 (void)vm_map_delete(map, entry->vme_start,
20124 entry->vme_end, VM_MAP_REMOVE_NO_YIELD,
20125 KMEM_GUARD_NONE, &zap_list);
20126 }
20127 }
20128
20129 vm_map_unlock(map);
20130
20131 vm_map_zap_dispose(&zap_list);
20132
20133 return kr;
20134 }
20135
20136
20137 #if DEVELOPMENT || DEBUG
20138
20139 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20140 vm_map_disconnect_page_mappings(
20141 vm_map_t map,
20142 boolean_t do_unnest)
20143 {
20144 vm_map_entry_t entry;
20145 ledger_amount_t byte_count = 0;
20146
20147 if (do_unnest == TRUE) {
20148 #ifndef NO_NESTED_PMAP
20149 vm_map_lock(map);
20150
20151 for (entry = vm_map_first_entry(map);
20152 entry != vm_map_to_entry(map);
20153 entry = entry->vme_next) {
20154 if (entry->is_sub_map && entry->use_pmap) {
20155 /*
20156 * Make sure the range between the start of this entry and
20157 * the end of this entry is no longer nested, so that
20158 * we will only remove mappings from the pmap in use by this
20159 * this task
20160 */
20161 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20162 }
20163 }
20164 vm_map_unlock(map);
20165 #endif
20166 }
20167 vm_map_lock_read(map);
20168
20169 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20170
20171 for (entry = vm_map_first_entry(map);
20172 entry != vm_map_to_entry(map);
20173 entry = entry->vme_next) {
20174 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20175 (VME_OBJECT(entry)->phys_contiguous))) {
20176 continue;
20177 }
20178 if (entry->is_sub_map) {
20179 assert(!entry->use_pmap);
20180 }
20181
20182 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20183 }
20184 vm_map_unlock_read(map);
20185
20186 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20187 }
20188
20189 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20190 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20191 {
20192 vm_object_t object = NULL;
20193 vm_object_offset_t offset;
20194 vm_prot_t prot;
20195 boolean_t wired;
20196 vm_map_version_t version;
20197 vm_map_t real_map;
20198 int result = KERN_FAILURE;
20199
20200 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20201 vm_map_lock(map);
20202
20203 result = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
20204 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20205 NULL, &real_map, NULL);
20206 if (object == NULL) {
20207 result = KERN_MEMORY_ERROR;
20208 } else if (object->pager) {
20209 result = vm_compressor_pager_inject_error(object->pager,
20210 offset);
20211 } else {
20212 result = KERN_MEMORY_PRESENT;
20213 }
20214
20215 if (object != NULL) {
20216 vm_object_unlock(object);
20217 }
20218
20219 if (real_map != map) {
20220 vm_map_unlock(real_map);
20221 }
20222 vm_map_unlock(map);
20223
20224 return result;
20225 }
20226
20227 #endif
20228
20229
20230 #if CONFIG_FREEZE
20231
20232
20233 extern struct freezer_context freezer_context_global;
20234 AbsoluteTime c_freezer_last_yield_ts = 0;
20235
20236 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20237 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20238
20239 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20240 vm_map_freeze(
20241 task_t task,
20242 unsigned int *purgeable_count,
20243 unsigned int *wired_count,
20244 unsigned int *clean_count,
20245 unsigned int *dirty_count,
20246 unsigned int dirty_budget,
20247 unsigned int *shared_count,
20248 int *freezer_error_code,
20249 boolean_t eval_only)
20250 {
20251 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
20252 kern_return_t kr = KERN_SUCCESS;
20253 boolean_t evaluation_phase = TRUE;
20254 vm_object_t cur_shared_object = NULL;
20255 int cur_shared_obj_ref_cnt = 0;
20256 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20257
20258 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20259
20260 /*
20261 * We need the exclusive lock here so that we can
20262 * block any page faults or lookups while we are
20263 * in the middle of freezing this vm map.
20264 */
20265 vm_map_t map = task->map;
20266
20267 vm_map_lock(map);
20268
20269 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20270
20271 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20272 if (vm_compressor_low_on_space()) {
20273 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20274 }
20275
20276 if (vm_swap_low_on_space()) {
20277 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20278 }
20279
20280 kr = KERN_NO_SPACE;
20281 goto done;
20282 }
20283
20284 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20285 /*
20286 * In-memory compressor backing the freezer. No disk.
20287 * So no need to do the evaluation phase.
20288 */
20289 evaluation_phase = FALSE;
20290
20291 if (eval_only == TRUE) {
20292 /*
20293 * We don't support 'eval_only' mode
20294 * in this non-swap config.
20295 */
20296 *freezer_error_code = FREEZER_ERROR_GENERIC;
20297 kr = KERN_INVALID_ARGUMENT;
20298 goto done;
20299 }
20300
20301 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20302 clock_get_uptime(&c_freezer_last_yield_ts);
20303 }
20304 again:
20305
20306 for (entry2 = vm_map_first_entry(map);
20307 entry2 != vm_map_to_entry(map);
20308 entry2 = entry2->vme_next) {
20309 vm_object_t src_object;
20310
20311 if (entry2->is_sub_map) {
20312 continue;
20313 }
20314
20315 src_object = VME_OBJECT(entry2);
20316 if (!src_object ||
20317 src_object->phys_contiguous ||
20318 !src_object->internal) {
20319 continue;
20320 }
20321
20322 /* If eligible, scan the entry, moving eligible pages over to our parent object */
20323
20324 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20325 /*
20326 * We skip purgeable objects during evaluation phase only.
20327 * If we decide to freeze this process, we'll explicitly
20328 * purge these objects before we go around again with
20329 * 'evaluation_phase' set to FALSE.
20330 */
20331
20332 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20333 /*
20334 * We want to purge objects that may not belong to this task but are mapped
20335 * in this task alone. Since we already purged this task's purgeable memory
20336 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20337 * on this task's purgeable objects. Hence the check for only volatile objects.
20338 */
20339 if (evaluation_phase == FALSE &&
20340 (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20341 (src_object->ref_count == 1)) {
20342 vm_object_lock(src_object);
20343 vm_object_purge(src_object, 0);
20344 vm_object_unlock(src_object);
20345 }
20346 continue;
20347 }
20348
20349 /*
20350 * Pages belonging to this object could be swapped to disk.
20351 * Make sure it's not a shared object because we could end
20352 * up just bringing it back in again.
20353 *
20354 * We try to optimize somewhat by checking for objects that are mapped
20355 * more than once within our own map. But we don't do full searches,
20356 * we just look at the entries following our current entry.
20357 */
20358
20359 if (src_object->ref_count > 1) {
20360 if (src_object != cur_shared_object) {
20361 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20362 dirty_shared_count += obj_pages_snapshot;
20363
20364 cur_shared_object = src_object;
20365 cur_shared_obj_ref_cnt = 1;
20366 continue;
20367 } else {
20368 cur_shared_obj_ref_cnt++;
20369 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20370 /*
20371 * Fall through to below and treat this object as private.
20372 * So deduct its pages from our shared total and add it to the
20373 * private total.
20374 */
20375
20376 dirty_shared_count -= obj_pages_snapshot;
20377 dirty_private_count += obj_pages_snapshot;
20378 } else {
20379 continue;
20380 }
20381 }
20382 }
20383
20384
20385 if (src_object->ref_count == 1) {
20386 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20387 }
20388
20389 if (evaluation_phase == TRUE) {
20390 continue;
20391 }
20392 }
20393
20394 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
20395 *wired_count += src_object->wired_page_count;
20396
20397 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20398 if (vm_compressor_low_on_space()) {
20399 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20400 }
20401
20402 if (vm_swap_low_on_space()) {
20403 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20404 }
20405
20406 kr = KERN_NO_SPACE;
20407 break;
20408 }
20409 if (paged_out_count >= dirty_budget) {
20410 break;
20411 }
20412 dirty_budget -= paged_out_count;
20413 }
20414
20415 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
20416 if (evaluation_phase) {
20417 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
20418
20419 if (dirty_shared_count > shared_pages_threshold) {
20420 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
20421 kr = KERN_FAILURE;
20422 goto done;
20423 }
20424
20425 if (dirty_shared_count &&
20426 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
20427 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
20428 kr = KERN_FAILURE;
20429 goto done;
20430 }
20431
20432 evaluation_phase = FALSE;
20433 dirty_shared_count = dirty_private_count = 0;
20434
20435 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20436 clock_get_uptime(&c_freezer_last_yield_ts);
20437
20438 if (eval_only) {
20439 kr = KERN_SUCCESS;
20440 goto done;
20441 }
20442
20443 vm_purgeable_purge_task_owned(task);
20444
20445 goto again;
20446 } else {
20447 kr = KERN_SUCCESS;
20448 }
20449
20450 done:
20451 vm_map_unlock(map);
20452
20453 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
20454 vm_object_compressed_freezer_done();
20455 }
20456 return kr;
20457 }
20458
20459 #endif
20460
20461 /*
20462 * vm_map_entry_should_cow_for_true_share:
20463 *
20464 * Determines if the map entry should be clipped and setup for copy-on-write
20465 * to avoid applying "true_share" to a large VM object when only a subset is
20466 * targeted.
20467 *
20468 * For now, we target only the map entries created for the Objective C
20469 * Garbage Collector, which initially have the following properties:
20470 * - alias == VM_MEMORY_MALLOC
20471 * - wired_count == 0
20472 * - !needs_copy
20473 * and a VM object with:
20474 * - internal
20475 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
20476 * - !true_share
20477 * - vo_size == ANON_CHUNK_SIZE
20478 *
20479 * Only non-kernel map entries.
20480 */
20481 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)20482 vm_map_entry_should_cow_for_true_share(
20483 vm_map_entry_t entry)
20484 {
20485 vm_object_t object;
20486
20487 if (entry->is_sub_map) {
20488 /* entry does not point at a VM object */
20489 return FALSE;
20490 }
20491
20492 if (entry->needs_copy) {
20493 /* already set for copy_on_write: done! */
20494 return FALSE;
20495 }
20496
20497 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
20498 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
20499 /* not a malloc heap or Obj-C Garbage Collector heap */
20500 return FALSE;
20501 }
20502
20503 if (entry->wired_count) {
20504 /* wired: can't change the map entry... */
20505 vm_counters.should_cow_but_wired++;
20506 return FALSE;
20507 }
20508
20509 object = VME_OBJECT(entry);
20510
20511 if (object == VM_OBJECT_NULL) {
20512 /* no object yet... */
20513 return FALSE;
20514 }
20515
20516 if (!object->internal) {
20517 /* not an internal object */
20518 return FALSE;
20519 }
20520
20521 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
20522 /* not the default copy strategy */
20523 return FALSE;
20524 }
20525
20526 if (object->true_share) {
20527 /* already true_share: too late to avoid it */
20528 return FALSE;
20529 }
20530
20531 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
20532 object->vo_size != ANON_CHUNK_SIZE) {
20533 /* ... not an object created for the ObjC Garbage Collector */
20534 return FALSE;
20535 }
20536
20537 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
20538 object->vo_size != 2048 * 4096) {
20539 /* ... not a "MALLOC_SMALL" heap */
20540 return FALSE;
20541 }
20542
20543 /*
20544 * All the criteria match: we have a large object being targeted for "true_share".
20545 * To limit the adverse side-effects linked with "true_share", tell the caller to
20546 * try and avoid setting up the entire object for "true_share" by clipping the
20547 * targeted range and setting it up for copy-on-write.
20548 */
20549 return TRUE;
20550 }
20551
20552 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20553 vm_map_round_page_mask(
20554 vm_map_offset_t offset,
20555 vm_map_offset_t mask)
20556 {
20557 return VM_MAP_ROUND_PAGE(offset, mask);
20558 }
20559
20560 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20561 vm_map_trunc_page_mask(
20562 vm_map_offset_t offset,
20563 vm_map_offset_t mask)
20564 {
20565 return VM_MAP_TRUNC_PAGE(offset, mask);
20566 }
20567
20568 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)20569 vm_map_page_aligned(
20570 vm_map_offset_t offset,
20571 vm_map_offset_t mask)
20572 {
20573 return ((offset) & mask) == 0;
20574 }
20575
20576 int
vm_map_page_shift(vm_map_t map)20577 vm_map_page_shift(
20578 vm_map_t map)
20579 {
20580 return VM_MAP_PAGE_SHIFT(map);
20581 }
20582
20583 int
vm_map_page_size(vm_map_t map)20584 vm_map_page_size(
20585 vm_map_t map)
20586 {
20587 return VM_MAP_PAGE_SIZE(map);
20588 }
20589
20590 vm_map_offset_t
vm_map_page_mask(vm_map_t map)20591 vm_map_page_mask(
20592 vm_map_t map)
20593 {
20594 return VM_MAP_PAGE_MASK(map);
20595 }
20596
20597 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)20598 vm_map_set_page_shift(
20599 vm_map_t map,
20600 int pageshift)
20601 {
20602 if (map->hdr.nentries != 0) {
20603 /* too late to change page size */
20604 return KERN_FAILURE;
20605 }
20606
20607 map->hdr.page_shift = (uint16_t)pageshift;
20608
20609 return KERN_SUCCESS;
20610 }
20611
20612 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)20613 vm_map_query_volatile(
20614 vm_map_t map,
20615 mach_vm_size_t *volatile_virtual_size_p,
20616 mach_vm_size_t *volatile_resident_size_p,
20617 mach_vm_size_t *volatile_compressed_size_p,
20618 mach_vm_size_t *volatile_pmap_size_p,
20619 mach_vm_size_t *volatile_compressed_pmap_size_p)
20620 {
20621 mach_vm_size_t volatile_virtual_size;
20622 mach_vm_size_t volatile_resident_count;
20623 mach_vm_size_t volatile_compressed_count;
20624 mach_vm_size_t volatile_pmap_count;
20625 mach_vm_size_t volatile_compressed_pmap_count;
20626 mach_vm_size_t resident_count;
20627 vm_map_entry_t entry;
20628 vm_object_t object;
20629
20630 /* map should be locked by caller */
20631
20632 volatile_virtual_size = 0;
20633 volatile_resident_count = 0;
20634 volatile_compressed_count = 0;
20635 volatile_pmap_count = 0;
20636 volatile_compressed_pmap_count = 0;
20637
20638 for (entry = vm_map_first_entry(map);
20639 entry != vm_map_to_entry(map);
20640 entry = entry->vme_next) {
20641 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
20642
20643 if (entry->is_sub_map) {
20644 continue;
20645 }
20646 if (!(entry->protection & VM_PROT_WRITE)) {
20647 continue;
20648 }
20649 object = VME_OBJECT(entry);
20650 if (object == VM_OBJECT_NULL) {
20651 continue;
20652 }
20653 if (object->purgable != VM_PURGABLE_VOLATILE &&
20654 object->purgable != VM_PURGABLE_EMPTY) {
20655 continue;
20656 }
20657 if (VME_OFFSET(entry)) {
20658 /*
20659 * If the map entry has been split and the object now
20660 * appears several times in the VM map, we don't want
20661 * to count the object's resident_page_count more than
20662 * once. We count it only for the first one, starting
20663 * at offset 0 and ignore the other VM map entries.
20664 */
20665 continue;
20666 }
20667 resident_count = object->resident_page_count;
20668 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
20669 resident_count = 0;
20670 } else {
20671 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
20672 }
20673
20674 volatile_virtual_size += entry->vme_end - entry->vme_start;
20675 volatile_resident_count += resident_count;
20676 if (object->pager) {
20677 volatile_compressed_count +=
20678 vm_compressor_pager_get_count(object->pager);
20679 }
20680 pmap_compressed_bytes = 0;
20681 pmap_resident_bytes =
20682 pmap_query_resident(map->pmap,
20683 entry->vme_start,
20684 entry->vme_end,
20685 &pmap_compressed_bytes);
20686 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
20687 volatile_compressed_pmap_count += (pmap_compressed_bytes
20688 / PAGE_SIZE);
20689 }
20690
20691 /* map is still locked on return */
20692
20693 *volatile_virtual_size_p = volatile_virtual_size;
20694 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
20695 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
20696 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
20697 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
20698
20699 return KERN_SUCCESS;
20700 }
20701
20702 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)20703 vm_map_sizes(vm_map_t map,
20704 vm_map_size_t * psize,
20705 vm_map_size_t * pfree,
20706 vm_map_size_t * plargest_free)
20707 {
20708 vm_map_entry_t entry;
20709 vm_map_offset_t prev;
20710 vm_map_size_t free, total_free, largest_free;
20711 boolean_t end;
20712
20713 if (!map) {
20714 *psize = *pfree = *plargest_free = 0;
20715 return;
20716 }
20717 total_free = largest_free = 0;
20718
20719 vm_map_lock_read(map);
20720 if (psize) {
20721 *psize = map->max_offset - map->min_offset;
20722 }
20723
20724 prev = map->min_offset;
20725 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
20726 end = (entry == vm_map_to_entry(map));
20727
20728 if (end) {
20729 free = entry->vme_end - prev;
20730 } else {
20731 free = entry->vme_start - prev;
20732 }
20733
20734 total_free += free;
20735 if (free > largest_free) {
20736 largest_free = free;
20737 }
20738
20739 if (end) {
20740 break;
20741 }
20742 prev = entry->vme_end;
20743 }
20744 vm_map_unlock_read(map);
20745 if (pfree) {
20746 *pfree = total_free;
20747 }
20748 if (plargest_free) {
20749 *plargest_free = largest_free;
20750 }
20751 }
20752
20753 #if VM_SCAN_FOR_SHADOW_CHAIN
20754 int vm_map_shadow_max(vm_map_t map);
20755 int
vm_map_shadow_max(vm_map_t map)20756 vm_map_shadow_max(
20757 vm_map_t map)
20758 {
20759 int shadows, shadows_max;
20760 vm_map_entry_t entry;
20761 vm_object_t object, next_object;
20762
20763 if (map == NULL) {
20764 return 0;
20765 }
20766
20767 shadows_max = 0;
20768
20769 vm_map_lock_read(map);
20770
20771 for (entry = vm_map_first_entry(map);
20772 entry != vm_map_to_entry(map);
20773 entry = entry->vme_next) {
20774 if (entry->is_sub_map) {
20775 continue;
20776 }
20777 object = VME_OBJECT(entry);
20778 if (object == NULL) {
20779 continue;
20780 }
20781 vm_object_lock_shared(object);
20782 for (shadows = 0;
20783 object->shadow != NULL;
20784 shadows++, object = next_object) {
20785 next_object = object->shadow;
20786 vm_object_lock_shared(next_object);
20787 vm_object_unlock(object);
20788 }
20789 vm_object_unlock(object);
20790 if (shadows > shadows_max) {
20791 shadows_max = shadows;
20792 }
20793 }
20794
20795 vm_map_unlock_read(map);
20796
20797 return shadows_max;
20798 }
20799 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
20800
20801 void
vm_commit_pagezero_status(vm_map_t lmap)20802 vm_commit_pagezero_status(vm_map_t lmap)
20803 {
20804 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
20805 }
20806
20807 #if XNU_TARGET_OS_OSX
20808 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)20809 vm_map_set_high_start(
20810 vm_map_t map,
20811 vm_map_offset_t high_start)
20812 {
20813 map->vmmap_high_start = high_start;
20814 }
20815 #endif /* XNU_TARGET_OS_OSX */
20816
20817
20818 /*
20819 * FORKED CORPSE FOOTPRINT
20820 *
20821 * A forked corpse gets a copy of the original VM map but its pmap is mostly
20822 * empty since it never ran and never got to fault in any pages.
20823 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
20824 * a forked corpse would therefore return very little information.
20825 *
20826 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
20827 * to vm_map_fork() to collect footprint information from the original VM map
20828 * and its pmap, and store it in the forked corpse's VM map. That information
20829 * is stored in place of the VM map's "hole list" since we'll never need to
20830 * lookup for holes in the corpse's map.
20831 *
20832 * The corpse's footprint info looks like this:
20833 *
20834 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
20835 * as follows:
20836 * +---------------------------------------+
20837 * header-> | cf_size |
20838 * +-------------------+-------------------+
20839 * | cf_last_region | cf_last_zeroes |
20840 * +-------------------+-------------------+
20841 * region1-> | cfr_vaddr |
20842 * +-------------------+-------------------+
20843 * | cfr_num_pages | d0 | d1 | d2 | d3 |
20844 * +---------------------------------------+
20845 * | d4 | d5 | ... |
20846 * +---------------------------------------+
20847 * | ... |
20848 * +-------------------+-------------------+
20849 * | dy | dz | na | na | cfr_vaddr... | <-region2
20850 * +-------------------+-------------------+
20851 * | cfr_vaddr (ctd) | cfr_num_pages |
20852 * +---------------------------------------+
20853 * | d0 | d1 ... |
20854 * +---------------------------------------+
20855 * ...
20856 * +---------------------------------------+
20857 * last region-> | cfr_vaddr |
20858 * +---------------------------------------+
20859 * + cfr_num_pages | d0 | d1 | d2 | d3 |
20860 * +---------------------------------------+
20861 * ...
20862 * +---------------------------------------+
20863 * | dx | dy | dz | na | na | na | na | na |
20864 * +---------------------------------------+
20865 *
20866 * where:
20867 * cf_size: total size of the buffer (rounded to page size)
20868 * cf_last_region: offset in the buffer of the last "region" sub-header
20869 * cf_last_zeroes: number of trailing "zero" dispositions at the end
20870 * of last region
20871 * cfr_vaddr: virtual address of the start of the covered "region"
20872 * cfr_num_pages: number of pages in the covered "region"
20873 * d*: disposition of the page at that virtual address
20874 * Regions in the buffer are word-aligned.
20875 *
20876 * We estimate the size of the buffer based on the number of memory regions
20877 * and the virtual size of the address space. While copying each memory region
20878 * during vm_map_fork(), we also collect the footprint info for that region
20879 * and store it in the buffer, packing it as much as possible (coalescing
20880 * contiguous memory regions to avoid having too many region headers and
20881 * avoiding long streaks of "zero" page dispositions by splitting footprint
20882 * "regions", so the number of regions in the footprint buffer might not match
20883 * the number of memory regions in the address space.
20884 *
20885 * We also have to copy the original task's "nonvolatile" ledgers since that's
20886 * part of the footprint and will need to be reported to any tool asking for
20887 * the footprint information of the forked corpse.
20888 */
20889
20890 uint64_t vm_map_corpse_footprint_count = 0;
20891 uint64_t vm_map_corpse_footprint_size_avg = 0;
20892 uint64_t vm_map_corpse_footprint_size_max = 0;
20893 uint64_t vm_map_corpse_footprint_full = 0;
20894 uint64_t vm_map_corpse_footprint_no_buf = 0;
20895
20896 struct vm_map_corpse_footprint_header {
20897 vm_size_t cf_size; /* allocated buffer size */
20898 uint32_t cf_last_region; /* offset of last region in buffer */
20899 union {
20900 uint32_t cfu_last_zeroes; /* during creation:
20901 * number of "zero" dispositions at
20902 * end of last region */
20903 uint32_t cfu_hint_region; /* during lookup:
20904 * offset of last looked up region */
20905 #define cf_last_zeroes cfu.cfu_last_zeroes
20906 #define cf_hint_region cfu.cfu_hint_region
20907 } cfu;
20908 };
20909 typedef uint8_t cf_disp_t;
20910 struct vm_map_corpse_footprint_region {
20911 vm_map_offset_t cfr_vaddr; /* region start virtual address */
20912 uint32_t cfr_num_pages; /* number of pages in this "region" */
20913 cf_disp_t cfr_disposition[0]; /* disposition of each page */
20914 } __attribute__((packed));
20915
20916 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)20917 vm_page_disposition_to_cf_disp(
20918 int disposition)
20919 {
20920 assert(sizeof(cf_disp_t) == 1);
20921 /* relocate bits that don't fit in a "uint8_t" */
20922 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
20923 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20924 }
20925 /* cast gets rid of extra bits */
20926 return (cf_disp_t) disposition;
20927 }
20928
20929 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)20930 vm_page_cf_disp_to_disposition(
20931 cf_disp_t cf_disp)
20932 {
20933 int disposition;
20934
20935 assert(sizeof(cf_disp_t) == 1);
20936 disposition = (int) cf_disp;
20937 /* move relocated bits back in place */
20938 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
20939 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20940 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
20941 }
20942 return disposition;
20943 }
20944
20945 /*
20946 * vm_map_corpse_footprint_new_region:
20947 * closes the current footprint "region" and creates a new one
20948 *
20949 * Returns NULL if there's not enough space in the buffer for a new region.
20950 */
20951 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)20952 vm_map_corpse_footprint_new_region(
20953 struct vm_map_corpse_footprint_header *footprint_header)
20954 {
20955 uintptr_t footprint_edge;
20956 uint32_t new_region_offset;
20957 struct vm_map_corpse_footprint_region *footprint_region;
20958 struct vm_map_corpse_footprint_region *new_footprint_region;
20959
20960 footprint_edge = ((uintptr_t)footprint_header +
20961 footprint_header->cf_size);
20962 footprint_region = ((struct vm_map_corpse_footprint_region *)
20963 ((char *)footprint_header +
20964 footprint_header->cf_last_region));
20965 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
20966 footprint_edge);
20967
20968 /* get rid of trailing zeroes in the last region */
20969 assert(footprint_region->cfr_num_pages >=
20970 footprint_header->cf_last_zeroes);
20971 footprint_region->cfr_num_pages -=
20972 footprint_header->cf_last_zeroes;
20973 footprint_header->cf_last_zeroes = 0;
20974
20975 /* reuse this region if it's now empty */
20976 if (footprint_region->cfr_num_pages == 0) {
20977 return footprint_region;
20978 }
20979
20980 /* compute offset of new region */
20981 new_region_offset = footprint_header->cf_last_region;
20982 new_region_offset += sizeof(*footprint_region);
20983 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
20984 new_region_offset = roundup(new_region_offset, sizeof(int));
20985
20986 /* check if we're going over the edge */
20987 if (((uintptr_t)footprint_header +
20988 new_region_offset +
20989 sizeof(*footprint_region)) >=
20990 footprint_edge) {
20991 /* over the edge: no new region */
20992 return NULL;
20993 }
20994
20995 /* adjust offset of last region in header */
20996 footprint_header->cf_last_region = new_region_offset;
20997
20998 new_footprint_region = (struct vm_map_corpse_footprint_region *)
20999 ((char *)footprint_header +
21000 footprint_header->cf_last_region);
21001 new_footprint_region->cfr_vaddr = 0;
21002 new_footprint_region->cfr_num_pages = 0;
21003 /* caller needs to initialize new region */
21004
21005 return new_footprint_region;
21006 }
21007
21008 /*
21009 * vm_map_corpse_footprint_collect:
21010 * collect footprint information for "old_entry" in "old_map" and
21011 * stores it in "new_map"'s vmmap_footprint_info.
21012 */
21013 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)21014 vm_map_corpse_footprint_collect(
21015 vm_map_t old_map,
21016 vm_map_entry_t old_entry,
21017 vm_map_t new_map)
21018 {
21019 vm_map_offset_t va;
21020 kern_return_t kr;
21021 struct vm_map_corpse_footprint_header *footprint_header;
21022 struct vm_map_corpse_footprint_region *footprint_region;
21023 struct vm_map_corpse_footprint_region *new_footprint_region;
21024 cf_disp_t *next_disp_p;
21025 uintptr_t footprint_edge;
21026 uint32_t num_pages_tmp;
21027 int effective_page_size;
21028
21029 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
21030
21031 va = old_entry->vme_start;
21032
21033 vm_map_lock_assert_exclusive(old_map);
21034 vm_map_lock_assert_exclusive(new_map);
21035
21036 assert(new_map->has_corpse_footprint);
21037 assert(!old_map->has_corpse_footprint);
21038 if (!new_map->has_corpse_footprint ||
21039 old_map->has_corpse_footprint) {
21040 /*
21041 * This can only transfer footprint info from a
21042 * map with a live pmap to a map with a corpse footprint.
21043 */
21044 return KERN_NOT_SUPPORTED;
21045 }
21046
21047 if (new_map->vmmap_corpse_footprint == NULL) {
21048 vm_offset_t buf;
21049 vm_size_t buf_size;
21050
21051 buf = 0;
21052 buf_size = (sizeof(*footprint_header) +
21053 (old_map->hdr.nentries
21054 *
21055 (sizeof(*footprint_region) +
21056 +3)) /* potential alignment for each region */
21057 +
21058 ((old_map->size / effective_page_size)
21059 *
21060 sizeof(cf_disp_t))); /* disposition for each page */
21061 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
21062 buf_size = round_page(buf_size);
21063
21064 /* limit buffer to 1 page to validate overflow detection */
21065 // buf_size = PAGE_SIZE;
21066
21067 /* limit size to a somewhat sane amount */
21068 #if XNU_TARGET_OS_OSX
21069 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
21070 #else /* XNU_TARGET_OS_OSX */
21071 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
21072 #endif /* XNU_TARGET_OS_OSX */
21073 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21074 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21075 }
21076
21077 /*
21078 * Allocate the pageable buffer (with a trailing guard page).
21079 * It will be zero-filled on demand.
21080 */
21081 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21082 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21083 VM_KERN_MEMORY_DIAG);
21084 if (kr != KERN_SUCCESS) {
21085 vm_map_corpse_footprint_no_buf++;
21086 return kr;
21087 }
21088
21089 /* initialize header and 1st region */
21090 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21091 new_map->vmmap_corpse_footprint = footprint_header;
21092
21093 footprint_header->cf_size = buf_size;
21094 footprint_header->cf_last_region =
21095 sizeof(*footprint_header);
21096 footprint_header->cf_last_zeroes = 0;
21097
21098 footprint_region = (struct vm_map_corpse_footprint_region *)
21099 ((char *)footprint_header +
21100 footprint_header->cf_last_region);
21101 footprint_region->cfr_vaddr = 0;
21102 footprint_region->cfr_num_pages = 0;
21103 } else {
21104 /* retrieve header and last region */
21105 footprint_header = (struct vm_map_corpse_footprint_header *)
21106 new_map->vmmap_corpse_footprint;
21107 footprint_region = (struct vm_map_corpse_footprint_region *)
21108 ((char *)footprint_header +
21109 footprint_header->cf_last_region);
21110 }
21111 footprint_edge = ((uintptr_t)footprint_header +
21112 footprint_header->cf_size);
21113
21114 if ((footprint_region->cfr_vaddr +
21115 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21116 effective_page_size))
21117 != old_entry->vme_start) {
21118 uint64_t num_pages_delta, num_pages_delta_size;
21119 uint32_t region_offset_delta_size;
21120
21121 /*
21122 * Not the next contiguous virtual address:
21123 * start a new region or store "zero" dispositions for
21124 * the missing pages?
21125 */
21126 /* size of gap in actual page dispositions */
21127 num_pages_delta = ((old_entry->vme_start -
21128 footprint_region->cfr_vaddr) / effective_page_size)
21129 - footprint_region->cfr_num_pages;
21130 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21131 /* size of gap as a new footprint region header */
21132 region_offset_delta_size =
21133 (sizeof(*footprint_region) +
21134 roundup(((footprint_region->cfr_num_pages -
21135 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21136 sizeof(int)) -
21137 ((footprint_region->cfr_num_pages -
21138 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21139 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21140 if (region_offset_delta_size < num_pages_delta_size ||
21141 os_add3_overflow(footprint_region->cfr_num_pages,
21142 (uint32_t) num_pages_delta,
21143 1,
21144 &num_pages_tmp)) {
21145 /*
21146 * Storing data for this gap would take more space
21147 * than inserting a new footprint region header:
21148 * let's start a new region and save space. If it's a
21149 * tie, let's avoid using a new region, since that
21150 * would require more region hops to find the right
21151 * range during lookups.
21152 *
21153 * If the current region's cfr_num_pages would overflow
21154 * if we added "zero" page dispositions for the gap,
21155 * no choice but to start a new region.
21156 */
21157 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21158 new_footprint_region =
21159 vm_map_corpse_footprint_new_region(footprint_header);
21160 /* check that we're not going over the edge */
21161 if (new_footprint_region == NULL) {
21162 goto over_the_edge;
21163 }
21164 footprint_region = new_footprint_region;
21165 /* initialize new region as empty */
21166 footprint_region->cfr_vaddr = old_entry->vme_start;
21167 footprint_region->cfr_num_pages = 0;
21168 } else {
21169 /*
21170 * Store "zero" page dispositions for the missing
21171 * pages.
21172 */
21173 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21174 for (; num_pages_delta > 0; num_pages_delta--) {
21175 next_disp_p = (cf_disp_t *)
21176 ((uintptr_t) footprint_region +
21177 sizeof(*footprint_region));
21178 next_disp_p += footprint_region->cfr_num_pages;
21179 /* check that we're not going over the edge */
21180 if ((uintptr_t)next_disp_p >= footprint_edge) {
21181 goto over_the_edge;
21182 }
21183 /* store "zero" disposition for this gap page */
21184 footprint_region->cfr_num_pages++;
21185 *next_disp_p = (cf_disp_t) 0;
21186 footprint_header->cf_last_zeroes++;
21187 }
21188 }
21189 }
21190
21191 for (va = old_entry->vme_start;
21192 va < old_entry->vme_end;
21193 va += effective_page_size) {
21194 int disposition;
21195 cf_disp_t cf_disp;
21196
21197 vm_map_footprint_query_page_info(old_map,
21198 old_entry,
21199 va,
21200 &disposition);
21201 cf_disp = vm_page_disposition_to_cf_disp(disposition);
21202
21203 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21204
21205 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21206 /*
21207 * Ignore "zero" dispositions at start of
21208 * region: just move start of region.
21209 */
21210 footprint_region->cfr_vaddr += effective_page_size;
21211 continue;
21212 }
21213
21214 /* would region's cfr_num_pages overflow? */
21215 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21216 &num_pages_tmp)) {
21217 /* overflow: create a new region */
21218 new_footprint_region =
21219 vm_map_corpse_footprint_new_region(
21220 footprint_header);
21221 if (new_footprint_region == NULL) {
21222 goto over_the_edge;
21223 }
21224 footprint_region = new_footprint_region;
21225 footprint_region->cfr_vaddr = va;
21226 footprint_region->cfr_num_pages = 0;
21227 }
21228
21229 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21230 sizeof(*footprint_region));
21231 next_disp_p += footprint_region->cfr_num_pages;
21232 /* check that we're not going over the edge */
21233 if ((uintptr_t)next_disp_p >= footprint_edge) {
21234 goto over_the_edge;
21235 }
21236 /* store this dispostion */
21237 *next_disp_p = cf_disp;
21238 footprint_region->cfr_num_pages++;
21239
21240 if (cf_disp != 0) {
21241 /* non-zero disp: break the current zero streak */
21242 footprint_header->cf_last_zeroes = 0;
21243 /* done */
21244 continue;
21245 }
21246
21247 /* zero disp: add to the current streak of zeroes */
21248 footprint_header->cf_last_zeroes++;
21249 if ((footprint_header->cf_last_zeroes +
21250 roundup(((footprint_region->cfr_num_pages -
21251 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21252 (sizeof(int) - 1),
21253 sizeof(int))) <
21254 (sizeof(*footprint_header))) {
21255 /*
21256 * There are not enough trailing "zero" dispositions
21257 * (+ the extra padding we would need for the previous
21258 * region); creating a new region would not save space
21259 * at this point, so let's keep this "zero" disposition
21260 * in this region and reconsider later.
21261 */
21262 continue;
21263 }
21264 /*
21265 * Create a new region to avoid having too many consecutive
21266 * "zero" dispositions.
21267 */
21268 new_footprint_region =
21269 vm_map_corpse_footprint_new_region(footprint_header);
21270 if (new_footprint_region == NULL) {
21271 goto over_the_edge;
21272 }
21273 footprint_region = new_footprint_region;
21274 /* initialize the new region as empty ... */
21275 footprint_region->cfr_num_pages = 0;
21276 /* ... and skip this "zero" disp */
21277 footprint_region->cfr_vaddr = va + effective_page_size;
21278 }
21279
21280 return KERN_SUCCESS;
21281
21282 over_the_edge:
21283 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21284 vm_map_corpse_footprint_full++;
21285 return KERN_RESOURCE_SHORTAGE;
21286 }
21287
21288 /*
21289 * vm_map_corpse_footprint_collect_done:
21290 * completes the footprint collection by getting rid of any remaining
21291 * trailing "zero" dispositions and trimming the unused part of the
21292 * kernel buffer
21293 */
21294 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21295 vm_map_corpse_footprint_collect_done(
21296 vm_map_t new_map)
21297 {
21298 struct vm_map_corpse_footprint_header *footprint_header;
21299 struct vm_map_corpse_footprint_region *footprint_region;
21300 vm_size_t buf_size, actual_size;
21301 kern_return_t kr;
21302
21303 assert(new_map->has_corpse_footprint);
21304 if (!new_map->has_corpse_footprint ||
21305 new_map->vmmap_corpse_footprint == NULL) {
21306 return;
21307 }
21308
21309 footprint_header = (struct vm_map_corpse_footprint_header *)
21310 new_map->vmmap_corpse_footprint;
21311 buf_size = footprint_header->cf_size;
21312
21313 footprint_region = (struct vm_map_corpse_footprint_region *)
21314 ((char *)footprint_header +
21315 footprint_header->cf_last_region);
21316
21317 /* get rid of trailing zeroes in last region */
21318 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21319 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21320 footprint_header->cf_last_zeroes = 0;
21321
21322 actual_size = (vm_size_t)(footprint_header->cf_last_region +
21323 sizeof(*footprint_region) +
21324 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21325
21326 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21327 vm_map_corpse_footprint_size_avg =
21328 (((vm_map_corpse_footprint_size_avg *
21329 vm_map_corpse_footprint_count) +
21330 actual_size) /
21331 (vm_map_corpse_footprint_count + 1));
21332 vm_map_corpse_footprint_count++;
21333 if (actual_size > vm_map_corpse_footprint_size_max) {
21334 vm_map_corpse_footprint_size_max = actual_size;
21335 }
21336
21337 actual_size = round_page(actual_size);
21338 if (buf_size > actual_size) {
21339 kr = vm_deallocate(kernel_map,
21340 ((vm_address_t)footprint_header +
21341 actual_size +
21342 PAGE_SIZE), /* trailing guard page */
21343 (buf_size - actual_size));
21344 assertf(kr == KERN_SUCCESS,
21345 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21346 footprint_header,
21347 (uint64_t) buf_size,
21348 (uint64_t) actual_size,
21349 kr);
21350 kr = vm_protect(kernel_map,
21351 ((vm_address_t)footprint_header +
21352 actual_size),
21353 PAGE_SIZE,
21354 FALSE, /* set_maximum */
21355 VM_PROT_NONE);
21356 assertf(kr == KERN_SUCCESS,
21357 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21358 footprint_header,
21359 (uint64_t) buf_size,
21360 (uint64_t) actual_size,
21361 kr);
21362 }
21363
21364 footprint_header->cf_size = actual_size;
21365 }
21366
21367 /*
21368 * vm_map_corpse_footprint_query_page_info:
21369 * retrieves the disposition of the page at virtual address "vaddr"
21370 * in the forked corpse's VM map
21371 *
21372 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21373 */
21374 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21375 vm_map_corpse_footprint_query_page_info(
21376 vm_map_t map,
21377 vm_map_offset_t va,
21378 int *disposition_p)
21379 {
21380 struct vm_map_corpse_footprint_header *footprint_header;
21381 struct vm_map_corpse_footprint_region *footprint_region;
21382 uint32_t footprint_region_offset;
21383 vm_map_offset_t region_start, region_end;
21384 int disp_idx;
21385 kern_return_t kr;
21386 int effective_page_size;
21387 cf_disp_t cf_disp;
21388
21389 if (!map->has_corpse_footprint) {
21390 *disposition_p = 0;
21391 kr = KERN_INVALID_ARGUMENT;
21392 goto done;
21393 }
21394
21395 footprint_header = map->vmmap_corpse_footprint;
21396 if (footprint_header == NULL) {
21397 *disposition_p = 0;
21398 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21399 kr = KERN_INVALID_ARGUMENT;
21400 goto done;
21401 }
21402
21403 /* start looking at the hint ("cf_hint_region") */
21404 footprint_region_offset = footprint_header->cf_hint_region;
21405
21406 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
21407
21408 lookup_again:
21409 if (footprint_region_offset < sizeof(*footprint_header)) {
21410 /* hint too low: start from 1st region */
21411 footprint_region_offset = sizeof(*footprint_header);
21412 }
21413 if (footprint_region_offset >= footprint_header->cf_last_region) {
21414 /* hint too high: re-start from 1st region */
21415 footprint_region_offset = sizeof(*footprint_header);
21416 }
21417 footprint_region = (struct vm_map_corpse_footprint_region *)
21418 ((char *)footprint_header + footprint_region_offset);
21419 region_start = footprint_region->cfr_vaddr;
21420 region_end = (region_start +
21421 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21422 effective_page_size));
21423 if (va < region_start &&
21424 footprint_region_offset != sizeof(*footprint_header)) {
21425 /* our range starts before the hint region */
21426
21427 /* reset the hint (in a racy way...) */
21428 footprint_header->cf_hint_region = sizeof(*footprint_header);
21429 /* lookup "va" again from 1st region */
21430 footprint_region_offset = sizeof(*footprint_header);
21431 goto lookup_again;
21432 }
21433
21434 while (va >= region_end) {
21435 if (footprint_region_offset >= footprint_header->cf_last_region) {
21436 break;
21437 }
21438 /* skip the region's header */
21439 footprint_region_offset += sizeof(*footprint_region);
21440 /* skip the region's page dispositions */
21441 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21442 /* align to next word boundary */
21443 footprint_region_offset =
21444 roundup(footprint_region_offset,
21445 sizeof(int));
21446 footprint_region = (struct vm_map_corpse_footprint_region *)
21447 ((char *)footprint_header + footprint_region_offset);
21448 region_start = footprint_region->cfr_vaddr;
21449 region_end = (region_start +
21450 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21451 effective_page_size));
21452 }
21453 if (va < region_start || va >= region_end) {
21454 /* page not found */
21455 *disposition_p = 0;
21456 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21457 kr = KERN_SUCCESS;
21458 goto done;
21459 }
21460
21461 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
21462 footprint_header->cf_hint_region = footprint_region_offset;
21463
21464 /* get page disposition for "va" in this region */
21465 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
21466 cf_disp = footprint_region->cfr_disposition[disp_idx];
21467 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
21468 kr = KERN_SUCCESS;
21469 done:
21470 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21471 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
21472 DTRACE_VM4(footprint_query_page_info,
21473 vm_map_t, map,
21474 vm_map_offset_t, va,
21475 int, *disposition_p,
21476 kern_return_t, kr);
21477
21478 return kr;
21479 }
21480
21481 void
vm_map_corpse_footprint_destroy(vm_map_t map)21482 vm_map_corpse_footprint_destroy(
21483 vm_map_t map)
21484 {
21485 if (map->has_corpse_footprint &&
21486 map->vmmap_corpse_footprint != 0) {
21487 struct vm_map_corpse_footprint_header *footprint_header;
21488 vm_size_t buf_size;
21489 kern_return_t kr;
21490
21491 footprint_header = map->vmmap_corpse_footprint;
21492 buf_size = footprint_header->cf_size;
21493 kr = vm_deallocate(kernel_map,
21494 (vm_offset_t) map->vmmap_corpse_footprint,
21495 ((vm_size_t) buf_size
21496 + PAGE_SIZE)); /* trailing guard page */
21497 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
21498 map->vmmap_corpse_footprint = 0;
21499 map->has_corpse_footprint = FALSE;
21500 }
21501 }
21502
21503 /*
21504 * vm_map_copy_footprint_ledgers:
21505 * copies any ledger that's relevant to the memory footprint of "old_task"
21506 * into the forked corpse's task ("new_task")
21507 */
21508 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)21509 vm_map_copy_footprint_ledgers(
21510 task_t old_task,
21511 task_t new_task)
21512 {
21513 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
21514 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
21515 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
21516 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
21517 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
21518 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
21519 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
21520 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
21521 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
21522 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
21523 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
21524 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
21525 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
21526 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
21527 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
21528 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
21529 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
21530 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
21531 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
21532 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
21533 }
21534
21535 /*
21536 * vm_map_copy_ledger:
21537 * copy a single ledger from "old_task" to "new_task"
21538 */
21539 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)21540 vm_map_copy_ledger(
21541 task_t old_task,
21542 task_t new_task,
21543 int ledger_entry)
21544 {
21545 ledger_amount_t old_balance, new_balance, delta;
21546
21547 assert(new_task->map->has_corpse_footprint);
21548 if (!new_task->map->has_corpse_footprint) {
21549 return;
21550 }
21551
21552 /* turn off sanity checks for the ledger we're about to mess with */
21553 ledger_disable_panic_on_negative(new_task->ledger,
21554 ledger_entry);
21555
21556 /* adjust "new_task" to match "old_task" */
21557 ledger_get_balance(old_task->ledger,
21558 ledger_entry,
21559 &old_balance);
21560 ledger_get_balance(new_task->ledger,
21561 ledger_entry,
21562 &new_balance);
21563 if (new_balance == old_balance) {
21564 /* new == old: done */
21565 } else if (new_balance > old_balance) {
21566 /* new > old ==> new -= new - old */
21567 delta = new_balance - old_balance;
21568 ledger_debit(new_task->ledger,
21569 ledger_entry,
21570 delta);
21571 } else {
21572 /* new < old ==> new += old - new */
21573 delta = old_balance - new_balance;
21574 ledger_credit(new_task->ledger,
21575 ledger_entry,
21576 delta);
21577 }
21578 }
21579
21580 /*
21581 * vm_map_get_pmap:
21582 * returns the pmap associated with the vm_map
21583 */
21584 pmap_t
vm_map_get_pmap(vm_map_t map)21585 vm_map_get_pmap(vm_map_t map)
21586 {
21587 return vm_map_pmap(map);
21588 }
21589
21590 #if MACH_ASSERT
21591
21592 extern int pmap_ledgers_panic;
21593 extern int pmap_ledgers_panic_leeway;
21594
21595 #define LEDGER_DRIFT(__LEDGER) \
21596 int __LEDGER##_over; \
21597 ledger_amount_t __LEDGER##_over_total; \
21598 ledger_amount_t __LEDGER##_over_max; \
21599 int __LEDGER##_under; \
21600 ledger_amount_t __LEDGER##_under_total; \
21601 ledger_amount_t __LEDGER##_under_max
21602
21603 struct {
21604 uint64_t num_pmaps_checked;
21605
21606 LEDGER_DRIFT(phys_footprint);
21607 LEDGER_DRIFT(internal);
21608 LEDGER_DRIFT(internal_compressed);
21609 LEDGER_DRIFT(external);
21610 LEDGER_DRIFT(reusable);
21611 LEDGER_DRIFT(iokit_mapped);
21612 LEDGER_DRIFT(alternate_accounting);
21613 LEDGER_DRIFT(alternate_accounting_compressed);
21614 LEDGER_DRIFT(page_table);
21615 LEDGER_DRIFT(purgeable_volatile);
21616 LEDGER_DRIFT(purgeable_nonvolatile);
21617 LEDGER_DRIFT(purgeable_volatile_compressed);
21618 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
21619 LEDGER_DRIFT(tagged_nofootprint);
21620 LEDGER_DRIFT(tagged_footprint);
21621 LEDGER_DRIFT(tagged_nofootprint_compressed);
21622 LEDGER_DRIFT(tagged_footprint_compressed);
21623 LEDGER_DRIFT(network_volatile);
21624 LEDGER_DRIFT(network_nonvolatile);
21625 LEDGER_DRIFT(network_volatile_compressed);
21626 LEDGER_DRIFT(network_nonvolatile_compressed);
21627 LEDGER_DRIFT(media_nofootprint);
21628 LEDGER_DRIFT(media_footprint);
21629 LEDGER_DRIFT(media_nofootprint_compressed);
21630 LEDGER_DRIFT(media_footprint_compressed);
21631 LEDGER_DRIFT(graphics_nofootprint);
21632 LEDGER_DRIFT(graphics_footprint);
21633 LEDGER_DRIFT(graphics_nofootprint_compressed);
21634 LEDGER_DRIFT(graphics_footprint_compressed);
21635 LEDGER_DRIFT(neural_nofootprint);
21636 LEDGER_DRIFT(neural_footprint);
21637 LEDGER_DRIFT(neural_nofootprint_compressed);
21638 LEDGER_DRIFT(neural_footprint_compressed);
21639 } pmap_ledgers_drift;
21640
21641 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)21642 vm_map_pmap_check_ledgers(
21643 pmap_t pmap,
21644 ledger_t ledger,
21645 int pid,
21646 char *procname)
21647 {
21648 ledger_amount_t bal;
21649 boolean_t do_panic;
21650
21651 do_panic = FALSE;
21652
21653 pmap_ledgers_drift.num_pmaps_checked++;
21654
21655 #define LEDGER_CHECK_BALANCE(__LEDGER) \
21656 MACRO_BEGIN \
21657 int panic_on_negative = TRUE; \
21658 ledger_get_balance(ledger, \
21659 task_ledgers.__LEDGER, \
21660 &bal); \
21661 ledger_get_panic_on_negative(ledger, \
21662 task_ledgers.__LEDGER, \
21663 &panic_on_negative); \
21664 if (bal != 0) { \
21665 if (panic_on_negative || \
21666 (pmap_ledgers_panic && \
21667 pmap_ledgers_panic_leeway > 0 && \
21668 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
21669 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
21670 do_panic = TRUE; \
21671 } \
21672 printf("LEDGER BALANCE proc %d (%s) " \
21673 "\"%s\" = %lld\n", \
21674 pid, procname, #__LEDGER, bal); \
21675 if (bal > 0) { \
21676 pmap_ledgers_drift.__LEDGER##_over++; \
21677 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
21678 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
21679 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
21680 } \
21681 } else if (bal < 0) { \
21682 pmap_ledgers_drift.__LEDGER##_under++; \
21683 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
21684 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
21685 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
21686 } \
21687 } \
21688 } \
21689 MACRO_END
21690
21691 LEDGER_CHECK_BALANCE(phys_footprint);
21692 LEDGER_CHECK_BALANCE(internal);
21693 LEDGER_CHECK_BALANCE(internal_compressed);
21694 LEDGER_CHECK_BALANCE(external);
21695 LEDGER_CHECK_BALANCE(reusable);
21696 LEDGER_CHECK_BALANCE(iokit_mapped);
21697 LEDGER_CHECK_BALANCE(alternate_accounting);
21698 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
21699 LEDGER_CHECK_BALANCE(page_table);
21700 LEDGER_CHECK_BALANCE(purgeable_volatile);
21701 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
21702 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
21703 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
21704 LEDGER_CHECK_BALANCE(tagged_nofootprint);
21705 LEDGER_CHECK_BALANCE(tagged_footprint);
21706 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
21707 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
21708 LEDGER_CHECK_BALANCE(network_volatile);
21709 LEDGER_CHECK_BALANCE(network_nonvolatile);
21710 LEDGER_CHECK_BALANCE(network_volatile_compressed);
21711 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
21712 LEDGER_CHECK_BALANCE(media_nofootprint);
21713 LEDGER_CHECK_BALANCE(media_footprint);
21714 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
21715 LEDGER_CHECK_BALANCE(media_footprint_compressed);
21716 LEDGER_CHECK_BALANCE(graphics_nofootprint);
21717 LEDGER_CHECK_BALANCE(graphics_footprint);
21718 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
21719 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
21720 LEDGER_CHECK_BALANCE(neural_nofootprint);
21721 LEDGER_CHECK_BALANCE(neural_footprint);
21722 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
21723 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
21724
21725 if (do_panic) {
21726 if (pmap_ledgers_panic) {
21727 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
21728 pmap, pid, procname);
21729 } else {
21730 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
21731 pmap, pid, procname);
21732 }
21733 }
21734 }
21735 #endif /* MACH_ASSERT */
21736