1 /*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66 #include <mach_assert.h>
67
68 #include <vm/vm_options.h>
69
70 #include <libkern/OSAtomic.h>
71
72 #include <mach/kern_return.h>
73 #include <mach/port.h>
74 #include <mach/vm_attributes.h>
75 #include <mach/vm_param.h>
76 #include <mach/vm_behavior.h>
77 #include <mach/vm_statistics.h>
78 #include <mach/memory_object.h>
79 #include <mach/mach_vm.h>
80 #include <machine/cpu_capabilities.h>
81 #include <mach/sdt.h>
82
83 #include <kern/assert.h>
84 #include <kern/backtrace.h>
85 #include <kern/counter.h>
86 #include <kern/exc_guard.h>
87 #include <kern/kalloc.h>
88 #include <kern/zalloc_internal.h>
89
90 #include <vm/cpm.h>
91 #include <vm/vm_compressor.h>
92 #include <vm/vm_compressor_pager.h>
93 #include <vm/vm_init.h>
94 #include <vm/vm_fault.h>
95 #include <vm/vm_map_internal.h>
96 #include <vm/vm_object.h>
97 #include <vm/vm_page.h>
98 #include <vm/vm_pageout.h>
99 #include <vm/pmap.h>
100 #include <vm/vm_kern.h>
101 #include <ipc/ipc_port.h>
102 #include <kern/sched_prim.h>
103 #include <kern/misc_protos.h>
104
105 #include <mach/vm_map_server.h>
106 #include <mach/mach_host_server.h>
107 #include <vm/vm_protos.h>
108 #include <vm/vm_purgeable_internal.h>
109
110 #include <vm/vm_protos.h>
111 #include <vm/vm_shared_region.h>
112 #include <vm/vm_map_store.h>
113
114 #include <san/kasan.h>
115
116 #include <sys/resource.h>
117 #include <sys/codesign.h>
118 #include <sys/mman.h>
119 #include <sys/reboot.h>
120 #include <sys/kdebug_triage.h>
121
122 #if __LP64__
123 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 0
124 #else
125 #define HAVE_VM_MAP_RESERVED_ENTRY_ZONE 1
126 #endif
127
128 #include <libkern/section_keywords.h>
129 #if DEVELOPMENT || DEBUG
130 extern int proc_selfcsflags(void);
131 int panic_on_unsigned_execute = 0;
132 int panic_on_mlock_failure = 0;
133 #endif /* DEVELOPMENT || DEBUG */
134
135 #if MACH_ASSERT
136 int debug4k_filter = 0;
137 char debug4k_proc_name[1024] = "";
138 int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
139 int debug4k_panic_on_misaligned_sharing = 0;
140 const char *debug4k_category_name[] = {
141 "error", /* 0 */
142 "life", /* 1 */
143 "load", /* 2 */
144 "fault", /* 3 */
145 "copy", /* 4 */
146 "share", /* 5 */
147 "adjust", /* 6 */
148 "pmap", /* 7 */
149 "mementry", /* 8 */
150 "iokit", /* 9 */
151 "upl", /* 10 */
152 "exc", /* 11 */
153 "vfs" /* 12 */
154 };
155 #endif /* MACH_ASSERT */
156 int debug4k_no_cow_copyin = 0;
157
158
159 #if __arm64__
160 extern const int fourk_binary_compatibility_unsafe;
161 extern const int fourk_binary_compatibility_allow_wx;
162 #endif /* __arm64__ */
163 extern int proc_selfpid(void);
164 extern char *proc_name_address(void *p);
165
166 #if VM_MAP_DEBUG_APPLE_PROTECT
167 int vm_map_debug_apple_protect = 0;
168 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
169 #if VM_MAP_DEBUG_FOURK
170 int vm_map_debug_fourk = 0;
171 #endif /* VM_MAP_DEBUG_FOURK */
172
173 #if DEBUG || DEVELOPMENT
174 static TUNABLE(bool, vm_map_executable_immutable,
175 "vm_map_executable_immutable", true);
176 #else
177 #define vm_map_executable_immutable true
178 #endif
179
180 os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
181
182 extern u_int32_t random(void); /* from <libkern/libkern.h> */
183 /* Internal prototypes
184 */
185
186 typedef struct vm_map_zap {
187 vm_map_entry_t vmz_head;
188 vm_map_entry_t *vmz_tail;
189 } *vm_map_zap_t;
190
191 #define VM_MAP_ZAP_DECLARE(zap) \
192 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
193
194 static vm_map_entry_t vm_map_entry_insert(
195 vm_map_t map,
196 vm_map_entry_t insp_entry,
197 vm_map_offset_t start,
198 vm_map_offset_t end,
199 vm_object_t object,
200 vm_object_offset_t offset,
201 vm_map_kernel_flags_t vmk_flags,
202 boolean_t needs_copy,
203 vm_prot_t cur_protection,
204 vm_prot_t max_protection,
205 vm_inherit_t inheritance,
206 boolean_t no_cache,
207 boolean_t permanent,
208 boolean_t no_copy_on_read,
209 unsigned int superpage_size,
210 boolean_t clear_map_aligned,
211 boolean_t is_submap,
212 boolean_t used_for_jit,
213 int alias,
214 boolean_t translated_allow_execute);
215
216 static void vm_map_simplify_range(
217 vm_map_t map,
218 vm_map_offset_t start,
219 vm_map_offset_t end); /* forward */
220
221 static boolean_t vm_map_range_check(
222 vm_map_t map,
223 vm_map_offset_t start,
224 vm_map_offset_t end,
225 vm_map_entry_t *entry);
226
227 static void vm_map_submap_pmap_clean(
228 vm_map_t map,
229 vm_map_offset_t start,
230 vm_map_offset_t end,
231 vm_map_t sub_map,
232 vm_map_offset_t offset);
233
234 static void vm_map_pmap_enter(
235 vm_map_t map,
236 vm_map_offset_t addr,
237 vm_map_offset_t end_addr,
238 vm_object_t object,
239 vm_object_offset_t offset,
240 vm_prot_t protection);
241
242 static void _vm_map_clip_end(
243 struct vm_map_header *map_header,
244 vm_map_entry_t entry,
245 vm_map_offset_t end);
246
247 static void _vm_map_clip_start(
248 struct vm_map_header *map_header,
249 vm_map_entry_t entry,
250 vm_map_offset_t start);
251
252 static kern_return_t vm_map_delete(
253 vm_map_t map,
254 vm_map_offset_t start,
255 vm_map_offset_t end,
256 vmr_flags_t flags,
257 vm_map_zap_t zap);
258
259 static void vm_map_copy_insert(
260 vm_map_t map,
261 vm_map_entry_t after_where,
262 vm_map_copy_t copy);
263
264 static kern_return_t vm_map_copy_overwrite_unaligned(
265 vm_map_t dst_map,
266 vm_map_entry_t entry,
267 vm_map_copy_t copy,
268 vm_map_address_t start,
269 boolean_t discard_on_success);
270
271 static kern_return_t vm_map_copy_overwrite_aligned(
272 vm_map_t dst_map,
273 vm_map_entry_t tmp_entry,
274 vm_map_copy_t copy,
275 vm_map_offset_t start,
276 pmap_t pmap);
277
278 static kern_return_t vm_map_copyin_kernel_buffer(
279 vm_map_t src_map,
280 vm_map_address_t src_addr,
281 vm_map_size_t len,
282 boolean_t src_destroy,
283 vm_map_copy_t *copy_result); /* OUT */
284
285 static kern_return_t vm_map_copyout_kernel_buffer(
286 vm_map_t map,
287 vm_map_address_t *addr, /* IN/OUT */
288 vm_map_copy_t copy,
289 vm_map_size_t copy_size,
290 boolean_t overwrite,
291 boolean_t consume_on_success);
292
293 static void vm_map_fork_share(
294 vm_map_t old_map,
295 vm_map_entry_t old_entry,
296 vm_map_t new_map);
297
298 static boolean_t vm_map_fork_copy(
299 vm_map_t old_map,
300 vm_map_entry_t *old_entry_p,
301 vm_map_t new_map,
302 int vm_map_copyin_flags);
303
304 static kern_return_t vm_map_wire_nested(
305 vm_map_t map,
306 vm_map_offset_t start,
307 vm_map_offset_t end,
308 vm_prot_t caller_prot,
309 vm_tag_t tag,
310 boolean_t user_wire,
311 pmap_t map_pmap,
312 vm_map_offset_t pmap_addr,
313 ppnum_t *physpage_p);
314
315 static kern_return_t vm_map_unwire_nested(
316 vm_map_t map,
317 vm_map_offset_t start,
318 vm_map_offset_t end,
319 boolean_t user_wire,
320 pmap_t map_pmap,
321 vm_map_offset_t pmap_addr);
322
323 static kern_return_t vm_map_overwrite_submap_recurse(
324 vm_map_t dst_map,
325 vm_map_offset_t dst_addr,
326 vm_map_size_t dst_size);
327
328 static kern_return_t vm_map_copy_overwrite_nested(
329 vm_map_t dst_map,
330 vm_map_offset_t dst_addr,
331 vm_map_copy_t copy,
332 boolean_t interruptible,
333 pmap_t pmap,
334 boolean_t discard_on_success);
335
336 static kern_return_t vm_map_remap_extract(
337 vm_map_t map,
338 vm_map_offset_t addr,
339 vm_map_size_t size,
340 boolean_t copy,
341 struct vm_map_header *map_header,
342 vm_prot_t *cur_protection,
343 vm_prot_t *max_protection,
344 vm_inherit_t inheritance,
345 vm_map_kernel_flags_t vmk_flags);
346
347 static kern_return_t vm_map_remap_range_allocate(
348 vm_map_t map,
349 vm_map_address_t *address,
350 vm_map_size_t size,
351 vm_map_offset_t mask,
352 int flags,
353 vm_map_kernel_flags_t vmk_flags,
354 vm_tag_t tag,
355 vm_map_entry_t *map_entry,
356 vm_map_zap_t zap_list);
357
358 static void vm_map_region_look_for_page(
359 vm_map_t map,
360 vm_map_offset_t va,
361 vm_object_t object,
362 vm_object_offset_t offset,
363 int max_refcnt,
364 unsigned short depth,
365 vm_region_extended_info_t extended,
366 mach_msg_type_number_t count);
367
368 static int vm_map_region_count_obj_refs(
369 vm_map_entry_t entry,
370 vm_object_t object);
371
372
373 static kern_return_t vm_map_willneed(
374 vm_map_t map,
375 vm_map_offset_t start,
376 vm_map_offset_t end);
377
378 static kern_return_t vm_map_reuse_pages(
379 vm_map_t map,
380 vm_map_offset_t start,
381 vm_map_offset_t end);
382
383 static kern_return_t vm_map_reusable_pages(
384 vm_map_t map,
385 vm_map_offset_t start,
386 vm_map_offset_t end);
387
388 static kern_return_t vm_map_can_reuse(
389 vm_map_t map,
390 vm_map_offset_t start,
391 vm_map_offset_t end);
392
393 #if MACH_ASSERT
394 static kern_return_t vm_map_pageout(
395 vm_map_t map,
396 vm_map_offset_t start,
397 vm_map_offset_t end);
398 #endif /* MACH_ASSERT */
399
400 kern_return_t vm_map_corpse_footprint_collect(
401 vm_map_t old_map,
402 vm_map_entry_t old_entry,
403 vm_map_t new_map);
404 void vm_map_corpse_footprint_collect_done(
405 vm_map_t new_map);
406 void vm_map_corpse_footprint_destroy(
407 vm_map_t map);
408 kern_return_t vm_map_corpse_footprint_query_page_info(
409 vm_map_t map,
410 vm_map_offset_t va,
411 int *disposition_p);
412 void vm_map_footprint_query_page_info(
413 vm_map_t map,
414 vm_map_entry_t map_entry,
415 vm_map_offset_t curr_s_offset,
416 int *disposition_p);
417
418 pid_t find_largest_process_vm_map_entries(void);
419
420 extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
421 mach_exception_data_type_t subcode);
422
423 /*
424 * Macros to copy a vm_map_entry. We must be careful to correctly
425 * manage the wired page count. vm_map_entry_copy() creates a new
426 * map entry to the same memory - the wired count in the new entry
427 * must be set to zero. vm_map_entry_copy_full() creates a new
428 * entry that is identical to the old entry. This preserves the
429 * wire count; it's used for map splitting and zone changing in
430 * vm_map_copyout.
431 */
432
433 static inline void
vm_map_entry_copy_pmap_cs_assoc(vm_map_t map __unused,vm_map_entry_t new __unused,vm_map_entry_t old __unused)434 vm_map_entry_copy_pmap_cs_assoc(
435 vm_map_t map __unused,
436 vm_map_entry_t new __unused,
437 vm_map_entry_t old __unused)
438 {
439 /* when pmap_cs is not enabled, assert as a sanity check */
440 assert(new->pmap_cs_associated == FALSE);
441 }
442
443 /*
444 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
445 * But for security reasons on some platforms, we don't want the
446 * new mapping to be "used for jit", so we reset the flag here.
447 */
448 static inline void
vm_map_entry_copy_code_signing(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old __unused)449 vm_map_entry_copy_code_signing(
450 vm_map_t map,
451 vm_map_entry_t new,
452 vm_map_entry_t old __unused)
453 {
454 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
455 assert(new->used_for_jit == old->used_for_jit);
456 } else {
457 new->used_for_jit = FALSE;
458 }
459 }
460
461 static inline void
vm_map_entry_copy_full(vm_map_entry_t new,vm_map_entry_t old)462 vm_map_entry_copy_full(
463 vm_map_entry_t new,
464 vm_map_entry_t old)
465 {
466 #if MAP_ENTRY_CREATION_DEBUG
467 btref_put(new->vme_creation_bt);
468 btref_retain(old->vme_creation_bt);
469 #endif
470 #if MAP_ENTRY_INSERTION_DEBUG
471 btref_put(new->vme_insertion_bt);
472 btref_retain(old->vme_insertion_bt);
473 #endif
474 *new = *old;
475 }
476
477 static inline void
vm_map_entry_copy(vm_map_t map,vm_map_entry_t new,vm_map_entry_t old)478 vm_map_entry_copy(
479 vm_map_t map,
480 vm_map_entry_t new,
481 vm_map_entry_t old)
482 {
483 vm_map_entry_copy_full(new, old);
484
485 new->is_shared = FALSE;
486 new->needs_wakeup = FALSE;
487 new->in_transition = FALSE;
488 new->wired_count = 0;
489 new->user_wired_count = 0;
490 new->permanent = FALSE;
491 vm_map_entry_copy_code_signing(map, new, old);
492 vm_map_entry_copy_pmap_cs_assoc(map, new, old);
493 if (new->iokit_acct) {
494 assertf(!new->use_pmap, "old %p new %p\n", old, new);
495 new->iokit_acct = FALSE;
496 new->use_pmap = TRUE;
497 }
498 new->vme_resilient_codesign = FALSE;
499 new->vme_resilient_media = FALSE;
500 new->vme_atomic = FALSE;
501 new->vme_no_copy_on_read = FALSE;
502 }
503
504 /*
505 * Normal lock_read_to_write() returns FALSE/0 on failure.
506 * These functions evaluate to zero on success and non-zero value on failure.
507 */
508 __attribute__((always_inline))
509 int
vm_map_lock_read_to_write(vm_map_t map)510 vm_map_lock_read_to_write(vm_map_t map)
511 {
512 if (lck_rw_lock_shared_to_exclusive(&(map)->lock)) {
513 DTRACE_VM(vm_map_lock_upgrade);
514 return 0;
515 }
516 return 1;
517 }
518
519 __attribute__((always_inline))
520 boolean_t
vm_map_try_lock(vm_map_t map)521 vm_map_try_lock(vm_map_t map)
522 {
523 if (lck_rw_try_lock_exclusive(&(map)->lock)) {
524 DTRACE_VM(vm_map_lock_w);
525 return TRUE;
526 }
527 return FALSE;
528 }
529
530 __attribute__((always_inline))
531 boolean_t
vm_map_try_lock_read(vm_map_t map)532 vm_map_try_lock_read(vm_map_t map)
533 {
534 if (lck_rw_try_lock_shared(&(map)->lock)) {
535 DTRACE_VM(vm_map_lock_r);
536 return TRUE;
537 }
538 return FALSE;
539 }
540
541 /*
542 * Routines to get the page size the caller should
543 * use while inspecting the target address space.
544 * Use the "_safely" variant if the caller is dealing with a user-provided
545 * array whose size depends on the page size, to avoid any overflow or
546 * underflow of a user-allocated buffer.
547 */
548 int
vm_self_region_page_shift_safely(vm_map_t target_map)549 vm_self_region_page_shift_safely(
550 vm_map_t target_map)
551 {
552 int effective_page_shift = 0;
553
554 if (PAGE_SIZE == (4096)) {
555 /* x86_64 and 4k watches: always use 4k */
556 return PAGE_SHIFT;
557 }
558 /* did caller provide an explicit page size for this thread to use? */
559 effective_page_shift = thread_self_region_page_shift();
560 if (effective_page_shift) {
561 /* use the explicitly-provided page size */
562 return effective_page_shift;
563 }
564 /* no explicit page size: use the caller's page size... */
565 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
566 if (effective_page_shift == VM_MAP_PAGE_SHIFT(target_map)) {
567 /* page size match: safe to use */
568 return effective_page_shift;
569 }
570 /* page size mismatch */
571 return -1;
572 }
573 int
vm_self_region_page_shift(vm_map_t target_map)574 vm_self_region_page_shift(
575 vm_map_t target_map)
576 {
577 int effective_page_shift;
578
579 effective_page_shift = vm_self_region_page_shift_safely(target_map);
580 if (effective_page_shift == -1) {
581 /* no safe value but OK to guess for caller */
582 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
583 VM_MAP_PAGE_SHIFT(target_map));
584 }
585 return effective_page_shift;
586 }
587
588
589 /*
590 * Decide if we want to allow processes to execute from their data or stack areas.
591 * override_nx() returns true if we do. Data/stack execution can be enabled independently
592 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
593 * or allow_stack_exec to enable data execution for that type of data area for that particular
594 * ABI (or both by or'ing the flags together). These are initialized in the architecture
595 * specific pmap files since the default behavior varies according to architecture. The
596 * main reason it varies is because of the need to provide binary compatibility with old
597 * applications that were written before these restrictions came into being. In the old
598 * days, an app could execute anything it could read, but this has slowly been tightened
599 * up over time. The default behavior is:
600 *
601 * 32-bit PPC apps may execute from both stack and data areas
602 * 32-bit Intel apps may exeucte from data areas but not stack
603 * 64-bit PPC/Intel apps may not execute from either data or stack
604 *
605 * An application on any architecture may override these defaults by explicitly
606 * adding PROT_EXEC permission to the page in question with the mprotect(2)
607 * system call. This code here just determines what happens when an app tries to
608 * execute from a page that lacks execute permission.
609 *
610 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
611 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
612 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
613 * execution from data areas for a particular binary even if the arch normally permits it. As
614 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
615 * to support some complicated use cases, notably browsers with out-of-process plugins that
616 * are not all NX-safe.
617 */
618
619 extern int allow_data_exec, allow_stack_exec;
620
621 int
override_nx(vm_map_t map,uint32_t user_tag)622 override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
623 {
624 int current_abi;
625
626 if (map->pmap == kernel_pmap) {
627 return FALSE;
628 }
629
630 /*
631 * Determine if the app is running in 32 or 64 bit mode.
632 */
633
634 if (vm_map_is_64bit(map)) {
635 current_abi = VM_ABI_64;
636 } else {
637 current_abi = VM_ABI_32;
638 }
639
640 /*
641 * Determine if we should allow the execution based on whether it's a
642 * stack or data area and the current architecture.
643 */
644
645 if (user_tag == VM_MEMORY_STACK) {
646 return allow_stack_exec & current_abi;
647 }
648
649 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
650 }
651
652
653 /*
654 * Virtual memory maps provide for the mapping, protection,
655 * and sharing of virtual memory objects. In addition,
656 * this module provides for an efficient virtual copy of
657 * memory from one map to another.
658 *
659 * Synchronization is required prior to most operations.
660 *
661 * Maps consist of an ordered doubly-linked list of simple
662 * entries; a single hint is used to speed up lookups.
663 *
664 * Sharing maps have been deleted from this version of Mach.
665 * All shared objects are now mapped directly into the respective
666 * maps. This requires a change in the copy on write strategy;
667 * the asymmetric (delayed) strategy is used for shared temporary
668 * objects instead of the symmetric (shadow) strategy. All maps
669 * are now "top level" maps (either task map, kernel map or submap
670 * of the kernel map).
671 *
672 * Since portions of maps are specified by start/end addreses,
673 * which may not align with existing map entries, all
674 * routines merely "clip" entries to these start/end values.
675 * [That is, an entry is split into two, bordering at a
676 * start or end value.] Note that these clippings may not
677 * always be necessary (as the two resulting entries are then
678 * not changed); however, the clipping is done for convenience.
679 * No attempt is currently made to "glue back together" two
680 * abutting entries.
681 *
682 * The symmetric (shadow) copy strategy implements virtual copy
683 * by copying VM object references from one map to
684 * another, and then marking both regions as copy-on-write.
685 * It is important to note that only one writeable reference
686 * to a VM object region exists in any map when this strategy
687 * is used -- this means that shadow object creation can be
688 * delayed until a write operation occurs. The symmetric (delayed)
689 * strategy allows multiple maps to have writeable references to
690 * the same region of a vm object, and hence cannot delay creating
691 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
692 * Copying of permanent objects is completely different; see
693 * vm_object_copy_strategically() in vm_object.c.
694 */
695
696 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_zone; /* zone for vm_map structures */
697 static SECURITY_READ_ONLY_LATE(zone_t) vm_map_copy_zone; /* zone for vm_map_copy structures */
698
699 SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_zone; /* zone for vm_map_entry structures */
700 SECURITY_READ_ONLY_LATE(zone_t) vm_map_holes_zone; /* zone for vm map holes (vm_map_links) structures */
701 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
702 SECURITY_READ_ONLY_LATE(zone_t) vm_map_entry_reserved_zone;
703 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
704
705 #define VM_MAP_ZONE_NAME "maps"
706 #define VM_MAP_ZFLAGS ( \
707 ZC_NOENCRYPT | \
708 ZC_NOGZALLOC | \
709 ZC_VM_LP64)
710
711 #define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
712 #define VM_MAP_ENTRY_ZFLAGS ( \
713 ZC_NOENCRYPT | \
714 ZC_CACHING | \
715 ZC_NOGZALLOC | \
716 ZC_KASAN_NOQUARANTINE | \
717 ZC_VM_LP64)
718
719 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
720 #define VM_MAP_ENTRY_RESERVED_ZONE_NAME "Reserved VM map entries"
721 #define VM_MAP_ENTRY_RESERVED_ZFLAGS ( \
722 ZC_NOENCRYPT | \
723 ZC_NOCACHING | \
724 ZC_NOGZALLOC | \
725 ZC_KASAN_NOQUARANTINE | \
726 ZC_VM)
727 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
728
729 #define VM_MAP_HOLES_ZONE_NAME "VM map holes"
730 #define VM_MAP_HOLES_ZFLAGS ( \
731 ZC_NOENCRYPT | \
732 ZC_CACHING | \
733 ZC_NOGZALLOC | \
734 ZC_KASAN_NOQUARANTINE | \
735 ZC_VM_LP64)
736
737 /*
738 * Asserts that a vm_map_copy object is coming from the
739 * vm_map_copy_zone to ensure that it isn't a fake constructed
740 * anywhere else.
741 */
742 void
vm_map_copy_require(struct vm_map_copy * copy)743 vm_map_copy_require(struct vm_map_copy *copy)
744 {
745 zone_id_require(ZONE_ID_VM_MAP_COPY, sizeof(struct vm_map_copy), copy);
746 }
747
748 /*
749 * vm_map_require:
750 *
751 * Ensures that the argument is memory allocated from the genuine
752 * vm map zone. (See zone_id_require_allow_foreign).
753 */
754 void
vm_map_require(vm_map_t map)755 vm_map_require(vm_map_t map)
756 {
757 zone_id_require(ZONE_ID_VM_MAP, sizeof(struct _vm_map), map);
758 }
759
760 #define VM_MAP_EARLY_COUNT_MAX 16
761 static __startup_data vm_offset_t map_data;
762 static __startup_data vm_size_t map_data_size;
763 static __startup_data vm_offset_t kentry_data;
764 static __startup_data vm_size_t kentry_data_size;
765 static __startup_data vm_offset_t map_holes_data;
766 static __startup_data vm_size_t map_holes_data_size;
767 static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
768 static __startup_data uint32_t early_map_count;
769
770 #if XNU_TARGET_OS_OSX
771 #define NO_COALESCE_LIMIT ((1024 * 128) - 1)
772 #else /* XNU_TARGET_OS_OSX */
773 #define NO_COALESCE_LIMIT 0
774 #endif /* XNU_TARGET_OS_OSX */
775
776 /* Skip acquiring locks if we're in the midst of a kernel core dump */
777 unsigned int not_in_kdp = 1;
778
779 unsigned int vm_map_set_cache_attr_count = 0;
780
781 kern_return_t
vm_map_set_cache_attr(vm_map_t map,vm_map_offset_t va)782 vm_map_set_cache_attr(
783 vm_map_t map,
784 vm_map_offset_t va)
785 {
786 vm_map_entry_t map_entry;
787 vm_object_t object;
788 kern_return_t kr = KERN_SUCCESS;
789
790 vm_map_lock_read(map);
791
792 if (!vm_map_lookup_entry(map, va, &map_entry) ||
793 map_entry->is_sub_map) {
794 /*
795 * that memory is not properly mapped
796 */
797 kr = KERN_INVALID_ARGUMENT;
798 goto done;
799 }
800 object = VME_OBJECT(map_entry);
801
802 if (object == VM_OBJECT_NULL) {
803 /*
804 * there should be a VM object here at this point
805 */
806 kr = KERN_INVALID_ARGUMENT;
807 goto done;
808 }
809 vm_object_lock(object);
810 object->set_cache_attr = TRUE;
811 vm_object_unlock(object);
812
813 vm_map_set_cache_attr_count++;
814 done:
815 vm_map_unlock_read(map);
816
817 return kr;
818 }
819
820
821 #if CONFIG_CODE_DECRYPTION
822 /*
823 * vm_map_apple_protected:
824 * This remaps the requested part of the object with an object backed by
825 * the decrypting pager.
826 * crypt_info contains entry points and session data for the crypt module.
827 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
828 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
829 */
830 kern_return_t
vm_map_apple_protected(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_object_offset_t crypto_backing_offset,struct pager_crypt_info * crypt_info,uint32_t cryptid)831 vm_map_apple_protected(
832 vm_map_t map,
833 vm_map_offset_t start,
834 vm_map_offset_t end,
835 vm_object_offset_t crypto_backing_offset,
836 struct pager_crypt_info *crypt_info,
837 uint32_t cryptid)
838 {
839 boolean_t map_locked;
840 kern_return_t kr;
841 vm_map_entry_t map_entry;
842 struct vm_map_entry tmp_entry;
843 memory_object_t unprotected_mem_obj;
844 vm_object_t protected_object;
845 vm_map_offset_t map_addr;
846 vm_map_offset_t start_aligned, end_aligned;
847 vm_object_offset_t crypto_start, crypto_end;
848 int vm_flags;
849 vm_map_kernel_flags_t vmk_flags;
850 boolean_t cache_pager;
851
852 vm_flags = 0;
853 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
854
855 map_locked = FALSE;
856 unprotected_mem_obj = MEMORY_OBJECT_NULL;
857
858 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
859 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
860 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
861 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
862
863 #if __arm64__
864 /*
865 * "start" and "end" might be 4K-aligned but not 16K-aligned,
866 * so we might have to loop and establish up to 3 mappings:
867 *
868 * + the first 16K-page, which might overlap with the previous
869 * 4K-aligned mapping,
870 * + the center,
871 * + the last 16K-page, which might overlap with the next
872 * 4K-aligned mapping.
873 * Each of these mapping might be backed by a vnode pager (if
874 * properly page-aligned) or a "fourk_pager", itself backed by a
875 * vnode pager (if 4K-aligned but not page-aligned).
876 */
877 #endif /* __arm64__ */
878
879 map_addr = start_aligned;
880 for (map_addr = start_aligned;
881 map_addr < end;
882 map_addr = tmp_entry.vme_end) {
883 vm_map_lock(map);
884 map_locked = TRUE;
885
886 /* lookup the protected VM object */
887 if (!vm_map_lookup_entry(map,
888 map_addr,
889 &map_entry) ||
890 map_entry->is_sub_map ||
891 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
892 /* that memory is not properly mapped */
893 kr = KERN_INVALID_ARGUMENT;
894 goto done;
895 }
896
897 /* ensure mapped memory is mapped as executable except
898 * except for model decryption flow */
899 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
900 !(map_entry->protection & VM_PROT_EXECUTE)) {
901 kr = KERN_INVALID_ARGUMENT;
902 goto done;
903 }
904
905 /* get the protected object to be decrypted */
906 protected_object = VME_OBJECT(map_entry);
907 if (protected_object == VM_OBJECT_NULL) {
908 /* there should be a VM object here at this point */
909 kr = KERN_INVALID_ARGUMENT;
910 goto done;
911 }
912 /* ensure protected object stays alive while map is unlocked */
913 vm_object_reference(protected_object);
914
915 /* limit the map entry to the area we want to cover */
916 vm_map_clip_start(map, map_entry, start_aligned);
917 vm_map_clip_end(map, map_entry, end_aligned);
918
919 tmp_entry = *map_entry;
920 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
921 vm_map_unlock(map);
922 map_locked = FALSE;
923
924 /*
925 * This map entry might be only partially encrypted
926 * (if not fully "page-aligned").
927 */
928 crypto_start = 0;
929 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
930 if (tmp_entry.vme_start < start) {
931 if (tmp_entry.vme_start != start_aligned) {
932 kr = KERN_INVALID_ADDRESS;
933 }
934 crypto_start += (start - tmp_entry.vme_start);
935 }
936 if (tmp_entry.vme_end > end) {
937 if (tmp_entry.vme_end != end_aligned) {
938 kr = KERN_INVALID_ADDRESS;
939 }
940 crypto_end -= (tmp_entry.vme_end - end);
941 }
942
943 /*
944 * This "extra backing offset" is needed to get the decryption
945 * routine to use the right key. It adjusts for the possibly
946 * relative offset of an interposed "4K" pager...
947 */
948 if (crypto_backing_offset == (vm_object_offset_t) -1) {
949 crypto_backing_offset = VME_OFFSET(&tmp_entry);
950 }
951
952 cache_pager = TRUE;
953 #if XNU_TARGET_OS_OSX
954 if (vm_map_is_alien(map)) {
955 cache_pager = FALSE;
956 }
957 #endif /* XNU_TARGET_OS_OSX */
958
959 /*
960 * Lookup (and create if necessary) the protected memory object
961 * matching that VM object.
962 * If successful, this also grabs a reference on the memory object,
963 * to guarantee that it doesn't go away before we get a chance to map
964 * it.
965 */
966 unprotected_mem_obj = apple_protect_pager_setup(
967 protected_object,
968 VME_OFFSET(&tmp_entry),
969 crypto_backing_offset,
970 crypt_info,
971 crypto_start,
972 crypto_end,
973 cache_pager);
974
975 /* release extra ref on protected object */
976 vm_object_deallocate(protected_object);
977
978 if (unprotected_mem_obj == NULL) {
979 kr = KERN_FAILURE;
980 goto done;
981 }
982
983 vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
984 /* can overwrite an immutable mapping */
985 vmk_flags.vmkf_overwrite_immutable = TRUE;
986 #if __arm64__
987 if (tmp_entry.used_for_jit &&
988 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
989 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
990 fourk_binary_compatibility_unsafe &&
991 fourk_binary_compatibility_allow_wx) {
992 printf("** FOURK_COMPAT [%d]: "
993 "allowing write+execute at 0x%llx\n",
994 proc_selfpid(), tmp_entry.vme_start);
995 vmk_flags.vmkf_map_jit = TRUE;
996 }
997 #endif /* __arm64__ */
998
999 /* map this memory object in place of the current one */
1000 map_addr = tmp_entry.vme_start;
1001 kr = vm_map_enter_mem_object(map,
1002 &map_addr,
1003 (tmp_entry.vme_end -
1004 tmp_entry.vme_start),
1005 (mach_vm_offset_t) 0,
1006 vm_flags,
1007 vmk_flags,
1008 VM_KERN_MEMORY_NONE,
1009 (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1010 0,
1011 TRUE,
1012 tmp_entry.protection,
1013 tmp_entry.max_protection,
1014 tmp_entry.inheritance);
1015 assertf(kr == KERN_SUCCESS,
1016 "kr = 0x%x\n", kr);
1017 assertf(map_addr == tmp_entry.vme_start,
1018 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1019 (uint64_t)map_addr,
1020 (uint64_t) tmp_entry.vme_start,
1021 &tmp_entry);
1022
1023 #if VM_MAP_DEBUG_APPLE_PROTECT
1024 if (vm_map_debug_apple_protect) {
1025 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1026 " backing:[object:%p,offset:0x%llx,"
1027 "crypto_backing_offset:0x%llx,"
1028 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1029 map,
1030 (uint64_t) map_addr,
1031 (uint64_t) (map_addr + (tmp_entry.vme_end -
1032 tmp_entry.vme_start)),
1033 unprotected_mem_obj,
1034 protected_object,
1035 VME_OFFSET(&tmp_entry),
1036 crypto_backing_offset,
1037 crypto_start,
1038 crypto_end);
1039 }
1040 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1041
1042 /*
1043 * Release the reference obtained by
1044 * apple_protect_pager_setup().
1045 * The mapping (if it succeeded) is now holding a reference on
1046 * the memory object.
1047 */
1048 memory_object_deallocate(unprotected_mem_obj);
1049 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1050
1051 /* continue with next map entry */
1052 crypto_backing_offset += (tmp_entry.vme_end -
1053 tmp_entry.vme_start);
1054 crypto_backing_offset -= crypto_start;
1055 }
1056 kr = KERN_SUCCESS;
1057
1058 done:
1059 if (map_locked) {
1060 vm_map_unlock(map);
1061 }
1062 return kr;
1063 }
1064 #endif /* CONFIG_CODE_DECRYPTION */
1065
1066
1067 LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1068 LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1069 LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1070
1071 #if XNU_TARGET_OS_OSX
1072 int malloc_no_cow = 0;
1073 #else /* XNU_TARGET_OS_OSX */
1074 int malloc_no_cow = 1;
1075 #endif /* XNU_TARGET_OS_OSX */
1076 uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1077 #if DEBUG
1078 int vm_check_map_sanity = 0;
1079 #endif
1080
1081 /*
1082 * vm_map_init:
1083 *
1084 * Initialize the vm_map module. Must be called before
1085 * any other vm_map routines.
1086 *
1087 * Map and entry structures are allocated from zones -- we must
1088 * initialize those zones.
1089 *
1090 * There are three zones of interest:
1091 *
1092 * vm_map_zone: used to allocate maps.
1093 * vm_map_entry_zone: used to allocate map entries.
1094 *
1095 * LP32:
1096 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1097 *
1098 * The kernel allocates map entries from a special zone that is initially
1099 * "crammed" with memory. It would be difficult (perhaps impossible) for
1100 * the kernel to allocate more memory to a entry zone when it became
1101 * empty since the very act of allocating memory implies the creation
1102 * of a new entry.
1103 */
1104 __startup_func
1105 void
vm_map_init(void)1106 vm_map_init(void)
1107 {
1108
1109 #if MACH_ASSERT
1110 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1111 sizeof(debug4k_filter));
1112 #endif /* MACH_ASSERT */
1113
1114 vm_map_zone = zone_create_ext(VM_MAP_ZONE_NAME, sizeof(struct _vm_map),
1115 VM_MAP_ZFLAGS, ZONE_ID_VM_MAP, NULL);
1116
1117 /*
1118 * Don't quarantine because we always need elements available
1119 * Disallow GC on this zone... to aid the GC.
1120 */
1121 vm_map_entry_zone = zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1122 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1123 ZONE_ID_VM_MAP_ENTRY, ^(zone_t z) {
1124 z->z_elems_rsv = (uint16_t)(32 *
1125 (ml_early_cpu_max_number() + 1));
1126 });
1127 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1128 vm_map_entry_reserved_zone = zone_create(VM_MAP_ENTRY_RESERVED_ZONE_NAME,
1129 sizeof(struct vm_map_entry), VM_MAP_ENTRY_RESERVED_ZFLAGS);
1130 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1131
1132 vm_map_holes_zone = zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1133 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1134 ZONE_ID_VM_MAP_HOLES, ^(zone_t z) {
1135 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_size(z));
1136 });
1137
1138 vm_map_copy_zone = zone_create_ext("VM map copies", sizeof(struct vm_map_copy),
1139 ZC_NOENCRYPT | ZC_CACHING, ZONE_ID_VM_MAP_COPY, NULL);
1140
1141 /*
1142 * Add the stolen memory to zones, adjust zone size and stolen counts.
1143 */
1144 zone_cram_early(vm_map_zone, map_data, map_data_size);
1145 zone_cram_early(vm_map_entry_zone, kentry_data, kentry_data_size);
1146 zone_cram_early(vm_map_holes_zone, map_holes_data, map_holes_data_size);
1147 printf("VM boostrap: %d maps, %d entries and %d holes available\n",
1148 vm_map_zone->z_elems_free,
1149 vm_map_entry_zone->z_elems_free,
1150 vm_map_holes_zone->z_elems_free);
1151
1152 /*
1153 * Since these are covered by zones, remove them from stolen page accounting.
1154 */
1155 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1156
1157 #if VM_MAP_DEBUG_APPLE_PROTECT
1158 PE_parse_boot_argn("vm_map_debug_apple_protect",
1159 &vm_map_debug_apple_protect,
1160 sizeof(vm_map_debug_apple_protect));
1161 #endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1162 #if VM_MAP_DEBUG_APPLE_FOURK
1163 PE_parse_boot_argn("vm_map_debug_fourk",
1164 &vm_map_debug_fourk,
1165 sizeof(vm_map_debug_fourk));
1166 #endif /* VM_MAP_DEBUG_FOURK */
1167
1168 PE_parse_boot_argn("malloc_no_cow",
1169 &malloc_no_cow,
1170 sizeof(malloc_no_cow));
1171 if (malloc_no_cow) {
1172 vm_memory_malloc_no_cow_mask = 0ULL;
1173 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1174 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1175 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1176 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1177 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1178 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1179 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1180 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1181 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1182 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1183 // vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1184 PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
1185 &vm_memory_malloc_no_cow_mask,
1186 sizeof(vm_memory_malloc_no_cow_mask));
1187 }
1188
1189 #if DEBUG
1190 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1191 if (vm_check_map_sanity) {
1192 kprintf("VM sanity checking enabled\n");
1193 } else {
1194 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1195 }
1196 #endif /* DEBUG */
1197
1198 #if DEVELOPMENT || DEBUG
1199 PE_parse_boot_argn("panic_on_unsigned_execute",
1200 &panic_on_unsigned_execute,
1201 sizeof(panic_on_unsigned_execute));
1202 PE_parse_boot_argn("panic_on_mlock_failure",
1203 &panic_on_mlock_failure,
1204 sizeof(panic_on_mlock_failure));
1205 #endif /* DEVELOPMENT || DEBUG */
1206 }
1207
1208 __startup_func
1209 static void
vm_map_steal_memory(void)1210 vm_map_steal_memory(void)
1211 {
1212 /*
1213 * We need to reserve enough memory to support boostraping VM maps
1214 * and the zone subsystem.
1215 *
1216 * The VM Maps that need to function before zones can support them
1217 * are the ones registered with vm_map_will_allocate_early_map(),
1218 * which are:
1219 * - the kernel map
1220 * - the various submaps used by zones (pgz, meta, ...)
1221 *
1222 * We also need enough entries and holes to support them
1223 * until zone_metadata_init() is called, which is when
1224 * the zone allocator becomes capable of expanding dynamically.
1225 *
1226 * We need:
1227 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1228 * - To allow for 3-4 entries per map, but the kernel map
1229 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1230 * to describe the submaps, so double it (and make it 8x too)
1231 * - To allow for holes between entries,
1232 * hence needs the same budget as entries
1233 */
1234 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1235 sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1236 VM_MAP_EARLY_COUNT_MAX);
1237
1238 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1239 sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1240 8 * VM_MAP_EARLY_COUNT_MAX);
1241
1242 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1243 sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1244 8 * VM_MAP_EARLY_COUNT_MAX);
1245
1246 /*
1247 * Steal a contiguous range of memory so that a simple range check
1248 * can validate early addresses being freed/crammed to these
1249 * zones
1250 */
1251 map_data = zone_early_mem_init(map_data_size + kentry_data_size +
1252 map_holes_data_size);
1253 kentry_data = map_data + map_data_size;
1254 map_holes_data = kentry_data + kentry_data_size;
1255 }
1256 STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1257
1258 __startup_func
1259 static void
vm_kernel_boostraped(void)1260 vm_kernel_boostraped(void)
1261 {
1262 printf("VM bootstrap done: %d maps, %d entries and %d holes left\n",
1263 vm_map_zone->z_elems_free,
1264 vm_map_entry_zone->z_elems_free,
1265 vm_map_holes_zone->z_elems_free);
1266 }
1267 STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1268
1269 void
vm_map_disable_hole_optimization(vm_map_t map)1270 vm_map_disable_hole_optimization(vm_map_t map)
1271 {
1272 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1273
1274 if (map->holelistenabled) {
1275 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1276
1277 while (hole_entry != NULL) {
1278 next_hole_entry = hole_entry->vme_next;
1279
1280 hole_entry->vme_next = NULL;
1281 hole_entry->vme_prev = NULL;
1282 zfree(vm_map_holes_zone, hole_entry);
1283
1284 if (next_hole_entry == head_entry) {
1285 hole_entry = NULL;
1286 } else {
1287 hole_entry = next_hole_entry;
1288 }
1289 }
1290
1291 map->holes_list = NULL;
1292 map->holelistenabled = FALSE;
1293
1294 map->first_free = vm_map_first_entry(map);
1295 SAVE_HINT_HOLE_WRITE(map, NULL);
1296 }
1297 }
1298
1299 boolean_t
vm_kernel_map_is_kernel(vm_map_t map)1300 vm_kernel_map_is_kernel(vm_map_t map)
1301 {
1302 return map->pmap == kernel_pmap;
1303 }
1304
1305 /*
1306 * vm_map_create:
1307 *
1308 * Creates and returns a new empty VM map with
1309 * the given physical map structure, and having
1310 * the given lower and upper address bounds.
1311 */
1312
1313 extern vm_map_t vm_map_create_external(
1314 pmap_t pmap,
1315 vm_map_offset_t min_off,
1316 vm_map_offset_t max_off,
1317 boolean_t pageable);
1318
1319 vm_map_t
vm_map_create_external(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,boolean_t pageable)1320 vm_map_create_external(
1321 pmap_t pmap,
1322 vm_map_offset_t min,
1323 vm_map_offset_t max,
1324 boolean_t pageable)
1325 {
1326 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1327
1328 if (pageable) {
1329 options |= VM_MAP_CREATE_PAGEABLE;
1330 }
1331 return vm_map_create_options(pmap, min, max, options);
1332 }
1333
1334 __startup_func
1335 void
vm_map_will_allocate_early_map(vm_map_t * owner)1336 vm_map_will_allocate_early_map(vm_map_t *owner)
1337 {
1338 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1339 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1340 }
1341
1342 early_map_owners[early_map_count++] = owner;
1343 }
1344
1345 __startup_func
1346 void
vm_map_relocate_early_maps(vm_offset_t delta)1347 vm_map_relocate_early_maps(vm_offset_t delta)
1348 {
1349 for (uint32_t i = 0; i < early_map_count; i++) {
1350 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1351
1352 *early_map_owners[i] = (vm_map_t)(addr + delta);
1353 }
1354
1355 early_map_count = ~0u;
1356 }
1357
1358 /*
1359 * Routine: vm_map_relocate_early_elem
1360 *
1361 * Purpose:
1362 * Early zone elements are allocated in a temporary part
1363 * of the address space.
1364 *
1365 * Once the zones live in their final place, the early
1366 * VM maps, map entries and map holes need to be relocated.
1367 *
1368 * It involves rewriting any vm_map_t, vm_map_entry_t or
1369 * pointers to vm_map_links. Other pointers to other types
1370 * are fine.
1371 *
1372 * Fortunately, pointers to those types are self-contained
1373 * in those zones, _except_ for pointers to VM maps,
1374 * which are tracked during early boot and fixed with
1375 * vm_map_relocate_early_maps().
1376 */
1377 __startup_func
1378 void
vm_map_relocate_early_elem(uint32_t zone_id,vm_offset_t new_addr,vm_offset_t delta)1379 vm_map_relocate_early_elem(
1380 uint32_t zone_id,
1381 vm_offset_t new_addr,
1382 vm_offset_t delta)
1383 {
1384 #define relocate(type_t, field) ({ \
1385 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1386 if (*__field) { \
1387 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1388 } \
1389 })
1390
1391 switch (zone_id) {
1392 case ZONE_ID_VM_MAP:
1393 case ZONE_ID_VM_MAP_ENTRY:
1394 case ZONE_ID_VM_MAP_HOLES:
1395 break;
1396
1397 default:
1398 panic("Unexpected zone ID %d", zone_id);
1399 }
1400
1401 if (zone_id == ZONE_ID_VM_MAP) {
1402 relocate(vm_map_t, hdr.links.prev);
1403 relocate(vm_map_t, hdr.links.next);
1404 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1405 #ifdef VM_MAP_STORE_USE_RB
1406 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1407 #endif /* VM_MAP_STORE_USE_RB */
1408 relocate(vm_map_t, hint);
1409 relocate(vm_map_t, hole_hint);
1410 relocate(vm_map_t, first_free);
1411 return;
1412 }
1413
1414 relocate(struct vm_map_links *, prev);
1415 relocate(struct vm_map_links *, next);
1416
1417 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1418 #ifdef VM_MAP_STORE_USE_RB
1419 relocate(vm_map_entry_t, store.entry.rbe_left);
1420 relocate(vm_map_entry_t, store.entry.rbe_right);
1421 relocate(vm_map_entry_t, store.entry.rbe_parent);
1422 #endif /* VM_MAP_STORE_USE_RB */
1423 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1424 /* no object to relocate because we haven't made any */
1425 relocate(vm_map_entry_t, vme_object.vmo_submap);
1426 }
1427 #if MAP_ENTRY_CREATION_DEBUG
1428 relocate(vm_map_entry_t, vme_creation_maphdr);
1429 #endif /* MAP_ENTRY_CREATION_DEBUG */
1430 }
1431
1432 #undef relocate
1433 }
1434
1435 vm_map_t
vm_map_create_options(pmap_t pmap,vm_map_offset_t min,vm_map_offset_t max,vm_map_create_options_t options)1436 vm_map_create_options(
1437 pmap_t pmap,
1438 vm_map_offset_t min,
1439 vm_map_offset_t max,
1440 vm_map_create_options_t options)
1441 {
1442 vm_map_t result;
1443
1444 #if DEBUG || DEVELOPMENT
1445 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1446 if (early_map_count != ~0u && early_map_count !=
1447 zone_count_allocated(vm_map_zone) + 1) {
1448 panic("allocating %dth early map, owner not known",
1449 zone_count_allocated(vm_map_zone) + 1);
1450 }
1451 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1452 panic("allocating %dth early map for non kernel pmap",
1453 early_map_count);
1454 }
1455 }
1456 #endif /* DEBUG || DEVELOPMENT */
1457
1458 result = zalloc_flags(vm_map_zone, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1459
1460 vm_map_first_entry(result) = vm_map_to_entry(result);
1461 vm_map_last_entry(result) = vm_map_to_entry(result);
1462
1463 vm_map_store_init(&result->hdr);
1464 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1465 vm_map_set_page_shift(result, PAGE_SHIFT);
1466
1467 result->size_limit = RLIM_INFINITY; /* default unlimited */
1468 result->data_limit = RLIM_INFINITY; /* default unlimited */
1469 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1470 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1471 result->pmap = pmap;
1472 result->min_offset = min;
1473 result->max_offset = max;
1474 result->first_free = vm_map_to_entry(result);
1475 result->hint = vm_map_to_entry(result);
1476
1477 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1478 assert(pmap == kernel_pmap);
1479 result->never_faults = true;
1480 }
1481
1482 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1483 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1484 result->has_corpse_footprint = true;
1485 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1486 struct vm_map_links *hole_entry = zalloc(vm_map_holes_zone);
1487
1488 hole_entry->start = min;
1489 #if defined(__arm__) || defined(__arm64__)
1490 hole_entry->end = result->max_offset;
1491 #else
1492 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1493 #endif
1494 result->holes_list = result->hole_hint = hole_entry;
1495 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1496 result->holelistenabled = true;
1497 }
1498
1499 vm_map_lock_init(result);
1500
1501 return result;
1502 }
1503
1504 /*
1505 * Adjusts a submap that was made by kmem_suballoc()
1506 * before it knew where it would be mapped,
1507 * so that it has the right min/max offsets.
1508 *
1509 * We do not need to hold any locks:
1510 * only the caller knows about this map,
1511 * and it is not published on any entry yet.
1512 */
1513 static void
vm_map_adjust_offsets(vm_map_t map,vm_map_offset_t min_off,vm_map_offset_t max_off)1514 vm_map_adjust_offsets(
1515 vm_map_t map,
1516 vm_map_offset_t min_off,
1517 vm_map_offset_t max_off)
1518 {
1519 assert(map->min_offset == 0);
1520 assert(map->max_offset == max_off - min_off);
1521 assert(map->hdr.nentries == 0);
1522 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1523
1524 map->min_offset = min_off;
1525 map->max_offset = max_off;
1526
1527 if (map->holelistenabled) {
1528 struct vm_map_links *hole = map->holes_list;
1529
1530 hole->start = min_off;
1531 #if defined(__arm__) || defined(__arm64__)
1532 hole->end = max_off;
1533 #else
1534 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1535 #endif
1536 }
1537 }
1538
1539
1540 vm_map_size_t
vm_map_adjusted_size(vm_map_t map)1541 vm_map_adjusted_size(vm_map_t map)
1542 {
1543 struct vm_reserved_region *regions = NULL;
1544 size_t num_regions = 0;
1545 mach_vm_size_t reserved_size = 0, map_size = 0;
1546
1547 if (map == NULL || (map->size == 0)) {
1548 return 0;
1549 }
1550
1551 map_size = map->size;
1552
1553 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1554 /*
1555 * No special reserved regions or not an exotic map or the task
1556 * is terminating and these special regions might have already
1557 * been deallocated.
1558 */
1559 return map_size;
1560 }
1561
1562 num_regions = ml_get_vm_reserved_regions(vm_map_is_64bit(map), ®ions);
1563 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1564
1565 while (num_regions) {
1566 reserved_size += regions[--num_regions].vmrr_size;
1567 }
1568
1569 /*
1570 * There are a few places where the map is being switched out due to
1571 * 'termination' without that bit being set (e.g. exec and corpse purging).
1572 * In those cases, we could have the map's regions being deallocated on
1573 * a core while some accounting process is trying to get the map's size.
1574 * So this assert can't be enabled till all those places are uniform in
1575 * their use of the 'map->terminated' bit.
1576 *
1577 * assert(map_size >= reserved_size);
1578 */
1579
1580 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1581 }
1582
1583 /*
1584 * vm_map_entry_create: [ internal use only ]
1585 *
1586 * Allocates a VM map entry for insertion in the
1587 * given map (or map copy). No fields are filled.
1588 *
1589 * The VM entry will be zero initialized, except for:
1590 * - behavior set to VM_BEHAVIOR_DEFAULT
1591 * - inheritance set to VM_INHERIT_DEFAULT
1592 */
1593 #define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1594
1595 #define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1596
1597 static vm_map_entry_t
_vm_map_entry_create(struct vm_map_header * map_header __unused)1598 _vm_map_entry_create(
1599 struct vm_map_header *map_header __unused)
1600 {
1601 vm_map_entry_t entry = NULL;
1602 zone_t zone = vm_map_entry_zone;
1603
1604 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1605 zone_security_flags_t zsflags = zone_security_array[ZONE_ID_VM_MAP_ENTRY];
1606 if (map_header == &zone_submap(zsflags)->hdr) {
1607 /*
1608 * If we are trying to allocate an entry for the submap
1609 * of the vm_map_entry_zone, then this can cause recursive
1610 * locking of this map.
1611 *
1612 * Try to allocate _without blocking_ from this zone,
1613 * but if it is depleted, we need to go to the
1614 * vm_map_entry_reserved_zone which is in the zalloc
1615 * "VM" submap, which can grow without taking any map lock.
1616 *
1617 * Note: the vm_map_entry_zone has a rather high "reserve"
1618 * setup in order to minimize usage of the reserved one.
1619 */
1620 entry = zalloc_flags(vm_map_entry_zone, Z_NOWAIT | Z_ZERO);
1621 zone = vm_map_entry_reserved_zone;
1622 }
1623 #endif
1624 if (entry == NULL) {
1625 entry = zalloc_flags(zone, Z_WAITOK | Z_ZERO);
1626 }
1627
1628 /*
1629 * Help the compiler with what we know to be true,
1630 * so that the further bitfields inits have good codegen.
1631 *
1632 * See rdar://87041299
1633 */
1634 __builtin_assume(entry->vme_object.vmo_object == NULL);
1635 #if __LP64__
1636 __builtin_assume(*(uint64_t *)(&entry->vme_object + 1) == 0);
1637 __builtin_assume(*(uint64_t *)(&entry->vme_object + 2) == 0);
1638 #else
1639 __builtin_assume(*(uint32_t *)(&entry->vme_object + 1) == 0);
1640 __builtin_assume(*(uint32_t *)(&entry->vme_object + 2) == 0);
1641 __builtin_assume(*(uint32_t *)(&entry->vme_object + 3) == 0);
1642 __builtin_assume(*(uint32_t *)(&entry->vme_object + 4) == 0);
1643 #endif
1644
1645 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1646 "VME_ALIAS_MASK covers tags");
1647
1648 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1649 "can skip zeroing of the behavior field");
1650 entry->inheritance = VM_INHERIT_DEFAULT;
1651
1652 vm_map_store_update((vm_map_t) NULL, entry, VM_MAP_ENTRY_CREATE);
1653
1654 #if MAP_ENTRY_CREATION_DEBUG
1655 entry->vme_creation_maphdr = map_header;
1656 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1657 BTREF_GET_NOWAIT);
1658 #endif
1659 return entry;
1660 }
1661
1662 /*
1663 * vm_map_entry_dispose: [ internal use only ]
1664 *
1665 * Inverse of vm_map_entry_create.
1666 *
1667 * write map lock held so no need to
1668 * do anything special to insure correctness
1669 * of the stores
1670 */
1671 static void
vm_map_entry_dispose(vm_map_entry_t entry)1672 vm_map_entry_dispose(
1673 vm_map_entry_t entry)
1674 {
1675 #if MAP_ENTRY_CREATION_DEBUG
1676 btref_put(entry->vme_creation_bt);
1677 #endif
1678 #if MAP_ENTRY_INSERTION_DEBUG
1679 btref_put(entry->vme_insertion_bt);
1680 #endif
1681 #if HAVE_VM_MAP_RESERVED_ENTRY_ZONE
1682 if (zone_id_for_element(entry, sizeof(*entry)) != ZONE_ID_VM_MAP_ENTRY) {
1683 zfree(vm_map_entry_reserved_zone, entry);
1684 return;
1685 }
1686 #endif /* HAVE_VM_MAP_RESERVED_ENTRY_ZONE */
1687 zfree(vm_map_entry_zone, entry);
1688 }
1689
1690 #define vm_map_copy_entry_dispose(copy_entry) \
1691 vm_map_entry_dispose(copy_entry)
1692
1693 static vm_map_entry_t
vm_map_zap_first_entry(vm_map_zap_t list)1694 vm_map_zap_first_entry(
1695 vm_map_zap_t list)
1696 {
1697 return list->vmz_head;
1698 }
1699
1700 static vm_map_entry_t
vm_map_zap_last_entry(vm_map_zap_t list)1701 vm_map_zap_last_entry(
1702 vm_map_zap_t list)
1703 {
1704 assert(vm_map_zap_first_entry(list));
1705 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1706 }
1707
1708 static void
vm_map_zap_append(vm_map_zap_t list,vm_map_entry_t entry)1709 vm_map_zap_append(
1710 vm_map_zap_t list,
1711 vm_map_entry_t entry)
1712 {
1713 entry->vme_next = VM_MAP_ENTRY_NULL;
1714 *list->vmz_tail = entry;
1715 list->vmz_tail = &entry->vme_next;
1716 }
1717
1718 static vm_map_entry_t
vm_map_zap_pop(vm_map_zap_t list)1719 vm_map_zap_pop(
1720 vm_map_zap_t list)
1721 {
1722 vm_map_entry_t head = list->vmz_head;
1723
1724 if (head != VM_MAP_ENTRY_NULL &&
1725 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1726 list->vmz_tail = &list->vmz_head;
1727 }
1728
1729 return head;
1730 }
1731
1732 static void
vm_map_zap_dispose(vm_map_zap_t list)1733 vm_map_zap_dispose(
1734 vm_map_zap_t list)
1735 {
1736 vm_map_entry_t entry;
1737
1738 while ((entry = vm_map_zap_pop(list))) {
1739 if (entry->is_sub_map) {
1740 vm_map_deallocate(VME_SUBMAP(entry));
1741 } else {
1742 vm_object_deallocate(VME_OBJECT(entry));
1743 }
1744
1745 vm_map_entry_dispose(entry);
1746 }
1747 }
1748
1749 #if MACH_ASSERT
1750 static boolean_t first_free_check = FALSE;
1751 boolean_t
first_free_is_valid(vm_map_t map)1752 first_free_is_valid(
1753 vm_map_t map)
1754 {
1755 if (!first_free_check) {
1756 return TRUE;
1757 }
1758
1759 return first_free_is_valid_store( map );
1760 }
1761 #endif /* MACH_ASSERT */
1762
1763
1764 #define vm_map_copy_entry_link(copy, after_where, entry) \
1765 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1766
1767 #define vm_map_copy_entry_unlink(copy, entry) \
1768 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry))
1769
1770 /*
1771 * vm_map_destroy:
1772 *
1773 * Actually destroy a map.
1774 */
1775 void
vm_map_destroy(vm_map_t map)1776 vm_map_destroy(
1777 vm_map_t map)
1778 {
1779 /* final cleanup: this is not allowed to fail */
1780 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1781
1782 VM_MAP_ZAP_DECLARE(zap);
1783
1784 vm_map_lock(map);
1785
1786 map->terminated = true;
1787 /* clean up regular map entries */
1788 (void)vm_map_delete(map, map->min_offset, map->max_offset, flags, &zap);
1789 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1790 #if !defined(__arm__)
1791 (void)vm_map_delete(map, 0x0, 0xFFFFFFFFFFFFF000ULL, flags, &zap);
1792 #endif /* !__arm__ */
1793
1794 vm_map_disable_hole_optimization(map);
1795 vm_map_corpse_footprint_destroy(map);
1796
1797 vm_map_unlock(map);
1798
1799 vm_map_zap_dispose(&zap);
1800
1801 assert(map->hdr.nentries == 0);
1802
1803 if (map->pmap) {
1804 pmap_destroy(map->pmap);
1805 }
1806
1807 #if LOCKS_INDIRECT_ALLOW
1808 if (vm_map_lck_attr.lck_attr_val & LCK_ATTR_DEBUG) {
1809 /*
1810 * If lock debugging is enabled the mutexes get tagged as LCK_MTX_TAG_INDIRECT.
1811 * And this is regardless of whether the lck_mtx_ext_t is embedded in the
1812 * structure or kalloc'ed via lck_mtx_init.
1813 * An example is s_lock_ext within struct _vm_map.
1814 *
1815 * A lck_mtx_destroy on such a mutex will attempt a kfree and panic. We
1816 * can add another tag to detect embedded vs alloc'ed indirect external
1817 * mutexes but that'll be additional checks in the lock path and require
1818 * updating dependencies for the old vs new tag.
1819 *
1820 * Since the kfree() is for LCK_MTX_TAG_INDIRECT mutexes and that tag is applied
1821 * just when lock debugging is ON, we choose to forego explicitly destroying
1822 * the vm_map mutex and rw lock. Because the vm_map_lck_grp is
1823 * permanent, this has no serious side-effect.
1824 */
1825 } else
1826 #endif /* LOCKS_INDIRECT_ALLOW */
1827 {
1828 lck_rw_destroy(&(map)->lock, &vm_map_lck_grp);
1829 }
1830
1831 zfree(vm_map_zone, map);
1832 }
1833
1834 /*
1835 * Returns pid of the task with the largest number of VM map entries.
1836 * Used in the zone-map-exhaustion jetsam path.
1837 */
1838 pid_t
find_largest_process_vm_map_entries(void)1839 find_largest_process_vm_map_entries(void)
1840 {
1841 pid_t victim_pid = -1;
1842 int max_vm_map_entries = 0;
1843 task_t task = TASK_NULL;
1844 queue_head_t *task_list = &tasks;
1845
1846 lck_mtx_lock(&tasks_threads_lock);
1847 queue_iterate(task_list, task, task_t, tasks) {
1848 if (task == kernel_task || !task->active) {
1849 continue;
1850 }
1851
1852 vm_map_t task_map = task->map;
1853 if (task_map != VM_MAP_NULL) {
1854 int task_vm_map_entries = task_map->hdr.nentries;
1855 if (task_vm_map_entries > max_vm_map_entries) {
1856 max_vm_map_entries = task_vm_map_entries;
1857 victim_pid = pid_from_task(task);
1858 }
1859 }
1860 }
1861 lck_mtx_unlock(&tasks_threads_lock);
1862
1863 printf("zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1864 return victim_pid;
1865 }
1866
1867
1868 /*
1869 * vm_map_lookup_entry: [ internal use only ]
1870 *
1871 * Calls into the vm map store layer to find the map
1872 * entry containing (or immediately preceding) the
1873 * specified address in the given map; the entry is returned
1874 * in the "entry" parameter. The boolean
1875 * result indicates whether the address is
1876 * actually contained in the map.
1877 */
1878 boolean_t
vm_map_lookup_entry(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1879 vm_map_lookup_entry(
1880 vm_map_t map,
1881 vm_map_offset_t address,
1882 vm_map_entry_t *entry) /* OUT */
1883 {
1884 #if CONFIG_KERNEL_TBI
1885 if (VM_KERNEL_ADDRESS(address)) {
1886 address = VM_KERNEL_STRIP_UPTR(address);
1887 }
1888 #endif /* CONFIG_KERNEL_TBI */
1889 #if CONFIG_PROB_GZALLOC
1890 if (map->pmap == kernel_pmap) {
1891 assertf(!pgz_owned(address),
1892 "it is the responsibility of callers to unguard PGZ addresses");
1893 }
1894 #endif /* CONFIG_PROB_GZALLOC */
1895 return vm_map_store_lookup_entry( map, address, entry );
1896 }
1897
1898 boolean_t
vm_map_lookup_entry_or_next(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1899 vm_map_lookup_entry_or_next(
1900 vm_map_t map,
1901 vm_map_offset_t address,
1902 vm_map_entry_t *entry) /* OUT */
1903 {
1904 if (vm_map_lookup_entry(map, address, entry)) {
1905 return true;
1906 }
1907
1908 *entry = (*entry)->vme_next;
1909 return false;
1910 }
1911
1912 #if CONFIG_PROB_GZALLOC
1913 boolean_t
vm_map_lookup_entry_allow_pgz(vm_map_t map,vm_map_offset_t address,vm_map_entry_t * entry)1914 vm_map_lookup_entry_allow_pgz(
1915 vm_map_t map,
1916 vm_map_offset_t address,
1917 vm_map_entry_t *entry) /* OUT */
1918 {
1919 #if CONFIG_KERNEL_TBI
1920 if (VM_KERNEL_ADDRESS(address)) {
1921 address = VM_KERNEL_STRIP_UPTR(address);
1922 }
1923 #endif /* CONFIG_KERNEL_TBI */
1924 return vm_map_store_lookup_entry( map, address, entry );
1925 }
1926 #endif /* CONFIG_PROB_GZALLOC */
1927
1928 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1929 /*
1930 * Routine: vm_map_adjust_direction
1931 * Purpose:
1932 * Overrides direction to reduce fragmentation. Allocate small
1933 * allocations from the end and large allocations from the right.
1934 */
1935 static void
vm_map_adjust_direction(vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1936 vm_map_adjust_direction(
1937 vm_map_kernel_flags_t *vmk_flags,
1938 vm_map_size_t size)
1939 {
1940 if (size < KMEM_SMALLMAP_THRESHOLD) {
1941 vmk_flags->vmkf_last_free = true;
1942 } else {
1943 vmk_flags->vmkf_last_free = false;
1944 }
1945 }
1946 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) || !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT) */
1947
1948 /*
1949 * Routine: vm_map_get_range
1950 * Purpose:
1951 * Adjust bounds based on security policy.
1952 */
1953 static struct kmem_range
vm_map_get_range(vm_map_t map,vm_map_offset_t * address,vm_map_kernel_flags_t * vmk_flags,vm_map_size_t size)1954 vm_map_get_range(
1955 vm_map_t map,
1956 vm_map_offset_t *address,
1957 vm_map_kernel_flags_t *vmk_flags,
1958 vm_map_size_t size)
1959 {
1960 struct kmem_range effective_range = {};
1961 if (map == kernel_map) {
1962 kmem_range_id_t range_id = vmk_flags->vmkf_range_id;
1963 effective_range = kmem_ranges[range_id];
1964
1965 if (startup_phase > STARTUP_SUB_KMEM) {
1966 /*
1967 * Hint provided by caller is zeroed as the range is restricted to a
1968 * subset of the entire kernel_map VA, which could put the hint outside
1969 * the range, causing vm_map_store_find_space to fail.
1970 */
1971 *address = 0ull;
1972 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1973 /*
1974 * Each allocation front looks like [ S | L ]
1975 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1976 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1977 * use the entire range.
1978 */
1979 if (size >= KMEM_SMALLMAP_THRESHOLD) {
1980 effective_range = kmem_large_ranges[range_id];
1981 }
1982 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1983 vm_map_adjust_direction(vmk_flags, size);
1984 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1985 }
1986 } else {
1987 /*
1988 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
1989 * allocations of PAGEZERO to explicit requests since its
1990 * normal use is to catch dereferences of NULL and many
1991 * applications also treat pointers with a value of 0 as
1992 * special and suddenly having address 0 contain useable
1993 * memory would tend to confuse those applications.
1994 */
1995 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
1996 effective_range.max_address = map->max_offset;
1997 }
1998
1999 return effective_range;
2000 }
2001
2002 /*
2003 * Routine: vm_map_locate_space
2004 * Purpose:
2005 * Finds a range in the specified virtual address map,
2006 * returning the start of that range,
2007 * as well as the entry right before it.
2008 */
2009 kern_return_t
vm_map_locate_space(vm_map_t map,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)2010 vm_map_locate_space(
2011 vm_map_t map,
2012 vm_map_size_t size,
2013 vm_map_offset_t mask,
2014 vm_map_kernel_flags_t vmk_flags,
2015 vm_map_offset_t *start_inout,
2016 vm_map_entry_t *entry_out)
2017 {
2018 struct kmem_range effective_range = {};
2019 vm_map_size_t guard_offset;
2020 vm_map_offset_t hint, limit;
2021 vm_map_entry_t entry;
2022
2023 /*
2024 * Only supported by vm_map_enter() with a fixed address.
2025 */
2026 assert(!vmk_flags.vmkf_beyond_max);
2027
2028 if (__improbable(map->wait_for_space)) {
2029 /*
2030 * support for "wait_for_space" is minimal,
2031 * its only consumer is the ipc_kernel_copy_map.
2032 */
2033 assert(!map->holelistenabled &&
2034 !vmk_flags.vmkf_last_free &&
2035 !vmk_flags.vmkf_keep_map_locked &&
2036 !vmk_flags.vmkf_map_jit &&
2037 !vmk_flags.vmkf_random_address &&
2038 *start_inout <= map->min_offset);
2039 } else if (vmk_flags.vmkf_last_free) {
2040 assert(!vmk_flags.vmkf_map_jit &&
2041 !vmk_flags.vmkf_random_address);
2042 }
2043
2044 if (vmk_flags.vmkf_guard_before) {
2045 guard_offset = VM_MAP_PAGE_SIZE(map);
2046 assert(size > guard_offset);
2047 size -= guard_offset;
2048 } else {
2049 assert(size != 0);
2050 guard_offset = 0;
2051 }
2052
2053 effective_range = vm_map_get_range(map, start_inout, &vmk_flags, size);
2054 #if XNU_TARGET_OS_OSX
2055 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2056 assert(map != kernel_map);
2057 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2058 }
2059 #endif /* XNU_TARGET_OS_OSX */
2060
2061 again:
2062 if (vmk_flags.vmkf_last_free) {
2063 hint = *start_inout;
2064
2065 if (hint == 0 || hint > effective_range.max_address) {
2066 hint = effective_range.max_address;
2067 }
2068 if (hint <= effective_range.min_address) {
2069 return KERN_NO_SPACE;
2070 }
2071 limit = effective_range.min_address;
2072 } else {
2073 hint = *start_inout;
2074
2075 if (vmk_flags.vmkf_map_jit) {
2076 if (map->jit_entry_exists &&
2077 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2078 return KERN_INVALID_ARGUMENT;
2079 }
2080 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2081 vmk_flags.vmkf_random_address = true;
2082 }
2083 }
2084
2085 if (vmk_flags.vmkf_random_address) {
2086 kern_return_t kr;
2087
2088 kr = vm_map_random_address_for_size(map, &hint, size, vmk_flags);
2089 if (kr != KERN_SUCCESS) {
2090 return kr;
2091 }
2092 }
2093 #if XNU_TARGET_OS_OSX
2094 else if ((hint == 0 || hint == vm_map_min(map)) &&
2095 !map->disable_vmentry_reuse &&
2096 map->vmmap_high_start != 0) {
2097 hint = map->vmmap_high_start;
2098 }
2099 #endif /* XNU_TARGET_OS_OSX */
2100
2101 if (hint < effective_range.min_address) {
2102 hint = effective_range.min_address;
2103 }
2104 if (effective_range.max_address <= hint) {
2105 return KERN_NO_SPACE;
2106 }
2107
2108 limit = effective_range.max_address;
2109 }
2110 entry = vm_map_store_find_space(map,
2111 hint, limit, vmk_flags.vmkf_last_free,
2112 guard_offset, size, mask,
2113 start_inout);
2114
2115 if (__improbable(entry == NULL)) {
2116 if (map->wait_for_space &&
2117 guard_offset + size <=
2118 effective_range.max_address - effective_range.min_address) {
2119 assert_wait((event_t)map, THREAD_ABORTSAFE);
2120 vm_map_unlock(map);
2121 thread_block(THREAD_CONTINUE_NULL);
2122 vm_map_lock(map);
2123 goto again;
2124 }
2125 return KERN_NO_SPACE;
2126 }
2127
2128 if (entry_out) {
2129 *entry_out = entry;
2130 }
2131 return KERN_SUCCESS;
2132 }
2133
2134
2135 /*
2136 * Routine: vm_map_find_space
2137 * Purpose:
2138 * Allocate a range in the specified virtual address map,
2139 * returning the entry allocated for that range.
2140 * Used by kmem_alloc, etc.
2141 *
2142 * The map must be NOT be locked. It will be returned locked
2143 * on KERN_SUCCESS, unlocked on failure.
2144 *
2145 * If an entry is allocated, the object/offset fields
2146 * are initialized to zero.
2147 */
2148 kern_return_t
vm_map_find_space(vm_map_t map,vm_map_offset_t hint_address,vm_map_size_t size,vm_map_offset_t mask,vm_map_kernel_flags_t vmk_flags,vm_map_entry_t * o_entry)2149 vm_map_find_space(
2150 vm_map_t map,
2151 vm_map_offset_t hint_address,
2152 vm_map_size_t size,
2153 vm_map_offset_t mask,
2154 vm_map_kernel_flags_t vmk_flags,
2155 vm_map_entry_t *o_entry) /* OUT */
2156 {
2157 vm_map_entry_t new_entry, entry;
2158 kern_return_t kr;
2159
2160 if (size == 0) {
2161 return KERN_INVALID_ARGUMENT;
2162 }
2163
2164 new_entry = vm_map_entry_create(map);
2165 new_entry->use_pmap = true;
2166 new_entry->protection = VM_PROT_DEFAULT;
2167 new_entry->max_protection = VM_PROT_ALL;
2168
2169 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2170 new_entry->map_aligned = true;
2171 }
2172 if (vmk_flags.vmkf_atomic_entry) {
2173 new_entry->vme_atomic = true;
2174 }
2175 if (vmk_flags.vmkf_permanent) {
2176 new_entry->permanent = true;
2177 }
2178
2179 vm_map_lock(map);
2180
2181 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2182 &hint_address, &entry);
2183 if (kr != KERN_SUCCESS) {
2184 vm_map_unlock(map);
2185 vm_map_entry_dispose(new_entry);
2186 return kr;
2187 }
2188 new_entry->vme_start = hint_address;
2189 new_entry->vme_end = hint_address + size;
2190
2191 /*
2192 * At this point,
2193 *
2194 * - new_entry's "vme_start" and "vme_end" should define
2195 * the endpoints of the available new range,
2196 *
2197 * - and "entry" should refer to the region before
2198 * the new range,
2199 *
2200 * - and the map should still be locked.
2201 */
2202
2203 assert(page_aligned(new_entry->vme_start));
2204 assert(page_aligned(new_entry->vme_end));
2205 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2206 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2207
2208 /*
2209 * Insert the new entry into the list
2210 */
2211
2212 vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
2213 map->size += size;
2214
2215 /*
2216 * Update the lookup hint
2217 */
2218 SAVE_HINT_MAP_WRITE(map, new_entry);
2219
2220 *o_entry = new_entry;
2221 return KERN_SUCCESS;
2222 }
2223
2224 int vm_map_pmap_enter_print = FALSE;
2225 int vm_map_pmap_enter_enable = FALSE;
2226
2227 /*
2228 * Routine: vm_map_pmap_enter [internal only]
2229 *
2230 * Description:
2231 * Force pages from the specified object to be entered into
2232 * the pmap at the specified address if they are present.
2233 * As soon as a page not found in the object the scan ends.
2234 *
2235 * Returns:
2236 * Nothing.
2237 *
2238 * In/out conditions:
2239 * The source map should not be locked on entry.
2240 */
2241 __unused static void
vm_map_pmap_enter(vm_map_t map,vm_map_offset_t addr,vm_map_offset_t end_addr,vm_object_t object,vm_object_offset_t offset,vm_prot_t protection)2242 vm_map_pmap_enter(
2243 vm_map_t map,
2244 vm_map_offset_t addr,
2245 vm_map_offset_t end_addr,
2246 vm_object_t object,
2247 vm_object_offset_t offset,
2248 vm_prot_t protection)
2249 {
2250 int type_of_fault;
2251 kern_return_t kr;
2252 struct vm_object_fault_info fault_info = {};
2253
2254 if (map->pmap == 0) {
2255 return;
2256 }
2257
2258 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2259
2260 while (addr < end_addr) {
2261 vm_page_t m;
2262
2263
2264 /*
2265 * TODO:
2266 * From vm_map_enter(), we come into this function without the map
2267 * lock held or the object lock held.
2268 * We haven't taken a reference on the object either.
2269 * We should do a proper lookup on the map to make sure
2270 * that things are sane before we go locking objects that
2271 * could have been deallocated from under us.
2272 */
2273
2274 vm_object_lock(object);
2275
2276 m = vm_page_lookup(object, offset);
2277
2278 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2279 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_absent))) {
2280 vm_object_unlock(object);
2281 return;
2282 }
2283
2284 if (vm_map_pmap_enter_print) {
2285 printf("vm_map_pmap_enter:");
2286 printf("map: %p, addr: %llx, object: %p, offset: %llx\n",
2287 map, (unsigned long long)addr, object, (unsigned long long)offset);
2288 }
2289 type_of_fault = DBG_CACHE_HIT_FAULT;
2290 kr = vm_fault_enter(m, map->pmap,
2291 addr,
2292 PAGE_SIZE, 0,
2293 protection, protection,
2294 VM_PAGE_WIRED(m),
2295 FALSE, /* change_wiring */
2296 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2297 &fault_info,
2298 NULL, /* need_retry */
2299 &type_of_fault);
2300
2301 vm_object_unlock(object);
2302
2303 offset += PAGE_SIZE_64;
2304 addr += PAGE_SIZE;
2305 }
2306 }
2307
2308 #define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2309 kern_return_t
vm_map_random_address_for_size(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_kernel_flags_t vmk_flags)2310 vm_map_random_address_for_size(
2311 vm_map_t map,
2312 vm_map_offset_t *address,
2313 vm_map_size_t size,
2314 vm_map_kernel_flags_t vmk_flags)
2315 {
2316 kern_return_t kr = KERN_SUCCESS;
2317 int tries = 0;
2318 vm_map_offset_t random_addr = 0;
2319 vm_map_offset_t hole_end;
2320
2321 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2322 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2323 vm_map_size_t vm_hole_size = 0;
2324 vm_map_size_t addr_space_size;
2325 struct kmem_range effective_range = vm_map_get_range(map, address, &vmk_flags, size);
2326
2327 addr_space_size = effective_range.max_address - effective_range.min_address;
2328 if (size >= addr_space_size) {
2329 return KERN_NO_SPACE;
2330 }
2331 addr_space_size -= size;
2332
2333 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2334
2335 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2336 if (startup_phase < STARTUP_SUB_ZALLOC) {
2337 random_addr = (vm_map_offset_t)early_random();
2338 } else {
2339 random_addr = (vm_map_offset_t)random();
2340 }
2341 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2342 random_addr = vm_map_trunc_page(
2343 effective_range.min_address + (random_addr % addr_space_size),
2344 VM_MAP_PAGE_MASK(map));
2345
2346 #if CONFIG_PROB_GZALLOC
2347 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2348 continue;
2349 }
2350 #endif /* CONFIG_PROB_GZALLOC */
2351
2352 if (vm_map_lookup_entry(map, random_addr, &prev_entry) == FALSE) {
2353 if (prev_entry == vm_map_to_entry(map)) {
2354 next_entry = vm_map_first_entry(map);
2355 } else {
2356 next_entry = prev_entry->vme_next;
2357 }
2358 if (next_entry == vm_map_to_entry(map)) {
2359 hole_end = vm_map_max(map);
2360 } else {
2361 hole_end = next_entry->vme_start;
2362 }
2363 vm_hole_size = hole_end - random_addr;
2364 if (vm_hole_size >= size) {
2365 *address = random_addr;
2366 break;
2367 }
2368 }
2369 tries++;
2370 }
2371
2372 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2373 kr = KERN_NO_SPACE;
2374 }
2375 return kr;
2376 }
2377
2378 static boolean_t
vm_memory_malloc_no_cow(int alias)2379 vm_memory_malloc_no_cow(
2380 int alias)
2381 {
2382 uint64_t alias_mask;
2383
2384 if (alias > 63) {
2385 return FALSE;
2386 }
2387
2388 alias_mask = 1ULL << alias;
2389 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2390 return TRUE;
2391 }
2392 return FALSE;
2393 }
2394
2395 uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2396 uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2397 /*
2398 * Routine: vm_map_enter
2399 *
2400 * Description:
2401 * Allocate a range in the specified virtual address map.
2402 * The resulting range will refer to memory defined by
2403 * the given memory object and offset into that object.
2404 *
2405 * Arguments are as defined in the vm_map call.
2406 */
2407 static unsigned int vm_map_enter_restore_successes = 0;
2408 static unsigned int vm_map_enter_restore_failures = 0;
2409 kern_return_t
vm_map_enter(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)2410 vm_map_enter(
2411 vm_map_t map,
2412 vm_map_offset_t *address, /* IN/OUT */
2413 vm_map_size_t size,
2414 vm_map_offset_t mask,
2415 int flags,
2416 vm_map_kernel_flags_t vmk_flags,
2417 vm_tag_t alias,
2418 vm_object_t object,
2419 vm_object_offset_t offset,
2420 boolean_t needs_copy,
2421 vm_prot_t cur_protection,
2422 vm_prot_t max_protection,
2423 vm_inherit_t inheritance)
2424 {
2425 vm_map_entry_t entry, new_entry;
2426 vm_map_offset_t start, tmp_start, tmp_offset;
2427 vm_map_offset_t end, tmp_end;
2428 vm_map_offset_t tmp2_start, tmp2_end;
2429 vm_map_offset_t step;
2430 kern_return_t result = KERN_SUCCESS;
2431 boolean_t map_locked = FALSE;
2432 boolean_t pmap_empty = TRUE;
2433 boolean_t new_mapping_established = FALSE;
2434 boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2435 boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
2436 boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
2437 boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
2438 boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
2439 boolean_t is_submap = vmk_flags.vmkf_submap;
2440 boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
2441 boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2442 boolean_t entry_for_jit = vmk_flags.vmkf_map_jit;
2443 boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct;
2444 boolean_t translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
2445 boolean_t resilient_codesign = ((flags & VM_FLAGS_RESILIENT_CODESIGN) != 0);
2446 boolean_t resilient_media = ((flags & VM_FLAGS_RESILIENT_MEDIA) != 0);
2447 unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
2448 vm_tag_t user_alias;
2449 kern_return_t kr;
2450 boolean_t clear_map_aligned = FALSE;
2451 vm_map_size_t chunk_size = 0;
2452 vm_object_t caller_object;
2453 VM_MAP_ZAP_DECLARE(zap_old_list);
2454 VM_MAP_ZAP_DECLARE(zap_new_list);
2455
2456 caller_object = object;
2457
2458 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2459
2460 if (flags & VM_FLAGS_4GB_CHUNK) {
2461 #if defined(__LP64__)
2462 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2463 #else /* __LP64__ */
2464 chunk_size = ANON_CHUNK_SIZE;
2465 #endif /* __LP64__ */
2466 } else {
2467 chunk_size = ANON_CHUNK_SIZE;
2468 }
2469
2470 if (superpage_size) {
2471 switch (superpage_size) {
2472 /*
2473 * Note that the current implementation only supports
2474 * a single size for superpages, SUPERPAGE_SIZE, per
2475 * architecture. As soon as more sizes are supposed
2476 * to be supported, SUPERPAGE_SIZE has to be replaced
2477 * with a lookup of the size depending on superpage_size.
2478 */
2479 #ifdef __x86_64__
2480 case SUPERPAGE_SIZE_ANY:
2481 /* handle it like 2 MB and round up to page size */
2482 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2483 OS_FALLTHROUGH;
2484 case SUPERPAGE_SIZE_2MB:
2485 break;
2486 #endif
2487 default:
2488 return KERN_INVALID_ARGUMENT;
2489 }
2490 mask = SUPERPAGE_SIZE - 1;
2491 if (size & (SUPERPAGE_SIZE - 1)) {
2492 return KERN_INVALID_ARGUMENT;
2493 }
2494 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2495 }
2496
2497
2498 if ((cur_protection & VM_PROT_WRITE) &&
2499 (cur_protection & VM_PROT_EXECUTE) &&
2500 #if XNU_TARGET_OS_OSX
2501 map->pmap != kernel_pmap &&
2502 (cs_process_global_enforcement() ||
2503 (vmk_flags.vmkf_cs_enforcement_override
2504 ? vmk_flags.vmkf_cs_enforcement
2505 : (vm_map_cs_enforcement(map)
2506 #if __arm64__
2507 || !VM_MAP_IS_EXOTIC(map)
2508 #endif /* __arm64__ */
2509 ))) &&
2510 #endif /* XNU_TARGET_OS_OSX */
2511 (VM_MAP_POLICY_WX_FAIL(map) ||
2512 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2513 !entry_for_jit) {
2514 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2515
2516 DTRACE_VM3(cs_wx,
2517 uint64_t, 0,
2518 uint64_t, 0,
2519 vm_prot_t, cur_protection);
2520 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2521 proc_selfpid(),
2522 (current_task()->bsd_info
2523 ? proc_name_address(current_task()->bsd_info)
2524 : "?"),
2525 __FUNCTION__,
2526 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2527 cur_protection &= ~VM_PROT_EXECUTE;
2528 if (vm_protect_wx_fail) {
2529 return KERN_PROTECTION_FAILURE;
2530 }
2531 }
2532
2533 /*
2534 * If the task has requested executable lockdown,
2535 * deny any new executable mapping.
2536 */
2537 if (map->map_disallow_new_exec == TRUE) {
2538 if (cur_protection & VM_PROT_EXECUTE) {
2539 return KERN_PROTECTION_FAILURE;
2540 }
2541 }
2542
2543 if (resilient_codesign) {
2544 assert(!is_submap);
2545 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2546 if ((cur_protection | max_protection) & reject_prot) {
2547 return KERN_PROTECTION_FAILURE;
2548 }
2549 }
2550
2551 if (resilient_media) {
2552 assert(!is_submap);
2553 // assert(!needs_copy);
2554 if (object != VM_OBJECT_NULL &&
2555 !object->internal) {
2556 /*
2557 * This mapping is directly backed by an external
2558 * memory manager (e.g. a vnode pager for a file):
2559 * we would not have any safe place to inject
2560 * a zero-filled page if an actual page is not
2561 * available, without possibly impacting the actual
2562 * contents of the mapped object (e.g. the file),
2563 * so we can't provide any media resiliency here.
2564 */
2565 return KERN_INVALID_ARGUMENT;
2566 }
2567 }
2568
2569 if (is_submap) {
2570 if (purgable) {
2571 /* submaps can not be purgeable */
2572 return KERN_INVALID_ARGUMENT;
2573 }
2574 if (object == VM_OBJECT_NULL) {
2575 /* submaps can not be created lazily */
2576 return KERN_INVALID_ARGUMENT;
2577 }
2578 }
2579 if (vmk_flags.vmkf_already) {
2580 /*
2581 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2582 * is already present. For it to be meaningul, the requested
2583 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2584 * we shouldn't try and remove what was mapped there first
2585 * (!VM_FLAGS_OVERWRITE).
2586 */
2587 if ((flags & VM_FLAGS_ANYWHERE) ||
2588 (flags & VM_FLAGS_OVERWRITE)) {
2589 return KERN_INVALID_ARGUMENT;
2590 }
2591 }
2592
2593 if (size == 0 ||
2594 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2595 *address = 0;
2596 return KERN_INVALID_ARGUMENT;
2597 }
2598
2599 if (map->pmap == kernel_pmap) {
2600 user_alias = VM_KERN_MEMORY_NONE;
2601 } else {
2602 user_alias = alias;
2603 }
2604
2605 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2606 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2607 }
2608
2609 #define RETURN(value) { result = value; goto BailOut; }
2610
2611 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2612 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2613 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2614 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2615 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2616 }
2617
2618 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2619 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2620 /*
2621 * In most cases, the caller rounds the size up to the
2622 * map's page size.
2623 * If we get a size that is explicitly not map-aligned here,
2624 * we'll have to respect the caller's wish and mark the
2625 * mapping as "not map-aligned" to avoid tripping the
2626 * map alignment checks later.
2627 */
2628 clear_map_aligned = TRUE;
2629 }
2630 if (!anywhere &&
2631 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2632 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2633 /*
2634 * We've been asked to map at a fixed address and that
2635 * address is not aligned to the map's specific alignment.
2636 * The caller should know what it's doing (i.e. most likely
2637 * mapping some fragmented copy map, transferring memory from
2638 * a VM map with a different alignment), so clear map_aligned
2639 * for this new VM map entry and proceed.
2640 */
2641 clear_map_aligned = TRUE;
2642 }
2643
2644 /*
2645 * Only zero-fill objects are allowed to be purgable.
2646 * LP64todo - limit purgable objects to 32-bits for now
2647 */
2648 if (purgable &&
2649 (offset != 0 ||
2650 (object != VM_OBJECT_NULL &&
2651 (object->vo_size != size ||
2652 object->purgable == VM_PURGABLE_DENY))
2653 || size > ANON_MAX_SIZE)) { /* LP64todo: remove when dp capable */
2654 return KERN_INVALID_ARGUMENT;
2655 }
2656
2657 start = *address;
2658
2659 if (anywhere) {
2660 vm_map_lock(map);
2661 map_locked = TRUE;
2662
2663 if (flags & VM_FLAGS_RANDOM_ADDR) {
2664 vmk_flags.vmkf_random_address = true;
2665 }
2666
2667 /*
2668 * Default to data range for kernel_map
2669 */
2670 if (map == kernel_map) {
2671 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
2672 }
2673
2674 result = vm_map_locate_space(map, size, mask, vmk_flags,
2675 &start, &entry);
2676 if (result != KERN_SUCCESS) {
2677 goto BailOut;
2678 }
2679
2680 *address = start;
2681 end = start + size;
2682 assert(VM_MAP_PAGE_ALIGNED(*address,
2683 VM_MAP_PAGE_MASK(map)));
2684 } else {
2685 vm_map_offset_t effective_min_offset, effective_max_offset;
2686
2687 effective_min_offset = map->min_offset;
2688 effective_max_offset = map->max_offset;
2689
2690 if (vmk_flags.vmkf_beyond_max) {
2691 /*
2692 * Allow an insertion beyond the map's max offset.
2693 */
2694 effective_max_offset = 0x00000000FFFFF000ULL;
2695 #if !defined(__arm__)
2696 if (vm_map_is_64bit(map)) {
2697 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2698 }
2699 #endif /* __arm__ */
2700 #if XNU_TARGET_OS_OSX
2701 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2702 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2703 #endif /* XNU_TARGET_OS_OSX */
2704 }
2705
2706 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2707 !overwrite &&
2708 user_alias == VM_MEMORY_REALLOC) {
2709 /*
2710 * Force realloc() to switch to a new allocation,
2711 * to prevent 4k-fragmented virtual ranges.
2712 */
2713 // DEBUG4K_ERROR("no realloc in place");
2714 return KERN_NO_SPACE;
2715 }
2716
2717 /*
2718 * Verify that:
2719 * the address doesn't itself violate
2720 * the mask requirement.
2721 */
2722
2723 vm_map_lock(map);
2724 map_locked = TRUE;
2725 if ((start & mask) != 0) {
2726 RETURN(KERN_NO_SPACE);
2727 }
2728
2729 /*
2730 * ... the address is within bounds
2731 */
2732
2733 end = start + size;
2734
2735 if ((start < effective_min_offset) ||
2736 (end > effective_max_offset) ||
2737 (start >= end)) {
2738 RETURN(KERN_INVALID_ADDRESS);
2739 }
2740
2741 if (overwrite) {
2742 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
2743
2744 /*
2745 * Fixed mapping and "overwrite" flag: attempt to
2746 * remove all existing mappings in the specified
2747 * address range, saving them in our "zap_old_list".
2748 *
2749 * This avoids releasing the VM map lock in
2750 * vm_map_entry_delete() and allows atomicity
2751 * when we want to replace some mappings with a new one.
2752 * It also allows us to restore the old VM mappings if the
2753 * new mapping fails.
2754 */
2755 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2756
2757 if (vmk_flags.vmkf_overwrite_immutable) {
2758 /* we can overwrite immutable mappings */
2759 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2760 }
2761 (void)vm_map_delete(map, start, end,
2762 remove_flags, &zap_old_list);
2763 }
2764
2765 /*
2766 * ... the starting address isn't allocated
2767 */
2768
2769 if (vm_map_lookup_entry(map, start, &entry)) {
2770 if (!(vmk_flags.vmkf_already)) {
2771 RETURN(KERN_NO_SPACE);
2772 }
2773 /*
2774 * Check if what's already there is what we want.
2775 */
2776 tmp_start = start;
2777 tmp_offset = offset;
2778 if (entry->vme_start < start) {
2779 tmp_start -= start - entry->vme_start;
2780 tmp_offset -= start - entry->vme_start;
2781 }
2782 for (; entry->vme_start < end;
2783 entry = entry->vme_next) {
2784 /*
2785 * Check if the mapping's attributes
2786 * match the existing map entry.
2787 */
2788 if (entry == vm_map_to_entry(map) ||
2789 entry->vme_start != tmp_start ||
2790 entry->is_sub_map != is_submap ||
2791 VME_OFFSET(entry) != tmp_offset ||
2792 entry->needs_copy != needs_copy ||
2793 entry->protection != cur_protection ||
2794 entry->max_protection != max_protection ||
2795 entry->inheritance != inheritance ||
2796 entry->iokit_acct != iokit_acct ||
2797 VME_ALIAS(entry) != alias) {
2798 /* not the same mapping ! */
2799 RETURN(KERN_NO_SPACE);
2800 }
2801 /*
2802 * Check if the same object is being mapped.
2803 */
2804 if (is_submap) {
2805 if (VME_SUBMAP(entry) !=
2806 (vm_map_t) object) {
2807 /* not the same submap */
2808 RETURN(KERN_NO_SPACE);
2809 }
2810 } else {
2811 if (VME_OBJECT(entry) != object) {
2812 /* not the same VM object... */
2813 vm_object_t obj2;
2814
2815 obj2 = VME_OBJECT(entry);
2816 if ((obj2 == VM_OBJECT_NULL ||
2817 obj2->internal) &&
2818 (object == VM_OBJECT_NULL ||
2819 object->internal)) {
2820 /*
2821 * ... but both are
2822 * anonymous memory,
2823 * so equivalent.
2824 */
2825 } else {
2826 RETURN(KERN_NO_SPACE);
2827 }
2828 }
2829 }
2830
2831 tmp_offset += entry->vme_end - entry->vme_start;
2832 tmp_start += entry->vme_end - entry->vme_start;
2833 if (entry->vme_end >= end) {
2834 /* reached the end of our mapping */
2835 break;
2836 }
2837 }
2838 /* it all matches: let's use what's already there ! */
2839 RETURN(KERN_MEMORY_PRESENT);
2840 }
2841
2842 /*
2843 * ... the next region doesn't overlap the
2844 * end point.
2845 */
2846
2847 if ((entry->vme_next != vm_map_to_entry(map)) &&
2848 (entry->vme_next->vme_start < end)) {
2849 RETURN(KERN_NO_SPACE);
2850 }
2851 }
2852
2853 /*
2854 * At this point,
2855 * "start" and "end" should define the endpoints of the
2856 * available new range, and
2857 * "entry" should refer to the region before the new
2858 * range, and
2859 *
2860 * the map should be locked.
2861 */
2862
2863 /*
2864 * See whether we can avoid creating a new entry (and object) by
2865 * extending one of our neighbors. [So far, we only attempt to
2866 * extend from below.] Note that we can never extend/join
2867 * purgable objects because they need to remain distinct
2868 * entities in order to implement their "volatile object"
2869 * semantics.
2870 */
2871
2872 if (purgable ||
2873 entry_for_jit ||
2874 vm_memory_malloc_no_cow(user_alias)) {
2875 if (object == VM_OBJECT_NULL) {
2876 object = vm_object_allocate(size);
2877 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2878 object->true_share = FALSE;
2879 if (purgable) {
2880 task_t owner;
2881 object->purgable = VM_PURGABLE_NONVOLATILE;
2882 if (map->pmap == kernel_pmap) {
2883 /*
2884 * Purgeable mappings made in a kernel
2885 * map are "owned" by the kernel itself
2886 * rather than the current user task
2887 * because they're likely to be used by
2888 * more than this user task (see
2889 * execargs_purgeable_allocate(), for
2890 * example).
2891 */
2892 owner = kernel_task;
2893 } else {
2894 owner = current_task();
2895 }
2896 assert(object->vo_owner == NULL);
2897 assert(object->resident_page_count == 0);
2898 assert(object->wired_page_count == 0);
2899 vm_object_lock(object);
2900 vm_purgeable_nonvolatile_enqueue(object, owner);
2901 vm_object_unlock(object);
2902 }
2903 offset = (vm_object_offset_t)0;
2904 }
2905 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
2906 /* no coalescing if address space uses sub-pages */
2907 } else if ((is_submap == FALSE) &&
2908 (object == VM_OBJECT_NULL) &&
2909 (entry != vm_map_to_entry(map)) &&
2910 (entry->vme_end == start) &&
2911 (!entry->is_shared) &&
2912 (!entry->is_sub_map) &&
2913 (!entry->in_transition) &&
2914 (!entry->needs_wakeup) &&
2915 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
2916 (entry->protection == cur_protection) &&
2917 (entry->max_protection == max_protection) &&
2918 (entry->inheritance == inheritance) &&
2919 ((user_alias == VM_MEMORY_REALLOC) ||
2920 (VME_ALIAS(entry) == alias)) &&
2921 (entry->no_cache == no_cache) &&
2922 (entry->permanent == permanent) &&
2923 /* no coalescing for immutable executable mappings */
2924 !((entry->protection & VM_PROT_EXECUTE) &&
2925 entry->permanent) &&
2926 (!entry->superpage_size && !superpage_size) &&
2927 /*
2928 * No coalescing if not map-aligned, to avoid propagating
2929 * that condition any further than needed:
2930 */
2931 (!entry->map_aligned || !clear_map_aligned) &&
2932 (!entry->zero_wired_pages) &&
2933 (!entry->used_for_jit && !entry_for_jit) &&
2934 (!entry->pmap_cs_associated) &&
2935 (entry->iokit_acct == iokit_acct) &&
2936 (!entry->vme_resilient_codesign) &&
2937 (!entry->vme_resilient_media) &&
2938 (!entry->vme_atomic) &&
2939 (entry->vme_no_copy_on_read == no_copy_on_read) &&
2940
2941 ((entry->vme_end - entry->vme_start) + size <=
2942 (user_alias == VM_MEMORY_REALLOC ?
2943 ANON_CHUNK_SIZE :
2944 NO_COALESCE_LIMIT)) &&
2945
2946 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
2947 if (vm_object_coalesce(VME_OBJECT(entry),
2948 VM_OBJECT_NULL,
2949 VME_OFFSET(entry),
2950 (vm_object_offset_t) 0,
2951 (vm_map_size_t)(entry->vme_end - entry->vme_start),
2952 (vm_map_size_t)(end - entry->vme_end))) {
2953 /*
2954 * Coalesced the two objects - can extend
2955 * the previous map entry to include the
2956 * new range.
2957 */
2958 map->size += (end - entry->vme_end);
2959 assert(entry->vme_start < end);
2960 assert(VM_MAP_PAGE_ALIGNED(end,
2961 VM_MAP_PAGE_MASK(map)));
2962 if (__improbable(vm_debug_events)) {
2963 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
2964 }
2965 entry->vme_end = end;
2966 if (map->holelistenabled) {
2967 vm_map_store_update_first_free(map, entry, TRUE);
2968 } else {
2969 vm_map_store_update_first_free(map, map->first_free, TRUE);
2970 }
2971 new_mapping_established = TRUE;
2972 RETURN(KERN_SUCCESS);
2973 }
2974 }
2975
2976 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
2977 new_entry = NULL;
2978
2979 if (vmk_flags.vmkf_submap_adjust) {
2980 vm_map_adjust_offsets((vm_map_t)caller_object, start, end);
2981 offset = start;
2982 }
2983
2984 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
2985 tmp2_end = tmp2_start + step;
2986 /*
2987 * Create a new entry
2988 *
2989 * XXX FBDP
2990 * The reserved "page zero" in each process's address space can
2991 * be arbitrarily large. Splitting it into separate objects and
2992 * therefore different VM map entries serves no purpose and just
2993 * slows down operations on the VM map, so let's not split the
2994 * allocation into chunks if the max protection is NONE. That
2995 * memory should never be accessible, so it will never get to the
2996 * default pager.
2997 */
2998 tmp_start = tmp2_start;
2999 if (object == VM_OBJECT_NULL &&
3000 size > chunk_size &&
3001 max_protection != VM_PROT_NONE &&
3002 superpage_size == 0) {
3003 tmp_end = tmp_start + chunk_size;
3004 } else {
3005 tmp_end = tmp2_end;
3006 }
3007 do {
3008 if (!is_submap &&
3009 object != VM_OBJECT_NULL &&
3010 object->internal &&
3011 offset + (tmp_end - tmp_start) > object->vo_size) {
3012 // printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3013 DTRACE_VM5(vm_map_enter_overmap,
3014 vm_map_t, map,
3015 vm_map_address_t, tmp_start,
3016 vm_map_address_t, tmp_end,
3017 vm_object_offset_t, offset,
3018 vm_object_size_t, object->vo_size);
3019 }
3020 new_entry = vm_map_entry_insert(map,
3021 entry, tmp_start, tmp_end,
3022 object, offset, vmk_flags,
3023 needs_copy,
3024 cur_protection, max_protection,
3025 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3026 VM_INHERIT_NONE : inheritance),
3027 no_cache,
3028 permanent,
3029 no_copy_on_read,
3030 superpage_size,
3031 clear_map_aligned,
3032 is_submap,
3033 entry_for_jit,
3034 alias,
3035 translated_allow_execute);
3036
3037 assert((object != kernel_object) || (VM_KERN_MEMORY_NONE != alias));
3038
3039 if (resilient_codesign) {
3040 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3041 if (!((cur_protection | max_protection) & reject_prot)) {
3042 new_entry->vme_resilient_codesign = TRUE;
3043 }
3044 }
3045
3046 if (resilient_media &&
3047 (object == VM_OBJECT_NULL ||
3048 object->internal)) {
3049 new_entry->vme_resilient_media = TRUE;
3050 }
3051
3052 assert(!new_entry->iokit_acct);
3053 if (!is_submap &&
3054 object != VM_OBJECT_NULL &&
3055 (object->purgable != VM_PURGABLE_DENY ||
3056 object->vo_ledger_tag)) {
3057 assert(new_entry->use_pmap);
3058 assert(!new_entry->iokit_acct);
3059 /*
3060 * Turn off pmap accounting since
3061 * purgeable (or tagged) objects have their
3062 * own ledgers.
3063 */
3064 new_entry->use_pmap = FALSE;
3065 } else if (!is_submap &&
3066 iokit_acct &&
3067 object != VM_OBJECT_NULL &&
3068 object->internal) {
3069 /* alternate accounting */
3070 assert(!new_entry->iokit_acct);
3071 assert(new_entry->use_pmap);
3072 new_entry->iokit_acct = TRUE;
3073 new_entry->use_pmap = FALSE;
3074 DTRACE_VM4(
3075 vm_map_iokit_mapped_region,
3076 vm_map_t, map,
3077 vm_map_offset_t, new_entry->vme_start,
3078 vm_map_offset_t, new_entry->vme_end,
3079 int, VME_ALIAS(new_entry));
3080 vm_map_iokit_mapped_region(
3081 map,
3082 (new_entry->vme_end -
3083 new_entry->vme_start));
3084 } else if (!is_submap) {
3085 assert(!new_entry->iokit_acct);
3086 assert(new_entry->use_pmap);
3087 }
3088
3089 if (is_submap) {
3090 vm_map_t submap;
3091 boolean_t submap_is_64bit;
3092 boolean_t use_pmap;
3093
3094 assert(new_entry->is_sub_map);
3095 assert(!new_entry->use_pmap);
3096 assert(!new_entry->iokit_acct);
3097 submap = (vm_map_t) object;
3098 submap_is_64bit = vm_map_is_64bit(submap);
3099 use_pmap = vmk_flags.vmkf_nested_pmap;
3100 #ifndef NO_NESTED_PMAP
3101 if (use_pmap && submap->pmap == NULL) {
3102 ledger_t ledger = map->pmap->ledger;
3103 /* we need a sub pmap to nest... */
3104 submap->pmap = pmap_create_options(ledger, 0,
3105 submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3106 if (submap->pmap == NULL) {
3107 /* let's proceed without nesting... */
3108 }
3109 #if defined(__arm__) || defined(__arm64__)
3110 else {
3111 pmap_set_nested(submap->pmap);
3112 }
3113 #endif
3114 }
3115 if (use_pmap && submap->pmap != NULL) {
3116 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(submap)) {
3117 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3118 kr = KERN_FAILURE;
3119 } else {
3120 kr = pmap_nest(map->pmap,
3121 submap->pmap,
3122 tmp_start,
3123 tmp_end - tmp_start);
3124 }
3125 if (kr != KERN_SUCCESS) {
3126 printf("vm_map_enter: "
3127 "pmap_nest(0x%llx,0x%llx) "
3128 "error 0x%x\n",
3129 (long long)tmp_start,
3130 (long long)tmp_end,
3131 kr);
3132 } else {
3133 /* we're now nested ! */
3134 new_entry->use_pmap = TRUE;
3135 pmap_empty = FALSE;
3136 }
3137 }
3138 #endif /* NO_NESTED_PMAP */
3139 }
3140 entry = new_entry;
3141
3142 if (superpage_size) {
3143 vm_page_t pages, m;
3144 vm_object_t sp_object;
3145 vm_object_offset_t sp_offset;
3146
3147 VME_OFFSET_SET(entry, 0);
3148
3149 /* allocate one superpage */
3150 kr = cpm_allocate(SUPERPAGE_SIZE, &pages, 0, SUPERPAGE_NBASEPAGES - 1, TRUE, 0);
3151 if (kr != KERN_SUCCESS) {
3152 /* deallocate whole range... */
3153 new_mapping_established = TRUE;
3154 /* ... but only up to "tmp_end" */
3155 size -= end - tmp_end;
3156 RETURN(kr);
3157 }
3158
3159 /* create one vm_object per superpage */
3160 sp_object = vm_object_allocate((vm_map_size_t)(entry->vme_end - entry->vme_start));
3161 sp_object->phys_contiguous = TRUE;
3162 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(pages) * PAGE_SIZE;
3163 VME_OBJECT_SET(entry, sp_object);
3164 assert(entry->use_pmap);
3165
3166 /* enter the base pages into the object */
3167 vm_object_lock(sp_object);
3168 for (sp_offset = 0;
3169 sp_offset < SUPERPAGE_SIZE;
3170 sp_offset += PAGE_SIZE) {
3171 m = pages;
3172 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
3173 pages = NEXT_PAGE(m);
3174 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3175 vm_page_insert_wired(m, sp_object, sp_offset, VM_KERN_MEMORY_OSFMK);
3176 }
3177 vm_object_unlock(sp_object);
3178 }
3179 } while (tmp_end != tmp2_end &&
3180 (tmp_start = tmp_end) &&
3181 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3182 tmp_end + chunk_size : tmp2_end));
3183 }
3184
3185 new_mapping_established = TRUE;
3186
3187 BailOut:
3188 assert(map_locked == TRUE);
3189
3190 /*
3191 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3192 * If we have identified and possibly established the new mapping(s),
3193 * make sure we did not go beyond the address space limit.
3194 */
3195 if (result == KERN_SUCCESS) {
3196 if (map->size_limit != RLIM_INFINITY &&
3197 map->size > map->size_limit) {
3198 /*
3199 * Establishing the requested mappings would exceed
3200 * the process's RLIMIT_AS limit: fail with
3201 * KERN_NO_SPACE.
3202 */
3203 result = KERN_NO_SPACE;
3204 printf("%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3205 proc_selfpid(),
3206 (current_task()->bsd_info
3207 ? proc_name_address(current_task()->bsd_info)
3208 : "?"),
3209 __FUNCTION__,
3210 (uint64_t) map->size,
3211 (uint64_t) map->size_limit);
3212 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3213 vm_map_size_t, map->size,
3214 uint64_t, map->size_limit);
3215 vm_map_enter_RLIMIT_AS_count++;
3216 } else if (map->data_limit != RLIM_INFINITY &&
3217 map->size > map->data_limit) {
3218 /*
3219 * Establishing the requested mappings would exceed
3220 * the process's RLIMIT_DATA limit: fail with
3221 * KERN_NO_SPACE.
3222 */
3223 result = KERN_NO_SPACE;
3224 printf("%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3225 proc_selfpid(),
3226 (current_task()->bsd_info
3227 ? proc_name_address(current_task()->bsd_info)
3228 : "?"),
3229 __FUNCTION__,
3230 (uint64_t) map->size,
3231 (uint64_t) map->data_limit);
3232 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3233 vm_map_size_t, map->size,
3234 uint64_t, map->data_limit);
3235 vm_map_enter_RLIMIT_DATA_count++;
3236 }
3237 }
3238
3239 if (result == KERN_SUCCESS) {
3240 vm_prot_t pager_prot;
3241 memory_object_t pager;
3242
3243 #if DEBUG
3244 if (pmap_empty &&
3245 !(vmk_flags.vmkf_no_pmap_check)) {
3246 assert(pmap_is_empty(map->pmap,
3247 *address,
3248 *address + size));
3249 }
3250 #endif /* DEBUG */
3251
3252 /*
3253 * For "named" VM objects, let the pager know that the
3254 * memory object is being mapped. Some pagers need to keep
3255 * track of this, to know when they can reclaim the memory
3256 * object, for example.
3257 * VM calls memory_object_map() for each mapping (specifying
3258 * the protection of each mapping) and calls
3259 * memory_object_last_unmap() when all the mappings are gone.
3260 */
3261 pager_prot = max_protection;
3262 if (needs_copy) {
3263 /*
3264 * Copy-On-Write mapping: won't modify
3265 * the memory object.
3266 */
3267 pager_prot &= ~VM_PROT_WRITE;
3268 }
3269 if (!is_submap &&
3270 object != VM_OBJECT_NULL &&
3271 object->named &&
3272 object->pager != MEMORY_OBJECT_NULL) {
3273 vm_object_lock(object);
3274 pager = object->pager;
3275 if (object->named &&
3276 pager != MEMORY_OBJECT_NULL) {
3277 assert(object->pager_ready);
3278 vm_object_mapping_wait(object, THREAD_UNINT);
3279 vm_object_mapping_begin(object);
3280 vm_object_unlock(object);
3281
3282 kr = memory_object_map(pager, pager_prot);
3283 assert(kr == KERN_SUCCESS);
3284
3285 vm_object_lock(object);
3286 vm_object_mapping_end(object);
3287 }
3288 vm_object_unlock(object);
3289 }
3290 }
3291
3292 assert(map_locked == TRUE);
3293
3294 if (!keep_map_locked) {
3295 vm_map_unlock(map);
3296 map_locked = FALSE;
3297 }
3298
3299 /*
3300 * We can't hold the map lock if we enter this block.
3301 */
3302
3303 if (result == KERN_SUCCESS) {
3304 /* Wire down the new entry if the user
3305 * requested all new map entries be wired.
3306 */
3307 if ((map->wiring_required) || (superpage_size)) {
3308 assert(!keep_map_locked);
3309 pmap_empty = FALSE; /* pmap won't be empty */
3310 kr = vm_map_wire_kernel(map, start, end,
3311 new_entry->protection, VM_KERN_MEMORY_MLOCK,
3312 TRUE);
3313 result = kr;
3314 }
3315
3316 }
3317
3318 if (result != KERN_SUCCESS) {
3319 if (new_mapping_established) {
3320 /*
3321 * The caller had an extra reference on the VM object
3322 * it gave us.
3323 * We've transferred that reference to the mapping we
3324 * just established but we're about to undo that mapping
3325 * and release that reference.
3326 * The caller expects its reference to be consumed on
3327 * success only, so we have to get the extra reference
3328 * back for the caller.
3329 */
3330 vm_object_reference(caller_object);
3331
3332 /*
3333 * We have to get rid of the new mappings since we
3334 * won't make them available to the user.
3335 * Try and do that atomically, to minimize the risk
3336 * that someone else create new mappings that range.
3337 */
3338
3339 if (!map_locked) {
3340 vm_map_lock(map);
3341 map_locked = TRUE;
3342 }
3343 (void)vm_map_delete(map, *address, *address + size,
3344 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3345 &zap_new_list);
3346 }
3347
3348 if (vm_map_zap_first_entry(&zap_old_list)) {
3349 vm_map_entry_t entry1, entry2;
3350
3351 /*
3352 * The new mapping failed. Attempt to restore
3353 * the old mappings, saved in the "zap_old_map".
3354 */
3355 if (!map_locked) {
3356 vm_map_lock(map);
3357 map_locked = TRUE;
3358 }
3359
3360 /* first check if the coast is still clear */
3361 start = vm_map_zap_first_entry(&zap_old_list)->vme_start;
3362 end = vm_map_zap_last_entry(&zap_old_list)->vme_end;
3363
3364 if (vm_map_lookup_entry(map, start, &entry1) ||
3365 vm_map_lookup_entry(map, end, &entry2) ||
3366 entry1 != entry2) {
3367 /*
3368 * Part of that range has already been
3369 * re-mapped: we can't restore the old
3370 * mappings...
3371 */
3372 vm_map_enter_restore_failures++;
3373 } else {
3374 /*
3375 * Transfer the saved map entries from
3376 * "zap_old_map" to the original "map",
3377 * inserting them all after "entry1".
3378 */
3379 while ((entry2 = vm_map_zap_pop(&zap_old_list))) {
3380 vm_map_size_t entry_size;
3381
3382 entry_size = (entry2->vme_end -
3383 entry2->vme_start);
3384 vm_map_store_entry_link(map, entry1, entry2,
3385 VM_MAP_KERNEL_FLAGS_NONE);
3386 map->size += entry_size;
3387 entry1 = entry2;
3388 }
3389 if (map->wiring_required) {
3390 /*
3391 * XXX TODO: we should rewire the
3392 * old pages here...
3393 */
3394 }
3395 vm_map_enter_restore_successes++;
3396 }
3397 }
3398 }
3399
3400 /*
3401 * The caller is responsible for releasing the lock if it requested to
3402 * keep the map locked.
3403 */
3404 if (map_locked && !keep_map_locked) {
3405 vm_map_unlock(map);
3406 }
3407
3408 vm_map_zap_dispose(&zap_old_list);
3409 vm_map_zap_dispose(&zap_new_list);
3410
3411 return result;
3412
3413 #undef RETURN
3414 }
3415
3416 #if __arm64__
3417 extern const struct memory_object_pager_ops fourk_pager_ops;
3418 kern_return_t
vm_map_enter_fourk(vm_map_t map,vm_map_offset_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t alias,vm_object_t object,vm_object_offset_t offset,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)3419 vm_map_enter_fourk(
3420 vm_map_t map,
3421 vm_map_offset_t *address, /* IN/OUT */
3422 vm_map_size_t size,
3423 vm_map_offset_t mask,
3424 int flags,
3425 vm_map_kernel_flags_t vmk_flags,
3426 vm_tag_t alias,
3427 vm_object_t object,
3428 vm_object_offset_t offset,
3429 boolean_t needs_copy,
3430 vm_prot_t cur_protection,
3431 vm_prot_t max_protection,
3432 vm_inherit_t inheritance)
3433 {
3434 vm_map_entry_t entry, new_entry;
3435 vm_map_offset_t start, fourk_start;
3436 vm_map_offset_t end, fourk_end;
3437 vm_map_size_t fourk_size;
3438 kern_return_t result = KERN_SUCCESS;
3439 boolean_t map_locked = FALSE;
3440 boolean_t pmap_empty = TRUE;
3441 boolean_t new_mapping_established = FALSE;
3442 boolean_t keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3443 boolean_t anywhere = ((flags & VM_FLAGS_ANYWHERE) != 0);
3444 boolean_t purgable = ((flags & VM_FLAGS_PURGABLE) != 0);
3445 boolean_t overwrite = ((flags & VM_FLAGS_OVERWRITE) != 0);
3446 boolean_t no_cache = ((flags & VM_FLAGS_NO_CACHE) != 0);
3447 boolean_t is_submap = vmk_flags.vmkf_submap;
3448 boolean_t permanent = (((flags & VM_FLAGS_PERMANENT) != 0) || vmk_flags.vmkf_permanent);
3449 boolean_t no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
3450 boolean_t entry_for_jit = vmk_flags.vmkf_map_jit;
3451 // boolean_t iokit_acct = vmk_flags.vmkf_iokit_acct;
3452 boolean_t translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
3453 unsigned int superpage_size = ((flags & VM_FLAGS_SUPERPAGE_MASK) >> VM_FLAGS_SUPERPAGE_SHIFT);
3454 vm_map_offset_t effective_min_offset, effective_max_offset;
3455 kern_return_t kr;
3456 boolean_t clear_map_aligned = FALSE;
3457 memory_object_t fourk_mem_obj;
3458 vm_object_t fourk_object;
3459 vm_map_offset_t fourk_pager_offset;
3460 int fourk_pager_index_start, fourk_pager_index_num;
3461 int cur_idx;
3462 boolean_t fourk_copy;
3463 vm_object_t copy_object;
3464 vm_object_offset_t copy_offset;
3465 VM_MAP_ZAP_DECLARE(zap_list);
3466
3467 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3468 panic("%s:%d", __FUNCTION__, __LINE__);
3469 }
3470 fourk_mem_obj = MEMORY_OBJECT_NULL;
3471 fourk_object = VM_OBJECT_NULL;
3472
3473 if (superpage_size) {
3474 return KERN_NOT_SUPPORTED;
3475 }
3476
3477 if ((cur_protection & VM_PROT_WRITE) &&
3478 (cur_protection & VM_PROT_EXECUTE) &&
3479 #if XNU_TARGET_OS_OSX
3480 map->pmap != kernel_pmap &&
3481 (vm_map_cs_enforcement(map)
3482 #if __arm64__
3483 || !VM_MAP_IS_EXOTIC(map)
3484 #endif /* __arm64__ */
3485 ) &&
3486 #endif /* XNU_TARGET_OS_OSX */
3487 !entry_for_jit) {
3488 DTRACE_VM3(cs_wx,
3489 uint64_t, 0,
3490 uint64_t, 0,
3491 vm_prot_t, cur_protection);
3492 printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3493 "turning off execute\n",
3494 proc_selfpid(),
3495 (current_task()->bsd_info
3496 ? proc_name_address(current_task()->bsd_info)
3497 : "?"),
3498 __FUNCTION__);
3499 cur_protection &= ~VM_PROT_EXECUTE;
3500 }
3501
3502 /*
3503 * If the task has requested executable lockdown,
3504 * deny any new executable mapping.
3505 */
3506 if (map->map_disallow_new_exec == TRUE) {
3507 if (cur_protection & VM_PROT_EXECUTE) {
3508 return KERN_PROTECTION_FAILURE;
3509 }
3510 }
3511
3512 if (is_submap) {
3513 return KERN_NOT_SUPPORTED;
3514 }
3515 if (vmk_flags.vmkf_already) {
3516 return KERN_NOT_SUPPORTED;
3517 }
3518 if (purgable || entry_for_jit) {
3519 return KERN_NOT_SUPPORTED;
3520 }
3521
3522 effective_min_offset = map->min_offset;
3523
3524 if (vmk_flags.vmkf_beyond_max) {
3525 return KERN_NOT_SUPPORTED;
3526 } else {
3527 effective_max_offset = map->max_offset;
3528 }
3529
3530 if (size == 0 ||
3531 (offset & FOURK_PAGE_MASK) != 0) {
3532 *address = 0;
3533 return KERN_INVALID_ARGUMENT;
3534 }
3535
3536 #define RETURN(value) { result = value; goto BailOut; }
3537
3538 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3539 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3540
3541 if (!anywhere && overwrite) {
3542 return KERN_NOT_SUPPORTED;
3543 }
3544
3545 fourk_start = *address;
3546 fourk_size = size;
3547 fourk_end = fourk_start + fourk_size;
3548
3549 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3550 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3551 size = end - start;
3552
3553 if (anywhere) {
3554 return KERN_NOT_SUPPORTED;
3555 } else {
3556 /*
3557 * Verify that:
3558 * the address doesn't itself violate
3559 * the mask requirement.
3560 */
3561
3562 vm_map_lock(map);
3563 map_locked = TRUE;
3564 if ((start & mask) != 0) {
3565 RETURN(KERN_NO_SPACE);
3566 }
3567
3568 /*
3569 * ... the address is within bounds
3570 */
3571
3572 end = start + size;
3573
3574 if ((start < effective_min_offset) ||
3575 (end > effective_max_offset) ||
3576 (start >= end)) {
3577 RETURN(KERN_INVALID_ADDRESS);
3578 }
3579
3580 /*
3581 * ... the starting address isn't allocated
3582 */
3583 if (vm_map_lookup_entry(map, start, &entry)) {
3584 vm_object_t cur_object, shadow_object;
3585
3586 /*
3587 * We might already some 4K mappings
3588 * in a 16K page here.
3589 */
3590
3591 if (entry->vme_end - entry->vme_start
3592 != SIXTEENK_PAGE_SIZE) {
3593 RETURN(KERN_NO_SPACE);
3594 }
3595 if (entry->is_sub_map) {
3596 RETURN(KERN_NO_SPACE);
3597 }
3598 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3599 RETURN(KERN_NO_SPACE);
3600 }
3601
3602 /* go all the way down the shadow chain */
3603 cur_object = VME_OBJECT(entry);
3604 vm_object_lock(cur_object);
3605 while (cur_object->shadow != VM_OBJECT_NULL) {
3606 shadow_object = cur_object->shadow;
3607 vm_object_lock(shadow_object);
3608 vm_object_unlock(cur_object);
3609 cur_object = shadow_object;
3610 shadow_object = VM_OBJECT_NULL;
3611 }
3612 if (cur_object->internal ||
3613 cur_object->pager == NULL) {
3614 vm_object_unlock(cur_object);
3615 RETURN(KERN_NO_SPACE);
3616 }
3617 if (cur_object->pager->mo_pager_ops
3618 != &fourk_pager_ops) {
3619 vm_object_unlock(cur_object);
3620 RETURN(KERN_NO_SPACE);
3621 }
3622 fourk_object = cur_object;
3623 fourk_mem_obj = fourk_object->pager;
3624
3625 /* keep the "4K" object alive */
3626 vm_object_reference_locked(fourk_object);
3627 memory_object_reference(fourk_mem_obj);
3628 vm_object_unlock(fourk_object);
3629
3630 /* merge permissions */
3631 entry->protection |= cur_protection;
3632 entry->max_protection |= max_protection;
3633
3634 if ((entry->protection & VM_PROT_WRITE) &&
3635 (entry->protection & VM_PROT_ALLEXEC) &&
3636 fourk_binary_compatibility_unsafe &&
3637 fourk_binary_compatibility_allow_wx) {
3638 /* write+execute: need to be "jit" */
3639 entry->used_for_jit = TRUE;
3640 }
3641 goto map_in_fourk_pager;
3642 }
3643
3644 /*
3645 * ... the next region doesn't overlap the
3646 * end point.
3647 */
3648
3649 if ((entry->vme_next != vm_map_to_entry(map)) &&
3650 (entry->vme_next->vme_start < end)) {
3651 RETURN(KERN_NO_SPACE);
3652 }
3653 }
3654
3655 /*
3656 * At this point,
3657 * "start" and "end" should define the endpoints of the
3658 * available new range, and
3659 * "entry" should refer to the region before the new
3660 * range, and
3661 *
3662 * the map should be locked.
3663 */
3664
3665 /* create a new "4K" pager */
3666 fourk_mem_obj = fourk_pager_create();
3667 fourk_object = fourk_pager_to_vm_object(fourk_mem_obj);
3668 assert(fourk_object);
3669
3670 /* keep the "4" object alive */
3671 vm_object_reference(fourk_object);
3672
3673 /* create a "copy" object, to map the "4K" object copy-on-write */
3674 fourk_copy = TRUE;
3675 result = vm_object_copy_strategically(fourk_object,
3676 0,
3677 end - start,
3678 ©_object,
3679 ©_offset,
3680 &fourk_copy);
3681 assert(result == KERN_SUCCESS);
3682 assert(copy_object != VM_OBJECT_NULL);
3683 assert(copy_offset == 0);
3684
3685 /* map the "4K" pager's copy object */
3686 new_entry = vm_map_entry_insert(map,
3687 entry,
3688 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3689 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3690 copy_object,
3691 0, /* offset */
3692 vmk_flags,
3693 FALSE, /* needs_copy */
3694 cur_protection, max_protection,
3695 (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3696 VM_INHERIT_NONE : inheritance),
3697 no_cache,
3698 permanent,
3699 no_copy_on_read,
3700 superpage_size,
3701 clear_map_aligned,
3702 is_submap,
3703 FALSE, /* jit */
3704 alias,
3705 translated_allow_execute);
3706 entry = new_entry;
3707
3708 #if VM_MAP_DEBUG_FOURK
3709 if (vm_map_debug_fourk) {
3710 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3711 map,
3712 (uint64_t) entry->vme_start,
3713 (uint64_t) entry->vme_end,
3714 fourk_mem_obj);
3715 }
3716 #endif /* VM_MAP_DEBUG_FOURK */
3717
3718 new_mapping_established = TRUE;
3719
3720 map_in_fourk_pager:
3721 /* "map" the original "object" where it belongs in the "4K" pager */
3722 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3723 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3724 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3725 fourk_pager_index_num = 4;
3726 } else {
3727 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3728 }
3729 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3730 fourk_pager_index_num = 4 - fourk_pager_index_start;
3731 }
3732 for (cur_idx = 0;
3733 cur_idx < fourk_pager_index_num;
3734 cur_idx++) {
3735 vm_object_t old_object;
3736 vm_object_offset_t old_offset;
3737
3738 kr = fourk_pager_populate(fourk_mem_obj,
3739 TRUE, /* overwrite */
3740 fourk_pager_index_start + cur_idx,
3741 object,
3742 (object
3743 ? (offset +
3744 (cur_idx * FOURK_PAGE_SIZE))
3745 : 0),
3746 &old_object,
3747 &old_offset);
3748 #if VM_MAP_DEBUG_FOURK
3749 if (vm_map_debug_fourk) {
3750 if (old_object == (vm_object_t) -1 &&
3751 old_offset == (vm_object_offset_t) -1) {
3752 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3753 "pager [%p:0x%llx] "
3754 "populate[%d] "
3755 "[object:%p,offset:0x%llx]\n",
3756 map,
3757 (uint64_t) entry->vme_start,
3758 (uint64_t) entry->vme_end,
3759 fourk_mem_obj,
3760 VME_OFFSET(entry),
3761 fourk_pager_index_start + cur_idx,
3762 object,
3763 (object
3764 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3765 : 0));
3766 } else {
3767 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3768 "pager [%p:0x%llx] "
3769 "populate[%d] [object:%p,offset:0x%llx] "
3770 "old [%p:0x%llx]\n",
3771 map,
3772 (uint64_t) entry->vme_start,
3773 (uint64_t) entry->vme_end,
3774 fourk_mem_obj,
3775 VME_OFFSET(entry),
3776 fourk_pager_index_start + cur_idx,
3777 object,
3778 (object
3779 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3780 : 0),
3781 old_object,
3782 old_offset);
3783 }
3784 }
3785 #endif /* VM_MAP_DEBUG_FOURK */
3786
3787 assert(kr == KERN_SUCCESS);
3788 if (object != old_object &&
3789 object != VM_OBJECT_NULL &&
3790 object != (vm_object_t) -1) {
3791 vm_object_reference(object);
3792 }
3793 if (object != old_object &&
3794 old_object != VM_OBJECT_NULL &&
3795 old_object != (vm_object_t) -1) {
3796 vm_object_deallocate(old_object);
3797 }
3798 }
3799
3800 BailOut:
3801 assert(map_locked == TRUE);
3802
3803 if (result == KERN_SUCCESS) {
3804 vm_prot_t pager_prot;
3805 memory_object_t pager;
3806
3807 #if DEBUG
3808 if (pmap_empty &&
3809 !(vmk_flags.vmkf_no_pmap_check)) {
3810 assert(pmap_is_empty(map->pmap,
3811 *address,
3812 *address + size));
3813 }
3814 #endif /* DEBUG */
3815
3816 /*
3817 * For "named" VM objects, let the pager know that the
3818 * memory object is being mapped. Some pagers need to keep
3819 * track of this, to know when they can reclaim the memory
3820 * object, for example.
3821 * VM calls memory_object_map() for each mapping (specifying
3822 * the protection of each mapping) and calls
3823 * memory_object_last_unmap() when all the mappings are gone.
3824 */
3825 pager_prot = max_protection;
3826 if (needs_copy) {
3827 /*
3828 * Copy-On-Write mapping: won't modify
3829 * the memory object.
3830 */
3831 pager_prot &= ~VM_PROT_WRITE;
3832 }
3833 if (!is_submap &&
3834 object != VM_OBJECT_NULL &&
3835 object->named &&
3836 object->pager != MEMORY_OBJECT_NULL) {
3837 vm_object_lock(object);
3838 pager = object->pager;
3839 if (object->named &&
3840 pager != MEMORY_OBJECT_NULL) {
3841 assert(object->pager_ready);
3842 vm_object_mapping_wait(object, THREAD_UNINT);
3843 vm_object_mapping_begin(object);
3844 vm_object_unlock(object);
3845
3846 kr = memory_object_map(pager, pager_prot);
3847 assert(kr == KERN_SUCCESS);
3848
3849 vm_object_lock(object);
3850 vm_object_mapping_end(object);
3851 }
3852 vm_object_unlock(object);
3853 }
3854 if (!is_submap &&
3855 fourk_object != VM_OBJECT_NULL &&
3856 fourk_object->named &&
3857 fourk_object->pager != MEMORY_OBJECT_NULL) {
3858 vm_object_lock(fourk_object);
3859 pager = fourk_object->pager;
3860 if (fourk_object->named &&
3861 pager != MEMORY_OBJECT_NULL) {
3862 assert(fourk_object->pager_ready);
3863 vm_object_mapping_wait(fourk_object,
3864 THREAD_UNINT);
3865 vm_object_mapping_begin(fourk_object);
3866 vm_object_unlock(fourk_object);
3867
3868 kr = memory_object_map(pager, VM_PROT_READ);
3869 assert(kr == KERN_SUCCESS);
3870
3871 vm_object_lock(fourk_object);
3872 vm_object_mapping_end(fourk_object);
3873 }
3874 vm_object_unlock(fourk_object);
3875 }
3876 }
3877
3878 if (fourk_object != VM_OBJECT_NULL) {
3879 vm_object_deallocate(fourk_object);
3880 fourk_object = VM_OBJECT_NULL;
3881 memory_object_deallocate(fourk_mem_obj);
3882 fourk_mem_obj = MEMORY_OBJECT_NULL;
3883 }
3884
3885 assert(map_locked == TRUE);
3886
3887 if (!keep_map_locked) {
3888 vm_map_unlock(map);
3889 map_locked = FALSE;
3890 }
3891
3892 /*
3893 * We can't hold the map lock if we enter this block.
3894 */
3895
3896 if (result == KERN_SUCCESS) {
3897 /* Wire down the new entry if the user
3898 * requested all new map entries be wired.
3899 */
3900 if ((map->wiring_required) || (superpage_size)) {
3901 assert(!keep_map_locked);
3902 pmap_empty = FALSE; /* pmap won't be empty */
3903 kr = vm_map_wire_kernel(map, start, end,
3904 new_entry->protection, VM_KERN_MEMORY_MLOCK,
3905 TRUE);
3906 result = kr;
3907 }
3908
3909 }
3910
3911 if (result != KERN_SUCCESS) {
3912 if (new_mapping_established) {
3913 /*
3914 * We have to get rid of the new mappings since we
3915 * won't make them available to the user.
3916 * Try and do that atomically, to minimize the risk
3917 * that someone else create new mappings that range.
3918 */
3919
3920 if (!map_locked) {
3921 vm_map_lock(map);
3922 map_locked = TRUE;
3923 }
3924 (void)vm_map_delete(map, *address, *address + size,
3925 VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
3926 &zap_list);
3927 }
3928 }
3929
3930 /*
3931 * The caller is responsible for releasing the lock if it requested to
3932 * keep the map locked.
3933 */
3934 if (map_locked && !keep_map_locked) {
3935 vm_map_unlock(map);
3936 }
3937
3938 vm_map_zap_dispose(&zap_list);
3939
3940 return result;
3941
3942 #undef RETURN
3943 }
3944 #endif /* __arm64__ */
3945
3946 /*
3947 * Counters for the prefault optimization.
3948 */
3949 int64_t vm_prefault_nb_pages = 0;
3950 int64_t vm_prefault_nb_bailout = 0;
3951
3952 static kern_return_t
vm_map_enter_mem_object_helper(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,upl_page_list_ptr_t page_list,unsigned int page_list_count)3953 vm_map_enter_mem_object_helper(
3954 vm_map_t target_map,
3955 vm_map_offset_t *address,
3956 vm_map_size_t initial_size,
3957 vm_map_offset_t mask,
3958 int flags,
3959 vm_map_kernel_flags_t vmk_flags,
3960 vm_tag_t tag,
3961 ipc_port_t port,
3962 vm_object_offset_t offset,
3963 boolean_t copy,
3964 vm_prot_t cur_protection,
3965 vm_prot_t max_protection,
3966 vm_inherit_t inheritance,
3967 upl_page_list_ptr_t page_list,
3968 unsigned int page_list_count)
3969 {
3970 vm_map_address_t map_addr;
3971 vm_map_size_t map_size;
3972 vm_object_t object;
3973 vm_object_size_t size;
3974 kern_return_t result;
3975 boolean_t mask_cur_protection, mask_max_protection;
3976 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
3977 vm_map_offset_t offset_in_mapping = 0;
3978 #if __arm64__
3979 boolean_t fourk = vmk_flags.vmkf_fourk;
3980 #endif /* __arm64__ */
3981
3982 if (VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
3983 /* XXX TODO4K prefaulting depends on page size... */
3984 try_prefault = FALSE;
3985 }
3986
3987 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
3988
3989 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
3990 mask_max_protection = max_protection & VM_PROT_IS_MASK;
3991 cur_protection &= ~VM_PROT_IS_MASK;
3992 max_protection &= ~VM_PROT_IS_MASK;
3993
3994 /*
3995 * Check arguments for validity
3996 */
3997 if ((target_map == VM_MAP_NULL) ||
3998 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
3999 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4000 (inheritance > VM_INHERIT_LAST_VALID) ||
4001 (try_prefault && (copy || !page_list)) ||
4002 initial_size == 0) {
4003 return KERN_INVALID_ARGUMENT;
4004 }
4005
4006 /*
4007 * Redirect to kmem_ranges[data]
4008 */
4009 if (target_map == kernel_map) {
4010 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
4011 }
4012
4013 #if __arm64__
4014 if (fourk && VM_MAP_PAGE_SHIFT(target_map) < PAGE_SHIFT) {
4015 /* no "fourk" if map is using a sub-page page size */
4016 fourk = FALSE;
4017 }
4018 if (fourk) {
4019 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4020 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4021 } else
4022 #endif /* __arm64__ */
4023 {
4024 map_addr = vm_map_trunc_page(*address,
4025 VM_MAP_PAGE_MASK(target_map));
4026 map_size = vm_map_round_page(initial_size,
4027 VM_MAP_PAGE_MASK(target_map));
4028 }
4029 size = vm_object_round_page(initial_size);
4030
4031 /*
4032 * Find the vm object (if any) corresponding to this port.
4033 */
4034 if (!IP_VALID(port)) {
4035 object = VM_OBJECT_NULL;
4036 offset = 0;
4037 copy = FALSE;
4038 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4039 vm_named_entry_t named_entry;
4040 vm_object_offset_t data_offset;
4041
4042 named_entry = mach_memory_entry_from_port(port);
4043
4044 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4045 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4046 data_offset = named_entry->data_offset;
4047 offset += named_entry->data_offset;
4048 } else {
4049 data_offset = 0;
4050 }
4051
4052 /* a few checks to make sure user is obeying rules */
4053 if (size == 0) {
4054 if (offset >= named_entry->size) {
4055 return KERN_INVALID_RIGHT;
4056 }
4057 size = named_entry->size - offset;
4058 }
4059 if (mask_max_protection) {
4060 max_protection &= named_entry->protection;
4061 }
4062 if (mask_cur_protection) {
4063 cur_protection &= named_entry->protection;
4064 }
4065 if ((named_entry->protection & max_protection) !=
4066 max_protection) {
4067 return KERN_INVALID_RIGHT;
4068 }
4069 if ((named_entry->protection & cur_protection) !=
4070 cur_protection) {
4071 return KERN_INVALID_RIGHT;
4072 }
4073 if (offset + size < offset) {
4074 /* overflow */
4075 return KERN_INVALID_ARGUMENT;
4076 }
4077 if (named_entry->size < (offset + initial_size)) {
4078 return KERN_INVALID_ARGUMENT;
4079 }
4080
4081 if (named_entry->is_copy) {
4082 /* for a vm_map_copy, we can only map it whole */
4083 if ((size != named_entry->size) &&
4084 (vm_map_round_page(size,
4085 VM_MAP_PAGE_MASK(target_map)) ==
4086 named_entry->size)) {
4087 /* XXX FBDP use the rounded size... */
4088 size = vm_map_round_page(
4089 size,
4090 VM_MAP_PAGE_MASK(target_map));
4091 }
4092 }
4093
4094 /* the callers parameter offset is defined to be the */
4095 /* offset from beginning of named entry offset in object */
4096 offset = offset + named_entry->offset;
4097
4098 if (!VM_MAP_PAGE_ALIGNED(size,
4099 VM_MAP_PAGE_MASK(target_map))) {
4100 /*
4101 * Let's not map more than requested;
4102 * vm_map_enter() will handle this "not map-aligned"
4103 * case.
4104 */
4105 map_size = size;
4106 }
4107
4108 named_entry_lock(named_entry);
4109 if (named_entry->is_sub_map) {
4110 vm_map_t submap;
4111
4112 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4113 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4114 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4115 }
4116
4117 submap = named_entry->backing.map;
4118 vm_map_reference(submap);
4119 named_entry_unlock(named_entry);
4120
4121 vmk_flags.vmkf_submap = TRUE;
4122
4123 result = vm_map_enter(target_map,
4124 &map_addr,
4125 map_size,
4126 mask,
4127 flags,
4128 vmk_flags,
4129 tag,
4130 (vm_object_t)(uintptr_t) submap,
4131 offset,
4132 copy,
4133 cur_protection,
4134 max_protection,
4135 inheritance);
4136 if (result != KERN_SUCCESS) {
4137 vm_map_deallocate(submap);
4138 } else {
4139 /*
4140 * No need to lock "submap" just to check its
4141 * "mapped" flag: that flag is never reset
4142 * once it's been set and if we race, we'll
4143 * just end up setting it twice, which is OK.
4144 */
4145 if (submap->mapped_in_other_pmaps == FALSE &&
4146 vm_map_pmap(submap) != PMAP_NULL &&
4147 vm_map_pmap(submap) !=
4148 vm_map_pmap(target_map)) {
4149 /*
4150 * This submap is being mapped in a map
4151 * that uses a different pmap.
4152 * Set its "mapped_in_other_pmaps" flag
4153 * to indicate that we now need to
4154 * remove mappings from all pmaps rather
4155 * than just the submap's pmap.
4156 */
4157 vm_map_lock(submap);
4158 submap->mapped_in_other_pmaps = TRUE;
4159 vm_map_unlock(submap);
4160 }
4161 *address = map_addr;
4162 }
4163 return result;
4164 } else if (named_entry->is_copy) {
4165 kern_return_t kr;
4166 vm_map_copy_t copy_map;
4167 vm_map_entry_t copy_entry;
4168 vm_map_offset_t copy_addr;
4169 vm_map_copy_t target_copy_map;
4170 vm_map_offset_t overmap_start, overmap_end;
4171 vm_map_offset_t trimmed_start;
4172 vm_map_size_t target_size;
4173
4174 if (flags & ~(VM_FLAGS_FIXED |
4175 VM_FLAGS_ANYWHERE |
4176 VM_FLAGS_OVERWRITE |
4177 VM_FLAGS_RETURN_4K_DATA_ADDR |
4178 VM_FLAGS_RETURN_DATA_ADDR |
4179 VM_FLAGS_ALIAS_MASK)) {
4180 named_entry_unlock(named_entry);
4181 return KERN_INVALID_ARGUMENT;
4182 }
4183
4184 copy_map = named_entry->backing.copy;
4185 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4186 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4187 /* unsupported type; should not happen */
4188 printf("vm_map_enter_mem_object: "
4189 "memory_entry->backing.copy "
4190 "unsupported type 0x%x\n",
4191 copy_map->type);
4192 named_entry_unlock(named_entry);
4193 return KERN_INVALID_ARGUMENT;
4194 }
4195
4196 if (VM_MAP_PAGE_SHIFT(target_map) != copy_map->cpy_hdr.page_shift) {
4197 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4198 }
4199
4200 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4201 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4202 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4203 if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4204 offset_in_mapping &= ~((signed)(0xFFF));
4205 }
4206 }
4207
4208 target_copy_map = VM_MAP_COPY_NULL;
4209 target_size = copy_map->size;
4210 overmap_start = 0;
4211 overmap_end = 0;
4212 trimmed_start = 0;
4213 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(target_map)) {
4214 DEBUG4K_ADJUST("adjusting...\n");
4215 kr = vm_map_copy_adjust_to_target(
4216 copy_map,
4217 offset /* includes data_offset */,
4218 initial_size,
4219 target_map,
4220 copy,
4221 &target_copy_map,
4222 &overmap_start,
4223 &overmap_end,
4224 &trimmed_start);
4225 if (kr != KERN_SUCCESS) {
4226 named_entry_unlock(named_entry);
4227 return kr;
4228 }
4229 target_size = target_copy_map->size;
4230 if (trimmed_start >= data_offset) {
4231 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4232 } else {
4233 data_offset -= trimmed_start;
4234 }
4235 } else {
4236 /*
4237 * Assert that the vm_map_copy is coming from the right
4238 * zone and hasn't been forged
4239 */
4240 vm_map_copy_require(copy_map);
4241 target_copy_map = copy_map;
4242 }
4243
4244 /* reserve a contiguous range */
4245 kr = vm_map_enter(target_map,
4246 &map_addr,
4247 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4248 mask,
4249 flags & (VM_FLAGS_ANYWHERE |
4250 VM_FLAGS_OVERWRITE |
4251 VM_FLAGS_RETURN_4K_DATA_ADDR |
4252 VM_FLAGS_RETURN_DATA_ADDR),
4253 vmk_flags,
4254 tag,
4255 VM_OBJECT_NULL,
4256 0,
4257 FALSE, /* copy */
4258 cur_protection,
4259 max_protection,
4260 inheritance);
4261 if (kr != KERN_SUCCESS) {
4262 DEBUG4K_ERROR("kr 0x%x\n", kr);
4263 if (target_copy_map != copy_map) {
4264 vm_map_copy_discard(target_copy_map);
4265 target_copy_map = VM_MAP_COPY_NULL;
4266 }
4267 named_entry_unlock(named_entry);
4268 return kr;
4269 }
4270
4271 copy_addr = map_addr;
4272
4273 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4274 copy_entry != vm_map_copy_to_entry(target_copy_map);
4275 copy_entry = copy_entry->vme_next) {
4276 int remap_flags;
4277 vm_map_kernel_flags_t vmk_remap_flags;
4278 vm_map_t copy_submap;
4279 vm_object_t copy_object;
4280 vm_map_size_t copy_size;
4281 vm_object_offset_t copy_offset;
4282 int copy_vm_alias;
4283 boolean_t do_copy;
4284
4285 do_copy = FALSE;
4286 remap_flags = 0;
4287 vmk_remap_flags = VM_MAP_KERNEL_FLAGS_NONE;
4288
4289 copy_object = VME_OBJECT(copy_entry);
4290 copy_offset = VME_OFFSET(copy_entry);
4291 copy_size = (copy_entry->vme_end -
4292 copy_entry->vme_start);
4293 VM_GET_FLAGS_ALIAS(flags, copy_vm_alias);
4294 if (copy_vm_alias == 0) {
4295 /*
4296 * Caller does not want a specific
4297 * alias for this new mapping: use
4298 * the alias of the original mapping.
4299 */
4300 copy_vm_alias = VME_ALIAS(copy_entry);
4301 }
4302
4303 /* sanity check */
4304 if ((copy_addr + copy_size) >
4305 (map_addr +
4306 overmap_start + overmap_end +
4307 named_entry->size /* XXX full size */)) {
4308 /* over-mapping too much !? */
4309 kr = KERN_INVALID_ARGUMENT;
4310 DEBUG4K_ERROR("kr 0x%x\n", kr);
4311 /* abort */
4312 break;
4313 }
4314
4315 /* take a reference on the object */
4316 if (copy_entry->is_sub_map) {
4317 vmk_remap_flags.vmkf_submap = TRUE;
4318 copy_submap = VME_SUBMAP(copy_entry);
4319 vm_map_lock(copy_submap);
4320 vm_map_reference(copy_submap);
4321 vm_map_unlock(copy_submap);
4322 copy_object = (vm_object_t)(uintptr_t) copy_submap;
4323 } else {
4324 if (!copy &&
4325 copy_object != VM_OBJECT_NULL &&
4326 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4327 /*
4328 * We need to resolve our side of this
4329 * "symmetric" copy-on-write now; we
4330 * need a new object to map and share,
4331 * instead of the current one which
4332 * might still be shared with the
4333 * original mapping.
4334 *
4335 * Note: A "vm_map_copy_t" does not
4336 * have a lock but we're protected by
4337 * the named entry's lock here.
4338 */
4339 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4340 VME_OBJECT_SHADOW(copy_entry, copy_size);
4341 assert(copy_object != VME_OBJECT(copy_entry));
4342 if (!copy_entry->needs_copy &&
4343 copy_entry->protection & VM_PROT_WRITE) {
4344 vm_prot_t prot;
4345
4346 prot = copy_entry->protection & ~VM_PROT_WRITE;
4347 vm_object_pmap_protect(copy_object,
4348 copy_offset,
4349 copy_size,
4350 PMAP_NULL,
4351 PAGE_SIZE,
4352 0,
4353 prot);
4354 }
4355
4356 copy_entry->needs_copy = FALSE;
4357 copy_entry->is_shared = TRUE;
4358 copy_object = VME_OBJECT(copy_entry);
4359 copy_offset = VME_OFFSET(copy_entry);
4360 vm_object_lock(copy_object);
4361 /* we're about to make a shared mapping of this object */
4362 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4363 copy_object->true_share = TRUE;
4364 vm_object_unlock(copy_object);
4365 }
4366
4367 if (copy_object != VM_OBJECT_NULL &&
4368 copy_object->named &&
4369 copy_object->pager != MEMORY_OBJECT_NULL &&
4370 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4371 memory_object_t pager;
4372 vm_prot_t pager_prot;
4373
4374 /*
4375 * For "named" VM objects, let the pager know that the
4376 * memory object is being mapped. Some pagers need to keep
4377 * track of this, to know when they can reclaim the memory
4378 * object, for example.
4379 * VM calls memory_object_map() for each mapping (specifying
4380 * the protection of each mapping) and calls
4381 * memory_object_last_unmap() when all the mappings are gone.
4382 */
4383 pager_prot = max_protection;
4384 if (copy) {
4385 /*
4386 * Copy-On-Write mapping: won't modify the
4387 * memory object.
4388 */
4389 pager_prot &= ~VM_PROT_WRITE;
4390 }
4391 vm_object_lock(copy_object);
4392 pager = copy_object->pager;
4393 if (copy_object->named &&
4394 pager != MEMORY_OBJECT_NULL &&
4395 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4396 assert(copy_object->pager_ready);
4397 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4398 vm_object_mapping_begin(copy_object);
4399 vm_object_unlock(copy_object);
4400
4401 kr = memory_object_map(pager, pager_prot);
4402 assert(kr == KERN_SUCCESS);
4403
4404 vm_object_lock(copy_object);
4405 vm_object_mapping_end(copy_object);
4406 }
4407 vm_object_unlock(copy_object);
4408 }
4409
4410 /*
4411 * Perform the copy if requested
4412 */
4413
4414 if (copy && copy_object != VM_OBJECT_NULL) {
4415 vm_object_t new_object;
4416 vm_object_offset_t new_offset;
4417
4418 result = vm_object_copy_strategically(copy_object, copy_offset,
4419 copy_size,
4420 &new_object, &new_offset,
4421 &do_copy);
4422
4423
4424 if (result == KERN_MEMORY_RESTART_COPY) {
4425 boolean_t success;
4426 boolean_t src_needs_copy;
4427
4428 /*
4429 * XXX
4430 * We currently ignore src_needs_copy.
4431 * This really is the issue of how to make
4432 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4433 * non-kernel users to use. Solution forthcoming.
4434 * In the meantime, since we don't allow non-kernel
4435 * memory managers to specify symmetric copy,
4436 * we won't run into problems here.
4437 */
4438 new_object = copy_object;
4439 new_offset = copy_offset;
4440 success = vm_object_copy_quickly(new_object,
4441 new_offset,
4442 copy_size,
4443 &src_needs_copy,
4444 &do_copy);
4445 assert(success);
4446 result = KERN_SUCCESS;
4447 }
4448 if (result != KERN_SUCCESS) {
4449 kr = result;
4450 break;
4451 }
4452
4453 copy_object = new_object;
4454 copy_offset = new_offset;
4455 /*
4456 * No extra object reference for the mapping:
4457 * the mapping should be the only thing keeping
4458 * this new object alive.
4459 */
4460 } else {
4461 /*
4462 * We already have the right object
4463 * to map.
4464 */
4465 copy_object = VME_OBJECT(copy_entry);
4466 /* take an extra ref for the mapping below */
4467 vm_object_reference(copy_object);
4468 }
4469 }
4470
4471 /* over-map the object into destination */
4472 remap_flags |= flags;
4473 remap_flags |= VM_FLAGS_FIXED;
4474 remap_flags |= VM_FLAGS_OVERWRITE;
4475 remap_flags &= ~VM_FLAGS_ANYWHERE;
4476 if (!copy && !copy_entry->is_sub_map) {
4477 /*
4478 * copy-on-write should have been
4479 * resolved at this point, or we would
4480 * end up sharing instead of copying.
4481 */
4482 assert(!copy_entry->needs_copy);
4483 }
4484 #if XNU_TARGET_OS_OSX
4485 if (copy_entry->used_for_jit) {
4486 vmk_remap_flags.vmkf_map_jit = TRUE;
4487 }
4488 #endif /* XNU_TARGET_OS_OSX */
4489
4490 assertf((copy_vm_alias & VME_ALIAS_MASK) == copy_vm_alias,
4491 "VM Tag truncated from 0x%x to 0x%x\n", copy_vm_alias, (copy_vm_alias & VME_ALIAS_MASK));
4492 kr = vm_map_enter(target_map,
4493 ©_addr,
4494 copy_size,
4495 (vm_map_offset_t) 0,
4496 remap_flags,
4497 vmk_remap_flags,
4498 (vm_tag_t) copy_vm_alias, /* see comment at end of vm_fault_unwire re. cast*/
4499 copy_object,
4500 copy_offset,
4501 ((copy_object == NULL)
4502 ? FALSE
4503 : (copy || copy_entry->needs_copy)),
4504 cur_protection,
4505 max_protection,
4506 inheritance);
4507 if (kr != KERN_SUCCESS) {
4508 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4509 if (copy_entry->is_sub_map) {
4510 vm_map_deallocate(copy_submap);
4511 } else {
4512 vm_object_deallocate(copy_object);
4513 }
4514 /* abort */
4515 break;
4516 }
4517
4518 /* next mapping */
4519 copy_addr += copy_size;
4520 }
4521
4522 if (kr == KERN_SUCCESS) {
4523 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4524 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4525 *address = map_addr + offset_in_mapping;
4526 } else {
4527 *address = map_addr;
4528 }
4529 if (overmap_start) {
4530 *address += overmap_start;
4531 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4532 }
4533 }
4534 named_entry_unlock(named_entry);
4535 if (target_copy_map != copy_map) {
4536 vm_map_copy_discard(target_copy_map);
4537 target_copy_map = VM_MAP_COPY_NULL;
4538 }
4539
4540 if (kr != KERN_SUCCESS) {
4541 if (!(flags & VM_FLAGS_OVERWRITE)) {
4542 /* deallocate the contiguous range */
4543 (void) vm_deallocate(target_map,
4544 map_addr,
4545 map_size);
4546 }
4547 }
4548
4549 return kr;
4550 }
4551
4552 if (named_entry->is_object) {
4553 unsigned int access;
4554 vm_prot_t protections;
4555 unsigned int wimg_mode;
4556
4557 /* we are mapping a VM object */
4558
4559 protections = named_entry->protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
4560 access = GET_MAP_MEM(named_entry->protection);
4561
4562 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4563 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4564 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4565 if (flags & VM_FLAGS_RETURN_4K_DATA_ADDR) {
4566 offset_in_mapping &= ~((signed)(0xFFF));
4567 }
4568 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4569 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4570 }
4571
4572 object = vm_named_entry_to_vm_object(named_entry);
4573 assert(object != VM_OBJECT_NULL);
4574 vm_object_lock(object);
4575 named_entry_unlock(named_entry);
4576
4577 vm_object_reference_locked(object);
4578
4579 wimg_mode = object->wimg_bits;
4580 vm_prot_to_wimg(access, &wimg_mode);
4581 if (object->wimg_bits != wimg_mode) {
4582 vm_object_change_wimg_mode(object, wimg_mode);
4583 }
4584
4585 vm_object_unlock(object);
4586 } else {
4587 panic("invalid VM named entry %p", named_entry);
4588 }
4589 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4590 /*
4591 * JMM - This is temporary until we unify named entries
4592 * and raw memory objects.
4593 *
4594 * Detected fake ip_kotype for a memory object. In
4595 * this case, the port isn't really a port at all, but
4596 * instead is just a raw memory object.
4597 */
4598 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4599 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4600 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4601 }
4602
4603 object = memory_object_to_vm_object((memory_object_t)port);
4604 if (object == VM_OBJECT_NULL) {
4605 return KERN_INVALID_OBJECT;
4606 }
4607 vm_object_reference(object);
4608
4609 /* wait for object (if any) to be ready */
4610 if (object != VM_OBJECT_NULL) {
4611 if (object == kernel_object) {
4612 printf("Warning: Attempt to map kernel object"
4613 " by a non-private kernel entity\n");
4614 return KERN_INVALID_OBJECT;
4615 }
4616 if (!object->pager_ready) {
4617 vm_object_lock(object);
4618
4619 while (!object->pager_ready) {
4620 vm_object_wait(object,
4621 VM_OBJECT_EVENT_PAGER_READY,
4622 THREAD_UNINT);
4623 vm_object_lock(object);
4624 }
4625 vm_object_unlock(object);
4626 }
4627 }
4628 } else {
4629 return KERN_INVALID_OBJECT;
4630 }
4631
4632 if (object != VM_OBJECT_NULL &&
4633 object->named &&
4634 object->pager != MEMORY_OBJECT_NULL &&
4635 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4636 memory_object_t pager;
4637 vm_prot_t pager_prot;
4638 kern_return_t kr;
4639
4640 /*
4641 * For "named" VM objects, let the pager know that the
4642 * memory object is being mapped. Some pagers need to keep
4643 * track of this, to know when they can reclaim the memory
4644 * object, for example.
4645 * VM calls memory_object_map() for each mapping (specifying
4646 * the protection of each mapping) and calls
4647 * memory_object_last_unmap() when all the mappings are gone.
4648 */
4649 pager_prot = max_protection;
4650 if (copy) {
4651 /*
4652 * Copy-On-Write mapping: won't modify the
4653 * memory object.
4654 */
4655 pager_prot &= ~VM_PROT_WRITE;
4656 }
4657 vm_object_lock(object);
4658 pager = object->pager;
4659 if (object->named &&
4660 pager != MEMORY_OBJECT_NULL &&
4661 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4662 assert(object->pager_ready);
4663 vm_object_mapping_wait(object, THREAD_UNINT);
4664 vm_object_mapping_begin(object);
4665 vm_object_unlock(object);
4666
4667 kr = memory_object_map(pager, pager_prot);
4668 assert(kr == KERN_SUCCESS);
4669
4670 vm_object_lock(object);
4671 vm_object_mapping_end(object);
4672 }
4673 vm_object_unlock(object);
4674 }
4675
4676 /*
4677 * Perform the copy if requested
4678 */
4679
4680 if (copy) {
4681 vm_object_t new_object;
4682 vm_object_offset_t new_offset;
4683
4684 result = vm_object_copy_strategically(object, offset,
4685 map_size,
4686 &new_object, &new_offset,
4687 ©);
4688
4689
4690 if (result == KERN_MEMORY_RESTART_COPY) {
4691 boolean_t success;
4692 boolean_t src_needs_copy;
4693
4694 /*
4695 * XXX
4696 * We currently ignore src_needs_copy.
4697 * This really is the issue of how to make
4698 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4699 * non-kernel users to use. Solution forthcoming.
4700 * In the meantime, since we don't allow non-kernel
4701 * memory managers to specify symmetric copy,
4702 * we won't run into problems here.
4703 */
4704 new_object = object;
4705 new_offset = offset;
4706 success = vm_object_copy_quickly(new_object,
4707 new_offset,
4708 map_size,
4709 &src_needs_copy,
4710 ©);
4711 assert(success);
4712 result = KERN_SUCCESS;
4713 }
4714 /*
4715 * Throw away the reference to the
4716 * original object, as it won't be mapped.
4717 */
4718
4719 vm_object_deallocate(object);
4720
4721 if (result != KERN_SUCCESS) {
4722 return result;
4723 }
4724
4725 object = new_object;
4726 offset = new_offset;
4727 }
4728
4729 /*
4730 * If non-kernel users want to try to prefault pages, the mapping and prefault
4731 * needs to be atomic.
4732 */
4733 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(target_map));
4734 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4735
4736 #if __arm64__
4737 if (fourk) {
4738 /* map this object in a "4K" pager */
4739 result = vm_map_enter_fourk(target_map,
4740 &map_addr,
4741 map_size,
4742 (vm_map_offset_t) mask,
4743 flags,
4744 vmk_flags,
4745 tag,
4746 object,
4747 offset,
4748 copy,
4749 cur_protection,
4750 max_protection,
4751 inheritance);
4752 } else
4753 #endif /* __arm64__ */
4754 {
4755 result = vm_map_enter(target_map,
4756 &map_addr, map_size,
4757 (vm_map_offset_t)mask,
4758 flags,
4759 vmk_flags,
4760 tag,
4761 object, offset,
4762 copy,
4763 cur_protection, max_protection,
4764 inheritance);
4765 }
4766 if (result != KERN_SUCCESS) {
4767 vm_object_deallocate(object);
4768 }
4769
4770 /*
4771 * Try to prefault, and do not forget to release the vm map lock.
4772 */
4773 if (result == KERN_SUCCESS && try_prefault) {
4774 mach_vm_address_t va = map_addr;
4775 kern_return_t kr = KERN_SUCCESS;
4776 unsigned int i = 0;
4777 int pmap_options;
4778
4779 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4780 if (object->internal) {
4781 pmap_options |= PMAP_OPTIONS_INTERNAL;
4782 }
4783
4784 for (i = 0; i < page_list_count; ++i) {
4785 if (!UPL_VALID_PAGE(page_list, i)) {
4786 if (kernel_prefault) {
4787 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4788 result = KERN_MEMORY_ERROR;
4789 break;
4790 }
4791 } else {
4792 /*
4793 * If this function call failed, we should stop
4794 * trying to optimize, other calls are likely
4795 * going to fail too.
4796 *
4797 * We are not gonna report an error for such
4798 * failure though. That's an optimization, not
4799 * something critical.
4800 */
4801 kr = pmap_enter_options(target_map->pmap,
4802 va, UPL_PHYS_PAGE(page_list, i),
4803 cur_protection, VM_PROT_NONE,
4804 0, TRUE, pmap_options, NULL);
4805 if (kr != KERN_SUCCESS) {
4806 OSIncrementAtomic64(&vm_prefault_nb_bailout);
4807 if (kernel_prefault) {
4808 result = kr;
4809 }
4810 break;
4811 }
4812 OSIncrementAtomic64(&vm_prefault_nb_pages);
4813 }
4814
4815 /* Next virtual address */
4816 va += PAGE_SIZE;
4817 }
4818 if (vmk_flags.vmkf_keep_map_locked) {
4819 vm_map_unlock(target_map);
4820 }
4821 }
4822
4823 if (flags & (VM_FLAGS_RETURN_DATA_ADDR |
4824 VM_FLAGS_RETURN_4K_DATA_ADDR)) {
4825 *address = map_addr + offset_in_mapping;
4826 } else {
4827 *address = map_addr;
4828 }
4829 return result;
4830 }
4831
4832 kern_return_t
vm_map_enter_mem_object(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4833 vm_map_enter_mem_object(
4834 vm_map_t target_map,
4835 vm_map_offset_t *address,
4836 vm_map_size_t initial_size,
4837 vm_map_offset_t mask,
4838 int flags,
4839 vm_map_kernel_flags_t vmk_flags,
4840 vm_tag_t tag,
4841 ipc_port_t port,
4842 vm_object_offset_t offset,
4843 boolean_t copy,
4844 vm_prot_t cur_protection,
4845 vm_prot_t max_protection,
4846 vm_inherit_t inheritance)
4847 {
4848 kern_return_t ret;
4849
4850 ret = vm_map_enter_mem_object_helper(target_map,
4851 address,
4852 initial_size,
4853 mask,
4854 flags,
4855 vmk_flags,
4856 tag,
4857 port,
4858 offset,
4859 copy,
4860 cur_protection,
4861 max_protection,
4862 inheritance,
4863 NULL,
4864 0);
4865
4866 #if KASAN
4867 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4868 kasan_notify_address(*address, initial_size);
4869 }
4870 #endif
4871
4872 return ret;
4873 }
4874
4875 kern_return_t
vm_map_enter_mem_object_prefault(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,ipc_port_t port,vm_object_offset_t offset,vm_prot_t cur_protection,vm_prot_t max_protection,upl_page_list_ptr_t page_list,unsigned int page_list_count)4876 vm_map_enter_mem_object_prefault(
4877 vm_map_t target_map,
4878 vm_map_offset_t *address,
4879 vm_map_size_t initial_size,
4880 vm_map_offset_t mask,
4881 int flags,
4882 vm_map_kernel_flags_t vmk_flags,
4883 vm_tag_t tag,
4884 ipc_port_t port,
4885 vm_object_offset_t offset,
4886 vm_prot_t cur_protection,
4887 vm_prot_t max_protection,
4888 upl_page_list_ptr_t page_list,
4889 unsigned int page_list_count)
4890 {
4891 kern_return_t ret;
4892
4893 ret = vm_map_enter_mem_object_helper(target_map,
4894 address,
4895 initial_size,
4896 mask,
4897 flags,
4898 vmk_flags,
4899 tag,
4900 port,
4901 offset,
4902 FALSE,
4903 cur_protection,
4904 max_protection,
4905 VM_INHERIT_DEFAULT,
4906 page_list,
4907 page_list_count);
4908
4909 #if KASAN
4910 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
4911 kasan_notify_address(*address, initial_size);
4912 }
4913 #endif
4914
4915 return ret;
4916 }
4917
4918
4919 kern_return_t
vm_map_enter_mem_object_control(vm_map_t target_map,vm_map_offset_t * address,vm_map_size_t initial_size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,memory_object_control_t control,vm_object_offset_t offset,boolean_t copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)4920 vm_map_enter_mem_object_control(
4921 vm_map_t target_map,
4922 vm_map_offset_t *address,
4923 vm_map_size_t initial_size,
4924 vm_map_offset_t mask,
4925 int flags,
4926 vm_map_kernel_flags_t vmk_flags,
4927 vm_tag_t tag,
4928 memory_object_control_t control,
4929 vm_object_offset_t offset,
4930 boolean_t copy,
4931 vm_prot_t cur_protection,
4932 vm_prot_t max_protection,
4933 vm_inherit_t inheritance)
4934 {
4935 vm_map_address_t map_addr;
4936 vm_map_size_t map_size;
4937 vm_object_t object;
4938 vm_object_size_t size;
4939 kern_return_t result;
4940 memory_object_t pager;
4941 vm_prot_t pager_prot;
4942 kern_return_t kr;
4943 #if __arm64__
4944 boolean_t fourk = vmk_flags.vmkf_fourk;
4945 #endif /* __arm64__ */
4946
4947 /*
4948 * Check arguments for validity
4949 */
4950 if ((target_map == VM_MAP_NULL) ||
4951 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4952 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4953 (inheritance > VM_INHERIT_LAST_VALID) ||
4954 initial_size == 0) {
4955 return KERN_INVALID_ARGUMENT;
4956 }
4957
4958 #if __arm64__
4959 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
4960 fourk = FALSE;
4961 }
4962
4963 if (fourk) {
4964 map_addr = vm_map_trunc_page(*address,
4965 FOURK_PAGE_MASK);
4966 map_size = vm_map_round_page(initial_size,
4967 FOURK_PAGE_MASK);
4968 } else
4969 #endif /* __arm64__ */
4970 {
4971 map_addr = vm_map_trunc_page(*address,
4972 VM_MAP_PAGE_MASK(target_map));
4973 map_size = vm_map_round_page(initial_size,
4974 VM_MAP_PAGE_MASK(target_map));
4975 }
4976 size = vm_object_round_page(initial_size);
4977
4978 object = memory_object_control_to_vm_object(control);
4979
4980 if (object == VM_OBJECT_NULL) {
4981 return KERN_INVALID_OBJECT;
4982 }
4983
4984 if (object == kernel_object) {
4985 printf("Warning: Attempt to map kernel object"
4986 " by a non-private kernel entity\n");
4987 return KERN_INVALID_OBJECT;
4988 }
4989
4990 vm_object_lock(object);
4991 object->ref_count++;
4992
4993 /*
4994 * For "named" VM objects, let the pager know that the
4995 * memory object is being mapped. Some pagers need to keep
4996 * track of this, to know when they can reclaim the memory
4997 * object, for example.
4998 * VM calls memory_object_map() for each mapping (specifying
4999 * the protection of each mapping) and calls
5000 * memory_object_last_unmap() when all the mappings are gone.
5001 */
5002 pager_prot = max_protection;
5003 if (copy) {
5004 pager_prot &= ~VM_PROT_WRITE;
5005 }
5006 pager = object->pager;
5007 if (object->named &&
5008 pager != MEMORY_OBJECT_NULL &&
5009 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5010 assert(object->pager_ready);
5011 vm_object_mapping_wait(object, THREAD_UNINT);
5012 vm_object_mapping_begin(object);
5013 vm_object_unlock(object);
5014
5015 kr = memory_object_map(pager, pager_prot);
5016 assert(kr == KERN_SUCCESS);
5017
5018 vm_object_lock(object);
5019 vm_object_mapping_end(object);
5020 }
5021 vm_object_unlock(object);
5022
5023 /*
5024 * Perform the copy if requested
5025 */
5026
5027 if (copy) {
5028 vm_object_t new_object;
5029 vm_object_offset_t new_offset;
5030
5031 result = vm_object_copy_strategically(object, offset, size,
5032 &new_object, &new_offset,
5033 ©);
5034
5035
5036 if (result == KERN_MEMORY_RESTART_COPY) {
5037 boolean_t success;
5038 boolean_t src_needs_copy;
5039
5040 /*
5041 * XXX
5042 * We currently ignore src_needs_copy.
5043 * This really is the issue of how to make
5044 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5045 * non-kernel users to use. Solution forthcoming.
5046 * In the meantime, since we don't allow non-kernel
5047 * memory managers to specify symmetric copy,
5048 * we won't run into problems here.
5049 */
5050 new_object = object;
5051 new_offset = offset;
5052 success = vm_object_copy_quickly(new_object,
5053 new_offset, size,
5054 &src_needs_copy,
5055 ©);
5056 assert(success);
5057 result = KERN_SUCCESS;
5058 }
5059 /*
5060 * Throw away the reference to the
5061 * original object, as it won't be mapped.
5062 */
5063
5064 vm_object_deallocate(object);
5065
5066 if (result != KERN_SUCCESS) {
5067 return result;
5068 }
5069
5070 object = new_object;
5071 offset = new_offset;
5072 }
5073
5074 #if __arm64__
5075 if (fourk) {
5076 result = vm_map_enter_fourk(target_map,
5077 &map_addr,
5078 map_size,
5079 (vm_map_offset_t)mask,
5080 flags,
5081 vmk_flags,
5082 tag,
5083 object, offset,
5084 copy,
5085 cur_protection, max_protection,
5086 inheritance);
5087 } else
5088 #endif /* __arm64__ */
5089 {
5090 result = vm_map_enter(target_map,
5091 &map_addr, map_size,
5092 (vm_map_offset_t)mask,
5093 flags,
5094 vmk_flags,
5095 tag,
5096 object, offset,
5097 copy,
5098 cur_protection, max_protection,
5099 inheritance);
5100 }
5101 if (result != KERN_SUCCESS) {
5102 vm_object_deallocate(object);
5103 }
5104 *address = map_addr;
5105
5106 return result;
5107 }
5108
5109
5110 #if VM_CPM
5111
5112 #ifdef MACH_ASSERT
5113 extern pmap_paddr_t avail_start, avail_end;
5114 #endif
5115
5116 /*
5117 * Allocate memory in the specified map, with the caveat that
5118 * the memory is physically contiguous. This call may fail
5119 * if the system can't find sufficient contiguous memory.
5120 * This call may cause or lead to heart-stopping amounts of
5121 * paging activity.
5122 *
5123 * Memory obtained from this call should be freed in the
5124 * normal way, viz., via vm_deallocate.
5125 */
5126 kern_return_t
vm_map_enter_cpm(vm_map_t map,vm_map_offset_t * addr,vm_map_size_t size,int flags)5127 vm_map_enter_cpm(
5128 vm_map_t map,
5129 vm_map_offset_t *addr,
5130 vm_map_size_t size,
5131 int flags)
5132 {
5133 vm_object_t cpm_obj;
5134 pmap_t pmap;
5135 vm_page_t m, pages;
5136 kern_return_t kr;
5137 vm_map_offset_t va, start, end, offset;
5138 #if MACH_ASSERT
5139 vm_map_offset_t prev_addr = 0;
5140 #endif /* MACH_ASSERT */
5141
5142 boolean_t anywhere = ((VM_FLAGS_ANYWHERE & flags) != 0);
5143 vm_tag_t tag;
5144
5145 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5146 /* XXX TODO4K do we need to support this? */
5147 *addr = 0;
5148 return KERN_NOT_SUPPORTED;
5149 }
5150
5151 VM_GET_FLAGS_ALIAS(flags, tag);
5152
5153 if (size == 0) {
5154 *addr = 0;
5155 return KERN_SUCCESS;
5156 }
5157 if (anywhere) {
5158 *addr = vm_map_min(map);
5159 } else {
5160 *addr = vm_map_trunc_page(*addr,
5161 VM_MAP_PAGE_MASK(map));
5162 }
5163 size = vm_map_round_page(size,
5164 VM_MAP_PAGE_MASK(map));
5165
5166 /*
5167 * LP64todo - cpm_allocate should probably allow
5168 * allocations of >4GB, but not with the current
5169 * algorithm, so just cast down the size for now.
5170 */
5171 if (size > VM_MAX_ADDRESS) {
5172 return KERN_RESOURCE_SHORTAGE;
5173 }
5174 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5175 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5176 return kr;
5177 }
5178
5179 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5180 assert(cpm_obj != VM_OBJECT_NULL);
5181 assert(cpm_obj->internal);
5182 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5183 assert(cpm_obj->can_persist == FALSE);
5184 assert(cpm_obj->pager_created == FALSE);
5185 assert(cpm_obj->pageout == FALSE);
5186 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5187
5188 /*
5189 * Insert pages into object.
5190 */
5191
5192 vm_object_lock(cpm_obj);
5193 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5194 m = pages;
5195 pages = NEXT_PAGE(m);
5196 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5197
5198 assert(!m->vmp_gobbled);
5199 assert(!m->vmp_wanted);
5200 assert(!m->vmp_pageout);
5201 assert(!m->vmp_tabled);
5202 assert(VM_PAGE_WIRED(m));
5203 assert(m->vmp_busy);
5204 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5205
5206 m->vmp_busy = FALSE;
5207 vm_page_insert(m, cpm_obj, offset);
5208 }
5209 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5210 vm_object_unlock(cpm_obj);
5211
5212 /*
5213 * Hang onto a reference on the object in case a
5214 * multi-threaded application for some reason decides
5215 * to deallocate the portion of the address space into
5216 * which we will insert this object.
5217 *
5218 * Unfortunately, we must insert the object now before
5219 * we can talk to the pmap module about which addresses
5220 * must be wired down. Hence, the race with a multi-
5221 * threaded app.
5222 */
5223 vm_object_reference(cpm_obj);
5224
5225 /*
5226 * Insert object into map.
5227 */
5228
5229 kr = vm_map_enter(
5230 map,
5231 addr,
5232 size,
5233 (vm_map_offset_t)0,
5234 flags,
5235 VM_MAP_KERNEL_FLAGS_NONE,
5236 cpm_obj,
5237 (vm_object_offset_t)0,
5238 FALSE,
5239 VM_PROT_ALL,
5240 VM_PROT_ALL,
5241 VM_INHERIT_DEFAULT);
5242
5243 if (kr != KERN_SUCCESS) {
5244 /*
5245 * A CPM object doesn't have can_persist set,
5246 * so all we have to do is deallocate it to
5247 * free up these pages.
5248 */
5249 assert(cpm_obj->pager_created == FALSE);
5250 assert(cpm_obj->can_persist == FALSE);
5251 assert(cpm_obj->pageout == FALSE);
5252 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5253 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5254 vm_object_deallocate(cpm_obj); /* kill creation ref */
5255 }
5256
5257 /*
5258 * Inform the physical mapping system that the
5259 * range of addresses may not fault, so that
5260 * page tables and such can be locked down as well.
5261 */
5262 start = *addr;
5263 end = start + size;
5264 pmap = vm_map_pmap(map);
5265 pmap_pageable(pmap, start, end, FALSE);
5266
5267 /*
5268 * Enter each page into the pmap, to avoid faults.
5269 * Note that this loop could be coded more efficiently,
5270 * if the need arose, rather than looking up each page
5271 * again.
5272 */
5273 for (offset = 0, va = start; offset < size;
5274 va += PAGE_SIZE, offset += PAGE_SIZE) {
5275 int type_of_fault;
5276
5277 vm_object_lock(cpm_obj);
5278 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5279 assert(m != VM_PAGE_NULL);
5280
5281 vm_page_zero_fill(m);
5282
5283 type_of_fault = DBG_ZERO_FILL_FAULT;
5284
5285 vm_fault_enter(m, pmap, va,
5286 PAGE_SIZE, 0,
5287 VM_PROT_ALL, VM_PROT_WRITE,
5288 VM_PAGE_WIRED(m),
5289 FALSE, /* change_wiring */
5290 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5291 FALSE, /* no_cache */
5292 FALSE, /* cs_bypass */
5293 0, /* user_tag */
5294 0, /* pmap_options */
5295 NULL, /* need_retry */
5296 &type_of_fault);
5297
5298 vm_object_unlock(cpm_obj);
5299 }
5300
5301 #if MACH_ASSERT
5302 /*
5303 * Verify ordering in address space.
5304 */
5305 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5306 vm_object_lock(cpm_obj);
5307 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5308 vm_object_unlock(cpm_obj);
5309 if (m == VM_PAGE_NULL) {
5310 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5311 cpm_obj, (uint64_t)offset);
5312 }
5313 assert(m->vmp_tabled);
5314 assert(!m->vmp_busy);
5315 assert(!m->vmp_wanted);
5316 assert(!m->vmp_fictitious);
5317 assert(!m->vmp_private);
5318 assert(!m->vmp_absent);
5319 assert(!m->vmp_error);
5320 assert(!m->vmp_cleaning);
5321 assert(!m->vmp_laundry);
5322 assert(!m->vmp_precious);
5323 assert(!m->vmp_clustered);
5324 if (offset != 0) {
5325 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5326 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5327 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5328 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5329 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5330 panic("vm_allocate_cpm: pages not contig!");
5331 }
5332 }
5333 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5334 }
5335 #endif /* MACH_ASSERT */
5336
5337 vm_object_deallocate(cpm_obj); /* kill extra ref */
5338
5339 return kr;
5340 }
5341
5342
5343 #else /* VM_CPM */
5344
5345 /*
5346 * Interface is defined in all cases, but unless the kernel
5347 * is built explicitly for this option, the interface does
5348 * nothing.
5349 */
5350
5351 kern_return_t
vm_map_enter_cpm(__unused vm_map_t map,__unused vm_map_offset_t * addr,__unused vm_map_size_t size,__unused int flags)5352 vm_map_enter_cpm(
5353 __unused vm_map_t map,
5354 __unused vm_map_offset_t *addr,
5355 __unused vm_map_size_t size,
5356 __unused int flags)
5357 {
5358 return KERN_FAILURE;
5359 }
5360 #endif /* VM_CPM */
5361
5362 /* Not used without nested pmaps */
5363 #ifndef NO_NESTED_PMAP
5364 /*
5365 * Clip and unnest a portion of a nested submap mapping.
5366 */
5367
5368
5369 static void
vm_map_clip_unnest(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t start_unnest,vm_map_offset_t end_unnest)5370 vm_map_clip_unnest(
5371 vm_map_t map,
5372 vm_map_entry_t entry,
5373 vm_map_offset_t start_unnest,
5374 vm_map_offset_t end_unnest)
5375 {
5376 vm_map_offset_t old_start_unnest = start_unnest;
5377 vm_map_offset_t old_end_unnest = end_unnest;
5378
5379 assert(entry->is_sub_map);
5380 assert(VME_SUBMAP(entry) != NULL);
5381 assert(entry->use_pmap);
5382
5383 /*
5384 * Query the platform for the optimal unnest range.
5385 * DRK: There's some duplication of effort here, since
5386 * callers may have adjusted the range to some extent. This
5387 * routine was introduced to support 1GiB subtree nesting
5388 * for x86 platforms, which can also nest on 2MiB boundaries
5389 * depending on size/alignment.
5390 */
5391 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5392 assert(VME_SUBMAP(entry)->is_nested_map);
5393 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5394 log_unnest_badness(map,
5395 old_start_unnest,
5396 old_end_unnest,
5397 VME_SUBMAP(entry)->is_nested_map,
5398 (entry->vme_start +
5399 VME_SUBMAP(entry)->lowest_unnestable_start -
5400 VME_OFFSET(entry)));
5401 }
5402
5403 if (entry->vme_start > start_unnest ||
5404 entry->vme_end < end_unnest) {
5405 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5406 "bad nested entry: start=0x%llx end=0x%llx\n",
5407 (long long)start_unnest, (long long)end_unnest,
5408 (long long)entry->vme_start, (long long)entry->vme_end);
5409 }
5410
5411 if (start_unnest > entry->vme_start) {
5412 _vm_map_clip_start(&map->hdr,
5413 entry,
5414 start_unnest);
5415 if (map->holelistenabled) {
5416 vm_map_store_update_first_free(map, NULL, FALSE);
5417 } else {
5418 vm_map_store_update_first_free(map, map->first_free, FALSE);
5419 }
5420 }
5421 if (entry->vme_end > end_unnest) {
5422 _vm_map_clip_end(&map->hdr,
5423 entry,
5424 end_unnest);
5425 if (map->holelistenabled) {
5426 vm_map_store_update_first_free(map, NULL, FALSE);
5427 } else {
5428 vm_map_store_update_first_free(map, map->first_free, FALSE);
5429 }
5430 }
5431
5432 pmap_unnest(map->pmap,
5433 entry->vme_start,
5434 entry->vme_end - entry->vme_start);
5435 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(&map->map_refcnt) != 0) {
5436 /* clean up parent map/maps */
5437 vm_map_submap_pmap_clean(
5438 map, entry->vme_start,
5439 entry->vme_end,
5440 VME_SUBMAP(entry),
5441 VME_OFFSET(entry));
5442 }
5443 entry->use_pmap = FALSE;
5444 if ((map->pmap != kernel_pmap) &&
5445 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5446 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5447 }
5448 }
5449 #endif /* NO_NESTED_PMAP */
5450
5451 __abortlike
5452 static void
__vm_map_clip_atomic_entry_panic(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t where)5453 __vm_map_clip_atomic_entry_panic(
5454 vm_map_t map,
5455 vm_map_entry_t entry,
5456 vm_map_offset_t where)
5457 {
5458 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5459 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5460 (uint64_t)entry->vme_start,
5461 (uint64_t)entry->vme_end,
5462 (uint64_t)where);
5463 }
5464
5465 /*
5466 * vm_map_clip_start: [ internal use only ]
5467 *
5468 * Asserts that the given entry begins at or after
5469 * the specified address; if necessary,
5470 * it splits the entry into two.
5471 */
5472 void
vm_map_clip_start(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t startaddr)5473 vm_map_clip_start(
5474 vm_map_t map,
5475 vm_map_entry_t entry,
5476 vm_map_offset_t startaddr)
5477 {
5478 #ifndef NO_NESTED_PMAP
5479 if (entry->is_sub_map &&
5480 entry->use_pmap &&
5481 startaddr >= entry->vme_start) {
5482 vm_map_offset_t start_unnest, end_unnest;
5483
5484 /*
5485 * Make sure "startaddr" is no longer in a nested range
5486 * before we clip. Unnest only the minimum range the platform
5487 * can handle.
5488 * vm_map_clip_unnest may perform additional adjustments to
5489 * the unnest range.
5490 */
5491 start_unnest = startaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
5492 end_unnest = start_unnest + pmap_shared_region_size_min(map->pmap);
5493 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5494 }
5495 #endif /* NO_NESTED_PMAP */
5496 if (startaddr > entry->vme_start) {
5497 if (VME_OBJECT(entry) &&
5498 !entry->is_sub_map &&
5499 VME_OBJECT(entry)->phys_contiguous) {
5500 pmap_remove(map->pmap,
5501 (addr64_t)(entry->vme_start),
5502 (addr64_t)(entry->vme_end));
5503 }
5504 if (entry->vme_atomic) {
5505 __vm_map_clip_atomic_entry_panic(map, entry, startaddr);
5506 }
5507
5508 DTRACE_VM5(
5509 vm_map_clip_start,
5510 vm_map_t, map,
5511 vm_map_offset_t, entry->vme_start,
5512 vm_map_offset_t, entry->vme_end,
5513 vm_map_offset_t, startaddr,
5514 int, VME_ALIAS(entry));
5515
5516 _vm_map_clip_start(&map->hdr, entry, startaddr);
5517 if (map->holelistenabled) {
5518 vm_map_store_update_first_free(map, NULL, FALSE);
5519 } else {
5520 vm_map_store_update_first_free(map, map->first_free, FALSE);
5521 }
5522 }
5523 }
5524
5525
5526 #define vm_map_copy_clip_start(copy, entry, startaddr) \
5527 MACRO_BEGIN \
5528 if ((startaddr) > (entry)->vme_start) \
5529 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5530 MACRO_END
5531
5532 /*
5533 * This routine is called only when it is known that
5534 * the entry must be split.
5535 */
5536 static void
_vm_map_clip_start(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t start)5537 _vm_map_clip_start(
5538 struct vm_map_header *map_header,
5539 vm_map_entry_t entry,
5540 vm_map_offset_t start)
5541 {
5542 vm_map_entry_t new_entry;
5543
5544 /*
5545 * Split off the front portion --
5546 * note that we must insert the new
5547 * entry BEFORE this one, so that
5548 * this entry has the specified starting
5549 * address.
5550 */
5551
5552 if (entry->map_aligned) {
5553 assert(VM_MAP_PAGE_ALIGNED(start,
5554 VM_MAP_HDR_PAGE_MASK(map_header)));
5555 }
5556
5557 new_entry = _vm_map_entry_create(map_header);
5558 vm_map_entry_copy_full(new_entry, entry);
5559
5560 new_entry->vme_end = start;
5561 assert(new_entry->vme_start < new_entry->vme_end);
5562 VME_OFFSET_SET(entry, VME_OFFSET(entry) + (start - entry->vme_start));
5563 assert(start < entry->vme_end);
5564 entry->vme_start = start;
5565
5566 _vm_map_store_entry_link(map_header, entry->vme_prev, new_entry);
5567
5568 if (entry->is_sub_map) {
5569 vm_map_reference(VME_SUBMAP(new_entry));
5570 } else {
5571 vm_object_reference(VME_OBJECT(new_entry));
5572 }
5573 }
5574
5575
5576 /*
5577 * vm_map_clip_end: [ internal use only ]
5578 *
5579 * Asserts that the given entry ends at or before
5580 * the specified address; if necessary,
5581 * it splits the entry into two.
5582 */
5583 void
vm_map_clip_end(vm_map_t map,vm_map_entry_t entry,vm_map_offset_t endaddr)5584 vm_map_clip_end(
5585 vm_map_t map,
5586 vm_map_entry_t entry,
5587 vm_map_offset_t endaddr)
5588 {
5589 if (endaddr > entry->vme_end) {
5590 /*
5591 * Within the scope of this clipping, limit "endaddr" to
5592 * the end of this map entry...
5593 */
5594 endaddr = entry->vme_end;
5595 }
5596 #ifndef NO_NESTED_PMAP
5597 if (entry->is_sub_map && entry->use_pmap) {
5598 vm_map_offset_t start_unnest, end_unnest;
5599
5600 /*
5601 * Make sure the range between the start of this entry and
5602 * the new "endaddr" is no longer nested before we clip.
5603 * Unnest only the minimum range the platform can handle.
5604 * vm_map_clip_unnest may perform additional adjustments to
5605 * the unnest range.
5606 */
5607 start_unnest = entry->vme_start;
5608 end_unnest =
5609 (endaddr + pmap_shared_region_size_min(map->pmap) - 1) &
5610 ~(pmap_shared_region_size_min(map->pmap) - 1);
5611 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5612 }
5613 #endif /* NO_NESTED_PMAP */
5614 if (endaddr < entry->vme_end) {
5615 if (VME_OBJECT(entry) &&
5616 !entry->is_sub_map &&
5617 VME_OBJECT(entry)->phys_contiguous) {
5618 pmap_remove(map->pmap,
5619 (addr64_t)(entry->vme_start),
5620 (addr64_t)(entry->vme_end));
5621 }
5622 if (entry->vme_atomic) {
5623 __vm_map_clip_atomic_entry_panic(map, entry, endaddr);
5624 }
5625 DTRACE_VM5(
5626 vm_map_clip_end,
5627 vm_map_t, map,
5628 vm_map_offset_t, entry->vme_start,
5629 vm_map_offset_t, entry->vme_end,
5630 vm_map_offset_t, endaddr,
5631 int, VME_ALIAS(entry));
5632
5633 _vm_map_clip_end(&map->hdr, entry, endaddr);
5634 if (map->holelistenabled) {
5635 vm_map_store_update_first_free(map, NULL, FALSE);
5636 } else {
5637 vm_map_store_update_first_free(map, map->first_free, FALSE);
5638 }
5639 }
5640 }
5641
5642
5643 #define vm_map_copy_clip_end(copy, entry, endaddr) \
5644 MACRO_BEGIN \
5645 if ((endaddr) < (entry)->vme_end) \
5646 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5647 MACRO_END
5648
5649 /*
5650 * This routine is called only when it is known that
5651 * the entry must be split.
5652 */
5653 static void
_vm_map_clip_end(struct vm_map_header * map_header,vm_map_entry_t entry,vm_map_offset_t end)5654 _vm_map_clip_end(
5655 struct vm_map_header *map_header,
5656 vm_map_entry_t entry,
5657 vm_map_offset_t end)
5658 {
5659 vm_map_entry_t new_entry;
5660
5661 /*
5662 * Create a new entry and insert it
5663 * AFTER the specified entry
5664 */
5665
5666 if (entry->map_aligned) {
5667 assert(VM_MAP_PAGE_ALIGNED(end,
5668 VM_MAP_HDR_PAGE_MASK(map_header)));
5669 }
5670
5671 new_entry = _vm_map_entry_create(map_header);
5672 vm_map_entry_copy_full(new_entry, entry);
5673
5674 assert(entry->vme_start < end);
5675 new_entry->vme_start = entry->vme_end = end;
5676 VME_OFFSET_SET(new_entry,
5677 VME_OFFSET(new_entry) + (end - entry->vme_start));
5678 assert(new_entry->vme_start < new_entry->vme_end);
5679
5680 _vm_map_store_entry_link(map_header, entry, new_entry);
5681
5682 if (entry->is_sub_map) {
5683 vm_map_reference(VME_SUBMAP(new_entry));
5684 } else {
5685 vm_object_reference(VME_OBJECT(new_entry));
5686 }
5687 }
5688
5689
5690 /*
5691 * VM_MAP_RANGE_CHECK: [ internal use only ]
5692 *
5693 * Asserts that the starting and ending region
5694 * addresses fall within the valid range of the map.
5695 */
5696 #define VM_MAP_RANGE_CHECK(map, start, end) \
5697 MACRO_BEGIN \
5698 if (start < vm_map_min(map)) \
5699 start = vm_map_min(map); \
5700 if (end > vm_map_max(map)) \
5701 end = vm_map_max(map); \
5702 if (start > end) \
5703 start = end; \
5704 MACRO_END
5705
5706 /*
5707 * vm_map_range_check: [ internal use only ]
5708 *
5709 * Check that the region defined by the specified start and
5710 * end addresses are wholly contained within a single map
5711 * entry or set of adjacent map entries of the spacified map,
5712 * i.e. the specified region contains no unmapped space.
5713 * If any or all of the region is unmapped, FALSE is returned.
5714 * Otherwise, TRUE is returned and if the output argument 'entry'
5715 * is not NULL it points to the map entry containing the start
5716 * of the region.
5717 *
5718 * The map is locked for reading on entry and is left locked.
5719 */
5720 static boolean_t
vm_map_range_check(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t * entry)5721 vm_map_range_check(
5722 vm_map_t map,
5723 vm_map_offset_t start,
5724 vm_map_offset_t end,
5725 vm_map_entry_t *entry)
5726 {
5727 vm_map_entry_t cur;
5728 vm_map_offset_t prev;
5729
5730 /*
5731 * Basic sanity checks first
5732 */
5733 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5734 return FALSE;
5735 }
5736
5737 /*
5738 * Check first if the region starts within a valid
5739 * mapping for the map.
5740 */
5741 if (!vm_map_lookup_entry(map, start, &cur)) {
5742 return FALSE;
5743 }
5744
5745 /*
5746 * Optimize for the case that the region is contained
5747 * in a single map entry.
5748 */
5749 if (entry != (vm_map_entry_t *) NULL) {
5750 *entry = cur;
5751 }
5752 if (end <= cur->vme_end) {
5753 return TRUE;
5754 }
5755
5756 /*
5757 * If the region is not wholly contained within a
5758 * single entry, walk the entries looking for holes.
5759 */
5760 prev = cur->vme_end;
5761 cur = cur->vme_next;
5762 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5763 if (end <= cur->vme_end) {
5764 return TRUE;
5765 }
5766 prev = cur->vme_end;
5767 cur = cur->vme_next;
5768 }
5769 return FALSE;
5770 }
5771
5772 /*
5773 * vm_map_protect:
5774 *
5775 * Sets the protection of the specified address
5776 * region in the target map. If "set_max" is
5777 * specified, the maximum protection is to be set;
5778 * otherwise, only the current protection is affected.
5779 */
5780 kern_return_t
vm_map_protect(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t new_prot,boolean_t set_max)5781 vm_map_protect(
5782 vm_map_t map,
5783 vm_map_offset_t start,
5784 vm_map_offset_t end,
5785 vm_prot_t new_prot,
5786 boolean_t set_max)
5787 {
5788 vm_map_entry_t current;
5789 vm_map_offset_t prev;
5790 vm_map_entry_t entry;
5791 vm_prot_t new_max;
5792 int pmap_options = 0;
5793 kern_return_t kr;
5794
5795 if (new_prot & VM_PROT_COPY) {
5796 vm_map_offset_t new_start;
5797 vm_prot_t cur_prot, max_prot;
5798 vm_map_kernel_flags_t kflags;
5799
5800 /* LP64todo - see below */
5801 if (start >= map->max_offset) {
5802 return KERN_INVALID_ADDRESS;
5803 }
5804
5805 if ((new_prot & VM_PROT_ALLEXEC) &&
5806 map->pmap != kernel_pmap &&
5807 (vm_map_cs_enforcement(map)
5808 #if XNU_TARGET_OS_OSX && __arm64__
5809 || !VM_MAP_IS_EXOTIC(map)
5810 #endif /* XNU_TARGET_OS_OSX && __arm64__ */
5811 ) &&
5812 VM_MAP_POLICY_WX_FAIL(map)) {
5813 DTRACE_VM3(cs_wx,
5814 uint64_t, (uint64_t) start,
5815 uint64_t, (uint64_t) end,
5816 vm_prot_t, new_prot);
5817 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5818 proc_selfpid(),
5819 (current_task()->bsd_info
5820 ? proc_name_address(current_task()->bsd_info)
5821 : "?"),
5822 __FUNCTION__);
5823 return KERN_PROTECTION_FAILURE;
5824 }
5825
5826 /*
5827 * Let vm_map_remap_extract() know that it will need to:
5828 * + make a copy of the mapping
5829 * + add VM_PROT_WRITE to the max protections
5830 * + remove any protections that are no longer allowed from the
5831 * max protections (to avoid any WRITE/EXECUTE conflict, for
5832 * example).
5833 * Note that "max_prot" is an IN/OUT parameter only for this
5834 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
5835 * only.
5836 */
5837 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
5838 cur_prot = VM_PROT_NONE;
5839 kflags = VM_MAP_KERNEL_FLAGS_NONE;
5840 kflags.vmkf_remap_prot_copy = TRUE;
5841 kflags.vmkf_overwrite_immutable = TRUE;
5842 new_start = start;
5843 kr = vm_map_remap(map,
5844 &new_start,
5845 end - start,
5846 0, /* mask */
5847 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
5848 kflags,
5849 0,
5850 map,
5851 start,
5852 TRUE, /* copy-on-write remapping! */
5853 &cur_prot, /* IN/OUT */
5854 &max_prot, /* IN/OUT */
5855 VM_INHERIT_DEFAULT);
5856 if (kr != KERN_SUCCESS) {
5857 return kr;
5858 }
5859 new_prot &= ~VM_PROT_COPY;
5860 }
5861
5862 vm_map_lock(map);
5863
5864 /* LP64todo - remove this check when vm_map_commpage64()
5865 * no longer has to stuff in a map_entry for the commpage
5866 * above the map's max_offset.
5867 */
5868 if (start >= map->max_offset) {
5869 vm_map_unlock(map);
5870 return KERN_INVALID_ADDRESS;
5871 }
5872
5873 while (1) {
5874 /*
5875 * Lookup the entry. If it doesn't start in a valid
5876 * entry, return an error.
5877 */
5878 if (!vm_map_lookup_entry(map, start, &entry)) {
5879 vm_map_unlock(map);
5880 return KERN_INVALID_ADDRESS;
5881 }
5882
5883 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
5884 start = SUPERPAGE_ROUND_DOWN(start);
5885 continue;
5886 }
5887 break;
5888 }
5889 if (entry->superpage_size) {
5890 end = SUPERPAGE_ROUND_UP(end);
5891 }
5892
5893 /*
5894 * Make a first pass to check for protection and address
5895 * violations.
5896 */
5897
5898 current = entry;
5899 prev = current->vme_start;
5900 while ((current != vm_map_to_entry(map)) &&
5901 (current->vme_start < end)) {
5902 /*
5903 * If there is a hole, return an error.
5904 */
5905 if (current->vme_start != prev) {
5906 vm_map_unlock(map);
5907 return KERN_INVALID_ADDRESS;
5908 }
5909
5910 new_max = current->max_protection;
5911
5912 #if defined(__x86_64__)
5913 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
5914 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
5915 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
5916 }
5917 #endif
5918 if ((new_prot & new_max) != new_prot) {
5919 vm_map_unlock(map);
5920 return KERN_PROTECTION_FAILURE;
5921 }
5922
5923 if (current->used_for_jit &&
5924 pmap_has_prot_policy(map->pmap, current->translated_allow_execute, current->protection)) {
5925 vm_map_unlock(map);
5926 return KERN_PROTECTION_FAILURE;
5927 }
5928
5929 if ((new_prot & VM_PROT_WRITE) &&
5930 (new_prot & VM_PROT_ALLEXEC) &&
5931 #if XNU_TARGET_OS_OSX
5932 map->pmap != kernel_pmap &&
5933 (vm_map_cs_enforcement(map)
5934 #if __arm64__
5935 || !VM_MAP_IS_EXOTIC(map)
5936 #endif /* __arm64__ */
5937 ) &&
5938 #endif /* XNU_TARGET_OS_OSX */
5939 !(current->used_for_jit)) {
5940 DTRACE_VM3(cs_wx,
5941 uint64_t, (uint64_t) current->vme_start,
5942 uint64_t, (uint64_t) current->vme_end,
5943 vm_prot_t, new_prot);
5944 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
5945 proc_selfpid(),
5946 (current_task()->bsd_info
5947 ? proc_name_address(current_task()->bsd_info)
5948 : "?"),
5949 __FUNCTION__);
5950 new_prot &= ~VM_PROT_ALLEXEC;
5951 if (VM_MAP_POLICY_WX_FAIL(map)) {
5952 vm_map_unlock(map);
5953 return KERN_PROTECTION_FAILURE;
5954 }
5955 }
5956
5957 /*
5958 * If the task has requested executable lockdown,
5959 * deny both:
5960 * - adding executable protections OR
5961 * - adding write protections to an existing executable mapping.
5962 */
5963 if (map->map_disallow_new_exec == TRUE) {
5964 if ((new_prot & VM_PROT_ALLEXEC) ||
5965 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
5966 vm_map_unlock(map);
5967 return KERN_PROTECTION_FAILURE;
5968 }
5969 }
5970
5971 prev = current->vme_end;
5972 current = current->vme_next;
5973 }
5974
5975 #if __arm64__
5976 if (end > prev &&
5977 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
5978 vm_map_entry_t prev_entry;
5979
5980 prev_entry = current->vme_prev;
5981 if (prev_entry != vm_map_to_entry(map) &&
5982 !prev_entry->map_aligned &&
5983 (vm_map_round_page(prev_entry->vme_end,
5984 VM_MAP_PAGE_MASK(map))
5985 == end)) {
5986 /*
5987 * The last entry in our range is not "map-aligned"
5988 * but it would have reached all the way to "end"
5989 * if it had been map-aligned, so this is not really
5990 * a hole in the range and we can proceed.
5991 */
5992 prev = end;
5993 }
5994 }
5995 #endif /* __arm64__ */
5996
5997 if (end > prev) {
5998 vm_map_unlock(map);
5999 return KERN_INVALID_ADDRESS;
6000 }
6001
6002 /*
6003 * Go back and fix up protections.
6004 * Clip to start here if the range starts within
6005 * the entry.
6006 */
6007
6008 current = entry;
6009 if (current != vm_map_to_entry(map)) {
6010 /* clip and unnest if necessary */
6011 vm_map_clip_start(map, current, start);
6012 }
6013
6014 while ((current != vm_map_to_entry(map)) &&
6015 (current->vme_start < end)) {
6016 vm_prot_t old_prot;
6017
6018 vm_map_clip_end(map, current, end);
6019
6020 if (current->is_sub_map) {
6021 /* clipping did unnest if needed */
6022 assert(!current->use_pmap);
6023 }
6024
6025 old_prot = current->protection;
6026
6027 if (set_max) {
6028 current->max_protection = new_prot;
6029 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6030 current->protection = (new_prot & old_prot);
6031 } else {
6032 current->protection = new_prot;
6033 }
6034
6035 /*
6036 * Update physical map if necessary.
6037 * If the request is to turn off write protection,
6038 * we won't do it for real (in pmap). This is because
6039 * it would cause copy-on-write to fail. We've already
6040 * set, the new protection in the map, so if a
6041 * write-protect fault occurred, it will be fixed up
6042 * properly, COW or not.
6043 */
6044 if (current->protection != old_prot) {
6045 /* Look one level in we support nested pmaps */
6046 /* from mapped submaps which are direct entries */
6047 /* in our map */
6048
6049 vm_prot_t prot;
6050
6051 prot = current->protection;
6052 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6053 prot &= ~VM_PROT_WRITE;
6054 } else {
6055 assert(!VME_OBJECT(current)->code_signed);
6056 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6057 }
6058
6059 if (override_nx(map, VME_ALIAS(current)) && prot) {
6060 prot |= VM_PROT_EXECUTE;
6061 }
6062
6063 #if DEVELOPMENT || DEBUG
6064 if (!(old_prot & VM_PROT_EXECUTE) &&
6065 (prot & VM_PROT_EXECUTE) &&
6066 panic_on_unsigned_execute &&
6067 (proc_selfcsflags() & CS_KILL)) {
6068 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6069 }
6070 #endif /* DEVELOPMENT || DEBUG */
6071
6072 if (pmap_has_prot_policy(map->pmap, current->translated_allow_execute, prot)) {
6073 if (current->wired_count) {
6074 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6075 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6076 }
6077
6078 /* If the pmap layer cares about this
6079 * protection type, force a fault for
6080 * each page so that vm_fault will
6081 * repopulate the page with the full
6082 * set of protections.
6083 */
6084 /*
6085 * TODO: We don't seem to need this,
6086 * but this is due to an internal
6087 * implementation detail of
6088 * pmap_protect. Do we want to rely
6089 * on this?
6090 */
6091 prot = VM_PROT_NONE;
6092 }
6093
6094 if (current->is_sub_map && current->use_pmap) {
6095 pmap_protect(VME_SUBMAP(current)->pmap,
6096 current->vme_start,
6097 current->vme_end,
6098 prot);
6099 } else {
6100 if (prot & VM_PROT_WRITE) {
6101 if (VME_OBJECT(current) == compressor_object) {
6102 /*
6103 * For write requests on the
6104 * compressor, we wil ask the
6105 * pmap layer to prevent us from
6106 * taking a write fault when we
6107 * attempt to access the mapping
6108 * next.
6109 */
6110 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6111 }
6112 }
6113
6114 pmap_protect_options(map->pmap,
6115 current->vme_start,
6116 current->vme_end,
6117 prot,
6118 pmap_options,
6119 NULL);
6120 }
6121 }
6122 current = current->vme_next;
6123 }
6124
6125 current = entry;
6126 while ((current != vm_map_to_entry(map)) &&
6127 (current->vme_start <= end)) {
6128 vm_map_simplify_entry(map, current);
6129 current = current->vme_next;
6130 }
6131
6132 vm_map_unlock(map);
6133 return KERN_SUCCESS;
6134 }
6135
6136 /*
6137 * vm_map_inherit:
6138 *
6139 * Sets the inheritance of the specified address
6140 * range in the target map. Inheritance
6141 * affects how the map will be shared with
6142 * child maps at the time of vm_map_fork.
6143 */
6144 kern_return_t
vm_map_inherit(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_inherit_t new_inheritance)6145 vm_map_inherit(
6146 vm_map_t map,
6147 vm_map_offset_t start,
6148 vm_map_offset_t end,
6149 vm_inherit_t new_inheritance)
6150 {
6151 vm_map_entry_t entry;
6152 vm_map_entry_t temp_entry;
6153
6154 vm_map_lock(map);
6155
6156 VM_MAP_RANGE_CHECK(map, start, end);
6157
6158 if (vm_map_lookup_entry(map, start, &temp_entry)) {
6159 entry = temp_entry;
6160 } else {
6161 temp_entry = temp_entry->vme_next;
6162 entry = temp_entry;
6163 }
6164
6165 /* first check entire range for submaps which can't support the */
6166 /* given inheritance. */
6167 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6168 if (entry->is_sub_map) {
6169 if (new_inheritance == VM_INHERIT_COPY) {
6170 vm_map_unlock(map);
6171 return KERN_INVALID_ARGUMENT;
6172 }
6173 }
6174
6175 entry = entry->vme_next;
6176 }
6177
6178 entry = temp_entry;
6179 if (entry != vm_map_to_entry(map)) {
6180 /* clip and unnest if necessary */
6181 vm_map_clip_start(map, entry, start);
6182 }
6183
6184 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6185 vm_map_clip_end(map, entry, end);
6186 if (entry->is_sub_map) {
6187 /* clip did unnest if needed */
6188 assert(!entry->use_pmap);
6189 }
6190
6191 entry->inheritance = new_inheritance;
6192
6193 entry = entry->vme_next;
6194 }
6195
6196 vm_map_unlock(map);
6197 return KERN_SUCCESS;
6198 }
6199
6200 /*
6201 * Update the accounting for the amount of wired memory in this map. If the user has
6202 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6203 */
6204
6205 static kern_return_t
add_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6206 add_wire_counts(
6207 vm_map_t map,
6208 vm_map_entry_t entry,
6209 boolean_t user_wire)
6210 {
6211 vm_map_size_t size;
6212
6213 if (user_wire) {
6214 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6215
6216 /*
6217 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6218 * this map entry.
6219 */
6220
6221 if (entry->user_wired_count == 0) {
6222 size = entry->vme_end - entry->vme_start;
6223
6224 /*
6225 * Since this is the first time the user is wiring this map entry, check to see if we're
6226 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6227 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6228 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6229 * limit, then we fail.
6230 */
6231
6232 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6233 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6234 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6235 #if DEVELOPMENT || DEBUG
6236 if (panic_on_mlock_failure) {
6237 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6238 }
6239 #endif /* DEVELOPMENT || DEBUG */
6240 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6241 } else {
6242 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6243 #if DEVELOPMENT || DEBUG
6244 if (panic_on_mlock_failure) {
6245 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6246 }
6247 #endif /* DEVELOPMENT || DEBUG */
6248 }
6249 return KERN_RESOURCE_SHORTAGE;
6250 }
6251
6252 /*
6253 * The first time the user wires an entry, we also increment the wired_count and add this to
6254 * the total that has been wired in the map.
6255 */
6256
6257 if (entry->wired_count >= MAX_WIRE_COUNT) {
6258 return KERN_FAILURE;
6259 }
6260
6261 entry->wired_count++;
6262 map->user_wire_size += size;
6263 }
6264
6265 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6266 return KERN_FAILURE;
6267 }
6268
6269 entry->user_wired_count++;
6270 } else {
6271 /*
6272 * The kernel's wiring the memory. Just bump the count and continue.
6273 */
6274
6275 if (entry->wired_count >= MAX_WIRE_COUNT) {
6276 panic("vm_map_wire: too many wirings");
6277 }
6278
6279 entry->wired_count++;
6280 }
6281
6282 return KERN_SUCCESS;
6283 }
6284
6285 /*
6286 * Update the memory wiring accounting now that the given map entry is being unwired.
6287 */
6288
6289 static void
subtract_wire_counts(vm_map_t map,vm_map_entry_t entry,boolean_t user_wire)6290 subtract_wire_counts(
6291 vm_map_t map,
6292 vm_map_entry_t entry,
6293 boolean_t user_wire)
6294 {
6295 if (user_wire) {
6296 /*
6297 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6298 */
6299
6300 if (entry->user_wired_count == 1) {
6301 /*
6302 * We're removing the last user wire reference. Decrement the wired_count and the total
6303 * user wired memory for this map.
6304 */
6305
6306 assert(entry->wired_count >= 1);
6307 entry->wired_count--;
6308 map->user_wire_size -= entry->vme_end - entry->vme_start;
6309 }
6310
6311 assert(entry->user_wired_count >= 1);
6312 entry->user_wired_count--;
6313 } else {
6314 /*
6315 * The kernel is unwiring the memory. Just update the count.
6316 */
6317
6318 assert(entry->wired_count >= 1);
6319 entry->wired_count--;
6320 }
6321 }
6322
6323 int cs_executable_wire = 0;
6324
6325 /*
6326 * vm_map_wire:
6327 *
6328 * Sets the pageability of the specified address range in the
6329 * target map as wired. Regions specified as not pageable require
6330 * locked-down physical memory and physical page maps. The
6331 * access_type variable indicates types of accesses that must not
6332 * generate page faults. This is checked against protection of
6333 * memory being locked-down.
6334 *
6335 * The map must not be locked, but a reference must remain to the
6336 * map throughout the call.
6337 */
6338 static kern_return_t
vm_map_wire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr,ppnum_t * physpage_p)6339 vm_map_wire_nested(
6340 vm_map_t map,
6341 vm_map_offset_t start,
6342 vm_map_offset_t end,
6343 vm_prot_t caller_prot,
6344 vm_tag_t tag,
6345 boolean_t user_wire,
6346 pmap_t map_pmap,
6347 vm_map_offset_t pmap_addr,
6348 ppnum_t *physpage_p)
6349 {
6350 vm_map_entry_t entry;
6351 vm_prot_t access_type;
6352 struct vm_map_entry *first_entry, tmp_entry;
6353 vm_map_t real_map;
6354 vm_map_offset_t s, e;
6355 kern_return_t rc;
6356 boolean_t need_wakeup;
6357 boolean_t main_map = FALSE;
6358 wait_interrupt_t interruptible_state;
6359 thread_t cur_thread;
6360 unsigned int last_timestamp;
6361 vm_map_size_t size;
6362 boolean_t wire_and_extract;
6363 vm_prot_t extra_prots;
6364
6365 extra_prots = VM_PROT_COPY;
6366 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6367 #if XNU_TARGET_OS_OSX
6368 if (map->pmap == kernel_pmap ||
6369 !vm_map_cs_enforcement(map)) {
6370 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6371 }
6372 #endif /* XNU_TARGET_OS_OSX */
6373
6374 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6375
6376 wire_and_extract = FALSE;
6377 if (physpage_p != NULL) {
6378 /*
6379 * The caller wants the physical page number of the
6380 * wired page. We return only one physical page number
6381 * so this works for only one page at a time.
6382 */
6383 if ((end - start) != PAGE_SIZE) {
6384 return KERN_INVALID_ARGUMENT;
6385 }
6386 wire_and_extract = TRUE;
6387 *physpage_p = 0;
6388 }
6389
6390 vm_map_lock(map);
6391 if (map_pmap == NULL) {
6392 main_map = TRUE;
6393 }
6394 last_timestamp = map->timestamp;
6395
6396 VM_MAP_RANGE_CHECK(map, start, end);
6397 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6398 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6399
6400 if (start == end) {
6401 /* We wired what the caller asked for, zero pages */
6402 vm_map_unlock(map);
6403 return KERN_SUCCESS;
6404 }
6405
6406 need_wakeup = FALSE;
6407 cur_thread = current_thread();
6408
6409 s = start;
6410 rc = KERN_SUCCESS;
6411
6412 if (vm_map_lookup_entry(map, s, &first_entry)) {
6413 entry = first_entry;
6414 /*
6415 * vm_map_clip_start will be done later.
6416 * We don't want to unnest any nested submaps here !
6417 */
6418 } else {
6419 /* Start address is not in map */
6420 rc = KERN_INVALID_ADDRESS;
6421 goto done;
6422 }
6423
6424 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6425 /*
6426 * At this point, we have wired from "start" to "s".
6427 * We still need to wire from "s" to "end".
6428 *
6429 * "entry" hasn't been clipped, so it could start before "s"
6430 * and/or end after "end".
6431 */
6432
6433 /* "e" is how far we want to wire in this entry */
6434 e = entry->vme_end;
6435 if (e > end) {
6436 e = end;
6437 }
6438
6439 /*
6440 * If another thread is wiring/unwiring this entry then
6441 * block after informing other thread to wake us up.
6442 */
6443 if (entry->in_transition) {
6444 wait_result_t wait_result;
6445
6446 /*
6447 * We have not clipped the entry. Make sure that
6448 * the start address is in range so that the lookup
6449 * below will succeed.
6450 * "s" is the current starting point: we've already
6451 * wired from "start" to "s" and we still have
6452 * to wire from "s" to "end".
6453 */
6454
6455 entry->needs_wakeup = TRUE;
6456
6457 /*
6458 * wake up anybody waiting on entries that we have
6459 * already wired.
6460 */
6461 if (need_wakeup) {
6462 vm_map_entry_wakeup(map);
6463 need_wakeup = FALSE;
6464 }
6465 /*
6466 * User wiring is interruptible
6467 */
6468 wait_result = vm_map_entry_wait(map,
6469 (user_wire) ? THREAD_ABORTSAFE :
6470 THREAD_UNINT);
6471 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6472 /*
6473 * undo the wirings we have done so far
6474 * We do not clear the needs_wakeup flag,
6475 * because we cannot tell if we were the
6476 * only one waiting.
6477 */
6478 rc = KERN_FAILURE;
6479 goto done;
6480 }
6481
6482 /*
6483 * Cannot avoid a lookup here. reset timestamp.
6484 */
6485 last_timestamp = map->timestamp;
6486
6487 /*
6488 * The entry could have been clipped, look it up again.
6489 * Worse that can happen is, it may not exist anymore.
6490 */
6491 if (!vm_map_lookup_entry(map, s, &first_entry)) {
6492 /*
6493 * User: undo everything upto the previous
6494 * entry. let vm_map_unwire worry about
6495 * checking the validity of the range.
6496 */
6497 rc = KERN_FAILURE;
6498 goto done;
6499 }
6500 entry = first_entry;
6501 continue;
6502 }
6503
6504 if (entry->is_sub_map) {
6505 vm_map_offset_t sub_start;
6506 vm_map_offset_t sub_end;
6507 vm_map_offset_t local_start;
6508 vm_map_offset_t local_end;
6509 pmap_t pmap;
6510
6511 if (wire_and_extract) {
6512 /*
6513 * Wiring would result in copy-on-write
6514 * which would not be compatible with
6515 * the sharing we have with the original
6516 * provider of this memory.
6517 */
6518 rc = KERN_INVALID_ARGUMENT;
6519 goto done;
6520 }
6521
6522 vm_map_clip_start(map, entry, s);
6523 vm_map_clip_end(map, entry, end);
6524
6525 sub_start = VME_OFFSET(entry);
6526 sub_end = entry->vme_end;
6527 sub_end += VME_OFFSET(entry) - entry->vme_start;
6528
6529 local_end = entry->vme_end;
6530 if (map_pmap == NULL) {
6531 vm_object_t object;
6532 vm_object_offset_t offset;
6533 vm_prot_t prot;
6534 boolean_t wired;
6535 vm_map_entry_t local_entry;
6536 vm_map_version_t version;
6537 vm_map_t lookup_map;
6538
6539 if (entry->use_pmap) {
6540 pmap = VME_SUBMAP(entry)->pmap;
6541 /* ppc implementation requires that */
6542 /* submaps pmap address ranges line */
6543 /* up with parent map */
6544 #ifdef notdef
6545 pmap_addr = sub_start;
6546 #endif
6547 pmap_addr = s;
6548 } else {
6549 pmap = map->pmap;
6550 pmap_addr = s;
6551 }
6552
6553 if (entry->wired_count) {
6554 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6555 goto done;
6556 }
6557
6558 /*
6559 * The map was not unlocked:
6560 * no need to goto re-lookup.
6561 * Just go directly to next entry.
6562 */
6563 entry = entry->vme_next;
6564 s = entry->vme_start;
6565 continue;
6566 }
6567
6568 /* call vm_map_lookup_locked to */
6569 /* cause any needs copy to be */
6570 /* evaluated */
6571 local_start = entry->vme_start;
6572 lookup_map = map;
6573 vm_map_lock_write_to_read(map);
6574 rc = vm_map_lookup_locked(
6575 &lookup_map, local_start,
6576 (access_type | extra_prots),
6577 OBJECT_LOCK_EXCLUSIVE,
6578 &version, &object,
6579 &offset, &prot, &wired,
6580 NULL,
6581 &real_map, NULL);
6582 if (rc != KERN_SUCCESS) {
6583 vm_map_unlock_read(lookup_map);
6584 assert(map_pmap == NULL);
6585 vm_map_unwire(map, start,
6586 s, user_wire);
6587 return rc;
6588 }
6589 vm_object_unlock(object);
6590 if (real_map != lookup_map) {
6591 vm_map_unlock(real_map);
6592 }
6593 vm_map_unlock_read(lookup_map);
6594 vm_map_lock(map);
6595
6596 /* we unlocked, so must re-lookup */
6597 if (!vm_map_lookup_entry(map,
6598 local_start,
6599 &local_entry)) {
6600 rc = KERN_FAILURE;
6601 goto done;
6602 }
6603
6604 /*
6605 * entry could have been "simplified",
6606 * so re-clip
6607 */
6608 entry = local_entry;
6609 assert(s == local_start);
6610 vm_map_clip_start(map, entry, s);
6611 vm_map_clip_end(map, entry, end);
6612 /* re-compute "e" */
6613 e = entry->vme_end;
6614 if (e > end) {
6615 e = end;
6616 }
6617
6618 /* did we have a change of type? */
6619 if (!entry->is_sub_map) {
6620 last_timestamp = map->timestamp;
6621 continue;
6622 }
6623 } else {
6624 local_start = entry->vme_start;
6625 pmap = map_pmap;
6626 }
6627
6628 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6629 goto done;
6630 }
6631
6632 entry->in_transition = TRUE;
6633
6634 vm_map_unlock(map);
6635 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6636 sub_start, sub_end,
6637 caller_prot, tag,
6638 user_wire, pmap, pmap_addr,
6639 NULL);
6640 vm_map_lock(map);
6641
6642 /*
6643 * Find the entry again. It could have been clipped
6644 * after we unlocked the map.
6645 */
6646 if (!vm_map_lookup_entry(map, local_start,
6647 &first_entry)) {
6648 panic("vm_map_wire: re-lookup failed");
6649 }
6650 entry = first_entry;
6651
6652 assert(local_start == s);
6653 /* re-compute "e" */
6654 e = entry->vme_end;
6655 if (e > end) {
6656 e = end;
6657 }
6658
6659 last_timestamp = map->timestamp;
6660 while ((entry != vm_map_to_entry(map)) &&
6661 (entry->vme_start < e)) {
6662 assert(entry->in_transition);
6663 entry->in_transition = FALSE;
6664 if (entry->needs_wakeup) {
6665 entry->needs_wakeup = FALSE;
6666 need_wakeup = TRUE;
6667 }
6668 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6669 subtract_wire_counts(map, entry, user_wire);
6670 }
6671 entry = entry->vme_next;
6672 }
6673 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6674 goto done;
6675 }
6676
6677 /* no need to relookup again */
6678 s = entry->vme_start;
6679 continue;
6680 }
6681
6682 /*
6683 * If this entry is already wired then increment
6684 * the appropriate wire reference count.
6685 */
6686 if (entry->wired_count) {
6687 if ((entry->protection & access_type) != access_type) {
6688 /* found a protection problem */
6689
6690 /*
6691 * XXX FBDP
6692 * We should always return an error
6693 * in this case but since we didn't
6694 * enforce it before, let's do
6695 * it only for the new "wire_and_extract"
6696 * code path for now...
6697 */
6698 if (wire_and_extract) {
6699 rc = KERN_PROTECTION_FAILURE;
6700 goto done;
6701 }
6702 }
6703
6704 /*
6705 * entry is already wired down, get our reference
6706 * after clipping to our range.
6707 */
6708 vm_map_clip_start(map, entry, s);
6709 vm_map_clip_end(map, entry, end);
6710
6711 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6712 goto done;
6713 }
6714
6715 if (wire_and_extract) {
6716 vm_object_t object;
6717 vm_object_offset_t offset;
6718 vm_page_t m;
6719
6720 /*
6721 * We don't have to "wire" the page again
6722 * bit we still have to "extract" its
6723 * physical page number, after some sanity
6724 * checks.
6725 */
6726 assert((entry->vme_end - entry->vme_start)
6727 == PAGE_SIZE);
6728 assert(!entry->needs_copy);
6729 assert(!entry->is_sub_map);
6730 assert(VME_OBJECT(entry));
6731 if (((entry->vme_end - entry->vme_start)
6732 != PAGE_SIZE) ||
6733 entry->needs_copy ||
6734 entry->is_sub_map ||
6735 VME_OBJECT(entry) == VM_OBJECT_NULL) {
6736 rc = KERN_INVALID_ARGUMENT;
6737 goto done;
6738 }
6739
6740 object = VME_OBJECT(entry);
6741 offset = VME_OFFSET(entry);
6742 /* need exclusive lock to update m->dirty */
6743 if (entry->protection & VM_PROT_WRITE) {
6744 vm_object_lock(object);
6745 } else {
6746 vm_object_lock_shared(object);
6747 }
6748 m = vm_page_lookup(object, offset);
6749 assert(m != VM_PAGE_NULL);
6750 assert(VM_PAGE_WIRED(m));
6751 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
6752 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6753 if (entry->protection & VM_PROT_WRITE) {
6754 vm_object_lock_assert_exclusive(
6755 object);
6756 m->vmp_dirty = TRUE;
6757 }
6758 } else {
6759 /* not already wired !? */
6760 *physpage_p = 0;
6761 }
6762 vm_object_unlock(object);
6763 }
6764
6765 /* map was not unlocked: no need to relookup */
6766 entry = entry->vme_next;
6767 s = entry->vme_start;
6768 continue;
6769 }
6770
6771 /*
6772 * Unwired entry or wire request transmitted via submap
6773 */
6774
6775 /*
6776 * Wiring would copy the pages to the shadow object.
6777 * The shadow object would not be code-signed so
6778 * attempting to execute code from these copied pages
6779 * would trigger a code-signing violation.
6780 */
6781
6782 if ((entry->protection & VM_PROT_EXECUTE)
6783 #if XNU_TARGET_OS_OSX
6784 &&
6785 map->pmap != kernel_pmap &&
6786 (vm_map_cs_enforcement(map)
6787 #if __arm64__
6788 || !VM_MAP_IS_EXOTIC(map)
6789 #endif /* __arm64__ */
6790 )
6791 #endif /* XNU_TARGET_OS_OSX */
6792 ) {
6793 #if MACH_ASSERT
6794 printf("pid %d[%s] wiring executable range from "
6795 "0x%llx to 0x%llx: rejected to preserve "
6796 "code-signing\n",
6797 proc_selfpid(),
6798 (current_task()->bsd_info
6799 ? proc_name_address(current_task()->bsd_info)
6800 : "?"),
6801 (uint64_t) entry->vme_start,
6802 (uint64_t) entry->vme_end);
6803 #endif /* MACH_ASSERT */
6804 DTRACE_VM2(cs_executable_wire,
6805 uint64_t, (uint64_t)entry->vme_start,
6806 uint64_t, (uint64_t)entry->vme_end);
6807 cs_executable_wire++;
6808 rc = KERN_PROTECTION_FAILURE;
6809 goto done;
6810 }
6811
6812 /*
6813 * Perform actions of vm_map_lookup that need the write
6814 * lock on the map: create a shadow object for a
6815 * copy-on-write region, or an object for a zero-fill
6816 * region.
6817 */
6818 size = entry->vme_end - entry->vme_start;
6819 /*
6820 * If wiring a copy-on-write page, we need to copy it now
6821 * even if we're only (currently) requesting read access.
6822 * This is aggressive, but once it's wired we can't move it.
6823 */
6824 if (entry->needs_copy) {
6825 if (wire_and_extract) {
6826 /*
6827 * We're supposed to share with the original
6828 * provider so should not be "needs_copy"
6829 */
6830 rc = KERN_INVALID_ARGUMENT;
6831 goto done;
6832 }
6833
6834 VME_OBJECT_SHADOW(entry, size);
6835 entry->needs_copy = FALSE;
6836 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6837 if (wire_and_extract) {
6838 /*
6839 * We're supposed to share with the original
6840 * provider so should already have an object.
6841 */
6842 rc = KERN_INVALID_ARGUMENT;
6843 goto done;
6844 }
6845 VME_OBJECT_SET(entry, vm_object_allocate(size));
6846 VME_OFFSET_SET(entry, (vm_object_offset_t)0);
6847 assert(entry->use_pmap);
6848 }
6849
6850 vm_map_clip_start(map, entry, s);
6851 vm_map_clip_end(map, entry, end);
6852
6853 /* re-compute "e" */
6854 e = entry->vme_end;
6855 if (e > end) {
6856 e = end;
6857 }
6858
6859 /*
6860 * Check for holes and protection mismatch.
6861 * Holes: Next entry should be contiguous unless this
6862 * is the end of the region.
6863 * Protection: Access requested must be allowed, unless
6864 * wiring is by protection class
6865 */
6866 if ((entry->vme_end < end) &&
6867 ((entry->vme_next == vm_map_to_entry(map)) ||
6868 (entry->vme_next->vme_start > entry->vme_end))) {
6869 /* found a hole */
6870 rc = KERN_INVALID_ADDRESS;
6871 goto done;
6872 }
6873 if ((entry->protection & access_type) != access_type) {
6874 /* found a protection problem */
6875 rc = KERN_PROTECTION_FAILURE;
6876 goto done;
6877 }
6878
6879 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
6880
6881 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6882 goto done;
6883 }
6884
6885 entry->in_transition = TRUE;
6886
6887 /*
6888 * This entry might get split once we unlock the map.
6889 * In vm_fault_wire(), we need the current range as
6890 * defined by this entry. In order for this to work
6891 * along with a simultaneous clip operation, we make a
6892 * temporary copy of this entry and use that for the
6893 * wiring. Note that the underlying objects do not
6894 * change during a clip.
6895 */
6896 tmp_entry = *entry;
6897
6898 /*
6899 * The in_transition state guarentees that the entry
6900 * (or entries for this range, if split occured) will be
6901 * there when the map lock is acquired for the second time.
6902 */
6903 vm_map_unlock(map);
6904
6905 if (!user_wire && cur_thread != THREAD_NULL) {
6906 interruptible_state = thread_interrupt_level(THREAD_UNINT);
6907 } else {
6908 interruptible_state = THREAD_UNINT;
6909 }
6910
6911 if (map_pmap) {
6912 rc = vm_fault_wire(map,
6913 &tmp_entry, caller_prot, tag, map_pmap, pmap_addr,
6914 physpage_p);
6915 } else {
6916 rc = vm_fault_wire(map,
6917 &tmp_entry, caller_prot, tag, map->pmap,
6918 tmp_entry.vme_start,
6919 physpage_p);
6920 }
6921
6922 if (!user_wire && cur_thread != THREAD_NULL) {
6923 thread_interrupt_level(interruptible_state);
6924 }
6925
6926 vm_map_lock(map);
6927
6928 if (last_timestamp + 1 != map->timestamp) {
6929 /*
6930 * Find the entry again. It could have been clipped
6931 * after we unlocked the map.
6932 */
6933 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
6934 &first_entry)) {
6935 panic("vm_map_wire: re-lookup failed");
6936 }
6937
6938 entry = first_entry;
6939 }
6940
6941 last_timestamp = map->timestamp;
6942
6943 while ((entry != vm_map_to_entry(map)) &&
6944 (entry->vme_start < tmp_entry.vme_end)) {
6945 assert(entry->in_transition);
6946 entry->in_transition = FALSE;
6947 if (entry->needs_wakeup) {
6948 entry->needs_wakeup = FALSE;
6949 need_wakeup = TRUE;
6950 }
6951 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6952 subtract_wire_counts(map, entry, user_wire);
6953 }
6954 entry = entry->vme_next;
6955 }
6956
6957 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6958 goto done;
6959 }
6960
6961 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
6962 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
6963 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
6964 /* found a "new" hole */
6965 s = tmp_entry.vme_end;
6966 rc = KERN_INVALID_ADDRESS;
6967 goto done;
6968 }
6969
6970 s = entry->vme_start;
6971 } /* end while loop through map entries */
6972
6973 done:
6974 if (rc == KERN_SUCCESS) {
6975 /* repair any damage we may have made to the VM map */
6976 vm_map_simplify_range(map, start, end);
6977 }
6978
6979 vm_map_unlock(map);
6980
6981 /*
6982 * wake up anybody waiting on entries we wired.
6983 */
6984 if (need_wakeup) {
6985 vm_map_entry_wakeup(map);
6986 }
6987
6988 if (rc != KERN_SUCCESS) {
6989 /* undo what has been wired so far */
6990 vm_map_unwire_nested(map, start, s, user_wire,
6991 map_pmap, pmap_addr);
6992 if (physpage_p) {
6993 *physpage_p = 0;
6994 }
6995 }
6996
6997 return rc;
6998 }
6999
7000 kern_return_t
vm_map_wire_external(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,boolean_t user_wire)7001 vm_map_wire_external(
7002 vm_map_t map,
7003 vm_map_offset_t start,
7004 vm_map_offset_t end,
7005 vm_prot_t caller_prot,
7006 boolean_t user_wire)
7007 {
7008 kern_return_t kret;
7009
7010 kret = vm_map_wire_nested(map, start, end, caller_prot, vm_tag_bt(),
7011 user_wire, (pmap_t)NULL, 0, NULL);
7012 return kret;
7013 }
7014
7015 kern_return_t
vm_map_wire_kernel(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire)7016 vm_map_wire_kernel(
7017 vm_map_t map,
7018 vm_map_offset_t start,
7019 vm_map_offset_t end,
7020 vm_prot_t caller_prot,
7021 vm_tag_t tag,
7022 boolean_t user_wire)
7023 {
7024 kern_return_t kret;
7025
7026 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7027 user_wire, (pmap_t)NULL, 0, NULL);
7028 return kret;
7029 }
7030
7031 kern_return_t
vm_map_wire_and_extract_external(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,boolean_t user_wire,ppnum_t * physpage_p)7032 vm_map_wire_and_extract_external(
7033 vm_map_t map,
7034 vm_map_offset_t start,
7035 vm_prot_t caller_prot,
7036 boolean_t user_wire,
7037 ppnum_t *physpage_p)
7038 {
7039 kern_return_t kret;
7040
7041 kret = vm_map_wire_nested(map,
7042 start,
7043 start + VM_MAP_PAGE_SIZE(map),
7044 caller_prot,
7045 vm_tag_bt(),
7046 user_wire,
7047 (pmap_t)NULL,
7048 0,
7049 physpage_p);
7050 if (kret != KERN_SUCCESS &&
7051 physpage_p != NULL) {
7052 *physpage_p = 0;
7053 }
7054 return kret;
7055 }
7056
7057 kern_return_t
vm_map_wire_and_extract_kernel(vm_map_t map,vm_map_offset_t start,vm_prot_t caller_prot,vm_tag_t tag,boolean_t user_wire,ppnum_t * physpage_p)7058 vm_map_wire_and_extract_kernel(
7059 vm_map_t map,
7060 vm_map_offset_t start,
7061 vm_prot_t caller_prot,
7062 vm_tag_t tag,
7063 boolean_t user_wire,
7064 ppnum_t *physpage_p)
7065 {
7066 kern_return_t kret;
7067
7068 kret = vm_map_wire_nested(map,
7069 start,
7070 start + VM_MAP_PAGE_SIZE(map),
7071 caller_prot,
7072 tag,
7073 user_wire,
7074 (pmap_t)NULL,
7075 0,
7076 physpage_p);
7077 if (kret != KERN_SUCCESS &&
7078 physpage_p != NULL) {
7079 *physpage_p = 0;
7080 }
7081 return kret;
7082 }
7083
7084 /*
7085 * vm_map_unwire:
7086 *
7087 * Sets the pageability of the specified address range in the target
7088 * as pageable. Regions specified must have been wired previously.
7089 *
7090 * The map must not be locked, but a reference must remain to the map
7091 * throughout the call.
7092 *
7093 * Kernel will panic on failures. User unwire ignores holes and
7094 * unwired and intransition entries to avoid losing memory by leaving
7095 * it unwired.
7096 */
7097 static kern_return_t
vm_map_unwire_nested(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire,pmap_t map_pmap,vm_map_offset_t pmap_addr)7098 vm_map_unwire_nested(
7099 vm_map_t map,
7100 vm_map_offset_t start,
7101 vm_map_offset_t end,
7102 boolean_t user_wire,
7103 pmap_t map_pmap,
7104 vm_map_offset_t pmap_addr)
7105 {
7106 vm_map_entry_t entry;
7107 struct vm_map_entry *first_entry, tmp_entry;
7108 boolean_t need_wakeup;
7109 boolean_t main_map = FALSE;
7110 unsigned int last_timestamp;
7111
7112 vm_map_lock(map);
7113 if (map_pmap == NULL) {
7114 main_map = TRUE;
7115 }
7116 last_timestamp = map->timestamp;
7117
7118 VM_MAP_RANGE_CHECK(map, start, end);
7119 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7120 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7121
7122 if (start == end) {
7123 /* We unwired what the caller asked for: zero pages */
7124 vm_map_unlock(map);
7125 return KERN_SUCCESS;
7126 }
7127
7128 if (vm_map_lookup_entry(map, start, &first_entry)) {
7129 entry = first_entry;
7130 /*
7131 * vm_map_clip_start will be done later.
7132 * We don't want to unnest any nested sub maps here !
7133 */
7134 } else {
7135 if (!user_wire) {
7136 panic("vm_map_unwire: start not found");
7137 }
7138 /* Start address is not in map. */
7139 vm_map_unlock(map);
7140 return KERN_INVALID_ADDRESS;
7141 }
7142
7143 if (entry->superpage_size) {
7144 /* superpages are always wired */
7145 vm_map_unlock(map);
7146 return KERN_INVALID_ADDRESS;
7147 }
7148
7149 need_wakeup = FALSE;
7150 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7151 if (entry->in_transition) {
7152 /*
7153 * 1)
7154 * Another thread is wiring down this entry. Note
7155 * that if it is not for the other thread we would
7156 * be unwiring an unwired entry. This is not
7157 * permitted. If we wait, we will be unwiring memory
7158 * we did not wire.
7159 *
7160 * 2)
7161 * Another thread is unwiring this entry. We did not
7162 * have a reference to it, because if we did, this
7163 * entry will not be getting unwired now.
7164 */
7165 if (!user_wire) {
7166 /*
7167 * XXX FBDP
7168 * This could happen: there could be some
7169 * overlapping vslock/vsunlock operations
7170 * going on.
7171 * We should probably just wait and retry,
7172 * but then we have to be careful that this
7173 * entry could get "simplified" after
7174 * "in_transition" gets unset and before
7175 * we re-lookup the entry, so we would
7176 * have to re-clip the entry to avoid
7177 * re-unwiring what we have already unwired...
7178 * See vm_map_wire_nested().
7179 *
7180 * Or we could just ignore "in_transition"
7181 * here and proceed to decement the wired
7182 * count(s) on this entry. That should be fine
7183 * as long as "wired_count" doesn't drop all
7184 * the way to 0 (and we should panic if THAT
7185 * happens).
7186 */
7187 panic("vm_map_unwire: in_transition entry");
7188 }
7189
7190 entry = entry->vme_next;
7191 continue;
7192 }
7193
7194 if (entry->is_sub_map) {
7195 vm_map_offset_t sub_start;
7196 vm_map_offset_t sub_end;
7197 vm_map_offset_t local_end;
7198 pmap_t pmap;
7199
7200 vm_map_clip_start(map, entry, start);
7201 vm_map_clip_end(map, entry, end);
7202
7203 sub_start = VME_OFFSET(entry);
7204 sub_end = entry->vme_end - entry->vme_start;
7205 sub_end += VME_OFFSET(entry);
7206 local_end = entry->vme_end;
7207 if (map_pmap == NULL) {
7208 if (entry->use_pmap) {
7209 pmap = VME_SUBMAP(entry)->pmap;
7210 pmap_addr = sub_start;
7211 } else {
7212 pmap = map->pmap;
7213 pmap_addr = start;
7214 }
7215 if (entry->wired_count == 0 ||
7216 (user_wire && entry->user_wired_count == 0)) {
7217 if (!user_wire) {
7218 panic("vm_map_unwire: entry is unwired");
7219 }
7220 entry = entry->vme_next;
7221 continue;
7222 }
7223
7224 /*
7225 * Check for holes
7226 * Holes: Next entry should be contiguous unless
7227 * this is the end of the region.
7228 */
7229 if (((entry->vme_end < end) &&
7230 ((entry->vme_next == vm_map_to_entry(map)) ||
7231 (entry->vme_next->vme_start
7232 > entry->vme_end)))) {
7233 if (!user_wire) {
7234 panic("vm_map_unwire: non-contiguous region");
7235 }
7236 /*
7237 * entry = entry->vme_next;
7238 * continue;
7239 */
7240 }
7241
7242 subtract_wire_counts(map, entry, user_wire);
7243
7244 if (entry->wired_count != 0) {
7245 entry = entry->vme_next;
7246 continue;
7247 }
7248
7249 entry->in_transition = TRUE;
7250 tmp_entry = *entry;/* see comment in vm_map_wire() */
7251
7252 /*
7253 * We can unlock the map now. The in_transition state
7254 * guarantees existance of the entry.
7255 */
7256 vm_map_unlock(map);
7257 vm_map_unwire_nested(VME_SUBMAP(entry),
7258 sub_start, sub_end, user_wire, pmap, pmap_addr);
7259 vm_map_lock(map);
7260
7261 if (last_timestamp + 1 != map->timestamp) {
7262 /*
7263 * Find the entry again. It could have been
7264 * clipped or deleted after we unlocked the map.
7265 */
7266 if (!vm_map_lookup_entry(map,
7267 tmp_entry.vme_start,
7268 &first_entry)) {
7269 if (!user_wire) {
7270 panic("vm_map_unwire: re-lookup failed");
7271 }
7272 entry = first_entry->vme_next;
7273 } else {
7274 entry = first_entry;
7275 }
7276 }
7277 last_timestamp = map->timestamp;
7278
7279 /*
7280 * clear transition bit for all constituent entries
7281 * that were in the original entry (saved in
7282 * tmp_entry). Also check for waiters.
7283 */
7284 while ((entry != vm_map_to_entry(map)) &&
7285 (entry->vme_start < tmp_entry.vme_end)) {
7286 assert(entry->in_transition);
7287 entry->in_transition = FALSE;
7288 if (entry->needs_wakeup) {
7289 entry->needs_wakeup = FALSE;
7290 need_wakeup = TRUE;
7291 }
7292 entry = entry->vme_next;
7293 }
7294 continue;
7295 } else {
7296 tmp_entry = *entry;
7297 vm_map_unlock(map);
7298 vm_map_unwire_nested(VME_SUBMAP(entry),
7299 sub_start, sub_end, user_wire, map_pmap,
7300 pmap_addr);
7301 vm_map_lock(map);
7302
7303 if (last_timestamp + 1 != map->timestamp) {
7304 /*
7305 * Find the entry again. It could have been
7306 * clipped or deleted after we unlocked the map.
7307 */
7308 if (!vm_map_lookup_entry(map,
7309 tmp_entry.vme_start,
7310 &first_entry)) {
7311 if (!user_wire) {
7312 panic("vm_map_unwire: re-lookup failed");
7313 }
7314 entry = first_entry->vme_next;
7315 } else {
7316 entry = first_entry;
7317 }
7318 }
7319 last_timestamp = map->timestamp;
7320 }
7321 }
7322
7323
7324 if ((entry->wired_count == 0) ||
7325 (user_wire && entry->user_wired_count == 0)) {
7326 if (!user_wire) {
7327 panic("vm_map_unwire: entry is unwired");
7328 }
7329
7330 entry = entry->vme_next;
7331 continue;
7332 }
7333
7334 assert(entry->wired_count > 0 &&
7335 (!user_wire || entry->user_wired_count > 0));
7336
7337 vm_map_clip_start(map, entry, start);
7338 vm_map_clip_end(map, entry, end);
7339
7340 /*
7341 * Check for holes
7342 * Holes: Next entry should be contiguous unless
7343 * this is the end of the region.
7344 */
7345 if (((entry->vme_end < end) &&
7346 ((entry->vme_next == vm_map_to_entry(map)) ||
7347 (entry->vme_next->vme_start > entry->vme_end)))) {
7348 if (!user_wire) {
7349 panic("vm_map_unwire: non-contiguous region");
7350 }
7351 entry = entry->vme_next;
7352 continue;
7353 }
7354
7355 subtract_wire_counts(map, entry, user_wire);
7356
7357 if (entry->wired_count != 0) {
7358 entry = entry->vme_next;
7359 continue;
7360 }
7361
7362 if (entry->zero_wired_pages) {
7363 entry->zero_wired_pages = FALSE;
7364 }
7365
7366 entry->in_transition = TRUE;
7367 tmp_entry = *entry; /* see comment in vm_map_wire() */
7368
7369 /*
7370 * We can unlock the map now. The in_transition state
7371 * guarantees existance of the entry.
7372 */
7373 vm_map_unlock(map);
7374 if (map_pmap) {
7375 vm_fault_unwire(map,
7376 &tmp_entry, FALSE, map_pmap, pmap_addr);
7377 } else {
7378 vm_fault_unwire(map,
7379 &tmp_entry, FALSE, map->pmap,
7380 tmp_entry.vme_start);
7381 }
7382 vm_map_lock(map);
7383
7384 if (last_timestamp + 1 != map->timestamp) {
7385 /*
7386 * Find the entry again. It could have been clipped
7387 * or deleted after we unlocked the map.
7388 */
7389 if (!vm_map_lookup_entry(map, tmp_entry.vme_start,
7390 &first_entry)) {
7391 if (!user_wire) {
7392 panic("vm_map_unwire: re-lookup failed");
7393 }
7394 entry = first_entry->vme_next;
7395 } else {
7396 entry = first_entry;
7397 }
7398 }
7399 last_timestamp = map->timestamp;
7400
7401 /*
7402 * clear transition bit for all constituent entries that
7403 * were in the original entry (saved in tmp_entry). Also
7404 * check for waiters.
7405 */
7406 while ((entry != vm_map_to_entry(map)) &&
7407 (entry->vme_start < tmp_entry.vme_end)) {
7408 assert(entry->in_transition);
7409 entry->in_transition = FALSE;
7410 if (entry->needs_wakeup) {
7411 entry->needs_wakeup = FALSE;
7412 need_wakeup = TRUE;
7413 }
7414 entry = entry->vme_next;
7415 }
7416 }
7417
7418 /*
7419 * We might have fragmented the address space when we wired this
7420 * range of addresses. Attempt to re-coalesce these VM map entries
7421 * with their neighbors now that they're no longer wired.
7422 * Under some circumstances, address space fragmentation can
7423 * prevent VM object shadow chain collapsing, which can cause
7424 * swap space leaks.
7425 */
7426 vm_map_simplify_range(map, start, end);
7427
7428 vm_map_unlock(map);
7429 /*
7430 * wake up anybody waiting on entries that we have unwired.
7431 */
7432 if (need_wakeup) {
7433 vm_map_entry_wakeup(map);
7434 }
7435 return KERN_SUCCESS;
7436 }
7437
7438 kern_return_t
vm_map_unwire(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,boolean_t user_wire)7439 vm_map_unwire(
7440 vm_map_t map,
7441 vm_map_offset_t start,
7442 vm_map_offset_t end,
7443 boolean_t user_wire)
7444 {
7445 return vm_map_unwire_nested(map, start, end,
7446 user_wire, (pmap_t)NULL, 0);
7447 }
7448
7449
7450 /*
7451 * vm_map_entry_zap: [ internal use only ]
7452 *
7453 * Remove the entry from the target map
7454 * and put it on a zap list.
7455 */
7456 static void
vm_map_entry_zap(vm_map_t map,vm_map_entry_t entry,vm_map_zap_t zap)7457 vm_map_entry_zap(
7458 vm_map_t map,
7459 vm_map_entry_t entry,
7460 vm_map_zap_t zap)
7461 {
7462 vm_map_offset_t s, e;
7463
7464 s = entry->vme_start;
7465 e = entry->vme_end;
7466 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7467 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7468 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7469 assert(page_aligned(s));
7470 assert(page_aligned(e));
7471 }
7472 if (entry->map_aligned == TRUE) {
7473 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7474 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7475 }
7476 assert(entry->wired_count == 0);
7477 assert(entry->user_wired_count == 0);
7478 assert(!entry->permanent);
7479
7480 vm_map_store_entry_unlink(map, entry);
7481 map->size -= e - s;
7482
7483 vm_map_zap_append(zap, entry);
7484 }
7485
7486 static void
vm_map_submap_pmap_clean(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_t sub_map,vm_map_offset_t offset)7487 vm_map_submap_pmap_clean(
7488 vm_map_t map,
7489 vm_map_offset_t start,
7490 vm_map_offset_t end,
7491 vm_map_t sub_map,
7492 vm_map_offset_t offset)
7493 {
7494 vm_map_offset_t submap_start;
7495 vm_map_offset_t submap_end;
7496 vm_map_size_t remove_size;
7497 vm_map_entry_t entry;
7498
7499 submap_end = offset + (end - start);
7500 submap_start = offset;
7501
7502 vm_map_lock_read(sub_map);
7503 if (vm_map_lookup_entry(sub_map, offset, &entry)) {
7504 remove_size = (entry->vme_end - entry->vme_start);
7505 if (offset > entry->vme_start) {
7506 remove_size -= offset - entry->vme_start;
7507 }
7508
7509
7510 if (submap_end < entry->vme_end) {
7511 remove_size -=
7512 entry->vme_end - submap_end;
7513 }
7514 if (entry->is_sub_map) {
7515 vm_map_submap_pmap_clean(
7516 sub_map,
7517 start,
7518 start + remove_size,
7519 VME_SUBMAP(entry),
7520 VME_OFFSET(entry));
7521 } else {
7522 if (map->mapped_in_other_pmaps &&
7523 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7524 VME_OBJECT(entry) != NULL) {
7525 vm_object_pmap_protect_options(
7526 VME_OBJECT(entry),
7527 (VME_OFFSET(entry) +
7528 offset -
7529 entry->vme_start),
7530 remove_size,
7531 PMAP_NULL,
7532 PAGE_SIZE,
7533 entry->vme_start,
7534 VM_PROT_NONE,
7535 PMAP_OPTIONS_REMOVE);
7536 } else {
7537 pmap_remove(map->pmap,
7538 (addr64_t)start,
7539 (addr64_t)(start + remove_size));
7540 }
7541 }
7542 }
7543
7544 entry = entry->vme_next;
7545
7546 while ((entry != vm_map_to_entry(sub_map))
7547 && (entry->vme_start < submap_end)) {
7548 remove_size = (entry->vme_end - entry->vme_start);
7549 if (submap_end < entry->vme_end) {
7550 remove_size -= entry->vme_end - submap_end;
7551 }
7552 if (entry->is_sub_map) {
7553 vm_map_submap_pmap_clean(
7554 sub_map,
7555 (start + entry->vme_start) - offset,
7556 ((start + entry->vme_start) - offset) + remove_size,
7557 VME_SUBMAP(entry),
7558 VME_OFFSET(entry));
7559 } else {
7560 if (map->mapped_in_other_pmaps &&
7561 os_ref_get_count_raw(&map->map_refcnt) != 0 &&
7562 VME_OBJECT(entry) != NULL) {
7563 vm_object_pmap_protect_options(
7564 VME_OBJECT(entry),
7565 VME_OFFSET(entry),
7566 remove_size,
7567 PMAP_NULL,
7568 PAGE_SIZE,
7569 entry->vme_start,
7570 VM_PROT_NONE,
7571 PMAP_OPTIONS_REMOVE);
7572 } else {
7573 pmap_remove(map->pmap,
7574 (addr64_t)((start + entry->vme_start)
7575 - offset),
7576 (addr64_t)(((start + entry->vme_start)
7577 - offset) + remove_size));
7578 }
7579 }
7580 entry = entry->vme_next;
7581 }
7582 vm_map_unlock_read(sub_map);
7583 return;
7584 }
7585
7586 /*
7587 * virt_memory_guard_ast:
7588 *
7589 * Handle the AST callout for a virtual memory guard.
7590 * raise an EXC_GUARD exception and terminate the task
7591 * if configured to do so.
7592 */
7593 void
virt_memory_guard_ast(thread_t thread,mach_exception_data_type_t code,mach_exception_data_type_t subcode)7594 virt_memory_guard_ast(
7595 thread_t thread,
7596 mach_exception_data_type_t code,
7597 mach_exception_data_type_t subcode)
7598 {
7599 task_t task = get_threadtask(thread);
7600 assert(task != kernel_task);
7601 assert(task == current_task());
7602 kern_return_t sync_exception_result;
7603 uint32_t behavior;
7604
7605 behavior = task->task_exc_guard;
7606
7607 /* Is delivery enabled */
7608 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7609 return;
7610 }
7611
7612 /* If only once, make sure we're that once */
7613 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7614 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7615
7616 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7617 break;
7618 }
7619 behavior = task->task_exc_guard;
7620 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7621 return;
7622 }
7623 }
7624
7625 /* Raise exception synchronously and see if handler claimed it */
7626 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode);
7627
7628 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7629 /*
7630 * If Synchronous EXC_GUARD delivery was successful then
7631 * kill the process and return, else kill the process
7632 * and deliver the exception via EXC_CORPSE_NOTIFY.
7633 */
7634 if (sync_exception_result == KERN_SUCCESS) {
7635 task_bsdtask_kill(current_task());
7636 } else {
7637 exit_with_guard_exception(current_proc(), code, subcode);
7638 }
7639 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
7640 /*
7641 * If the synchronous EXC_GUARD delivery was not successful,
7642 * raise a simulated crash.
7643 */
7644 if (sync_exception_result != KERN_SUCCESS) {
7645 task_violated_guard(code, subcode, NULL);
7646 }
7647 }
7648 }
7649
7650 /*
7651 * vm_map_guard_exception:
7652 *
7653 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
7654 *
7655 * Right now, we do this when we find nothing mapped, or a
7656 * gap in the mapping when a user address space deallocate
7657 * was requested. We report the address of the first gap found.
7658 */
7659 static void
vm_map_guard_exception(vm_map_offset_t gap_start,unsigned reason)7660 vm_map_guard_exception(
7661 vm_map_offset_t gap_start,
7662 unsigned reason)
7663 {
7664 mach_exception_code_t code = 0;
7665 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
7666 unsigned int target = 0; /* should we pass in pid associated with map? */
7667 mach_exception_data_type_t subcode = (uint64_t)gap_start;
7668 boolean_t fatal = FALSE;
7669
7670 task_t task = current_task_early();
7671
7672 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
7673 if (task == NULL || task == kernel_task) {
7674 return;
7675 }
7676
7677 EXC_GUARD_ENCODE_TYPE(code, guard_type);
7678 EXC_GUARD_ENCODE_FLAVOR(code, reason);
7679 EXC_GUARD_ENCODE_TARGET(code, target);
7680
7681 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
7682 fatal = TRUE;
7683 }
7684 thread_guard_violation(current_thread(), code, subcode, fatal);
7685 }
7686
7687 __abortlike
7688 static void
__vm_map_delete_gap_panic(vm_map_t map,vm_map_offset_t where,vm_map_offset_t start,vm_map_offset_t end)7689 __vm_map_delete_gap_panic(
7690 vm_map_t map,
7691 vm_map_offset_t where,
7692 vm_map_offset_t start,
7693 vm_map_offset_t end)
7694 {
7695 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
7696 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
7697 }
7698
7699 __abortlike
7700 static void
__vm_map_delete_permanent_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7701 __vm_map_delete_permanent_panic(
7702 vm_map_t map,
7703 vm_map_offset_t start,
7704 vm_map_offset_t end,
7705 vm_map_entry_t entry)
7706 {
7707 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7708 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
7709 map, (uint64_t)start, (uint64_t)end, entry,
7710 (uint64_t)entry->vme_start,
7711 (uint64_t)entry->vme_end);
7712 }
7713
7714 __abortlike
7715 static void
__vm_map_delete_loose_atomic_panic(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_map_entry_t entry)7716 __vm_map_delete_loose_atomic_panic(
7717 vm_map_t map,
7718 vm_map_offset_t start,
7719 vm_map_offset_t end,
7720 vm_map_entry_t entry)
7721 {
7722 panic("vm_map_delete(%p,0x%llx,0x%llx): "
7723 "request loosely encompasses atomic entry %p at (0x%llx,0x%llx)",
7724 map, (uint64_t)start, (uint64_t)end, entry,
7725 (uint64_t)entry->vme_start,
7726 (uint64_t)entry->vme_end);
7727 }
7728
7729 __options_decl(vm_map_delete_state_t, uint32_t, {
7730 VMDS_NONE = 0x0000,
7731
7732 VMDS_FOUND_GAP = 0x0001,
7733 VMDS_GAPS_OK = 0x0002,
7734
7735 VMDS_KERNEL_PMAP = 0x0004,
7736 VMDS_NEEDS_LOOKUP = 0x0008,
7737 VMDS_NEEDS_WAKEUP = 0x0010,
7738 });
7739
7740 /*
7741 * vm_map_delete: [ internal use only ]
7742 *
7743 * Deallocates the given address range from the target map.
7744 * Removes all user wirings. Unwires one kernel wiring if
7745 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
7746 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
7747 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
7748 *
7749 *
7750 * When VM_MAP_REMOVE_RETURN_ERRORS is not passed,
7751 * then any error in removing mappings will lead to a panic
7752 * so that clients do not have to repeat the panic code
7753 * at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
7754 * is also passed, then KERN_ABORTED will not lead to a panic.
7755 *
7756 * Note: at this time, there is no such condition,
7757 * that isn't already causing a panic.
7758 *
7759 * If the code is changed to add such errors later,
7760 * then the flag must be honored.
7761 *
7762 * This routine is called with map locked and leaves map locked.
7763 */
7764 static kern_return_t
vm_map_delete(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags,vm_map_zap_t zap_list)7765 vm_map_delete(
7766 vm_map_t map,
7767 vm_map_offset_t start,
7768 vm_map_offset_t end,
7769 vmr_flags_t flags,
7770 vm_map_zap_t zap_list)
7771 {
7772 vm_map_entry_t entry, next;
7773 int interruptible;
7774 vm_map_offset_t gap_start = 0;
7775 vm_map_offset_t clear_in_transition_end = 0;
7776 __unused vm_map_offset_t save_start = start;
7777 __unused vm_map_offset_t save_end = end;
7778 vm_map_delete_state_t state = VMDS_NONE;
7779
7780 if (vm_map_pmap(map) == kernel_pmap) {
7781 state |= VMDS_KERNEL_PMAP;
7782 }
7783
7784 if (map->terminated || os_ref_get_count_raw(&map->map_refcnt) == 0) {
7785 state |= VMDS_GAPS_OK;
7786 }
7787
7788 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
7789 THREAD_ABORTSAFE : THREAD_UNINT;
7790
7791 /*
7792 * Find the start of the region.
7793 *
7794 * If in a superpage, extend the range
7795 * to include the start of the mapping.
7796 */
7797 if (vm_map_lookup_entry_or_next(map, start, &entry)) {
7798 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7799 start = SUPERPAGE_ROUND_DOWN(start);
7800 while (vm_map_lookup_entry_or_next(map, start, &entry)) {
7801 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
7802 start = SUPERPAGE_ROUND_DOWN(start);
7803 continue;
7804 }
7805 break;
7806 }
7807 }
7808 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7809 } else {
7810 if (!(state & VMDS_GAPS_OK)) {
7811 state |= VMDS_FOUND_GAP;
7812 gap_start = start;
7813 }
7814 }
7815
7816 if (entry->superpage_size) {
7817 end = SUPERPAGE_ROUND_UP(end);
7818 }
7819
7820 /*
7821 * Step through all entries in this region
7822 */
7823 for (vm_map_offset_t s = start; s < end;) {
7824 /*
7825 * At this point, we have deleted all the memory entries
7826 * in [start, s) and are proceeding with the [s, end) range.
7827 *
7828 * This loop might drop the map lock, and it is possible that
7829 * some memory was already reallocated within [start, s)
7830 * and we don't want to mess with those entries.
7831 *
7832 * Some of those entries could even have been re-assembled
7833 * with an entry after "s" (in vm_map_simplify_entry()), so
7834 * we may have to vm_map_clip_start() again.
7835 *
7836 * When clear_in_transition_end is set, the we had marked
7837 * [start, clear_in_transition_end) as "in_transition"
7838 * during a previous iteration and we need to clear it.
7839 */
7840
7841 /*
7842 * Step 1: If needed (because we dropped locks),
7843 * lookup the entry again.
7844 *
7845 * If we're coming back from unwiring (Step 5),
7846 * we also need to mark the entries as no longer
7847 * in transition after that.
7848 */
7849
7850 if (state & VMDS_NEEDS_LOOKUP) {
7851 state &= ~VMDS_NEEDS_LOOKUP;
7852
7853 if (vm_map_lookup_entry_or_next(map, s, &entry)) {
7854 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7855 }
7856 }
7857
7858 if (clear_in_transition_end) {
7859 for (vm_map_entry_t it = entry;
7860 it != vm_map_to_entry(map) &&
7861 it->vme_start < clear_in_transition_end;
7862 it = it->vme_next) {
7863 assert(it->in_transition);
7864 it->in_transition = FALSE;
7865 if (it->needs_wakeup) {
7866 it->needs_wakeup = FALSE;
7867 state |= VMDS_NEEDS_WAKEUP;
7868 }
7869 }
7870
7871 clear_in_transition_end = 0;
7872 }
7873
7874
7875 /*
7876 * Step 2: Perform various policy checks
7877 * before we do _anything_ to this entry.
7878 */
7879
7880 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
7881 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
7882 /*
7883 * Either we found a gap already,
7884 * or we are tearing down a map,
7885 * keep going.
7886 */
7887 } else if (state & VMDS_KERNEL_PMAP) {
7888 __vm_map_delete_gap_panic(map, s, start, end);
7889 } else if (vm_map_round_page(s, VM_MAP_PAGE_MASK(map)) < end) {
7890 /*
7891 * The vm_map_round_page() is needed since an entry
7892 * can be less than VM_MAP_PAGE_MASK() sized.
7893 *
7894 * For example, devices which have h/w 4K pages,
7895 * but entry sizes are all now 16K.
7896 */
7897 state |= VMDS_FOUND_GAP;
7898 gap_start = s;
7899 }
7900
7901 if (entry == vm_map_to_entry(map) ||
7902 end <= entry->vme_start) {
7903 break;
7904 }
7905
7906 s = entry->vme_start;
7907 }
7908
7909 if (state & VMDS_KERNEL_PMAP) {
7910 /*
7911 * In the kernel map and its submaps,
7912 * permanent entries never die, even
7913 * if VM_MAP_REMOVE_IMMUTABLE is passed.
7914 */
7915 if (entry->permanent) {
7916 __vm_map_delete_permanent_panic(map, start, end, entry);
7917 }
7918
7919 /*
7920 * In the kernel map and its submaps,
7921 * the removal of an atomic entry is strict.
7922 *
7923 * An atomic entry is processed only if it was
7924 * specifically targeted.
7925 *
7926 * We might have deleted non-atomic entries before
7927 * we reach this this point however...
7928 */
7929 if (entry->vme_atomic &&
7930 (entry->vme_start != start || entry->vme_end != end)) {
7931 __vm_map_delete_loose_atomic_panic(map,
7932 start, end, entry);
7933 }
7934 }
7935
7936
7937 /*
7938 * Step 3: Perform any clipping needed.
7939 *
7940 * After this, "entry" starts at "s", ends before "end"
7941 */
7942
7943 if (entry->vme_start < s) {
7944 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7945 entry->map_aligned &&
7946 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
7947 /*
7948 * The entry will no longer be map-aligned
7949 * after clipping and the caller said it's OK.
7950 */
7951 entry->map_aligned = FALSE;
7952 }
7953 vm_map_clip_start(map, entry, s);
7954 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
7955 }
7956
7957 if (end < entry->vme_end) {
7958 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
7959 entry->map_aligned &&
7960 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
7961 /*
7962 * The entry will no longer be map-aligned
7963 * after clipping and the caller said it's OK.
7964 */
7965 entry->map_aligned = FALSE;
7966 }
7967 vm_map_clip_end(map, entry, end);
7968 }
7969
7970 assert(s == entry->vme_start);
7971 assert(entry->vme_end <= end);
7972
7973
7974 /*
7975 * Step 4: If the entry is in flux, wait for this to resolve.
7976 */
7977
7978 if (entry->in_transition) {
7979 wait_result_t wait_result;
7980
7981 /*
7982 * Another thread is wiring/unwiring this entry.
7983 * Let the other thread know we are waiting.
7984 */
7985
7986 entry->needs_wakeup = TRUE;
7987
7988 /*
7989 * wake up anybody waiting on entries that we have
7990 * already unwired/deleted.
7991 */
7992 if (state & VMDS_NEEDS_WAKEUP) {
7993 vm_map_entry_wakeup(map);
7994 state &= ~VMDS_NEEDS_WAKEUP;
7995 }
7996
7997 wait_result = vm_map_entry_wait(map, interruptible);
7998
7999 if (interruptible &&
8000 wait_result == THREAD_INTERRUPTED) {
8001 /*
8002 * We do not clear the needs_wakeup flag,
8003 * since we cannot tell if we were the only one.
8004 */
8005 return KERN_ABORTED;
8006 }
8007
8008 /*
8009 * The entry could have been clipped or it
8010 * may not exist anymore. Look it up again.
8011 */
8012 state |= VMDS_NEEDS_LOOKUP;
8013 continue;
8014 }
8015
8016
8017 /*
8018 * Step 5: Handle wiring
8019 */
8020
8021 if (entry->wired_count) {
8022 struct vm_map_entry tmp_entry;
8023 boolean_t user_wire;
8024 unsigned int last_timestamp;
8025
8026 user_wire = entry->user_wired_count > 0;
8027
8028 /*
8029 * Remove a kernel wiring if requested
8030 */
8031 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8032 entry->wired_count--;
8033 }
8034
8035 /*
8036 * Remove all user wirings for proper accounting
8037 */
8038 while (entry->user_wired_count) {
8039 subtract_wire_counts(map, entry, user_wire);
8040 }
8041
8042 /*
8043 * All our DMA I/O operations in IOKit are currently
8044 * done by wiring through the map entries of the task
8045 * requesting the I/O.
8046 *
8047 * Because of this, we must always wait for kernel wirings
8048 * to go away on the entries before deleting them.
8049 *
8050 * Any caller who wants to actually remove a kernel wiring
8051 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8052 * properly remove one wiring instead of blasting through
8053 * them all.
8054 */
8055 if (entry->wired_count != 0) {
8056 assert(map != kernel_map);
8057 /*
8058 * Cannot continue. Typical case is when
8059 * a user thread has physical io pending on
8060 * on this page. Either wait for the
8061 * kernel wiring to go away or return an
8062 * error.
8063 */
8064 wait_result_t wait_result;
8065
8066 entry->needs_wakeup = TRUE;
8067 wait_result = vm_map_entry_wait(map,
8068 interruptible);
8069
8070 if (interruptible &&
8071 wait_result == THREAD_INTERRUPTED) {
8072 /*
8073 * We do not clear the
8074 * needs_wakeup flag, since we
8075 * cannot tell if we were the
8076 * only one.
8077 */
8078 return KERN_ABORTED;
8079 }
8080
8081
8082 /*
8083 * The entry could have been clipped or
8084 * it may not exist anymore. Look it
8085 * up again.
8086 */
8087 state |= VMDS_NEEDS_LOOKUP;
8088 continue;
8089 }
8090
8091 /*
8092 * We can unlock the map now.
8093 *
8094 * The entry might be split once we unlock the map,
8095 * but we need the range as defined by this entry
8096 * to be stable. So we must make a local copy.
8097 *
8098 * The underlying objects do not change during clips,
8099 * and the in_transition state guarentees existence
8100 * of the entry.
8101 */
8102 last_timestamp = map->timestamp;
8103 entry->in_transition = TRUE;
8104 tmp_entry = *entry;
8105 vm_map_unlock(map);
8106
8107 if (tmp_entry.is_sub_map) {
8108 vm_map_t sub_map;
8109 vm_map_offset_t sub_start, sub_end;
8110 pmap_t pmap;
8111 vm_map_offset_t pmap_addr;
8112
8113
8114 sub_map = VME_SUBMAP(&tmp_entry);
8115 sub_start = VME_OFFSET(&tmp_entry);
8116 sub_end = sub_start + (tmp_entry.vme_end -
8117 tmp_entry.vme_start);
8118 if (tmp_entry.use_pmap) {
8119 pmap = sub_map->pmap;
8120 pmap_addr = tmp_entry.vme_start;
8121 } else {
8122 pmap = map->pmap;
8123 pmap_addr = tmp_entry.vme_start;
8124 }
8125 (void) vm_map_unwire_nested(sub_map,
8126 sub_start, sub_end,
8127 user_wire,
8128 pmap, pmap_addr);
8129 } else {
8130 if (VME_OBJECT(&tmp_entry) == kernel_object) {
8131 pmap_protect_options(
8132 map->pmap,
8133 tmp_entry.vme_start,
8134 tmp_entry.vme_end,
8135 VM_PROT_NONE,
8136 PMAP_OPTIONS_REMOVE,
8137 NULL);
8138 }
8139 vm_fault_unwire(map, &tmp_entry,
8140 VME_OBJECT(&tmp_entry) == kernel_object,
8141 map->pmap, tmp_entry.vme_start);
8142 }
8143
8144 vm_map_lock(map);
8145
8146 /*
8147 * Unwiring happened, we can now go back to deleting
8148 * them (after we clear the in_transition bit for the range).
8149 */
8150 if (last_timestamp + 1 != map->timestamp) {
8151 state |= VMDS_NEEDS_LOOKUP;
8152 }
8153 clear_in_transition_end = tmp_entry.vme_end;
8154 continue;
8155 }
8156
8157 assert(entry->wired_count == 0);
8158 assert(entry->user_wired_count == 0);
8159
8160
8161 /*
8162 * Step 6: Entry is unwired and ready for us to delete !
8163 */
8164
8165 if (!entry->permanent) {
8166 /*
8167 * Typical case: the entry really shouldn't be permanent
8168 */
8169 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8170 #if 0
8171 printf("FBDP %d[%s] removing permanent entry "
8172 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8173 proc_selfpid(),
8174 (current_task()->bsd_info
8175 ? proc_name_address(current_task()->bsd_info)
8176 : "?"), entry,
8177 (uint64_t)entry->vme_start,
8178 (uint64_t)entry->vme_end,
8179 entry->protection,
8180 entry->max_protection);
8181 #endif
8182 entry->permanent = FALSE;
8183 } else {
8184 /*
8185 * dtrace -n 'vm_map_delete_permanent {
8186 * print("start=0x%llx end=0x%llx prot=0x%x/0x%x\n", arg0, arg1, arg2, arg3);
8187 * stack();
8188 * ustack();
8189 * }'
8190 */
8191 DTRACE_VM5(vm_map_delete_permanent,
8192 vm_map_offset_t, entry->vme_start,
8193 vm_map_offset_t, entry->vme_end,
8194 vm_prot_t, entry->protection,
8195 vm_prot_t, entry->max_protection,
8196 int, VME_ALIAS(entry));
8197 }
8198
8199 if (entry->is_sub_map) {
8200 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8201 "map %p (%d) entry %p submap %p (%d)\n",
8202 map, VM_MAP_PAGE_SHIFT(map), entry,
8203 VME_SUBMAP(entry),
8204 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8205 if (entry->use_pmap) {
8206 #ifndef NO_NESTED_PMAP
8207 int pmap_flags;
8208
8209 if (map->terminated) {
8210 /*
8211 * This is the final cleanup of the
8212 * address space being terminated.
8213 * No new mappings are expected and
8214 * we don't really need to unnest the
8215 * shared region (and lose the "global"
8216 * pmap mappings, if applicable).
8217 *
8218 * Tell the pmap layer that we're
8219 * "clean" wrt nesting.
8220 */
8221 pmap_flags = PMAP_UNNEST_CLEAN;
8222 } else {
8223 /*
8224 * We're unmapping part of the nested
8225 * shared region, so we can't keep the
8226 * nested pmap.
8227 */
8228 pmap_flags = 0;
8229 }
8230 pmap_unnest_options(
8231 map->pmap,
8232 (addr64_t)entry->vme_start,
8233 entry->vme_end - entry->vme_start,
8234 pmap_flags);
8235 #endif /* NO_NESTED_PMAP */
8236 if (map->mapped_in_other_pmaps &&
8237 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8238 /* clean up parent map/maps */
8239 vm_map_submap_pmap_clean(
8240 map, entry->vme_start,
8241 entry->vme_end,
8242 VME_SUBMAP(entry),
8243 VME_OFFSET(entry));
8244 }
8245 } else {
8246 vm_map_submap_pmap_clean(
8247 map, entry->vme_start, entry->vme_end,
8248 VME_SUBMAP(entry),
8249 VME_OFFSET(entry));
8250 }
8251 } else if (VME_OBJECT(entry) == kernel_object ||
8252 VME_OBJECT(entry) == compressor_object) {
8253 /*
8254 * nothing to do
8255 */
8256 } else if (map->mapped_in_other_pmaps &&
8257 os_ref_get_count_raw(&map->map_refcnt) != 0) {
8258 vm_object_pmap_protect_options(
8259 VME_OBJECT(entry), VME_OFFSET(entry),
8260 entry->vme_end - entry->vme_start,
8261 PMAP_NULL,
8262 PAGE_SIZE,
8263 entry->vme_start,
8264 VM_PROT_NONE,
8265 PMAP_OPTIONS_REMOVE);
8266 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8267 (state & VMDS_KERNEL_PMAP)) {
8268 /* Remove translations associated
8269 * with this range unless the entry
8270 * does not have an object, or
8271 * it's the kernel map or a descendant
8272 * since the platform could potentially
8273 * create "backdoor" mappings invisible
8274 * to the VM. It is expected that
8275 * objectless, non-kernel ranges
8276 * do not have such VM invisible
8277 * translations.
8278 */
8279 pmap_remove_options(map->pmap,
8280 (addr64_t)entry->vme_start,
8281 (addr64_t)entry->vme_end,
8282 PMAP_OPTIONS_REMOVE);
8283 }
8284
8285 #if DEBUG
8286 /*
8287 * All pmap mappings for this map entry must have been
8288 * cleared by now.
8289 */
8290 assert(pmap_is_empty(map->pmap,
8291 entry->vme_start,
8292 entry->vme_end));
8293 #endif /* DEBUG */
8294
8295 if (entry->iokit_acct) {
8296 /* alternate accounting */
8297 DTRACE_VM4(vm_map_iokit_unmapped_region,
8298 vm_map_t, map,
8299 vm_map_offset_t, entry->vme_start,
8300 vm_map_offset_t, entry->vme_end,
8301 int, VME_ALIAS(entry));
8302 vm_map_iokit_unmapped_region(map,
8303 (entry->vme_end -
8304 entry->vme_start));
8305 entry->iokit_acct = FALSE;
8306 entry->use_pmap = FALSE;
8307 }
8308
8309 s = entry->vme_end;
8310 next = entry->vme_next;
8311
8312 if (entry->permanent) {
8313 /*
8314 * A permanent entry can not be removed, so leave it
8315 * in place but remove all access permissions.
8316 */
8317 entry->protection = VM_PROT_NONE;
8318 entry->max_protection = VM_PROT_NONE;
8319 } else {
8320 vm_map_entry_zap(map, entry, zap_list);
8321 }
8322
8323 entry = next;
8324
8325 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8326 unsigned int last_timestamp = map->timestamp++;
8327
8328 if (lck_rw_lock_yield_exclusive(&map->lock,
8329 LCK_RW_YIELD_ANY_WAITER)) {
8330 if (last_timestamp != map->timestamp + 1) {
8331 state |= VMDS_NEEDS_LOOKUP;
8332 }
8333 } else {
8334 /* we didn't yield, undo our change */
8335 map->timestamp--;
8336 }
8337 }
8338 }
8339
8340 if (map->wait_for_space) {
8341 thread_wakeup((event_t) map);
8342 }
8343
8344 if (state & VMDS_NEEDS_WAKEUP) {
8345 vm_map_entry_wakeup(map);
8346 }
8347
8348 if (state & VMDS_FOUND_GAP) {
8349 DTRACE_VM3(kern_vm_deallocate_gap,
8350 vm_map_offset_t, gap_start,
8351 vm_map_offset_t, save_start,
8352 vm_map_offset_t, save_end);
8353 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
8354 return KERN_INVALID_VALUE;
8355 } else {
8356 vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
8357 }
8358 }
8359
8360 return KERN_SUCCESS;
8361 }
8362
8363
8364 /*
8365 * vm_map_terminate:
8366 *
8367 * Clean out a task's map.
8368 */
8369 kern_return_t
vm_map_terminate(vm_map_t map)8370 vm_map_terminate(
8371 vm_map_t map)
8372 {
8373 vm_map_lock(map);
8374 map->terminated = TRUE;
8375 vm_map_disable_hole_optimization(map);
8376 vm_map_remove_and_unlock(map, map->min_offset, map->max_offset,
8377 VM_MAP_REMOVE_NO_FLAGS);
8378 return KERN_SUCCESS;
8379 }
8380
8381 /*
8382 * vm_map_remove:
8383 *
8384 * Remove the given address range from the target map.
8385 * This is the exported form of vm_map_delete.
8386 */
8387 kern_return_t
vm_map_remove_flags(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags)8388 vm_map_remove_flags(
8389 vm_map_t map,
8390 vm_map_offset_t start,
8391 vm_map_offset_t end,
8392 vmr_flags_t flags)
8393 {
8394 vm_map_lock(map);
8395 return vm_map_remove_and_unlock(map, start, end, flags);
8396 }
8397
8398 /*
8399 * vm_map_remove_locked:
8400 *
8401 * Remove the given address range from the target locked map.
8402 * This is the exported form of vm_map_delete.
8403 */
8404 kern_return_t
vm_map_remove_and_unlock(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vmr_flags_t flags)8405 vm_map_remove_and_unlock(
8406 vm_map_t map,
8407 vm_map_offset_t start,
8408 vm_map_offset_t end,
8409 vmr_flags_t flags)
8410 {
8411 VM_MAP_ZAP_DECLARE(zap);
8412 kern_return_t result;
8413
8414 VM_MAP_RANGE_CHECK(map, start, end);
8415 result = vm_map_delete(map, start, end, flags, &zap);
8416 vm_map_unlock(map);
8417
8418 vm_map_zap_dispose(&zap);
8419
8420 return result;
8421 }
8422
8423
8424 /*
8425 * Routine: vm_map_copy_allocate
8426 *
8427 * Description:
8428 * Allocates and initializes a map copy object.
8429 */
8430 static vm_map_copy_t
vm_map_copy_allocate(void)8431 vm_map_copy_allocate(void)
8432 {
8433 vm_map_copy_t new_copy;
8434
8435 new_copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO);
8436 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
8437 vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8438 vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
8439 return new_copy;
8440 }
8441
8442 /*
8443 * Routine: vm_map_copy_discard
8444 *
8445 * Description:
8446 * Dispose of a map copy object (returned by
8447 * vm_map_copyin).
8448 */
8449 void
vm_map_copy_discard(vm_map_copy_t copy)8450 vm_map_copy_discard(
8451 vm_map_copy_t copy)
8452 {
8453 if (copy == VM_MAP_COPY_NULL) {
8454 return;
8455 }
8456
8457 /*
8458 * Assert that the vm_map_copy is coming from the right
8459 * zone and hasn't been forged
8460 */
8461 vm_map_copy_require(copy);
8462
8463 switch (copy->type) {
8464 case VM_MAP_COPY_ENTRY_LIST:
8465 while (vm_map_copy_first_entry(copy) !=
8466 vm_map_copy_to_entry(copy)) {
8467 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
8468
8469 vm_map_copy_entry_unlink(copy, entry);
8470 if (entry->is_sub_map) {
8471 vm_map_deallocate(VME_SUBMAP(entry));
8472 } else {
8473 vm_object_deallocate(VME_OBJECT(entry));
8474 }
8475 vm_map_copy_entry_dispose(entry);
8476 }
8477 break;
8478 case VM_MAP_COPY_OBJECT:
8479 vm_object_deallocate(copy->cpy_object);
8480 break;
8481 case VM_MAP_COPY_KERNEL_BUFFER:
8482
8483 /*
8484 * The vm_map_copy_t and possibly the data buffer were
8485 * allocated by a single call to kalloc_data(), i.e. the
8486 * vm_map_copy_t was not allocated out of the zone.
8487 */
8488 if (copy->size > msg_ool_size_small || copy->offset) {
8489 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
8490 (long long)copy->size, (long long)copy->offset);
8491 }
8492 kfree_data(copy->cpy_kdata, copy->size);
8493 }
8494 zfree(vm_map_copy_zone, copy);
8495 }
8496
8497 /*
8498 * Routine: vm_map_copy_copy
8499 *
8500 * Description:
8501 * Move the information in a map copy object to
8502 * a new map copy object, leaving the old one
8503 * empty.
8504 *
8505 * This is used by kernel routines that need
8506 * to look at out-of-line data (in copyin form)
8507 * before deciding whether to return SUCCESS.
8508 * If the routine returns FAILURE, the original
8509 * copy object will be deallocated; therefore,
8510 * these routines must make a copy of the copy
8511 * object and leave the original empty so that
8512 * deallocation will not fail.
8513 */
8514 vm_map_copy_t
vm_map_copy_copy(vm_map_copy_t copy)8515 vm_map_copy_copy(
8516 vm_map_copy_t copy)
8517 {
8518 vm_map_copy_t new_copy;
8519
8520 if (copy == VM_MAP_COPY_NULL) {
8521 return VM_MAP_COPY_NULL;
8522 }
8523
8524 /*
8525 * Assert that the vm_map_copy is coming from the right
8526 * zone and hasn't been forged
8527 */
8528 vm_map_copy_require(copy);
8529
8530 /*
8531 * Allocate a new copy object, and copy the information
8532 * from the old one into it.
8533 */
8534
8535 new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
8536 memcpy((void *) new_copy, (void *) copy, sizeof(struct vm_map_copy));
8537 #if __has_feature(ptrauth_calls)
8538 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8539 new_copy->cpy_kdata = copy->cpy_kdata;
8540 }
8541 #endif
8542
8543 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
8544 /*
8545 * The links in the entry chain must be
8546 * changed to point to the new copy object.
8547 */
8548 vm_map_copy_first_entry(copy)->vme_prev
8549 = vm_map_copy_to_entry(new_copy);
8550 vm_map_copy_last_entry(copy)->vme_next
8551 = vm_map_copy_to_entry(new_copy);
8552 }
8553
8554 /*
8555 * Change the old copy object into one that contains
8556 * nothing to be deallocated.
8557 */
8558 copy->type = VM_MAP_COPY_OBJECT;
8559 copy->cpy_object = VM_OBJECT_NULL;
8560
8561 /*
8562 * Return the new object.
8563 */
8564 return new_copy;
8565 }
8566
8567 static boolean_t
vm_map_entry_is_overwritable(vm_map_t dst_map __unused,vm_map_entry_t entry)8568 vm_map_entry_is_overwritable(
8569 vm_map_t dst_map __unused,
8570 vm_map_entry_t entry)
8571 {
8572 if (!(entry->protection & VM_PROT_WRITE)) {
8573 /* can't overwrite if not writable */
8574 return FALSE;
8575 }
8576 #if !__x86_64__
8577 if (entry->used_for_jit &&
8578 vm_map_cs_enforcement(dst_map) &&
8579 !dst_map->cs_debugged) {
8580 /*
8581 * Can't overwrite a JIT region while cs_enforced
8582 * and not cs_debugged.
8583 */
8584 return FALSE;
8585 }
8586 #endif /* !__x86_64__ */
8587 return TRUE;
8588 }
8589
8590 static kern_return_t
vm_map_overwrite_submap_recurse(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_size_t dst_size)8591 vm_map_overwrite_submap_recurse(
8592 vm_map_t dst_map,
8593 vm_map_offset_t dst_addr,
8594 vm_map_size_t dst_size)
8595 {
8596 vm_map_offset_t dst_end;
8597 vm_map_entry_t tmp_entry;
8598 vm_map_entry_t entry;
8599 kern_return_t result;
8600 boolean_t encountered_sub_map = FALSE;
8601
8602
8603
8604 /*
8605 * Verify that the destination is all writeable
8606 * initially. We have to trunc the destination
8607 * address and round the copy size or we'll end up
8608 * splitting entries in strange ways.
8609 */
8610
8611 dst_end = vm_map_round_page(dst_addr + dst_size,
8612 VM_MAP_PAGE_MASK(dst_map));
8613 vm_map_lock(dst_map);
8614
8615 start_pass_1:
8616 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8617 vm_map_unlock(dst_map);
8618 return KERN_INVALID_ADDRESS;
8619 }
8620
8621 vm_map_clip_start(dst_map,
8622 tmp_entry,
8623 vm_map_trunc_page(dst_addr,
8624 VM_MAP_PAGE_MASK(dst_map)));
8625 if (tmp_entry->is_sub_map) {
8626 /* clipping did unnest if needed */
8627 assert(!tmp_entry->use_pmap);
8628 }
8629
8630 for (entry = tmp_entry;;) {
8631 vm_map_entry_t next;
8632
8633 next = entry->vme_next;
8634 while (entry->is_sub_map) {
8635 vm_map_offset_t sub_start;
8636 vm_map_offset_t sub_end;
8637 vm_map_offset_t local_end;
8638
8639 if (entry->in_transition) {
8640 /*
8641 * Say that we are waiting, and wait for entry.
8642 */
8643 entry->needs_wakeup = TRUE;
8644 vm_map_entry_wait(dst_map, THREAD_UNINT);
8645
8646 goto start_pass_1;
8647 }
8648
8649 encountered_sub_map = TRUE;
8650 sub_start = VME_OFFSET(entry);
8651
8652 if (entry->vme_end < dst_end) {
8653 sub_end = entry->vme_end;
8654 } else {
8655 sub_end = dst_end;
8656 }
8657 sub_end -= entry->vme_start;
8658 sub_end += VME_OFFSET(entry);
8659 local_end = entry->vme_end;
8660 vm_map_unlock(dst_map);
8661
8662 result = vm_map_overwrite_submap_recurse(
8663 VME_SUBMAP(entry),
8664 sub_start,
8665 sub_end - sub_start);
8666
8667 if (result != KERN_SUCCESS) {
8668 return result;
8669 }
8670 if (dst_end <= entry->vme_end) {
8671 return KERN_SUCCESS;
8672 }
8673 vm_map_lock(dst_map);
8674 if (!vm_map_lookup_entry(dst_map, local_end,
8675 &tmp_entry)) {
8676 vm_map_unlock(dst_map);
8677 return KERN_INVALID_ADDRESS;
8678 }
8679 entry = tmp_entry;
8680 next = entry->vme_next;
8681 }
8682
8683 if (!(entry->protection & VM_PROT_WRITE)) {
8684 vm_map_unlock(dst_map);
8685 return KERN_PROTECTION_FAILURE;
8686 }
8687
8688 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8689 vm_map_unlock(dst_map);
8690 return KERN_PROTECTION_FAILURE;
8691 }
8692
8693 /*
8694 * If the entry is in transition, we must wait
8695 * for it to exit that state. Anything could happen
8696 * when we unlock the map, so start over.
8697 */
8698 if (entry->in_transition) {
8699 /*
8700 * Say that we are waiting, and wait for entry.
8701 */
8702 entry->needs_wakeup = TRUE;
8703 vm_map_entry_wait(dst_map, THREAD_UNINT);
8704
8705 goto start_pass_1;
8706 }
8707
8708 /*
8709 * our range is contained completely within this map entry
8710 */
8711 if (dst_end <= entry->vme_end) {
8712 vm_map_unlock(dst_map);
8713 return KERN_SUCCESS;
8714 }
8715 /*
8716 * check that range specified is contiguous region
8717 */
8718 if ((next == vm_map_to_entry(dst_map)) ||
8719 (next->vme_start != entry->vme_end)) {
8720 vm_map_unlock(dst_map);
8721 return KERN_INVALID_ADDRESS;
8722 }
8723
8724 /*
8725 * Check for permanent objects in the destination.
8726 */
8727 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8728 ((!VME_OBJECT(entry)->internal) ||
8729 (VME_OBJECT(entry)->true_share))) {
8730 if (encountered_sub_map) {
8731 vm_map_unlock(dst_map);
8732 return KERN_FAILURE;
8733 }
8734 }
8735
8736
8737 entry = next;
8738 }/* for */
8739 vm_map_unlock(dst_map);
8740 return KERN_SUCCESS;
8741 }
8742
8743 /*
8744 * Routine: vm_map_copy_overwrite
8745 *
8746 * Description:
8747 * Copy the memory described by the map copy
8748 * object (copy; returned by vm_map_copyin) onto
8749 * the specified destination region (dst_map, dst_addr).
8750 * The destination must be writeable.
8751 *
8752 * Unlike vm_map_copyout, this routine actually
8753 * writes over previously-mapped memory. If the
8754 * previous mapping was to a permanent (user-supplied)
8755 * memory object, it is preserved.
8756 *
8757 * The attributes (protection and inheritance) of the
8758 * destination region are preserved.
8759 *
8760 * If successful, consumes the copy object.
8761 * Otherwise, the caller is responsible for it.
8762 *
8763 * Implementation notes:
8764 * To overwrite aligned temporary virtual memory, it is
8765 * sufficient to remove the previous mapping and insert
8766 * the new copy. This replacement is done either on
8767 * the whole region (if no permanent virtual memory
8768 * objects are embedded in the destination region) or
8769 * in individual map entries.
8770 *
8771 * To overwrite permanent virtual memory , it is necessary
8772 * to copy each page, as the external memory management
8773 * interface currently does not provide any optimizations.
8774 *
8775 * Unaligned memory also has to be copied. It is possible
8776 * to use 'vm_trickery' to copy the aligned data. This is
8777 * not done but not hard to implement.
8778 *
8779 * Once a page of permanent memory has been overwritten,
8780 * it is impossible to interrupt this function; otherwise,
8781 * the call would be neither atomic nor location-independent.
8782 * The kernel-state portion of a user thread must be
8783 * interruptible.
8784 *
8785 * It may be expensive to forward all requests that might
8786 * overwrite permanent memory (vm_write, vm_copy) to
8787 * uninterruptible kernel threads. This routine may be
8788 * called by interruptible threads; however, success is
8789 * not guaranteed -- if the request cannot be performed
8790 * atomically and interruptibly, an error indication is
8791 * returned.
8792 *
8793 * Callers of this function must call vm_map_copy_require on
8794 * previously created vm_map_copy_t or pass a newly created
8795 * one to ensure that it hasn't been forged.
8796 */
8797
8798 static kern_return_t
vm_map_copy_overwrite_nested(vm_map_t dst_map,vm_map_address_t dst_addr,vm_map_copy_t copy,boolean_t interruptible,pmap_t pmap,boolean_t discard_on_success)8799 vm_map_copy_overwrite_nested(
8800 vm_map_t dst_map,
8801 vm_map_address_t dst_addr,
8802 vm_map_copy_t copy,
8803 boolean_t interruptible,
8804 pmap_t pmap,
8805 boolean_t discard_on_success)
8806 {
8807 vm_map_offset_t dst_end;
8808 vm_map_entry_t tmp_entry;
8809 vm_map_entry_t entry;
8810 kern_return_t kr;
8811 boolean_t aligned = TRUE;
8812 boolean_t contains_permanent_objects = FALSE;
8813 boolean_t encountered_sub_map = FALSE;
8814 vm_map_offset_t base_addr;
8815 vm_map_size_t copy_size;
8816 vm_map_size_t total_size;
8817 uint16_t copy_page_shift;
8818
8819 /*
8820 * Check for special kernel buffer allocated
8821 * by new_ipc_kmsg_copyin.
8822 */
8823
8824 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
8825 return vm_map_copyout_kernel_buffer(
8826 dst_map, &dst_addr,
8827 copy, copy->size, TRUE, discard_on_success);
8828 }
8829
8830 /*
8831 * Only works for entry lists at the moment. Will
8832 * support page lists later.
8833 */
8834
8835 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
8836
8837 if (copy->size == 0) {
8838 if (discard_on_success) {
8839 vm_map_copy_discard(copy);
8840 }
8841 return KERN_SUCCESS;
8842 }
8843
8844 copy_page_shift = copy->cpy_hdr.page_shift;
8845
8846 /*
8847 * Verify that the destination is all writeable
8848 * initially. We have to trunc the destination
8849 * address and round the copy size or we'll end up
8850 * splitting entries in strange ways.
8851 */
8852
8853 if (!VM_MAP_PAGE_ALIGNED(copy->size,
8854 VM_MAP_PAGE_MASK(dst_map)) ||
8855 !VM_MAP_PAGE_ALIGNED(copy->offset,
8856 VM_MAP_PAGE_MASK(dst_map)) ||
8857 !VM_MAP_PAGE_ALIGNED(dst_addr,
8858 VM_MAP_PAGE_MASK(dst_map)) ||
8859 copy_page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
8860 aligned = FALSE;
8861 dst_end = vm_map_round_page(dst_addr + copy->size,
8862 VM_MAP_PAGE_MASK(dst_map));
8863 } else {
8864 dst_end = dst_addr + copy->size;
8865 }
8866
8867 vm_map_lock(dst_map);
8868
8869 /* LP64todo - remove this check when vm_map_commpage64()
8870 * no longer has to stuff in a map_entry for the commpage
8871 * above the map's max_offset.
8872 */
8873 if (dst_addr >= dst_map->max_offset) {
8874 vm_map_unlock(dst_map);
8875 return KERN_INVALID_ADDRESS;
8876 }
8877
8878 start_pass_1:
8879 if (!vm_map_lookup_entry(dst_map, dst_addr, &tmp_entry)) {
8880 vm_map_unlock(dst_map);
8881 return KERN_INVALID_ADDRESS;
8882 }
8883 vm_map_clip_start(dst_map,
8884 tmp_entry,
8885 vm_map_trunc_page(dst_addr,
8886 VM_MAP_PAGE_MASK(dst_map)));
8887 for (entry = tmp_entry;;) {
8888 vm_map_entry_t next = entry->vme_next;
8889
8890 while (entry->is_sub_map) {
8891 vm_map_offset_t sub_start;
8892 vm_map_offset_t sub_end;
8893 vm_map_offset_t local_end;
8894
8895 if (entry->in_transition) {
8896 /*
8897 * Say that we are waiting, and wait for entry.
8898 */
8899 entry->needs_wakeup = TRUE;
8900 vm_map_entry_wait(dst_map, THREAD_UNINT);
8901
8902 goto start_pass_1;
8903 }
8904
8905 local_end = entry->vme_end;
8906 if (!(entry->needs_copy)) {
8907 /* if needs_copy we are a COW submap */
8908 /* in such a case we just replace so */
8909 /* there is no need for the follow- */
8910 /* ing check. */
8911 encountered_sub_map = TRUE;
8912 sub_start = VME_OFFSET(entry);
8913
8914 if (entry->vme_end < dst_end) {
8915 sub_end = entry->vme_end;
8916 } else {
8917 sub_end = dst_end;
8918 }
8919 sub_end -= entry->vme_start;
8920 sub_end += VME_OFFSET(entry);
8921 vm_map_unlock(dst_map);
8922
8923 kr = vm_map_overwrite_submap_recurse(
8924 VME_SUBMAP(entry),
8925 sub_start,
8926 sub_end - sub_start);
8927 if (kr != KERN_SUCCESS) {
8928 return kr;
8929 }
8930 vm_map_lock(dst_map);
8931 }
8932
8933 if (dst_end <= entry->vme_end) {
8934 goto start_overwrite;
8935 }
8936 if (!vm_map_lookup_entry(dst_map, local_end,
8937 &entry)) {
8938 vm_map_unlock(dst_map);
8939 return KERN_INVALID_ADDRESS;
8940 }
8941 next = entry->vme_next;
8942 }
8943
8944 if (!(entry->protection & VM_PROT_WRITE)) {
8945 vm_map_unlock(dst_map);
8946 return KERN_PROTECTION_FAILURE;
8947 }
8948
8949 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
8950 vm_map_unlock(dst_map);
8951 return KERN_PROTECTION_FAILURE;
8952 }
8953
8954 /*
8955 * If the entry is in transition, we must wait
8956 * for it to exit that state. Anything could happen
8957 * when we unlock the map, so start over.
8958 */
8959 if (entry->in_transition) {
8960 /*
8961 * Say that we are waiting, and wait for entry.
8962 */
8963 entry->needs_wakeup = TRUE;
8964 vm_map_entry_wait(dst_map, THREAD_UNINT);
8965
8966 goto start_pass_1;
8967 }
8968
8969 /*
8970 * our range is contained completely within this map entry
8971 */
8972 if (dst_end <= entry->vme_end) {
8973 break;
8974 }
8975 /*
8976 * check that range specified is contiguous region
8977 */
8978 if ((next == vm_map_to_entry(dst_map)) ||
8979 (next->vme_start != entry->vme_end)) {
8980 vm_map_unlock(dst_map);
8981 return KERN_INVALID_ADDRESS;
8982 }
8983
8984
8985 /*
8986 * Check for permanent objects in the destination.
8987 */
8988 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
8989 ((!VME_OBJECT(entry)->internal) ||
8990 (VME_OBJECT(entry)->true_share))) {
8991 contains_permanent_objects = TRUE;
8992 }
8993
8994 entry = next;
8995 }/* for */
8996
8997 start_overwrite:
8998 /*
8999 * If there are permanent objects in the destination, then
9000 * the copy cannot be interrupted.
9001 */
9002
9003 if (interruptible && contains_permanent_objects) {
9004 vm_map_unlock(dst_map);
9005 return KERN_FAILURE; /* XXX */
9006 }
9007
9008 /*
9009 *
9010 * Make a second pass, overwriting the data
9011 * At the beginning of each loop iteration,
9012 * the next entry to be overwritten is "tmp_entry"
9013 * (initially, the value returned from the lookup above),
9014 * and the starting address expected in that entry
9015 * is "start".
9016 */
9017
9018 total_size = copy->size;
9019 if (encountered_sub_map) {
9020 copy_size = 0;
9021 /* re-calculate tmp_entry since we've had the map */
9022 /* unlocked */
9023 if (!vm_map_lookup_entry( dst_map, dst_addr, &tmp_entry)) {
9024 vm_map_unlock(dst_map);
9025 return KERN_INVALID_ADDRESS;
9026 }
9027 } else {
9028 copy_size = copy->size;
9029 }
9030
9031 base_addr = dst_addr;
9032 while (TRUE) {
9033 /* deconstruct the copy object and do in parts */
9034 /* only in sub_map, interruptable case */
9035 vm_map_entry_t copy_entry;
9036 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9037 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9038 int nentries;
9039 int remaining_entries = 0;
9040 vm_map_offset_t new_offset = 0;
9041
9042 for (entry = tmp_entry; copy_size == 0;) {
9043 vm_map_entry_t next;
9044
9045 next = entry->vme_next;
9046
9047 /* tmp_entry and base address are moved along */
9048 /* each time we encounter a sub-map. Otherwise */
9049 /* entry can outpase tmp_entry, and the copy_size */
9050 /* may reflect the distance between them */
9051 /* if the current entry is found to be in transition */
9052 /* we will start over at the beginning or the last */
9053 /* encounter of a submap as dictated by base_addr */
9054 /* we will zero copy_size accordingly. */
9055 if (entry->in_transition) {
9056 /*
9057 * Say that we are waiting, and wait for entry.
9058 */
9059 entry->needs_wakeup = TRUE;
9060 vm_map_entry_wait(dst_map, THREAD_UNINT);
9061
9062 if (!vm_map_lookup_entry(dst_map, base_addr,
9063 &tmp_entry)) {
9064 vm_map_unlock(dst_map);
9065 return KERN_INVALID_ADDRESS;
9066 }
9067 copy_size = 0;
9068 entry = tmp_entry;
9069 continue;
9070 }
9071 if (entry->is_sub_map) {
9072 vm_map_offset_t sub_start;
9073 vm_map_offset_t sub_end;
9074 vm_map_offset_t local_end;
9075
9076 if (entry->needs_copy) {
9077 /* if this is a COW submap */
9078 /* just back the range with a */
9079 /* anonymous entry */
9080 if (entry->vme_end < dst_end) {
9081 sub_end = entry->vme_end;
9082 } else {
9083 sub_end = dst_end;
9084 }
9085 if (entry->vme_start < base_addr) {
9086 sub_start = base_addr;
9087 } else {
9088 sub_start = entry->vme_start;
9089 }
9090 vm_map_clip_end(
9091 dst_map, entry, sub_end);
9092 vm_map_clip_start(
9093 dst_map, entry, sub_start);
9094 assert(!entry->use_pmap);
9095 assert(!entry->iokit_acct);
9096 entry->use_pmap = TRUE;
9097 entry->is_sub_map = FALSE;
9098 vm_map_deallocate(
9099 VME_SUBMAP(entry));
9100 VME_OBJECT_SET(entry, VM_OBJECT_NULL);
9101 VME_OFFSET_SET(entry, 0);
9102 entry->is_shared = FALSE;
9103 entry->needs_copy = FALSE;
9104 entry->protection = VM_PROT_DEFAULT;
9105 entry->max_protection = VM_PROT_ALL;
9106 entry->wired_count = 0;
9107 entry->user_wired_count = 0;
9108 if (entry->inheritance
9109 == VM_INHERIT_SHARE) {
9110 entry->inheritance = VM_INHERIT_COPY;
9111 }
9112 continue;
9113 }
9114 /* first take care of any non-sub_map */
9115 /* entries to send */
9116 if (base_addr < entry->vme_start) {
9117 /* stuff to send */
9118 copy_size =
9119 entry->vme_start - base_addr;
9120 break;
9121 }
9122 sub_start = VME_OFFSET(entry);
9123
9124 if (entry->vme_end < dst_end) {
9125 sub_end = entry->vme_end;
9126 } else {
9127 sub_end = dst_end;
9128 }
9129 sub_end -= entry->vme_start;
9130 sub_end += VME_OFFSET(entry);
9131 local_end = entry->vme_end;
9132 vm_map_unlock(dst_map);
9133 copy_size = sub_end - sub_start;
9134
9135 /* adjust the copy object */
9136 if (total_size > copy_size) {
9137 vm_map_size_t local_size = 0;
9138 vm_map_size_t entry_size;
9139
9140 nentries = 1;
9141 new_offset = copy->offset;
9142 copy_entry = vm_map_copy_first_entry(copy);
9143 while (copy_entry !=
9144 vm_map_copy_to_entry(copy)) {
9145 entry_size = copy_entry->vme_end -
9146 copy_entry->vme_start;
9147 if ((local_size < copy_size) &&
9148 ((local_size + entry_size)
9149 >= copy_size)) {
9150 vm_map_copy_clip_end(copy,
9151 copy_entry,
9152 copy_entry->vme_start +
9153 (copy_size - local_size));
9154 entry_size = copy_entry->vme_end -
9155 copy_entry->vme_start;
9156 local_size += entry_size;
9157 new_offset += entry_size;
9158 }
9159 if (local_size >= copy_size) {
9160 next_copy = copy_entry->vme_next;
9161 copy_entry->vme_next =
9162 vm_map_copy_to_entry(copy);
9163 previous_prev =
9164 copy->cpy_hdr.links.prev;
9165 copy->cpy_hdr.links.prev = copy_entry;
9166 copy->size = copy_size;
9167 remaining_entries =
9168 copy->cpy_hdr.nentries;
9169 remaining_entries -= nentries;
9170 copy->cpy_hdr.nentries = nentries;
9171 break;
9172 } else {
9173 local_size += entry_size;
9174 new_offset += entry_size;
9175 nentries++;
9176 }
9177 copy_entry = copy_entry->vme_next;
9178 }
9179 }
9180
9181 if ((entry->use_pmap) && (pmap == NULL)) {
9182 kr = vm_map_copy_overwrite_nested(
9183 VME_SUBMAP(entry),
9184 sub_start,
9185 copy,
9186 interruptible,
9187 VME_SUBMAP(entry)->pmap,
9188 TRUE);
9189 } else if (pmap != NULL) {
9190 kr = vm_map_copy_overwrite_nested(
9191 VME_SUBMAP(entry),
9192 sub_start,
9193 copy,
9194 interruptible, pmap,
9195 TRUE);
9196 } else {
9197 kr = vm_map_copy_overwrite_nested(
9198 VME_SUBMAP(entry),
9199 sub_start,
9200 copy,
9201 interruptible,
9202 dst_map->pmap,
9203 TRUE);
9204 }
9205 if (kr != KERN_SUCCESS) {
9206 if (next_copy != NULL) {
9207 copy->cpy_hdr.nentries +=
9208 remaining_entries;
9209 copy->cpy_hdr.links.prev->vme_next =
9210 next_copy;
9211 copy->cpy_hdr.links.prev
9212 = previous_prev;
9213 copy->size = total_size;
9214 }
9215 return kr;
9216 }
9217 if (dst_end <= local_end) {
9218 return KERN_SUCCESS;
9219 }
9220 /* otherwise copy no longer exists, it was */
9221 /* destroyed after successful copy_overwrite */
9222 copy = vm_map_copy_allocate();
9223 copy->type = VM_MAP_COPY_ENTRY_LIST;
9224 copy->offset = new_offset;
9225 copy->cpy_hdr.page_shift = copy_page_shift;
9226
9227 /*
9228 * XXX FBDP
9229 * this does not seem to deal with
9230 * the VM map store (R&B tree)
9231 */
9232
9233 total_size -= copy_size;
9234 copy_size = 0;
9235 /* put back remainder of copy in container */
9236 if (next_copy != NULL) {
9237 copy->cpy_hdr.nentries = remaining_entries;
9238 copy->cpy_hdr.links.next = next_copy;
9239 copy->cpy_hdr.links.prev = previous_prev;
9240 copy->size = total_size;
9241 next_copy->vme_prev =
9242 vm_map_copy_to_entry(copy);
9243 next_copy = NULL;
9244 }
9245 base_addr = local_end;
9246 vm_map_lock(dst_map);
9247 if (!vm_map_lookup_entry(dst_map,
9248 local_end, &tmp_entry)) {
9249 vm_map_unlock(dst_map);
9250 return KERN_INVALID_ADDRESS;
9251 }
9252 entry = tmp_entry;
9253 continue;
9254 }
9255 if (dst_end <= entry->vme_end) {
9256 copy_size = dst_end - base_addr;
9257 break;
9258 }
9259
9260 if ((next == vm_map_to_entry(dst_map)) ||
9261 (next->vme_start != entry->vme_end)) {
9262 vm_map_unlock(dst_map);
9263 return KERN_INVALID_ADDRESS;
9264 }
9265
9266 entry = next;
9267 }/* for */
9268
9269 next_copy = NULL;
9270 nentries = 1;
9271
9272 /* adjust the copy object */
9273 if (total_size > copy_size) {
9274 vm_map_size_t local_size = 0;
9275 vm_map_size_t entry_size;
9276
9277 new_offset = copy->offset;
9278 copy_entry = vm_map_copy_first_entry(copy);
9279 while (copy_entry != vm_map_copy_to_entry(copy)) {
9280 entry_size = copy_entry->vme_end -
9281 copy_entry->vme_start;
9282 if ((local_size < copy_size) &&
9283 ((local_size + entry_size)
9284 >= copy_size)) {
9285 vm_map_copy_clip_end(copy, copy_entry,
9286 copy_entry->vme_start +
9287 (copy_size - local_size));
9288 entry_size = copy_entry->vme_end -
9289 copy_entry->vme_start;
9290 local_size += entry_size;
9291 new_offset += entry_size;
9292 }
9293 if (local_size >= copy_size) {
9294 next_copy = copy_entry->vme_next;
9295 copy_entry->vme_next =
9296 vm_map_copy_to_entry(copy);
9297 previous_prev =
9298 copy->cpy_hdr.links.prev;
9299 copy->cpy_hdr.links.prev = copy_entry;
9300 copy->size = copy_size;
9301 remaining_entries =
9302 copy->cpy_hdr.nentries;
9303 remaining_entries -= nentries;
9304 copy->cpy_hdr.nentries = nentries;
9305 break;
9306 } else {
9307 local_size += entry_size;
9308 new_offset += entry_size;
9309 nentries++;
9310 }
9311 copy_entry = copy_entry->vme_next;
9312 }
9313 }
9314
9315 if (aligned) {
9316 pmap_t local_pmap;
9317
9318 if (pmap) {
9319 local_pmap = pmap;
9320 } else {
9321 local_pmap = dst_map->pmap;
9322 }
9323
9324 if ((kr = vm_map_copy_overwrite_aligned(
9325 dst_map, tmp_entry, copy,
9326 base_addr, local_pmap)) != KERN_SUCCESS) {
9327 if (next_copy != NULL) {
9328 copy->cpy_hdr.nentries +=
9329 remaining_entries;
9330 copy->cpy_hdr.links.prev->vme_next =
9331 next_copy;
9332 copy->cpy_hdr.links.prev =
9333 previous_prev;
9334 copy->size += copy_size;
9335 }
9336 return kr;
9337 }
9338 vm_map_unlock(dst_map);
9339 } else {
9340 /*
9341 * Performance gain:
9342 *
9343 * if the copy and dst address are misaligned but the same
9344 * offset within the page we can copy_not_aligned the
9345 * misaligned parts and copy aligned the rest. If they are
9346 * aligned but len is unaligned we simply need to copy
9347 * the end bit unaligned. We'll need to split the misaligned
9348 * bits of the region in this case !
9349 */
9350 /* ALWAYS UNLOCKS THE dst_map MAP */
9351 kr = vm_map_copy_overwrite_unaligned(
9352 dst_map,
9353 tmp_entry,
9354 copy,
9355 base_addr,
9356 discard_on_success);
9357 if (kr != KERN_SUCCESS) {
9358 if (next_copy != NULL) {
9359 copy->cpy_hdr.nentries +=
9360 remaining_entries;
9361 copy->cpy_hdr.links.prev->vme_next =
9362 next_copy;
9363 copy->cpy_hdr.links.prev =
9364 previous_prev;
9365 copy->size += copy_size;
9366 }
9367 return kr;
9368 }
9369 }
9370 total_size -= copy_size;
9371 if (total_size == 0) {
9372 break;
9373 }
9374 base_addr += copy_size;
9375 copy_size = 0;
9376 copy->offset = new_offset;
9377 if (next_copy != NULL) {
9378 copy->cpy_hdr.nentries = remaining_entries;
9379 copy->cpy_hdr.links.next = next_copy;
9380 copy->cpy_hdr.links.prev = previous_prev;
9381 next_copy->vme_prev = vm_map_copy_to_entry(copy);
9382 copy->size = total_size;
9383 }
9384 vm_map_lock(dst_map);
9385 while (TRUE) {
9386 if (!vm_map_lookup_entry(dst_map,
9387 base_addr, &tmp_entry)) {
9388 vm_map_unlock(dst_map);
9389 return KERN_INVALID_ADDRESS;
9390 }
9391 if (tmp_entry->in_transition) {
9392 entry->needs_wakeup = TRUE;
9393 vm_map_entry_wait(dst_map, THREAD_UNINT);
9394 } else {
9395 break;
9396 }
9397 }
9398 vm_map_clip_start(dst_map,
9399 tmp_entry,
9400 vm_map_trunc_page(base_addr,
9401 VM_MAP_PAGE_MASK(dst_map)));
9402
9403 entry = tmp_entry;
9404 } /* while */
9405
9406 /*
9407 * Throw away the vm_map_copy object
9408 */
9409 if (discard_on_success) {
9410 vm_map_copy_discard(copy);
9411 }
9412
9413 return KERN_SUCCESS;
9414 }/* vm_map_copy_overwrite */
9415
9416 kern_return_t
vm_map_copy_overwrite(vm_map_t dst_map,vm_map_offset_t dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t interruptible)9417 vm_map_copy_overwrite(
9418 vm_map_t dst_map,
9419 vm_map_offset_t dst_addr,
9420 vm_map_copy_t copy,
9421 vm_map_size_t copy_size,
9422 boolean_t interruptible)
9423 {
9424 vm_map_size_t head_size, tail_size;
9425 vm_map_copy_t head_copy, tail_copy;
9426 vm_map_offset_t head_addr, tail_addr;
9427 vm_map_entry_t entry;
9428 kern_return_t kr;
9429 vm_map_offset_t effective_page_mask, effective_page_size;
9430 uint16_t copy_page_shift;
9431
9432 head_size = 0;
9433 tail_size = 0;
9434 head_copy = NULL;
9435 tail_copy = NULL;
9436 head_addr = 0;
9437 tail_addr = 0;
9438
9439 /*
9440 * Check for null copy object.
9441 */
9442 if (copy == VM_MAP_COPY_NULL) {
9443 return KERN_SUCCESS;
9444 }
9445
9446 /*
9447 * Assert that the vm_map_copy is coming from the right
9448 * zone and hasn't been forged
9449 */
9450 vm_map_copy_require(copy);
9451
9452 if (interruptible ||
9453 copy->type != VM_MAP_COPY_ENTRY_LIST) {
9454 /*
9455 * We can't split the "copy" map if we're interruptible
9456 * or if we don't have a "copy" map...
9457 */
9458 blunt_copy:
9459 return vm_map_copy_overwrite_nested(dst_map,
9460 dst_addr,
9461 copy,
9462 interruptible,
9463 (pmap_t) NULL,
9464 TRUE);
9465 }
9466
9467 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
9468 if (copy_page_shift < PAGE_SHIFT ||
9469 VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9470 goto blunt_copy;
9471 }
9472
9473 if (VM_MAP_PAGE_SHIFT(dst_map) < PAGE_SHIFT) {
9474 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
9475 } else {
9476 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
9477 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
9478 effective_page_mask);
9479 }
9480 effective_page_size = effective_page_mask + 1;
9481
9482 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
9483 /*
9484 * Too small to bother with optimizing...
9485 */
9486 goto blunt_copy;
9487 }
9488
9489 if ((dst_addr & effective_page_mask) !=
9490 (copy->offset & effective_page_mask)) {
9491 /*
9492 * Incompatible mis-alignment of source and destination...
9493 */
9494 goto blunt_copy;
9495 }
9496
9497 /*
9498 * Proper alignment or identical mis-alignment at the beginning.
9499 * Let's try and do a small unaligned copy first (if needed)
9500 * and then an aligned copy for the rest.
9501 */
9502 if (!vm_map_page_aligned(dst_addr, effective_page_mask)) {
9503 head_addr = dst_addr;
9504 head_size = (effective_page_size -
9505 (copy->offset & effective_page_mask));
9506 head_size = MIN(head_size, copy_size);
9507 }
9508 if (!vm_map_page_aligned(copy->offset + copy_size,
9509 effective_page_mask)) {
9510 /*
9511 * Mis-alignment at the end.
9512 * Do an aligned copy up to the last page and
9513 * then an unaligned copy for the remaining bytes.
9514 */
9515 tail_size = ((copy->offset + copy_size) &
9516 effective_page_mask);
9517 tail_size = MIN(tail_size, copy_size);
9518 tail_addr = dst_addr + copy_size - tail_size;
9519 assert(tail_addr >= head_addr + head_size);
9520 }
9521 assert(head_size + tail_size <= copy_size);
9522
9523 if (head_size + tail_size == copy_size) {
9524 /*
9525 * It's all unaligned, no optimization possible...
9526 */
9527 goto blunt_copy;
9528 }
9529
9530 /*
9531 * Can't optimize if there are any submaps in the
9532 * destination due to the way we free the "copy" map
9533 * progressively in vm_map_copy_overwrite_nested()
9534 * in that case.
9535 */
9536 vm_map_lock_read(dst_map);
9537 if (!vm_map_lookup_entry(dst_map, dst_addr, &entry)) {
9538 vm_map_unlock_read(dst_map);
9539 goto blunt_copy;
9540 }
9541 for (;
9542 (entry != vm_map_copy_to_entry(copy) &&
9543 entry->vme_start < dst_addr + copy_size);
9544 entry = entry->vme_next) {
9545 if (entry->is_sub_map) {
9546 vm_map_unlock_read(dst_map);
9547 goto blunt_copy;
9548 }
9549 }
9550 vm_map_unlock_read(dst_map);
9551
9552 if (head_size) {
9553 /*
9554 * Unaligned copy of the first "head_size" bytes, to reach
9555 * a page boundary.
9556 */
9557
9558 /*
9559 * Extract "head_copy" out of "copy".
9560 */
9561 head_copy = vm_map_copy_allocate();
9562 head_copy->type = VM_MAP_COPY_ENTRY_LIST;
9563 head_copy->cpy_hdr.entries_pageable =
9564 copy->cpy_hdr.entries_pageable;
9565 vm_map_store_init(&head_copy->cpy_hdr);
9566 head_copy->cpy_hdr.page_shift = copy_page_shift;
9567
9568 entry = vm_map_copy_first_entry(copy);
9569 if (entry->vme_end < copy->offset + head_size) {
9570 head_size = entry->vme_end - copy->offset;
9571 }
9572
9573 head_copy->offset = copy->offset;
9574 head_copy->size = head_size;
9575 copy->offset += head_size;
9576 copy->size -= head_size;
9577 copy_size -= head_size;
9578 assert(copy_size > 0);
9579
9580 vm_map_copy_clip_end(copy, entry, copy->offset);
9581 vm_map_copy_entry_unlink(copy, entry);
9582 vm_map_copy_entry_link(head_copy,
9583 vm_map_copy_to_entry(head_copy),
9584 entry);
9585
9586 /*
9587 * Do the unaligned copy.
9588 */
9589 kr = vm_map_copy_overwrite_nested(dst_map,
9590 head_addr,
9591 head_copy,
9592 interruptible,
9593 (pmap_t) NULL,
9594 FALSE);
9595 if (kr != KERN_SUCCESS) {
9596 goto done;
9597 }
9598 }
9599
9600 if (tail_size) {
9601 /*
9602 * Extract "tail_copy" out of "copy".
9603 */
9604 tail_copy = vm_map_copy_allocate();
9605 tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
9606 tail_copy->cpy_hdr.entries_pageable =
9607 copy->cpy_hdr.entries_pageable;
9608 vm_map_store_init(&tail_copy->cpy_hdr);
9609 tail_copy->cpy_hdr.page_shift = copy_page_shift;
9610
9611 tail_copy->offset = copy->offset + copy_size - tail_size;
9612 tail_copy->size = tail_size;
9613
9614 copy->size -= tail_size;
9615 copy_size -= tail_size;
9616 assert(copy_size > 0);
9617
9618 entry = vm_map_copy_last_entry(copy);
9619 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
9620 entry = vm_map_copy_last_entry(copy);
9621 vm_map_copy_entry_unlink(copy, entry);
9622 vm_map_copy_entry_link(tail_copy,
9623 vm_map_copy_last_entry(tail_copy),
9624 entry);
9625 }
9626
9627 /*
9628 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
9629 * we want to avoid TOCTOU issues w.r.t copy->size but
9630 * we don't need to change vm_map_copy_overwrite_nested()
9631 * and all other vm_map_copy_overwrite variants.
9632 *
9633 * So we assign the original copy_size that was passed into
9634 * this routine back to copy.
9635 *
9636 * This use of local 'copy_size' passed into this routine is
9637 * to try and protect against TOCTOU attacks where the kernel
9638 * has been exploited. We don't expect this to be an issue
9639 * during normal system operation.
9640 */
9641 assertf(copy->size == copy_size,
9642 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
9643 copy->size = copy_size;
9644
9645 /*
9646 * Copy most (or possibly all) of the data.
9647 */
9648 kr = vm_map_copy_overwrite_nested(dst_map,
9649 dst_addr + head_size,
9650 copy,
9651 interruptible,
9652 (pmap_t) NULL,
9653 FALSE);
9654 if (kr != KERN_SUCCESS) {
9655 goto done;
9656 }
9657
9658 if (tail_size) {
9659 kr = vm_map_copy_overwrite_nested(dst_map,
9660 tail_addr,
9661 tail_copy,
9662 interruptible,
9663 (pmap_t) NULL,
9664 FALSE);
9665 }
9666
9667 done:
9668 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9669 if (kr == KERN_SUCCESS) {
9670 /*
9671 * Discard all the copy maps.
9672 */
9673 if (head_copy) {
9674 vm_map_copy_discard(head_copy);
9675 head_copy = NULL;
9676 }
9677 vm_map_copy_discard(copy);
9678 if (tail_copy) {
9679 vm_map_copy_discard(tail_copy);
9680 tail_copy = NULL;
9681 }
9682 } else {
9683 /*
9684 * Re-assemble the original copy map.
9685 */
9686 if (head_copy) {
9687 entry = vm_map_copy_first_entry(head_copy);
9688 vm_map_copy_entry_unlink(head_copy, entry);
9689 vm_map_copy_entry_link(copy,
9690 vm_map_copy_to_entry(copy),
9691 entry);
9692 copy->offset -= head_size;
9693 copy->size += head_size;
9694 vm_map_copy_discard(head_copy);
9695 head_copy = NULL;
9696 }
9697 if (tail_copy) {
9698 entry = vm_map_copy_last_entry(tail_copy);
9699 vm_map_copy_entry_unlink(tail_copy, entry);
9700 vm_map_copy_entry_link(copy,
9701 vm_map_copy_last_entry(copy),
9702 entry);
9703 copy->size += tail_size;
9704 vm_map_copy_discard(tail_copy);
9705 tail_copy = NULL;
9706 }
9707 }
9708 return kr;
9709 }
9710
9711
9712 /*
9713 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
9714 *
9715 * Decription:
9716 * Physically copy unaligned data
9717 *
9718 * Implementation:
9719 * Unaligned parts of pages have to be physically copied. We use
9720 * a modified form of vm_fault_copy (which understands none-aligned
9721 * page offsets and sizes) to do the copy. We attempt to copy as
9722 * much memory in one go as possibly, however vm_fault_copy copies
9723 * within 1 memory object so we have to find the smaller of "amount left"
9724 * "source object data size" and "target object data size". With
9725 * unaligned data we don't need to split regions, therefore the source
9726 * (copy) object should be one map entry, the target range may be split
9727 * over multiple map entries however. In any event we are pessimistic
9728 * about these assumptions.
9729 *
9730 * Callers of this function must call vm_map_copy_require on
9731 * previously created vm_map_copy_t or pass a newly created
9732 * one to ensure that it hasn't been forged.
9733 *
9734 * Assumptions:
9735 * dst_map is locked on entry and is return locked on success,
9736 * unlocked on error.
9737 */
9738
9739 static kern_return_t
vm_map_copy_overwrite_unaligned(vm_map_t dst_map,vm_map_entry_t entry,vm_map_copy_t copy,vm_map_offset_t start,boolean_t discard_on_success)9740 vm_map_copy_overwrite_unaligned(
9741 vm_map_t dst_map,
9742 vm_map_entry_t entry,
9743 vm_map_copy_t copy,
9744 vm_map_offset_t start,
9745 boolean_t discard_on_success)
9746 {
9747 vm_map_entry_t copy_entry;
9748 vm_map_entry_t copy_entry_next;
9749 vm_map_version_t version;
9750 vm_object_t dst_object;
9751 vm_object_offset_t dst_offset;
9752 vm_object_offset_t src_offset;
9753 vm_object_offset_t entry_offset;
9754 vm_map_offset_t entry_end;
9755 vm_map_size_t src_size,
9756 dst_size,
9757 copy_size,
9758 amount_left;
9759 kern_return_t kr = KERN_SUCCESS;
9760
9761
9762 copy_entry = vm_map_copy_first_entry(copy);
9763
9764 vm_map_lock_write_to_read(dst_map);
9765
9766 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
9767 amount_left = copy->size;
9768 /*
9769 * unaligned so we never clipped this entry, we need the offset into
9770 * the vm_object not just the data.
9771 */
9772 while (amount_left > 0) {
9773 if (entry == vm_map_to_entry(dst_map)) {
9774 vm_map_unlock_read(dst_map);
9775 return KERN_INVALID_ADDRESS;
9776 }
9777
9778 /* "start" must be within the current map entry */
9779 assert((start >= entry->vme_start) && (start < entry->vme_end));
9780
9781 dst_offset = start - entry->vme_start;
9782
9783 dst_size = entry->vme_end - start;
9784
9785 src_size = copy_entry->vme_end -
9786 (copy_entry->vme_start + src_offset);
9787
9788 if (dst_size < src_size) {
9789 /*
9790 * we can only copy dst_size bytes before
9791 * we have to get the next destination entry
9792 */
9793 copy_size = dst_size;
9794 } else {
9795 /*
9796 * we can only copy src_size bytes before
9797 * we have to get the next source copy entry
9798 */
9799 copy_size = src_size;
9800 }
9801
9802 if (copy_size > amount_left) {
9803 copy_size = amount_left;
9804 }
9805 /*
9806 * Entry needs copy, create a shadow shadow object for
9807 * Copy on write region.
9808 */
9809 if (entry->needs_copy &&
9810 ((entry->protection & VM_PROT_WRITE) != 0)) {
9811 if (vm_map_lock_read_to_write(dst_map)) {
9812 vm_map_lock_read(dst_map);
9813 goto RetryLookup;
9814 }
9815 VME_OBJECT_SHADOW(entry,
9816 (vm_map_size_t)(entry->vme_end
9817 - entry->vme_start));
9818 entry->needs_copy = FALSE;
9819 vm_map_lock_write_to_read(dst_map);
9820 }
9821 dst_object = VME_OBJECT(entry);
9822 /*
9823 * unlike with the virtual (aligned) copy we're going
9824 * to fault on it therefore we need a target object.
9825 */
9826 if (dst_object == VM_OBJECT_NULL) {
9827 if (vm_map_lock_read_to_write(dst_map)) {
9828 vm_map_lock_read(dst_map);
9829 goto RetryLookup;
9830 }
9831 dst_object = vm_object_allocate((vm_map_size_t)
9832 entry->vme_end - entry->vme_start);
9833 VME_OBJECT_SET(entry, dst_object);
9834 VME_OFFSET_SET(entry, 0);
9835 assert(entry->use_pmap);
9836 vm_map_lock_write_to_read(dst_map);
9837 }
9838 /*
9839 * Take an object reference and unlock map. The "entry" may
9840 * disappear or change when the map is unlocked.
9841 */
9842 vm_object_reference(dst_object);
9843 version.main_timestamp = dst_map->timestamp;
9844 entry_offset = VME_OFFSET(entry);
9845 entry_end = entry->vme_end;
9846 vm_map_unlock_read(dst_map);
9847 /*
9848 * Copy as much as possible in one pass
9849 */
9850 kr = vm_fault_copy(
9851 VME_OBJECT(copy_entry),
9852 VME_OFFSET(copy_entry) + src_offset,
9853 ©_size,
9854 dst_object,
9855 entry_offset + dst_offset,
9856 dst_map,
9857 &version,
9858 THREAD_UNINT );
9859
9860 start += copy_size;
9861 src_offset += copy_size;
9862 amount_left -= copy_size;
9863 /*
9864 * Release the object reference
9865 */
9866 vm_object_deallocate(dst_object);
9867 /*
9868 * If a hard error occurred, return it now
9869 */
9870 if (kr != KERN_SUCCESS) {
9871 return kr;
9872 }
9873
9874 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
9875 || amount_left == 0) {
9876 /*
9877 * all done with this copy entry, dispose.
9878 */
9879 copy_entry_next = copy_entry->vme_next;
9880
9881 if (discard_on_success) {
9882 vm_map_copy_entry_unlink(copy, copy_entry);
9883 assert(!copy_entry->is_sub_map);
9884 vm_object_deallocate(VME_OBJECT(copy_entry));
9885 vm_map_copy_entry_dispose(copy_entry);
9886 }
9887
9888 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
9889 amount_left) {
9890 /*
9891 * not finished copying but run out of source
9892 */
9893 return KERN_INVALID_ADDRESS;
9894 }
9895
9896 copy_entry = copy_entry_next;
9897
9898 src_offset = 0;
9899 }
9900
9901 if (amount_left == 0) {
9902 return KERN_SUCCESS;
9903 }
9904
9905 vm_map_lock_read(dst_map);
9906 if (version.main_timestamp == dst_map->timestamp) {
9907 if (start == entry_end) {
9908 /*
9909 * destination region is split. Use the version
9910 * information to avoid a lookup in the normal
9911 * case.
9912 */
9913 entry = entry->vme_next;
9914 /*
9915 * should be contiguous. Fail if we encounter
9916 * a hole in the destination.
9917 */
9918 if (start != entry->vme_start) {
9919 vm_map_unlock_read(dst_map);
9920 return KERN_INVALID_ADDRESS;
9921 }
9922 }
9923 } else {
9924 /*
9925 * Map version check failed.
9926 * we must lookup the entry because somebody
9927 * might have changed the map behind our backs.
9928 */
9929 RetryLookup:
9930 if (!vm_map_lookup_entry(dst_map, start, &entry)) {
9931 vm_map_unlock_read(dst_map);
9932 return KERN_INVALID_ADDRESS;
9933 }
9934 }
9935 }/* while */
9936
9937 return KERN_SUCCESS;
9938 }/* vm_map_copy_overwrite_unaligned */
9939
9940 /*
9941 * Routine: vm_map_copy_overwrite_aligned [internal use only]
9942 *
9943 * Description:
9944 * Does all the vm_trickery possible for whole pages.
9945 *
9946 * Implementation:
9947 *
9948 * If there are no permanent objects in the destination,
9949 * and the source and destination map entry zones match,
9950 * and the destination map entry is not shared,
9951 * then the map entries can be deleted and replaced
9952 * with those from the copy. The following code is the
9953 * basic idea of what to do, but there are lots of annoying
9954 * little details about getting protection and inheritance
9955 * right. Should add protection, inheritance, and sharing checks
9956 * to the above pass and make sure that no wiring is involved.
9957 *
9958 * Callers of this function must call vm_map_copy_require on
9959 * previously created vm_map_copy_t or pass a newly created
9960 * one to ensure that it hasn't been forged.
9961 */
9962
9963 int vm_map_copy_overwrite_aligned_src_not_internal = 0;
9964 int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
9965 int vm_map_copy_overwrite_aligned_src_large = 0;
9966
9967 static kern_return_t
vm_map_copy_overwrite_aligned(vm_map_t dst_map,vm_map_entry_t tmp_entry,vm_map_copy_t copy,vm_map_offset_t start,__unused pmap_t pmap)9968 vm_map_copy_overwrite_aligned(
9969 vm_map_t dst_map,
9970 vm_map_entry_t tmp_entry,
9971 vm_map_copy_t copy,
9972 vm_map_offset_t start,
9973 __unused pmap_t pmap)
9974 {
9975 vm_object_t object;
9976 vm_map_entry_t copy_entry;
9977 vm_map_size_t copy_size;
9978 vm_map_size_t size;
9979 vm_map_entry_t entry;
9980
9981 while ((copy_entry = vm_map_copy_first_entry(copy))
9982 != vm_map_copy_to_entry(copy)) {
9983 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
9984
9985 entry = tmp_entry;
9986 if (entry->is_sub_map) {
9987 /* unnested when clipped earlier */
9988 assert(!entry->use_pmap);
9989 }
9990 if (entry == vm_map_to_entry(dst_map)) {
9991 vm_map_unlock(dst_map);
9992 return KERN_INVALID_ADDRESS;
9993 }
9994 size = (entry->vme_end - entry->vme_start);
9995 /*
9996 * Make sure that no holes popped up in the
9997 * address map, and that the protection is
9998 * still valid, in case the map was unlocked
9999 * earlier.
10000 */
10001
10002 if ((entry->vme_start != start) || ((entry->is_sub_map)
10003 && !entry->needs_copy)) {
10004 vm_map_unlock(dst_map);
10005 return KERN_INVALID_ADDRESS;
10006 }
10007 assert(entry != vm_map_to_entry(dst_map));
10008
10009 /*
10010 * Check protection again
10011 */
10012
10013 if (!(entry->protection & VM_PROT_WRITE)) {
10014 vm_map_unlock(dst_map);
10015 return KERN_PROTECTION_FAILURE;
10016 }
10017
10018 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10019 vm_map_unlock(dst_map);
10020 return KERN_PROTECTION_FAILURE;
10021 }
10022
10023 /*
10024 * Adjust to source size first
10025 */
10026
10027 if (copy_size < size) {
10028 if (entry->map_aligned &&
10029 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10030 VM_MAP_PAGE_MASK(dst_map))) {
10031 /* no longer map-aligned */
10032 entry->map_aligned = FALSE;
10033 }
10034 vm_map_clip_end(dst_map, entry, entry->vme_start + copy_size);
10035 size = copy_size;
10036 }
10037
10038 /*
10039 * Adjust to destination size
10040 */
10041
10042 if (size < copy_size) {
10043 vm_map_copy_clip_end(copy, copy_entry,
10044 copy_entry->vme_start + size);
10045 copy_size = size;
10046 }
10047
10048 assert((entry->vme_end - entry->vme_start) == size);
10049 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10050 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10051
10052 /*
10053 * If the destination contains temporary unshared memory,
10054 * we can perform the copy by throwing it away and
10055 * installing the source data.
10056 */
10057
10058 object = VME_OBJECT(entry);
10059 if ((!entry->is_shared &&
10060 ((object == VM_OBJECT_NULL) ||
10061 (object->internal && !object->true_share))) ||
10062 entry->needs_copy) {
10063 vm_object_t old_object = VME_OBJECT(entry);
10064 vm_object_offset_t old_offset = VME_OFFSET(entry);
10065 vm_object_offset_t offset;
10066
10067 /*
10068 * Ensure that the source and destination aren't
10069 * identical
10070 */
10071 if (old_object == VME_OBJECT(copy_entry) &&
10072 old_offset == VME_OFFSET(copy_entry)) {
10073 vm_map_copy_entry_unlink(copy, copy_entry);
10074 vm_map_copy_entry_dispose(copy_entry);
10075
10076 if (old_object != VM_OBJECT_NULL) {
10077 vm_object_deallocate(old_object);
10078 }
10079
10080 start = tmp_entry->vme_end;
10081 tmp_entry = tmp_entry->vme_next;
10082 continue;
10083 }
10084
10085 #if XNU_TARGET_OS_OSX
10086 #define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10087 #define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10088 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10089 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10090 copy_size <= __TRADEOFF1_COPY_SIZE) {
10091 /*
10092 * Virtual vs. Physical copy tradeoff #1.
10093 *
10094 * Copying only a few pages out of a large
10095 * object: do a physical copy instead of
10096 * a virtual copy, to avoid possibly keeping
10097 * the entire large object alive because of
10098 * those few copy-on-write pages.
10099 */
10100 vm_map_copy_overwrite_aligned_src_large++;
10101 goto slow_copy;
10102 }
10103 #endif /* XNU_TARGET_OS_OSX */
10104
10105 if ((dst_map->pmap != kernel_pmap) &&
10106 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10107 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10108 vm_object_t new_object, new_shadow;
10109
10110 /*
10111 * We're about to map something over a mapping
10112 * established by malloc()...
10113 */
10114 new_object = VME_OBJECT(copy_entry);
10115 if (new_object != VM_OBJECT_NULL) {
10116 vm_object_lock_shared(new_object);
10117 }
10118 while (new_object != VM_OBJECT_NULL &&
10119 #if XNU_TARGET_OS_OSX
10120 !new_object->true_share &&
10121 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10122 #endif /* XNU_TARGET_OS_OSX */
10123 new_object->internal) {
10124 new_shadow = new_object->shadow;
10125 if (new_shadow == VM_OBJECT_NULL) {
10126 break;
10127 }
10128 vm_object_lock_shared(new_shadow);
10129 vm_object_unlock(new_object);
10130 new_object = new_shadow;
10131 }
10132 if (new_object != VM_OBJECT_NULL) {
10133 if (!new_object->internal) {
10134 /*
10135 * The new mapping is backed
10136 * by an external object. We
10137 * don't want malloc'ed memory
10138 * to be replaced with such a
10139 * non-anonymous mapping, so
10140 * let's go off the optimized
10141 * path...
10142 */
10143 vm_map_copy_overwrite_aligned_src_not_internal++;
10144 vm_object_unlock(new_object);
10145 goto slow_copy;
10146 }
10147 #if XNU_TARGET_OS_OSX
10148 if (new_object->true_share ||
10149 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10150 /*
10151 * Same if there's a "true_share"
10152 * object in the shadow chain, or
10153 * an object with a non-default
10154 * (SYMMETRIC) copy strategy.
10155 */
10156 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10157 vm_object_unlock(new_object);
10158 goto slow_copy;
10159 }
10160 #endif /* XNU_TARGET_OS_OSX */
10161 vm_object_unlock(new_object);
10162 }
10163 /*
10164 * The new mapping is still backed by
10165 * anonymous (internal) memory, so it's
10166 * OK to substitute it for the original
10167 * malloc() mapping.
10168 */
10169 }
10170
10171 if (old_object != VM_OBJECT_NULL) {
10172 if (entry->is_sub_map) {
10173 if (entry->use_pmap) {
10174 #ifndef NO_NESTED_PMAP
10175 pmap_unnest(dst_map->pmap,
10176 (addr64_t)entry->vme_start,
10177 entry->vme_end - entry->vme_start);
10178 #endif /* NO_NESTED_PMAP */
10179 if (dst_map->mapped_in_other_pmaps) {
10180 /* clean up parent */
10181 /* map/maps */
10182 vm_map_submap_pmap_clean(
10183 dst_map, entry->vme_start,
10184 entry->vme_end,
10185 VME_SUBMAP(entry),
10186 VME_OFFSET(entry));
10187 }
10188 } else {
10189 vm_map_submap_pmap_clean(
10190 dst_map, entry->vme_start,
10191 entry->vme_end,
10192 VME_SUBMAP(entry),
10193 VME_OFFSET(entry));
10194 }
10195 vm_map_deallocate(VME_SUBMAP(entry));
10196 } else {
10197 if (dst_map->mapped_in_other_pmaps) {
10198 vm_object_pmap_protect_options(
10199 VME_OBJECT(entry),
10200 VME_OFFSET(entry),
10201 entry->vme_end
10202 - entry->vme_start,
10203 PMAP_NULL,
10204 PAGE_SIZE,
10205 entry->vme_start,
10206 VM_PROT_NONE,
10207 PMAP_OPTIONS_REMOVE);
10208 } else {
10209 pmap_remove_options(
10210 dst_map->pmap,
10211 (addr64_t)(entry->vme_start),
10212 (addr64_t)(entry->vme_end),
10213 PMAP_OPTIONS_REMOVE);
10214 }
10215 vm_object_deallocate(old_object);
10216 }
10217 }
10218
10219 if (entry->iokit_acct) {
10220 /* keep using iokit accounting */
10221 entry->use_pmap = FALSE;
10222 } else {
10223 /* use pmap accounting */
10224 entry->use_pmap = TRUE;
10225 }
10226 entry->is_sub_map = FALSE;
10227 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry));
10228 object = VME_OBJECT(entry);
10229 entry->needs_copy = copy_entry->needs_copy;
10230 entry->wired_count = 0;
10231 entry->user_wired_count = 0;
10232 offset = VME_OFFSET(copy_entry);
10233 VME_OFFSET_SET(entry, offset);
10234
10235 vm_map_copy_entry_unlink(copy, copy_entry);
10236 vm_map_copy_entry_dispose(copy_entry);
10237
10238 /*
10239 * we could try to push pages into the pmap at this point, BUT
10240 * this optimization only saved on average 2 us per page if ALL
10241 * the pages in the source were currently mapped
10242 * and ALL the pages in the dest were touched, if there were fewer
10243 * than 2/3 of the pages touched, this optimization actually cost more cycles
10244 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
10245 */
10246
10247 /*
10248 * Set up for the next iteration. The map
10249 * has not been unlocked, so the next
10250 * address should be at the end of this
10251 * entry, and the next map entry should be
10252 * the one following it.
10253 */
10254
10255 start = tmp_entry->vme_end;
10256 tmp_entry = tmp_entry->vme_next;
10257 } else {
10258 vm_map_version_t version;
10259 vm_object_t dst_object;
10260 vm_object_offset_t dst_offset;
10261 kern_return_t r;
10262
10263 slow_copy:
10264 if (entry->needs_copy) {
10265 VME_OBJECT_SHADOW(entry,
10266 (entry->vme_end -
10267 entry->vme_start));
10268 entry->needs_copy = FALSE;
10269 }
10270
10271 dst_object = VME_OBJECT(entry);
10272 dst_offset = VME_OFFSET(entry);
10273
10274 /*
10275 * Take an object reference, and record
10276 * the map version information so that the
10277 * map can be safely unlocked.
10278 */
10279
10280 if (dst_object == VM_OBJECT_NULL) {
10281 /*
10282 * We would usually have just taken the
10283 * optimized path above if the destination
10284 * object has not been allocated yet. But we
10285 * now disable that optimization if the copy
10286 * entry's object is not backed by anonymous
10287 * memory to avoid replacing malloc'ed
10288 * (i.e. re-usable) anonymous memory with a
10289 * not-so-anonymous mapping.
10290 * So we have to handle this case here and
10291 * allocate a new VM object for this map entry.
10292 */
10293 dst_object = vm_object_allocate(
10294 entry->vme_end - entry->vme_start);
10295 dst_offset = 0;
10296 VME_OBJECT_SET(entry, dst_object);
10297 VME_OFFSET_SET(entry, dst_offset);
10298 assert(entry->use_pmap);
10299 }
10300
10301 vm_object_reference(dst_object);
10302
10303 /* account for unlock bumping up timestamp */
10304 version.main_timestamp = dst_map->timestamp + 1;
10305
10306 vm_map_unlock(dst_map);
10307
10308 /*
10309 * Copy as much as possible in one pass
10310 */
10311
10312 copy_size = size;
10313 r = vm_fault_copy(
10314 VME_OBJECT(copy_entry),
10315 VME_OFFSET(copy_entry),
10316 ©_size,
10317 dst_object,
10318 dst_offset,
10319 dst_map,
10320 &version,
10321 THREAD_UNINT );
10322
10323 /*
10324 * Release the object reference
10325 */
10326
10327 vm_object_deallocate(dst_object);
10328
10329 /*
10330 * If a hard error occurred, return it now
10331 */
10332
10333 if (r != KERN_SUCCESS) {
10334 return r;
10335 }
10336
10337 if (copy_size != 0) {
10338 /*
10339 * Dispose of the copied region
10340 */
10341
10342 vm_map_copy_clip_end(copy, copy_entry,
10343 copy_entry->vme_start + copy_size);
10344 vm_map_copy_entry_unlink(copy, copy_entry);
10345 vm_object_deallocate(VME_OBJECT(copy_entry));
10346 vm_map_copy_entry_dispose(copy_entry);
10347 }
10348
10349 /*
10350 * Pick up in the destination map where we left off.
10351 *
10352 * Use the version information to avoid a lookup
10353 * in the normal case.
10354 */
10355
10356 start += copy_size;
10357 vm_map_lock(dst_map);
10358 if (version.main_timestamp == dst_map->timestamp &&
10359 copy_size != 0) {
10360 /* We can safely use saved tmp_entry value */
10361
10362 if (tmp_entry->map_aligned &&
10363 !VM_MAP_PAGE_ALIGNED(
10364 start,
10365 VM_MAP_PAGE_MASK(dst_map))) {
10366 /* no longer map-aligned */
10367 tmp_entry->map_aligned = FALSE;
10368 }
10369 vm_map_clip_end(dst_map, tmp_entry, start);
10370 tmp_entry = tmp_entry->vme_next;
10371 } else {
10372 /* Must do lookup of tmp_entry */
10373
10374 if (!vm_map_lookup_entry(dst_map, start, &tmp_entry)) {
10375 vm_map_unlock(dst_map);
10376 return KERN_INVALID_ADDRESS;
10377 }
10378 if (tmp_entry->map_aligned &&
10379 !VM_MAP_PAGE_ALIGNED(
10380 start,
10381 VM_MAP_PAGE_MASK(dst_map))) {
10382 /* no longer map-aligned */
10383 tmp_entry->map_aligned = FALSE;
10384 }
10385 vm_map_clip_start(dst_map, tmp_entry, start);
10386 }
10387 }
10388 }/* while */
10389
10390 return KERN_SUCCESS;
10391 }/* vm_map_copy_overwrite_aligned */
10392
10393 /*
10394 * Routine: vm_map_copyin_kernel_buffer [internal use only]
10395 *
10396 * Description:
10397 * Copy in data to a kernel buffer from space in the
10398 * source map. The original space may be optionally
10399 * deallocated.
10400 *
10401 * If successful, returns a new copy object.
10402 */
10403 static kern_return_t
vm_map_copyin_kernel_buffer(vm_map_t src_map,vm_map_offset_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)10404 vm_map_copyin_kernel_buffer(
10405 vm_map_t src_map,
10406 vm_map_offset_t src_addr,
10407 vm_map_size_t len,
10408 boolean_t src_destroy,
10409 vm_map_copy_t *copy_result)
10410 {
10411 kern_return_t kr;
10412 vm_map_copy_t copy;
10413
10414 if (len > msg_ool_size_small) {
10415 return KERN_INVALID_ARGUMENT;
10416 }
10417
10418 copy = zalloc_flags(vm_map_copy_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10419 copy->cpy_kdata = kalloc_data(len, Z_WAITOK);
10420 if (copy->cpy_kdata == NULL) {
10421 zfree(vm_map_copy_zone, copy);
10422 return KERN_RESOURCE_SHORTAGE;
10423 }
10424
10425 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
10426 copy->size = len;
10427 copy->offset = 0;
10428
10429 kr = copyinmap(src_map, src_addr, copy->cpy_kdata, (vm_size_t)len);
10430 if (kr != KERN_SUCCESS) {
10431 kfree_data(copy->cpy_kdata, len);
10432 zfree(vm_map_copy_zone, copy);
10433 return kr;
10434 }
10435
10436 if (src_destroy) {
10437 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
10438
10439 if (src_map == kernel_map) {
10440 flags |= VM_MAP_REMOVE_KUNWIRE;
10441 }
10442
10443 (void)vm_map_remove_flags(src_map,
10444 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
10445 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
10446 flags);
10447 }
10448
10449 *copy_result = copy;
10450 return KERN_SUCCESS;
10451 }
10452
10453 /*
10454 * Routine: vm_map_copyout_kernel_buffer [internal use only]
10455 *
10456 * Description:
10457 * Copy out data from a kernel buffer into space in the
10458 * destination map. The space may be otpionally dynamically
10459 * allocated.
10460 *
10461 * If successful, consumes the copy object.
10462 * Otherwise, the caller is responsible for it.
10463 *
10464 * Callers of this function must call vm_map_copy_require on
10465 * previously created vm_map_copy_t or pass a newly created
10466 * one to ensure that it hasn't been forged.
10467 */
10468 static int vm_map_copyout_kernel_buffer_failures = 0;
10469 static kern_return_t
vm_map_copyout_kernel_buffer(vm_map_t map,vm_map_address_t * addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t overwrite,boolean_t consume_on_success)10470 vm_map_copyout_kernel_buffer(
10471 vm_map_t map,
10472 vm_map_address_t *addr, /* IN/OUT */
10473 vm_map_copy_t copy,
10474 vm_map_size_t copy_size,
10475 boolean_t overwrite,
10476 boolean_t consume_on_success)
10477 {
10478 kern_return_t kr = KERN_SUCCESS;
10479 thread_t thread = current_thread();
10480
10481 assert(copy->size == copy_size);
10482
10483 /*
10484 * check for corrupted vm_map_copy structure
10485 */
10486 if (copy_size > msg_ool_size_small || copy->offset) {
10487 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
10488 (long long)copy->size, (long long)copy->offset);
10489 }
10490
10491 if (!overwrite) {
10492 /*
10493 * Allocate space in the target map for the data
10494 */
10495 *addr = 0;
10496 kr = vm_map_enter(map,
10497 addr,
10498 vm_map_round_page(copy_size,
10499 VM_MAP_PAGE_MASK(map)),
10500 (vm_map_offset_t) 0,
10501 VM_FLAGS_ANYWHERE,
10502 VM_MAP_KERNEL_FLAGS_NONE,
10503 VM_KERN_MEMORY_NONE,
10504 VM_OBJECT_NULL,
10505 (vm_object_offset_t) 0,
10506 FALSE,
10507 VM_PROT_DEFAULT,
10508 VM_PROT_ALL,
10509 VM_INHERIT_DEFAULT);
10510 if (kr != KERN_SUCCESS) {
10511 return kr;
10512 }
10513 #if KASAN
10514 if (map->pmap == kernel_pmap) {
10515 kasan_notify_address(*addr, copy->size);
10516 }
10517 #endif
10518 }
10519
10520 /*
10521 * Copyout the data from the kernel buffer to the target map.
10522 */
10523 if (thread->map == map) {
10524 /*
10525 * If the target map is the current map, just do
10526 * the copy.
10527 */
10528 assert((vm_size_t)copy_size == copy_size);
10529 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10530 kr = KERN_INVALID_ADDRESS;
10531 }
10532 } else {
10533 vm_map_t oldmap;
10534
10535 /*
10536 * If the target map is another map, assume the
10537 * target's address space identity for the duration
10538 * of the copy.
10539 */
10540 vm_map_reference(map);
10541 oldmap = vm_map_switch(map);
10542
10543 assert((vm_size_t)copy_size == copy_size);
10544 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
10545 vm_map_copyout_kernel_buffer_failures++;
10546 kr = KERN_INVALID_ADDRESS;
10547 }
10548
10549 (void) vm_map_switch(oldmap);
10550 vm_map_deallocate(map);
10551 }
10552
10553 if (kr != KERN_SUCCESS) {
10554 /* the copy failed, clean up */
10555 if (!overwrite) {
10556 /*
10557 * Deallocate the space we allocated in the target map.
10558 */
10559 vm_map_remove(map,
10560 vm_map_trunc_page(*addr,
10561 VM_MAP_PAGE_MASK(map)),
10562 vm_map_round_page((*addr +
10563 vm_map_round_page(copy_size,
10564 VM_MAP_PAGE_MASK(map))),
10565 VM_MAP_PAGE_MASK(map)));
10566 *addr = 0;
10567 }
10568 } else {
10569 /* copy was successful, dicard the copy structure */
10570 if (consume_on_success) {
10571 kfree_data(copy->cpy_kdata, copy_size);
10572 zfree(vm_map_copy_zone, copy);
10573 }
10574 }
10575
10576 return kr;
10577 }
10578
10579 /*
10580 * Routine: vm_map_copy_insert [internal use only]
10581 *
10582 * Description:
10583 * Link a copy chain ("copy") into a map at the
10584 * specified location (after "where").
10585 *
10586 * Callers of this function must call vm_map_copy_require on
10587 * previously created vm_map_copy_t or pass a newly created
10588 * one to ensure that it hasn't been forged.
10589 * Side effects:
10590 * The copy chain is destroyed.
10591 */
10592 static void
vm_map_copy_insert(vm_map_t map,vm_map_entry_t after_where,vm_map_copy_t copy)10593 vm_map_copy_insert(
10594 vm_map_t map,
10595 vm_map_entry_t after_where,
10596 vm_map_copy_t copy)
10597 {
10598 vm_map_entry_t entry;
10599
10600 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
10601 entry = vm_map_copy_first_entry(copy);
10602 vm_map_copy_entry_unlink(copy, entry);
10603 vm_map_store_entry_link(map, after_where, entry,
10604 VM_MAP_KERNEL_FLAGS_NONE);
10605 after_where = entry;
10606 }
10607 zfree(vm_map_copy_zone, copy);
10608 }
10609
10610 /*
10611 * Callers of this function must call vm_map_copy_require on
10612 * previously created vm_map_copy_t or pass a newly created
10613 * one to ensure that it hasn't been forged.
10614 */
10615 void
vm_map_copy_remap(vm_map_t map,vm_map_entry_t where,vm_map_copy_t copy,vm_map_offset_t adjustment,vm_prot_t cur_prot,vm_prot_t max_prot,vm_inherit_t inheritance)10616 vm_map_copy_remap(
10617 vm_map_t map,
10618 vm_map_entry_t where,
10619 vm_map_copy_t copy,
10620 vm_map_offset_t adjustment,
10621 vm_prot_t cur_prot,
10622 vm_prot_t max_prot,
10623 vm_inherit_t inheritance)
10624 {
10625 vm_map_entry_t copy_entry, new_entry;
10626
10627 for (copy_entry = vm_map_copy_first_entry(copy);
10628 copy_entry != vm_map_copy_to_entry(copy);
10629 copy_entry = copy_entry->vme_next) {
10630 /* get a new VM map entry for the map */
10631 new_entry = vm_map_entry_create(map);
10632 /* copy the "copy entry" to the new entry */
10633 vm_map_entry_copy(map, new_entry, copy_entry);
10634 /* adjust "start" and "end" */
10635 new_entry->vme_start += adjustment;
10636 new_entry->vme_end += adjustment;
10637 /* clear some attributes */
10638 new_entry->inheritance = inheritance;
10639 new_entry->protection = cur_prot;
10640 new_entry->max_protection = max_prot;
10641 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
10642 /* take an extra reference on the entry's "object" */
10643 if (new_entry->is_sub_map) {
10644 assert(!new_entry->use_pmap); /* not nested */
10645 vm_map_lock(VME_SUBMAP(new_entry));
10646 vm_map_reference(VME_SUBMAP(new_entry));
10647 vm_map_unlock(VME_SUBMAP(new_entry));
10648 } else {
10649 vm_object_reference(VME_OBJECT(new_entry));
10650 }
10651 /* insert the new entry in the map */
10652 vm_map_store_entry_link(map, where, new_entry,
10653 VM_MAP_KERNEL_FLAGS_NONE);
10654 /* continue inserting the "copy entries" after the new entry */
10655 where = new_entry;
10656 }
10657 }
10658
10659
10660 /*
10661 * Returns true if *size matches (or is in the range of) copy->size.
10662 * Upon returning true, the *size field is updated with the actual size of the
10663 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
10664 */
10665 boolean_t
vm_map_copy_validate_size(vm_map_t dst_map,vm_map_copy_t copy,vm_map_size_t * size)10666 vm_map_copy_validate_size(
10667 vm_map_t dst_map,
10668 vm_map_copy_t copy,
10669 vm_map_size_t *size)
10670 {
10671 if (copy == VM_MAP_COPY_NULL) {
10672 return FALSE;
10673 }
10674
10675 /*
10676 * Assert that the vm_map_copy is coming from the right
10677 * zone and hasn't been forged
10678 */
10679 vm_map_copy_require(copy);
10680
10681 vm_map_size_t copy_sz = copy->size;
10682 vm_map_size_t sz = *size;
10683 switch (copy->type) {
10684 case VM_MAP_COPY_OBJECT:
10685 case VM_MAP_COPY_KERNEL_BUFFER:
10686 if (sz == copy_sz) {
10687 return TRUE;
10688 }
10689 break;
10690 case VM_MAP_COPY_ENTRY_LIST:
10691 /*
10692 * potential page-size rounding prevents us from exactly
10693 * validating this flavor of vm_map_copy, but we can at least
10694 * assert that it's within a range.
10695 */
10696 if (copy_sz >= sz &&
10697 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
10698 *size = copy_sz;
10699 return TRUE;
10700 }
10701 break;
10702 default:
10703 break;
10704 }
10705 return FALSE;
10706 }
10707
10708 /*
10709 * Routine: vm_map_copyout_size
10710 *
10711 * Description:
10712 * Copy out a copy chain ("copy") into newly-allocated
10713 * space in the destination map. Uses a prevalidated
10714 * size for the copy object (vm_map_copy_validate_size).
10715 *
10716 * If successful, consumes the copy object.
10717 * Otherwise, the caller is responsible for it.
10718 */
10719 kern_return_t
vm_map_copyout_size(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size)10720 vm_map_copyout_size(
10721 vm_map_t dst_map,
10722 vm_map_address_t *dst_addr, /* OUT */
10723 vm_map_copy_t copy,
10724 vm_map_size_t copy_size)
10725 {
10726 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
10727 TRUE, /* consume_on_success */
10728 VM_PROT_DEFAULT,
10729 VM_PROT_ALL,
10730 VM_INHERIT_DEFAULT);
10731 }
10732
10733 /*
10734 * Routine: vm_map_copyout
10735 *
10736 * Description:
10737 * Copy out a copy chain ("copy") into newly-allocated
10738 * space in the destination map.
10739 *
10740 * If successful, consumes the copy object.
10741 * Otherwise, the caller is responsible for it.
10742 */
10743 kern_return_t
vm_map_copyout(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy)10744 vm_map_copyout(
10745 vm_map_t dst_map,
10746 vm_map_address_t *dst_addr, /* OUT */
10747 vm_map_copy_t copy)
10748 {
10749 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy ? copy->size : 0,
10750 TRUE, /* consume_on_success */
10751 VM_PROT_DEFAULT,
10752 VM_PROT_ALL,
10753 VM_INHERIT_DEFAULT);
10754 }
10755
10756 kern_return_t
vm_map_copyout_internal(vm_map_t dst_map,vm_map_address_t * dst_addr,vm_map_copy_t copy,vm_map_size_t copy_size,boolean_t consume_on_success,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance)10757 vm_map_copyout_internal(
10758 vm_map_t dst_map,
10759 vm_map_address_t *dst_addr, /* OUT */
10760 vm_map_copy_t copy,
10761 vm_map_size_t copy_size,
10762 boolean_t consume_on_success,
10763 vm_prot_t cur_protection,
10764 vm_prot_t max_protection,
10765 vm_inherit_t inheritance)
10766 {
10767 vm_map_size_t size;
10768 vm_map_size_t adjustment;
10769 vm_map_offset_t start;
10770 vm_object_offset_t vm_copy_start;
10771 vm_map_entry_t last;
10772 vm_map_entry_t entry;
10773 vm_map_copy_t original_copy;
10774 kern_return_t kr;
10775 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
10776
10777 /*
10778 * Check for null copy object.
10779 */
10780
10781 if (copy == VM_MAP_COPY_NULL) {
10782 *dst_addr = 0;
10783 return KERN_SUCCESS;
10784 }
10785
10786 /*
10787 * Assert that the vm_map_copy is coming from the right
10788 * zone and hasn't been forged
10789 */
10790 vm_map_copy_require(copy);
10791
10792 if (copy->size != copy_size) {
10793 *dst_addr = 0;
10794 return KERN_FAILURE;
10795 }
10796
10797 /*
10798 * Check for special copy object, created
10799 * by vm_map_copyin_object.
10800 */
10801
10802 if (copy->type == VM_MAP_COPY_OBJECT) {
10803 vm_object_t object = copy->cpy_object;
10804 vm_object_offset_t offset;
10805
10806 offset = vm_object_trunc_page(copy->offset);
10807 size = vm_map_round_page((copy_size +
10808 (vm_map_size_t)(copy->offset -
10809 offset)),
10810 VM_MAP_PAGE_MASK(dst_map));
10811 *dst_addr = 0;
10812 kr = vm_map_enter(dst_map, dst_addr, size,
10813 (vm_map_offset_t) 0, VM_FLAGS_ANYWHERE,
10814 VM_MAP_KERNEL_FLAGS_NONE,
10815 VM_KERN_MEMORY_NONE,
10816 object, offset, FALSE,
10817 VM_PROT_DEFAULT, VM_PROT_ALL,
10818 VM_INHERIT_DEFAULT);
10819 if (kr != KERN_SUCCESS) {
10820 return kr;
10821 }
10822 /* Account for non-pagealigned copy object */
10823 *dst_addr += (vm_map_offset_t)(copy->offset - offset);
10824 if (consume_on_success) {
10825 zfree(vm_map_copy_zone, copy);
10826 }
10827 return KERN_SUCCESS;
10828 }
10829
10830 /*
10831 * Check for special kernel buffer allocated
10832 * by new_ipc_kmsg_copyin.
10833 */
10834
10835 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
10836 return vm_map_copyout_kernel_buffer(dst_map, dst_addr,
10837 copy, copy_size, FALSE,
10838 consume_on_success);
10839 }
10840
10841 original_copy = copy;
10842 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(dst_map)) {
10843 vm_map_copy_t target_copy;
10844 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
10845
10846 target_copy = VM_MAP_COPY_NULL;
10847 DEBUG4K_ADJUST("adjusting...\n");
10848 kr = vm_map_copy_adjust_to_target(
10849 copy,
10850 0, /* offset */
10851 copy->size, /* size */
10852 dst_map,
10853 TRUE, /* copy */
10854 &target_copy,
10855 &overmap_start,
10856 &overmap_end,
10857 &trimmed_start);
10858 if (kr != KERN_SUCCESS) {
10859 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
10860 return kr;
10861 }
10862 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
10863 if (target_copy != copy) {
10864 copy = target_copy;
10865 }
10866 copy_size = copy->size;
10867 }
10868
10869 /*
10870 * Find space for the data
10871 */
10872
10873 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
10874 VM_MAP_COPY_PAGE_MASK(copy));
10875 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
10876 VM_MAP_COPY_PAGE_MASK(copy))
10877 - vm_copy_start;
10878
10879
10880 if (dst_map == kernel_map) {
10881 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
10882 }
10883
10884 vm_map_lock(dst_map);
10885 kr = vm_map_locate_space(dst_map, size, 0, vmk_flags,
10886 &start, &last);
10887 if (kr != KERN_SUCCESS) {
10888 vm_map_unlock(dst_map);
10889 return kr;
10890 }
10891
10892 adjustment = start - vm_copy_start;
10893 if (!consume_on_success) {
10894 /*
10895 * We're not allowed to consume "copy", so we'll have to
10896 * copy its map entries into the destination map below.
10897 * No need to re-allocate map entries from the correct
10898 * (pageable or not) zone, since we'll get new map entries
10899 * during the transfer.
10900 * We'll also adjust the map entries's "start" and "end"
10901 * during the transfer, to keep "copy"'s entries consistent
10902 * with its "offset".
10903 */
10904 goto after_adjustments;
10905 }
10906
10907 /*
10908 * Since we're going to just drop the map
10909 * entries from the copy into the destination
10910 * map, they must come from the same pool.
10911 */
10912
10913 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
10914 /*
10915 * Mismatches occur when dealing with the default
10916 * pager.
10917 */
10918 vm_map_entry_t next, new;
10919
10920 /*
10921 * Find the zone that the copies were allocated from
10922 */
10923
10924 entry = vm_map_copy_first_entry(copy);
10925
10926 /*
10927 * Reinitialize the copy so that vm_map_copy_entry_link
10928 * will work.
10929 */
10930 vm_map_store_copy_reset(copy, entry);
10931 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
10932
10933 /*
10934 * Copy each entry.
10935 */
10936 while (entry != vm_map_copy_to_entry(copy)) {
10937 new = vm_map_copy_entry_create(copy);
10938 vm_map_entry_copy_full(new, entry);
10939 new->vme_no_copy_on_read = FALSE;
10940 assert(!new->iokit_acct);
10941 if (new->is_sub_map) {
10942 /* clr address space specifics */
10943 new->use_pmap = FALSE;
10944 }
10945 vm_map_copy_entry_link(copy,
10946 vm_map_copy_last_entry(copy),
10947 new);
10948 next = entry->vme_next;
10949 vm_map_entry_dispose(entry);
10950 entry = next;
10951 }
10952 }
10953
10954 /*
10955 * Adjust the addresses in the copy chain, and
10956 * reset the region attributes.
10957 */
10958
10959 for (entry = vm_map_copy_first_entry(copy);
10960 entry != vm_map_copy_to_entry(copy);
10961 entry = entry->vme_next) {
10962 if (VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT) {
10963 /*
10964 * We're injecting this copy entry into a map that
10965 * has the standard page alignment, so clear
10966 * "map_aligned" (which might have been inherited
10967 * from the original map entry).
10968 */
10969 entry->map_aligned = FALSE;
10970 }
10971
10972 entry->vme_start += adjustment;
10973 entry->vme_end += adjustment;
10974
10975 if (entry->map_aligned) {
10976 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
10977 VM_MAP_PAGE_MASK(dst_map)));
10978 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
10979 VM_MAP_PAGE_MASK(dst_map)));
10980 }
10981
10982 entry->inheritance = VM_INHERIT_DEFAULT;
10983 entry->protection = VM_PROT_DEFAULT;
10984 entry->max_protection = VM_PROT_ALL;
10985 entry->behavior = VM_BEHAVIOR_DEFAULT;
10986
10987 /*
10988 * If the entry is now wired,
10989 * map the pages into the destination map.
10990 */
10991 if (entry->wired_count != 0) {
10992 vm_map_offset_t va;
10993 vm_object_offset_t offset;
10994 vm_object_t object;
10995 vm_prot_t prot;
10996 int type_of_fault;
10997
10998 /* TODO4K would need to use actual page size */
10999 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11000
11001 object = VME_OBJECT(entry);
11002 offset = VME_OFFSET(entry);
11003 va = entry->vme_start;
11004
11005 pmap_pageable(dst_map->pmap,
11006 entry->vme_start,
11007 entry->vme_end,
11008 TRUE);
11009
11010 while (va < entry->vme_end) {
11011 vm_page_t m;
11012 struct vm_object_fault_info fault_info = {};
11013
11014 /*
11015 * Look up the page in the object.
11016 * Assert that the page will be found in the
11017 * top object:
11018 * either
11019 * the object was newly created by
11020 * vm_object_copy_slowly, and has
11021 * copies of all of the pages from
11022 * the source object
11023 * or
11024 * the object was moved from the old
11025 * map entry; because the old map
11026 * entry was wired, all of the pages
11027 * were in the top-level object.
11028 * (XXX not true if we wire pages for
11029 * reading)
11030 */
11031 vm_object_lock(object);
11032
11033 m = vm_page_lookup(object, offset);
11034 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11035 m->vmp_absent) {
11036 panic("vm_map_copyout: wiring %p", m);
11037 }
11038
11039 prot = entry->protection;
11040
11041 if (override_nx(dst_map, VME_ALIAS(entry)) &&
11042 prot) {
11043 prot |= VM_PROT_EXECUTE;
11044 }
11045
11046 type_of_fault = DBG_CACHE_HIT_FAULT;
11047
11048 fault_info.user_tag = VME_ALIAS(entry);
11049 fault_info.pmap_options = 0;
11050 if (entry->iokit_acct ||
11051 (!entry->is_sub_map && !entry->use_pmap)) {
11052 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11053 }
11054
11055 vm_fault_enter(m,
11056 dst_map->pmap,
11057 va,
11058 PAGE_SIZE, 0,
11059 prot,
11060 prot,
11061 VM_PAGE_WIRED(m),
11062 FALSE, /* change_wiring */
11063 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11064 &fault_info,
11065 NULL, /* need_retry */
11066 &type_of_fault);
11067
11068 vm_object_unlock(object);
11069
11070 offset += PAGE_SIZE_64;
11071 va += PAGE_SIZE;
11072 }
11073 }
11074 }
11075
11076 after_adjustments:
11077
11078 /*
11079 * Correct the page alignment for the result
11080 */
11081
11082 *dst_addr = start + (copy->offset - vm_copy_start);
11083
11084 #if KASAN
11085 kasan_notify_address(*dst_addr, size);
11086 #endif
11087
11088 /*
11089 * Update the hints and the map size
11090 */
11091
11092 if (consume_on_success) {
11093 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11094 } else {
11095 SAVE_HINT_MAP_WRITE(dst_map, last);
11096 }
11097
11098 dst_map->size += size;
11099
11100 /*
11101 * Link in the copy
11102 */
11103
11104 if (consume_on_success) {
11105 vm_map_copy_insert(dst_map, last, copy);
11106 if (copy != original_copy) {
11107 vm_map_copy_discard(original_copy);
11108 original_copy = VM_MAP_COPY_NULL;
11109 }
11110 } else {
11111 vm_map_copy_remap(dst_map, last, copy, adjustment,
11112 cur_protection, max_protection,
11113 inheritance);
11114 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11115 vm_map_copy_discard(copy);
11116 copy = original_copy;
11117 }
11118 }
11119
11120
11121 vm_map_unlock(dst_map);
11122
11123 /*
11124 * XXX If wiring_required, call vm_map_pageable
11125 */
11126
11127 return KERN_SUCCESS;
11128 }
11129
11130 /*
11131 * Routine: vm_map_copyin
11132 *
11133 * Description:
11134 * see vm_map_copyin_common. Exported via Unsupported.exports.
11135 *
11136 */
11137
11138 #undef vm_map_copyin
11139
11140 kern_return_t
vm_map_copyin(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,vm_map_copy_t * copy_result)11141 vm_map_copyin(
11142 vm_map_t src_map,
11143 vm_map_address_t src_addr,
11144 vm_map_size_t len,
11145 boolean_t src_destroy,
11146 vm_map_copy_t *copy_result) /* OUT */
11147 {
11148 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11149 FALSE, copy_result, FALSE);
11150 }
11151
11152 /*
11153 * Routine: vm_map_copyin_common
11154 *
11155 * Description:
11156 * Copy the specified region (src_addr, len) from the
11157 * source address space (src_map), possibly removing
11158 * the region from the source address space (src_destroy).
11159 *
11160 * Returns:
11161 * A vm_map_copy_t object (copy_result), suitable for
11162 * insertion into another address space (using vm_map_copyout),
11163 * copying over another address space region (using
11164 * vm_map_copy_overwrite). If the copy is unused, it
11165 * should be destroyed (using vm_map_copy_discard).
11166 *
11167 * In/out conditions:
11168 * The source map should not be locked on entry.
11169 */
11170
11171 typedef struct submap_map {
11172 vm_map_t parent_map;
11173 vm_map_offset_t base_start;
11174 vm_map_offset_t base_end;
11175 vm_map_size_t base_len;
11176 struct submap_map *next;
11177 } submap_map_t;
11178
11179 kern_return_t
vm_map_copyin_common(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t src_destroy,__unused boolean_t src_volatile,vm_map_copy_t * copy_result,boolean_t use_maxprot)11180 vm_map_copyin_common(
11181 vm_map_t src_map,
11182 vm_map_address_t src_addr,
11183 vm_map_size_t len,
11184 boolean_t src_destroy,
11185 __unused boolean_t src_volatile,
11186 vm_map_copy_t *copy_result, /* OUT */
11187 boolean_t use_maxprot)
11188 {
11189 int flags;
11190
11191 flags = 0;
11192 if (src_destroy) {
11193 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11194 }
11195 if (use_maxprot) {
11196 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11197 }
11198 return vm_map_copyin_internal(src_map,
11199 src_addr,
11200 len,
11201 flags,
11202 copy_result);
11203 }
11204 kern_return_t
vm_map_copyin_internal(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,int flags,vm_map_copy_t * copy_result)11205 vm_map_copyin_internal(
11206 vm_map_t src_map,
11207 vm_map_address_t src_addr,
11208 vm_map_size_t len,
11209 int flags,
11210 vm_map_copy_t *copy_result) /* OUT */
11211 {
11212 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11213 * in multi-level lookup, this
11214 * entry contains the actual
11215 * vm_object/offset.
11216 */
11217 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11218
11219 vm_map_offset_t src_start; /* Start of current entry --
11220 * where copy is taking place now
11221 */
11222 vm_map_offset_t src_end; /* End of entire region to be
11223 * copied */
11224 vm_map_offset_t src_base;
11225 vm_map_t base_map = src_map;
11226 boolean_t map_share = FALSE;
11227 submap_map_t *parent_maps = NULL;
11228
11229 vm_map_copy_t copy; /* Resulting copy */
11230 vm_map_address_t copy_addr;
11231 vm_map_size_t copy_size;
11232 boolean_t src_destroy;
11233 boolean_t use_maxprot;
11234 boolean_t preserve_purgeable;
11235 boolean_t entry_was_shared;
11236 vm_map_entry_t saved_src_entry;
11237
11238 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11239 return KERN_INVALID_ARGUMENT;
11240 }
11241
11242 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11243 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11244 preserve_purgeable =
11245 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
11246
11247 /*
11248 * Check for copies of zero bytes.
11249 */
11250
11251 if (len == 0) {
11252 *copy_result = VM_MAP_COPY_NULL;
11253 return KERN_SUCCESS;
11254 }
11255
11256 /*
11257 * Check that the end address doesn't overflow
11258 */
11259 src_end = src_addr + len;
11260 if (src_end < src_addr) {
11261 return KERN_INVALID_ADDRESS;
11262 }
11263
11264 /*
11265 * Compute (page aligned) start and end of region
11266 */
11267 src_start = vm_map_trunc_page(src_addr,
11268 VM_MAP_PAGE_MASK(src_map));
11269 src_end = vm_map_round_page(src_end,
11270 VM_MAP_PAGE_MASK(src_map));
11271
11272 /*
11273 * If the copy is sufficiently small, use a kernel buffer instead
11274 * of making a virtual copy. The theory being that the cost of
11275 * setting up VM (and taking C-O-W faults) dominates the copy costs
11276 * for small regions.
11277 */
11278 if ((len <= msg_ool_size_small) &&
11279 !use_maxprot &&
11280 !preserve_purgeable &&
11281 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
11282 /*
11283 * Since the "msg_ool_size_small" threshold was increased and
11284 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
11285 * address space limits, we revert to doing a virtual copy if the
11286 * copied range goes beyond those limits. Otherwise, mach_vm_read()
11287 * of the commpage would now fail when it used to work.
11288 */
11289 (src_start >= vm_map_min(src_map) &&
11290 src_start < vm_map_max(src_map) &&
11291 src_end >= vm_map_min(src_map) &&
11292 src_end < vm_map_max(src_map))) {
11293 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
11294 src_destroy, copy_result);
11295 }
11296
11297 /*
11298 * Allocate a header element for the list.
11299 *
11300 * Use the start and end in the header to
11301 * remember the endpoints prior to rounding.
11302 */
11303
11304 copy = vm_map_copy_allocate();
11305 copy->type = VM_MAP_COPY_ENTRY_LIST;
11306 copy->cpy_hdr.entries_pageable = TRUE;
11307 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(src_map);
11308
11309 vm_map_store_init( &(copy->cpy_hdr));
11310
11311 copy->offset = src_addr;
11312 copy->size = len;
11313
11314 new_entry = vm_map_copy_entry_create(copy);
11315
11316 #define RETURN(x) \
11317 MACRO_BEGIN \
11318 vm_map_unlock(src_map); \
11319 if(src_map != base_map) \
11320 vm_map_deallocate(src_map); \
11321 if (new_entry != VM_MAP_ENTRY_NULL) \
11322 vm_map_copy_entry_dispose(new_entry); \
11323 vm_map_copy_discard(copy); \
11324 { \
11325 submap_map_t *_ptr; \
11326 \
11327 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
11328 parent_maps=parent_maps->next; \
11329 if (_ptr->parent_map != base_map) \
11330 vm_map_deallocate(_ptr->parent_map); \
11331 kfree_type(submap_map_t, _ptr); \
11332 } \
11333 } \
11334 MACRO_RETURN(x); \
11335 MACRO_END
11336
11337 /*
11338 * Find the beginning of the region.
11339 */
11340
11341 vm_map_lock(src_map);
11342
11343 /*
11344 * Lookup the original "src_addr" rather than the truncated
11345 * "src_start", in case "src_start" falls in a non-map-aligned
11346 * map entry *before* the map entry that contains "src_addr"...
11347 */
11348 if (!vm_map_lookup_entry(src_map, src_addr, &tmp_entry)) {
11349 RETURN(KERN_INVALID_ADDRESS);
11350 }
11351 if (!tmp_entry->is_sub_map) {
11352 /*
11353 * ... but clip to the map-rounded "src_start" rather than
11354 * "src_addr" to preserve map-alignment. We'll adjust the
11355 * first copy entry at the end, if needed.
11356 */
11357 vm_map_clip_start(src_map, tmp_entry, src_start);
11358 }
11359 if (src_start < tmp_entry->vme_start) {
11360 /*
11361 * Move "src_start" up to the start of the
11362 * first map entry to copy.
11363 */
11364 src_start = tmp_entry->vme_start;
11365 }
11366 /* set for later submap fix-up */
11367 copy_addr = src_start;
11368
11369 /*
11370 * Go through entries until we get to the end.
11371 */
11372
11373 while (TRUE) {
11374 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
11375 vm_map_size_t src_size; /* Size of source
11376 * map entry (in both
11377 * maps)
11378 */
11379
11380 vm_object_t src_object; /* Object to copy */
11381 vm_object_offset_t src_offset;
11382
11383 vm_object_t new_copy_object;/* vm_object_copy_* result */
11384
11385 boolean_t src_needs_copy; /* Should source map
11386 * be made read-only
11387 * for copy-on-write?
11388 */
11389
11390 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
11391
11392 boolean_t was_wired; /* Was source wired? */
11393 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
11394 vm_map_version_t version; /* Version before locks
11395 * dropped to make copy
11396 */
11397 kern_return_t result; /* Return value from
11398 * copy_strategically.
11399 */
11400 while (tmp_entry->is_sub_map) {
11401 vm_map_size_t submap_len;
11402 submap_map_t *ptr;
11403
11404 ptr = kalloc_type(submap_map_t, Z_WAITOK);
11405 ptr->next = parent_maps;
11406 parent_maps = ptr;
11407 ptr->parent_map = src_map;
11408 ptr->base_start = src_start;
11409 ptr->base_end = src_end;
11410 submap_len = tmp_entry->vme_end - src_start;
11411 if (submap_len > (src_end - src_start)) {
11412 submap_len = src_end - src_start;
11413 }
11414 ptr->base_len = submap_len;
11415
11416 src_start -= tmp_entry->vme_start;
11417 src_start += VME_OFFSET(tmp_entry);
11418 src_end = src_start + submap_len;
11419 src_map = VME_SUBMAP(tmp_entry);
11420 vm_map_lock(src_map);
11421 /* keep an outstanding reference for all maps in */
11422 /* the parents tree except the base map */
11423 vm_map_reference(src_map);
11424 vm_map_unlock(ptr->parent_map);
11425 if (!vm_map_lookup_entry(
11426 src_map, src_start, &tmp_entry)) {
11427 RETURN(KERN_INVALID_ADDRESS);
11428 }
11429 map_share = TRUE;
11430 if (!tmp_entry->is_sub_map) {
11431 vm_map_clip_start(src_map, tmp_entry, src_start);
11432 }
11433 src_entry = tmp_entry;
11434 }
11435 /* we are now in the lowest level submap... */
11436
11437 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
11438 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
11439 /* This is not, supported for now.In future */
11440 /* we will need to detect the phys_contig */
11441 /* condition and then upgrade copy_slowly */
11442 /* to do physical copy from the device mem */
11443 /* based object. We can piggy-back off of */
11444 /* the was wired boolean to set-up the */
11445 /* proper handling */
11446 RETURN(KERN_PROTECTION_FAILURE);
11447 }
11448 /*
11449 * Create a new address map entry to hold the result.
11450 * Fill in the fields from the appropriate source entries.
11451 * We must unlock the source map to do this if we need
11452 * to allocate a map entry.
11453 */
11454 if (new_entry == VM_MAP_ENTRY_NULL) {
11455 version.main_timestamp = src_map->timestamp;
11456 vm_map_unlock(src_map);
11457
11458 new_entry = vm_map_copy_entry_create(copy);
11459
11460 vm_map_lock(src_map);
11461 if ((version.main_timestamp + 1) != src_map->timestamp) {
11462 if (!vm_map_lookup_entry(src_map, src_start,
11463 &tmp_entry)) {
11464 RETURN(KERN_INVALID_ADDRESS);
11465 }
11466 if (!tmp_entry->is_sub_map) {
11467 vm_map_clip_start(src_map, tmp_entry, src_start);
11468 }
11469 continue; /* restart w/ new tmp_entry */
11470 }
11471 }
11472
11473 /*
11474 * Verify that the region can be read.
11475 */
11476 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
11477 !use_maxprot) ||
11478 (src_entry->max_protection & VM_PROT_READ) == 0) {
11479 RETURN(KERN_PROTECTION_FAILURE);
11480 }
11481
11482 /*
11483 * Clip against the endpoints of the entire region.
11484 */
11485
11486 vm_map_clip_end(src_map, src_entry, src_end);
11487
11488 src_size = src_entry->vme_end - src_start;
11489 src_object = VME_OBJECT(src_entry);
11490 src_offset = VME_OFFSET(src_entry);
11491 was_wired = (src_entry->wired_count != 0);
11492
11493 vm_map_entry_copy(src_map, new_entry, src_entry);
11494 if (new_entry->is_sub_map) {
11495 /* clr address space specifics */
11496 new_entry->use_pmap = FALSE;
11497 } else {
11498 /*
11499 * We're dealing with a copy-on-write operation,
11500 * so the resulting mapping should not inherit the
11501 * original mapping's accounting settings.
11502 * "iokit_acct" should have been cleared in
11503 * vm_map_entry_copy().
11504 * "use_pmap" should be reset to its default (TRUE)
11505 * so that the new mapping gets accounted for in
11506 * the task's memory footprint.
11507 */
11508 assert(!new_entry->iokit_acct);
11509 new_entry->use_pmap = TRUE;
11510 }
11511
11512 /*
11513 * Attempt non-blocking copy-on-write optimizations.
11514 */
11515
11516 /*
11517 * If we are destroying the source, and the object
11518 * is internal, we could move the object reference
11519 * from the source to the copy. The copy is
11520 * copy-on-write only if the source is.
11521 * We make another reference to the object, because
11522 * destroying the source entry will deallocate it.
11523 *
11524 * This memory transfer has to be atomic, (to prevent
11525 * the VM object from being shared or copied while
11526 * it's being moved here), so we could only do this
11527 * if we won't have to unlock the VM map until the
11528 * original mapping has been fully removed.
11529 */
11530
11531 RestartCopy:
11532 if ((src_object == VM_OBJECT_NULL ||
11533 (!was_wired && !map_share && !tmp_entry->is_shared
11534 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT))) &&
11535 vm_object_copy_quickly(
11536 VME_OBJECT(new_entry),
11537 src_offset,
11538 src_size,
11539 &src_needs_copy,
11540 &new_entry_needs_copy)) {
11541 new_entry->needs_copy = new_entry_needs_copy;
11542
11543 /*
11544 * Handle copy-on-write obligations
11545 */
11546
11547 if (src_needs_copy && !tmp_entry->needs_copy) {
11548 vm_prot_t prot;
11549
11550 prot = src_entry->protection & ~VM_PROT_WRITE;
11551
11552 if (override_nx(src_map, VME_ALIAS(src_entry))
11553 && prot) {
11554 prot |= VM_PROT_EXECUTE;
11555 }
11556
11557 vm_object_pmap_protect(
11558 src_object,
11559 src_offset,
11560 src_size,
11561 (src_entry->is_shared ?
11562 PMAP_NULL
11563 : src_map->pmap),
11564 VM_MAP_PAGE_SIZE(src_map),
11565 src_entry->vme_start,
11566 prot);
11567
11568 assert(tmp_entry->wired_count == 0);
11569 tmp_entry->needs_copy = TRUE;
11570 }
11571
11572 /*
11573 * The map has never been unlocked, so it's safe
11574 * to move to the next entry rather than doing
11575 * another lookup.
11576 */
11577
11578 goto CopySuccessful;
11579 }
11580
11581 entry_was_shared = tmp_entry->is_shared;
11582
11583 /*
11584 * Take an object reference, so that we may
11585 * release the map lock(s).
11586 */
11587
11588 assert(src_object != VM_OBJECT_NULL);
11589 vm_object_reference(src_object);
11590
11591 /*
11592 * Record the timestamp for later verification.
11593 * Unlock the map.
11594 */
11595
11596 version.main_timestamp = src_map->timestamp;
11597 vm_map_unlock(src_map); /* Increments timestamp once! */
11598 saved_src_entry = src_entry;
11599 tmp_entry = VM_MAP_ENTRY_NULL;
11600 src_entry = VM_MAP_ENTRY_NULL;
11601
11602 /*
11603 * Perform the copy
11604 */
11605
11606 if (was_wired ||
11607 (debug4k_no_cow_copyin &&
11608 VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT)) {
11609 CopySlowly:
11610 vm_object_lock(src_object);
11611 result = vm_object_copy_slowly(
11612 src_object,
11613 src_offset,
11614 src_size,
11615 THREAD_UNINT,
11616 &new_copy_object);
11617 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11618 saved_used_for_jit = new_entry->used_for_jit;
11619 VME_OBJECT_SET(new_entry, new_copy_object);
11620 new_entry->used_for_jit = saved_used_for_jit;
11621 VME_OFFSET_SET(new_entry,
11622 src_offset - vm_object_trunc_page(src_offset));
11623 new_entry->needs_copy = FALSE;
11624 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
11625 (entry_was_shared || map_share)) {
11626 vm_object_t new_object;
11627
11628 vm_object_lock_shared(src_object);
11629 new_object = vm_object_copy_delayed(
11630 src_object,
11631 src_offset,
11632 src_size,
11633 TRUE);
11634 if (new_object == VM_OBJECT_NULL) {
11635 goto CopySlowly;
11636 }
11637
11638 VME_OBJECT_SET(new_entry, new_object);
11639 assert(new_entry->wired_count == 0);
11640 new_entry->needs_copy = TRUE;
11641 assert(!new_entry->iokit_acct);
11642 assert(new_object->purgable == VM_PURGABLE_DENY);
11643 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
11644 result = KERN_SUCCESS;
11645 } else {
11646 vm_object_offset_t new_offset;
11647 new_offset = VME_OFFSET(new_entry);
11648 result = vm_object_copy_strategically(src_object,
11649 src_offset,
11650 src_size,
11651 &new_copy_object,
11652 &new_offset,
11653 &new_entry_needs_copy);
11654 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
11655 saved_used_for_jit = new_entry->used_for_jit;
11656 VME_OBJECT_SET(new_entry, new_copy_object);
11657 new_entry->used_for_jit = saved_used_for_jit;
11658 if (new_offset != VME_OFFSET(new_entry)) {
11659 VME_OFFSET_SET(new_entry, new_offset);
11660 }
11661
11662 new_entry->needs_copy = new_entry_needs_copy;
11663 }
11664
11665 if (result == KERN_SUCCESS &&
11666 ((preserve_purgeable &&
11667 src_object->purgable != VM_PURGABLE_DENY) ||
11668 new_entry->used_for_jit)) {
11669 /*
11670 * Purgeable objects should be COPY_NONE, true share;
11671 * this should be propogated to the copy.
11672 *
11673 * Also force mappings the pmap specially protects to
11674 * be COPY_NONE; trying to COW these mappings would
11675 * change the effective protections, which could have
11676 * side effects if the pmap layer relies on the
11677 * specified protections.
11678 */
11679
11680 vm_object_t new_object;
11681
11682 new_object = VME_OBJECT(new_entry);
11683 assert(new_object != src_object);
11684 vm_object_lock(new_object);
11685 assert(new_object->ref_count == 1);
11686 assert(new_object->shadow == VM_OBJECT_NULL);
11687 assert(new_object->copy == VM_OBJECT_NULL);
11688 assert(new_object->vo_owner == NULL);
11689
11690 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
11691
11692 if (preserve_purgeable &&
11693 src_object->purgable != VM_PURGABLE_DENY) {
11694 new_object->true_share = TRUE;
11695
11696 /* start as non-volatile with no owner... */
11697 new_object->purgable = VM_PURGABLE_NONVOLATILE;
11698 vm_purgeable_nonvolatile_enqueue(new_object, NULL);
11699 /* ... and move to src_object's purgeable state */
11700 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
11701 int state;
11702 state = src_object->purgable;
11703 vm_object_purgable_control(
11704 new_object,
11705 VM_PURGABLE_SET_STATE_FROM_KERNEL,
11706 &state);
11707 }
11708 /* no pmap accounting for purgeable objects */
11709 new_entry->use_pmap = FALSE;
11710 }
11711
11712 vm_object_unlock(new_object);
11713 new_object = VM_OBJECT_NULL;
11714 }
11715
11716 if (result != KERN_SUCCESS &&
11717 result != KERN_MEMORY_RESTART_COPY) {
11718 vm_map_lock(src_map);
11719 RETURN(result);
11720 }
11721
11722 /*
11723 * Throw away the extra reference
11724 */
11725
11726 vm_object_deallocate(src_object);
11727
11728 /*
11729 * Verify that the map has not substantially
11730 * changed while the copy was being made.
11731 */
11732
11733 vm_map_lock(src_map);
11734
11735 if ((version.main_timestamp + 1) == src_map->timestamp) {
11736 /* src_map hasn't changed: src_entry is still valid */
11737 src_entry = saved_src_entry;
11738 goto VerificationSuccessful;
11739 }
11740
11741 /*
11742 * Simple version comparison failed.
11743 *
11744 * Retry the lookup and verify that the
11745 * same object/offset are still present.
11746 *
11747 * [Note: a memory manager that colludes with
11748 * the calling task can detect that we have
11749 * cheated. While the map was unlocked, the
11750 * mapping could have been changed and restored.]
11751 */
11752
11753 if (!vm_map_lookup_entry(src_map, src_start, &tmp_entry)) {
11754 if (result != KERN_MEMORY_RESTART_COPY) {
11755 vm_object_deallocate(VME_OBJECT(new_entry));
11756 VME_OBJECT_SET(new_entry, VM_OBJECT_NULL);
11757 /* reset accounting state */
11758 new_entry->iokit_acct = FALSE;
11759 new_entry->use_pmap = TRUE;
11760 }
11761 RETURN(KERN_INVALID_ADDRESS);
11762 }
11763
11764 src_entry = tmp_entry;
11765 vm_map_clip_start(src_map, src_entry, src_start);
11766
11767 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
11768 !use_maxprot) ||
11769 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
11770 goto VerificationFailed;
11771 }
11772
11773 if (src_entry->vme_end < new_entry->vme_end) {
11774 /*
11775 * This entry might have been shortened
11776 * (vm_map_clip_end) or been replaced with
11777 * an entry that ends closer to "src_start"
11778 * than before.
11779 * Adjust "new_entry" accordingly; copying
11780 * less memory would be correct but we also
11781 * redo the copy (see below) if the new entry
11782 * no longer points at the same object/offset.
11783 */
11784 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
11785 VM_MAP_COPY_PAGE_MASK(copy)));
11786 new_entry->vme_end = src_entry->vme_end;
11787 src_size = new_entry->vme_end - src_start;
11788 } else if (src_entry->vme_end > new_entry->vme_end) {
11789 /*
11790 * This entry might have been extended
11791 * (vm_map_entry_simplify() or coalesce)
11792 * or been replaced with an entry that ends farther
11793 * from "src_start" than before.
11794 *
11795 * We've called vm_object_copy_*() only on
11796 * the previous <start:end> range, so we can't
11797 * just extend new_entry. We have to re-do
11798 * the copy based on the new entry as if it was
11799 * pointing at a different object/offset (see
11800 * "Verification failed" below).
11801 */
11802 }
11803
11804 if ((VME_OBJECT(src_entry) != src_object) ||
11805 (VME_OFFSET(src_entry) != src_offset) ||
11806 (src_entry->vme_end > new_entry->vme_end)) {
11807 /*
11808 * Verification failed.
11809 *
11810 * Start over with this top-level entry.
11811 */
11812
11813 VerificationFailed: ;
11814
11815 vm_object_deallocate(VME_OBJECT(new_entry));
11816 tmp_entry = src_entry;
11817 continue;
11818 }
11819
11820 /*
11821 * Verification succeeded.
11822 */
11823
11824 VerificationSuccessful:;
11825
11826 if (result == KERN_MEMORY_RESTART_COPY) {
11827 goto RestartCopy;
11828 }
11829
11830 /*
11831 * Copy succeeded.
11832 */
11833
11834 CopySuccessful: ;
11835
11836 /*
11837 * Link in the new copy entry.
11838 */
11839
11840 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
11841 new_entry);
11842
11843 /*
11844 * Determine whether the entire region
11845 * has been copied.
11846 */
11847 src_base = src_start;
11848 src_start = new_entry->vme_end;
11849 new_entry = VM_MAP_ENTRY_NULL;
11850 while ((src_start >= src_end) && (src_end != 0)) {
11851 submap_map_t *ptr;
11852
11853 if (src_map == base_map) {
11854 /* back to the top */
11855 break;
11856 }
11857
11858 ptr = parent_maps;
11859 assert(ptr != NULL);
11860 parent_maps = parent_maps->next;
11861
11862 /* fix up the damage we did in that submap */
11863 vm_map_simplify_range(src_map,
11864 src_base,
11865 src_end);
11866
11867 vm_map_unlock(src_map);
11868 vm_map_deallocate(src_map);
11869 vm_map_lock(ptr->parent_map);
11870 src_map = ptr->parent_map;
11871 src_base = ptr->base_start;
11872 src_start = ptr->base_start + ptr->base_len;
11873 src_end = ptr->base_end;
11874 if (!vm_map_lookup_entry(src_map,
11875 src_start,
11876 &tmp_entry) &&
11877 (src_end > src_start)) {
11878 RETURN(KERN_INVALID_ADDRESS);
11879 }
11880 kfree_type(submap_map_t, ptr);
11881 if (parent_maps == NULL) {
11882 map_share = FALSE;
11883 }
11884 src_entry = tmp_entry->vme_prev;
11885 }
11886
11887 if ((VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT) &&
11888 (src_start >= src_addr + len) &&
11889 (src_addr + len != 0)) {
11890 /*
11891 * Stop copying now, even though we haven't reached
11892 * "src_end". We'll adjust the end of the last copy
11893 * entry at the end, if needed.
11894 *
11895 * If src_map's aligment is different from the
11896 * system's page-alignment, there could be
11897 * extra non-map-aligned map entries between
11898 * the original (non-rounded) "src_addr + len"
11899 * and the rounded "src_end".
11900 * We do not want to copy those map entries since
11901 * they're not part of the copied range.
11902 */
11903 break;
11904 }
11905
11906 if ((src_start >= src_end) && (src_end != 0)) {
11907 break;
11908 }
11909
11910 /*
11911 * Verify that there are no gaps in the region
11912 */
11913
11914 tmp_entry = src_entry->vme_next;
11915 if ((tmp_entry->vme_start != src_start) ||
11916 (tmp_entry == vm_map_to_entry(src_map))) {
11917 RETURN(KERN_INVALID_ADDRESS);
11918 }
11919 }
11920
11921 /*
11922 * If the source should be destroyed, do it now, since the
11923 * copy was successful.
11924 */
11925 if (src_destroy) {
11926 (void)vm_map_remove_and_unlock(src_map,
11927 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11928 src_end,
11929 ((src_map == kernel_map) ?
11930 VM_MAP_REMOVE_KUNWIRE :
11931 VM_MAP_REMOVE_NO_FLAGS));
11932 } else {
11933 /* fix up the damage we did in the base map */
11934 vm_map_simplify_range(
11935 src_map,
11936 vm_map_trunc_page(src_addr,
11937 VM_MAP_PAGE_MASK(src_map)),
11938 vm_map_round_page(src_end,
11939 VM_MAP_PAGE_MASK(src_map)));
11940 vm_map_unlock(src_map);
11941 }
11942
11943 tmp_entry = VM_MAP_ENTRY_NULL;
11944
11945 if (VM_MAP_PAGE_SHIFT(src_map) > PAGE_SHIFT &&
11946 VM_MAP_PAGE_SHIFT(src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
11947 vm_map_offset_t original_start, original_offset, original_end;
11948
11949 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
11950
11951 /* adjust alignment of first copy_entry's "vme_start" */
11952 tmp_entry = vm_map_copy_first_entry(copy);
11953 if (tmp_entry != vm_map_copy_to_entry(copy)) {
11954 vm_map_offset_t adjustment;
11955
11956 original_start = tmp_entry->vme_start;
11957 original_offset = VME_OFFSET(tmp_entry);
11958
11959 /* map-align the start of the first copy entry... */
11960 adjustment = (tmp_entry->vme_start -
11961 vm_map_trunc_page(
11962 tmp_entry->vme_start,
11963 VM_MAP_PAGE_MASK(src_map)));
11964 tmp_entry->vme_start -= adjustment;
11965 VME_OFFSET_SET(tmp_entry,
11966 VME_OFFSET(tmp_entry) - adjustment);
11967 copy_addr -= adjustment;
11968 assert(tmp_entry->vme_start < tmp_entry->vme_end);
11969 /* ... adjust for mis-aligned start of copy range */
11970 adjustment =
11971 (vm_map_trunc_page(copy->offset,
11972 PAGE_MASK) -
11973 vm_map_trunc_page(copy->offset,
11974 VM_MAP_PAGE_MASK(src_map)));
11975 if (adjustment) {
11976 assert(page_aligned(adjustment));
11977 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
11978 tmp_entry->vme_start += adjustment;
11979 VME_OFFSET_SET(tmp_entry,
11980 (VME_OFFSET(tmp_entry) +
11981 adjustment));
11982 copy_addr += adjustment;
11983 assert(tmp_entry->vme_start < tmp_entry->vme_end);
11984 }
11985
11986 /*
11987 * Assert that the adjustments haven't exposed
11988 * more than was originally copied...
11989 */
11990 assert(tmp_entry->vme_start >= original_start);
11991 assert(VME_OFFSET(tmp_entry) >= original_offset);
11992 /*
11993 * ... and that it did not adjust outside of a
11994 * a single 16K page.
11995 */
11996 assert(vm_map_trunc_page(tmp_entry->vme_start,
11997 VM_MAP_PAGE_MASK(src_map)) ==
11998 vm_map_trunc_page(original_start,
11999 VM_MAP_PAGE_MASK(src_map)));
12000 }
12001
12002 /* adjust alignment of last copy_entry's "vme_end" */
12003 tmp_entry = vm_map_copy_last_entry(copy);
12004 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12005 vm_map_offset_t adjustment;
12006
12007 original_end = tmp_entry->vme_end;
12008
12009 /* map-align the end of the last copy entry... */
12010 tmp_entry->vme_end =
12011 vm_map_round_page(tmp_entry->vme_end,
12012 VM_MAP_PAGE_MASK(src_map));
12013 /* ... adjust for mis-aligned end of copy range */
12014 adjustment =
12015 (vm_map_round_page((copy->offset +
12016 copy->size),
12017 VM_MAP_PAGE_MASK(src_map)) -
12018 vm_map_round_page((copy->offset +
12019 copy->size),
12020 PAGE_MASK));
12021 if (adjustment) {
12022 assert(page_aligned(adjustment));
12023 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12024 tmp_entry->vme_end -= adjustment;
12025 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12026 }
12027
12028 /*
12029 * Assert that the adjustments haven't exposed
12030 * more than was originally copied...
12031 */
12032 assert(tmp_entry->vme_end <= original_end);
12033 /*
12034 * ... and that it did not adjust outside of a
12035 * a single 16K page.
12036 */
12037 assert(vm_map_round_page(tmp_entry->vme_end,
12038 VM_MAP_PAGE_MASK(src_map)) ==
12039 vm_map_round_page(original_end,
12040 VM_MAP_PAGE_MASK(src_map)));
12041 }
12042 }
12043
12044 /* Fix-up start and end points in copy. This is necessary */
12045 /* when the various entries in the copy object were picked */
12046 /* up from different sub-maps */
12047
12048 tmp_entry = vm_map_copy_first_entry(copy);
12049 copy_size = 0; /* compute actual size */
12050 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12051 assert(VM_MAP_PAGE_ALIGNED(
12052 copy_addr + (tmp_entry->vme_end -
12053 tmp_entry->vme_start),
12054 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12055 assert(VM_MAP_PAGE_ALIGNED(
12056 copy_addr,
12057 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12058
12059 /*
12060 * The copy_entries will be injected directly into the
12061 * destination map and might not be "map aligned" there...
12062 */
12063 tmp_entry->map_aligned = FALSE;
12064
12065 tmp_entry->vme_end = copy_addr +
12066 (tmp_entry->vme_end - tmp_entry->vme_start);
12067 tmp_entry->vme_start = copy_addr;
12068 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12069 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12070 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12071 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12072 }
12073
12074 if (VM_MAP_PAGE_SHIFT(src_map) != PAGE_SHIFT &&
12075 copy_size < copy->size) {
12076 /*
12077 * The actual size of the VM map copy is smaller than what
12078 * was requested by the caller. This must be because some
12079 * PAGE_SIZE-sized pages are missing at the end of the last
12080 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12081 * The caller might not have been aware of those missing
12082 * pages and might not want to be aware of it, which is
12083 * fine as long as they don't try to access (and crash on)
12084 * those missing pages.
12085 * Let's adjust the size of the "copy", to avoid failing
12086 * in vm_map_copyout() or vm_map_copy_overwrite().
12087 */
12088 assert(vm_map_round_page(copy_size,
12089 VM_MAP_PAGE_MASK(src_map)) ==
12090 vm_map_round_page(copy->size,
12091 VM_MAP_PAGE_MASK(src_map)));
12092 copy->size = copy_size;
12093 }
12094
12095 *copy_result = copy;
12096 return KERN_SUCCESS;
12097
12098 #undef RETURN
12099 }
12100
12101 kern_return_t
vm_map_copy_extract(vm_map_t src_map,vm_map_address_t src_addr,vm_map_size_t len,boolean_t do_copy,vm_map_copy_t * copy_result,vm_prot_t * cur_prot,vm_prot_t * max_prot,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)12102 vm_map_copy_extract(
12103 vm_map_t src_map,
12104 vm_map_address_t src_addr,
12105 vm_map_size_t len,
12106 boolean_t do_copy,
12107 vm_map_copy_t *copy_result, /* OUT */
12108 vm_prot_t *cur_prot, /* IN/OUT */
12109 vm_prot_t *max_prot, /* IN/OUT */
12110 vm_inherit_t inheritance,
12111 vm_map_kernel_flags_t vmk_flags)
12112 {
12113 vm_map_copy_t copy;
12114 kern_return_t kr;
12115 vm_prot_t required_cur_prot, required_max_prot;
12116
12117 /*
12118 * Check for copies of zero bytes.
12119 */
12120
12121 if (len == 0) {
12122 *copy_result = VM_MAP_COPY_NULL;
12123 return KERN_SUCCESS;
12124 }
12125
12126 /*
12127 * Check that the end address doesn't overflow
12128 */
12129 if (src_addr + len < src_addr) {
12130 return KERN_INVALID_ADDRESS;
12131 }
12132
12133 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12134 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12135 }
12136
12137 required_cur_prot = *cur_prot;
12138 required_max_prot = *max_prot;
12139
12140 /*
12141 * Allocate a header element for the list.
12142 *
12143 * Use the start and end in the header to
12144 * remember the endpoints prior to rounding.
12145 */
12146
12147 copy = vm_map_copy_allocate();
12148 copy->type = VM_MAP_COPY_ENTRY_LIST;
12149 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12150
12151 vm_map_store_init(©->cpy_hdr);
12152
12153 copy->offset = 0;
12154 copy->size = len;
12155
12156 kr = vm_map_remap_extract(src_map,
12157 src_addr,
12158 len,
12159 do_copy, /* copy */
12160 ©->cpy_hdr,
12161 cur_prot, /* IN/OUT */
12162 max_prot, /* IN/OUT */
12163 inheritance,
12164 vmk_flags);
12165 if (kr != KERN_SUCCESS) {
12166 vm_map_copy_discard(copy);
12167 return kr;
12168 }
12169 if (required_cur_prot != VM_PROT_NONE) {
12170 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12171 assert((*max_prot & required_max_prot) == required_max_prot);
12172 }
12173
12174 *copy_result = copy;
12175 return KERN_SUCCESS;
12176 }
12177
12178 /*
12179 * vm_map_copyin_object:
12180 *
12181 * Create a copy object from an object.
12182 * Our caller donates an object reference.
12183 */
12184
12185 kern_return_t
vm_map_copyin_object(vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_map_copy_t * copy_result)12186 vm_map_copyin_object(
12187 vm_object_t object,
12188 vm_object_offset_t offset, /* offset of region in object */
12189 vm_object_size_t size, /* size of region in object */
12190 vm_map_copy_t *copy_result) /* OUT */
12191 {
12192 vm_map_copy_t copy; /* Resulting copy */
12193
12194 /*
12195 * We drop the object into a special copy object
12196 * that contains the object directly.
12197 */
12198
12199 copy = vm_map_copy_allocate();
12200 copy->type = VM_MAP_COPY_OBJECT;
12201 copy->cpy_object = object;
12202 copy->offset = offset;
12203 copy->size = size;
12204
12205 *copy_result = copy;
12206 return KERN_SUCCESS;
12207 }
12208
12209 static void
vm_map_fork_share(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)12210 vm_map_fork_share(
12211 vm_map_t old_map,
12212 vm_map_entry_t old_entry,
12213 vm_map_t new_map)
12214 {
12215 vm_object_t object;
12216 vm_map_entry_t new_entry;
12217
12218 /*
12219 * New sharing code. New map entry
12220 * references original object. Internal
12221 * objects use asynchronous copy algorithm for
12222 * future copies. First make sure we have
12223 * the right object. If we need a shadow,
12224 * or someone else already has one, then
12225 * make a new shadow and share it.
12226 */
12227
12228 object = VME_OBJECT(old_entry);
12229 if (old_entry->is_sub_map) {
12230 assert(old_entry->wired_count == 0);
12231 #ifndef NO_NESTED_PMAP
12232 if (old_entry->use_pmap) {
12233 kern_return_t result;
12234
12235 result = pmap_nest(new_map->pmap,
12236 (VME_SUBMAP(old_entry))->pmap,
12237 (addr64_t)old_entry->vme_start,
12238 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12239 if (result) {
12240 panic("vm_map_fork_share: pmap_nest failed!");
12241 }
12242 }
12243 #endif /* NO_NESTED_PMAP */
12244 } else if (object == VM_OBJECT_NULL) {
12245 object = vm_object_allocate((vm_map_size_t)(old_entry->vme_end -
12246 old_entry->vme_start));
12247 VME_OFFSET_SET(old_entry, 0);
12248 VME_OBJECT_SET(old_entry, object);
12249 old_entry->use_pmap = TRUE;
12250 // assert(!old_entry->needs_copy);
12251 } else if (object->copy_strategy !=
12252 MEMORY_OBJECT_COPY_SYMMETRIC) {
12253 /*
12254 * We are already using an asymmetric
12255 * copy, and therefore we already have
12256 * the right object.
12257 */
12258
12259 assert(!old_entry->needs_copy);
12260 } else if (old_entry->needs_copy || /* case 1 */
12261 object->shadowed || /* case 2 */
12262 (!object->true_share && /* case 3 */
12263 !old_entry->is_shared &&
12264 (object->vo_size >
12265 (vm_map_size_t)(old_entry->vme_end -
12266 old_entry->vme_start)))) {
12267 /*
12268 * We need to create a shadow.
12269 * There are three cases here.
12270 * In the first case, we need to
12271 * complete a deferred symmetrical
12272 * copy that we participated in.
12273 * In the second and third cases,
12274 * we need to create the shadow so
12275 * that changes that we make to the
12276 * object do not interfere with
12277 * any symmetrical copies which
12278 * have occured (case 2) or which
12279 * might occur (case 3).
12280 *
12281 * The first case is when we had
12282 * deferred shadow object creation
12283 * via the entry->needs_copy mechanism.
12284 * This mechanism only works when
12285 * only one entry points to the source
12286 * object, and we are about to create
12287 * a second entry pointing to the
12288 * same object. The problem is that
12289 * there is no way of mapping from
12290 * an object to the entries pointing
12291 * to it. (Deferred shadow creation
12292 * works with one entry because occurs
12293 * at fault time, and we walk from the
12294 * entry to the object when handling
12295 * the fault.)
12296 *
12297 * The second case is when the object
12298 * to be shared has already been copied
12299 * with a symmetric copy, but we point
12300 * directly to the object without
12301 * needs_copy set in our entry. (This
12302 * can happen because different ranges
12303 * of an object can be pointed to by
12304 * different entries. In particular,
12305 * a single entry pointing to an object
12306 * can be split by a call to vm_inherit,
12307 * which, combined with task_create, can
12308 * result in the different entries
12309 * having different needs_copy values.)
12310 * The shadowed flag in the object allows
12311 * us to detect this case. The problem
12312 * with this case is that if this object
12313 * has or will have shadows, then we
12314 * must not perform an asymmetric copy
12315 * of this object, since such a copy
12316 * allows the object to be changed, which
12317 * will break the previous symmetrical
12318 * copies (which rely upon the object
12319 * not changing). In a sense, the shadowed
12320 * flag says "don't change this object".
12321 * We fix this by creating a shadow
12322 * object for this object, and sharing
12323 * that. This works because we are free
12324 * to change the shadow object (and thus
12325 * to use an asymmetric copy strategy);
12326 * this is also semantically correct,
12327 * since this object is temporary, and
12328 * therefore a copy of the object is
12329 * as good as the object itself. (This
12330 * is not true for permanent objects,
12331 * since the pager needs to see changes,
12332 * which won't happen if the changes
12333 * are made to a copy.)
12334 *
12335 * The third case is when the object
12336 * to be shared has parts sticking
12337 * outside of the entry we're working
12338 * with, and thus may in the future
12339 * be subject to a symmetrical copy.
12340 * (This is a preemptive version of
12341 * case 2.)
12342 */
12343 VME_OBJECT_SHADOW(old_entry,
12344 (vm_map_size_t) (old_entry->vme_end -
12345 old_entry->vme_start));
12346
12347 /*
12348 * If we're making a shadow for other than
12349 * copy on write reasons, then we have
12350 * to remove write permission.
12351 */
12352
12353 if (!old_entry->needs_copy &&
12354 (old_entry->protection & VM_PROT_WRITE)) {
12355 vm_prot_t prot;
12356
12357 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12358
12359 prot = old_entry->protection & ~VM_PROT_WRITE;
12360
12361 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12362
12363 if (override_nx(old_map, VME_ALIAS(old_entry)) && prot) {
12364 prot |= VM_PROT_EXECUTE;
12365 }
12366
12367
12368 if (old_map->mapped_in_other_pmaps) {
12369 vm_object_pmap_protect(
12370 VME_OBJECT(old_entry),
12371 VME_OFFSET(old_entry),
12372 (old_entry->vme_end -
12373 old_entry->vme_start),
12374 PMAP_NULL,
12375 PAGE_SIZE,
12376 old_entry->vme_start,
12377 prot);
12378 } else {
12379 pmap_protect(old_map->pmap,
12380 old_entry->vme_start,
12381 old_entry->vme_end,
12382 prot);
12383 }
12384 }
12385
12386 old_entry->needs_copy = FALSE;
12387 object = VME_OBJECT(old_entry);
12388 }
12389
12390
12391 /*
12392 * If object was using a symmetric copy strategy,
12393 * change its copy strategy to the default
12394 * asymmetric copy strategy, which is copy_delay
12395 * in the non-norma case and copy_call in the
12396 * norma case. Bump the reference count for the
12397 * new entry.
12398 */
12399
12400 if (old_entry->is_sub_map) {
12401 vm_map_lock(VME_SUBMAP(old_entry));
12402 vm_map_reference(VME_SUBMAP(old_entry));
12403 vm_map_unlock(VME_SUBMAP(old_entry));
12404 } else {
12405 vm_object_lock(object);
12406 vm_object_reference_locked(object);
12407 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
12408 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
12409 }
12410 vm_object_unlock(object);
12411 }
12412
12413 /*
12414 * Clone the entry, using object ref from above.
12415 * Mark both entries as shared.
12416 */
12417
12418 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
12419 vm_map_entry_copy(old_map, new_entry, old_entry);
12420 old_entry->is_shared = TRUE;
12421 new_entry->is_shared = TRUE;
12422
12423 /*
12424 * We're dealing with a shared mapping, so the resulting mapping
12425 * should inherit some of the original mapping's accounting settings.
12426 * "iokit_acct" should have been cleared in vm_map_entry_copy().
12427 * "use_pmap" should stay the same as before (if it hasn't been reset
12428 * to TRUE when we cleared "iokit_acct").
12429 */
12430 assert(!new_entry->iokit_acct);
12431
12432 /*
12433 * If old entry's inheritence is VM_INHERIT_NONE,
12434 * the new entry is for corpse fork, remove the
12435 * write permission from the new entry.
12436 */
12437 if (old_entry->inheritance == VM_INHERIT_NONE) {
12438 new_entry->protection &= ~VM_PROT_WRITE;
12439 new_entry->max_protection &= ~VM_PROT_WRITE;
12440 }
12441
12442 /*
12443 * Insert the entry into the new map -- we
12444 * know we're inserting at the end of the new
12445 * map.
12446 */
12447
12448 vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
12449 VM_MAP_KERNEL_FLAGS_NONE);
12450
12451 /*
12452 * Update the physical map
12453 */
12454
12455 if (old_entry->is_sub_map) {
12456 /* Bill Angell pmap support goes here */
12457 } else {
12458 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
12459 old_entry->vme_end - old_entry->vme_start,
12460 old_entry->vme_start);
12461 }
12462 }
12463
12464 static boolean_t
vm_map_fork_copy(vm_map_t old_map,vm_map_entry_t * old_entry_p,vm_map_t new_map,int vm_map_copyin_flags)12465 vm_map_fork_copy(
12466 vm_map_t old_map,
12467 vm_map_entry_t *old_entry_p,
12468 vm_map_t new_map,
12469 int vm_map_copyin_flags)
12470 {
12471 vm_map_entry_t old_entry = *old_entry_p;
12472 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
12473 vm_map_offset_t start = old_entry->vme_start;
12474 vm_map_copy_t copy;
12475 vm_map_entry_t last = vm_map_last_entry(new_map);
12476
12477 vm_map_unlock(old_map);
12478 /*
12479 * Use maxprot version of copyin because we
12480 * care about whether this memory can ever
12481 * be accessed, not just whether it's accessible
12482 * right now.
12483 */
12484 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
12485 if (vm_map_copyin_internal(old_map, start, entry_size,
12486 vm_map_copyin_flags, ©)
12487 != KERN_SUCCESS) {
12488 /*
12489 * The map might have changed while it
12490 * was unlocked, check it again. Skip
12491 * any blank space or permanently
12492 * unreadable region.
12493 */
12494 vm_map_lock(old_map);
12495 if (!vm_map_lookup_entry(old_map, start, &last) ||
12496 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
12497 last = last->vme_next;
12498 }
12499 *old_entry_p = last;
12500
12501 /*
12502 * XXX For some error returns, want to
12503 * XXX skip to the next element. Note
12504 * that INVALID_ADDRESS and
12505 * PROTECTION_FAILURE are handled above.
12506 */
12507
12508 return FALSE;
12509 }
12510
12511 /*
12512 * Assert that the vm_map_copy is coming from the right
12513 * zone and hasn't been forged
12514 */
12515 vm_map_copy_require(copy);
12516
12517 /*
12518 * Insert the copy into the new map
12519 */
12520 vm_map_copy_insert(new_map, last, copy);
12521
12522 /*
12523 * Pick up the traversal at the end of
12524 * the copied region.
12525 */
12526
12527 vm_map_lock(old_map);
12528 start += entry_size;
12529 if (!vm_map_lookup_entry(old_map, start, &last)) {
12530 last = last->vme_next;
12531 } else {
12532 if (last->vme_start == start) {
12533 /*
12534 * No need to clip here and we don't
12535 * want to cause any unnecessary
12536 * unnesting...
12537 */
12538 } else {
12539 vm_map_clip_start(old_map, last, start);
12540 }
12541 }
12542 *old_entry_p = last;
12543
12544 return TRUE;
12545 }
12546
12547 /*
12548 * vm_map_fork:
12549 *
12550 * Create and return a new map based on the old
12551 * map, according to the inheritance values on the
12552 * regions in that map and the options.
12553 *
12554 * The source map must not be locked.
12555 */
12556 vm_map_t
vm_map_fork(ledger_t ledger,vm_map_t old_map,int options)12557 vm_map_fork(
12558 ledger_t ledger,
12559 vm_map_t old_map,
12560 int options)
12561 {
12562 pmap_t new_pmap;
12563 vm_map_t new_map;
12564 vm_map_entry_t old_entry;
12565 vm_map_size_t new_size = 0, entry_size;
12566 vm_map_entry_t new_entry;
12567 boolean_t src_needs_copy;
12568 boolean_t new_entry_needs_copy;
12569 boolean_t pmap_is64bit;
12570 int vm_map_copyin_flags;
12571 vm_inherit_t old_entry_inheritance;
12572 int map_create_options;
12573 kern_return_t footprint_collect_kr;
12574
12575 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
12576 VM_MAP_FORK_PRESERVE_PURGEABLE |
12577 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
12578 /* unsupported option */
12579 return VM_MAP_NULL;
12580 }
12581
12582 pmap_is64bit =
12583 #if defined(__i386__) || defined(__x86_64__)
12584 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
12585 #elif defined(__arm64__)
12586 old_map->pmap->is_64bit;
12587 #elif defined(__arm__)
12588 FALSE;
12589 #else
12590 #error Unknown architecture.
12591 #endif
12592
12593 unsigned int pmap_flags = 0;
12594 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
12595 #if defined(HAS_APPLE_PAC)
12596 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
12597 #endif
12598 #if PMAP_CREATE_FORCE_4K_PAGES
12599 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
12600 PAGE_SIZE != FOURK_PAGE_SIZE) {
12601 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
12602 }
12603 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
12604 new_pmap = pmap_create_options(ledger, (vm_map_size_t) 0, pmap_flags);
12605 if (new_pmap == NULL) {
12606 return VM_MAP_NULL;
12607 }
12608
12609 vm_map_reference(old_map);
12610 vm_map_lock(old_map);
12611
12612 map_create_options = 0;
12613 if (old_map->hdr.entries_pageable) {
12614 map_create_options |= VM_MAP_CREATE_PAGEABLE;
12615 }
12616 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12617 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
12618 footprint_collect_kr = KERN_SUCCESS;
12619 }
12620 new_map = vm_map_create_options(new_pmap,
12621 old_map->min_offset,
12622 old_map->max_offset,
12623 map_create_options);
12624 /* inherit cs_enforcement */
12625 vm_map_cs_enforcement_set(new_map, old_map->cs_enforcement);
12626 vm_map_lock(new_map);
12627 vm_commit_pagezero_status(new_map);
12628 /* inherit the parent map's page size */
12629 vm_map_set_page_shift(new_map, VM_MAP_PAGE_SHIFT(old_map));
12630
12631 /* ensure PMAP_CS structures are prepared for the fork */
12632 pmap_cs_fork_prepare(old_map->pmap, new_pmap);
12633
12634 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
12635 /*
12636 * Abort any corpse collection if the system is shutting down.
12637 */
12638 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12639 get_system_inshutdown()) {
12640 vm_map_corpse_footprint_collect_done(new_map);
12641 vm_map_unlock(new_map);
12642 vm_map_unlock(old_map);
12643 vm_map_deallocate(new_map);
12644 vm_map_deallocate(old_map);
12645 printf("Aborting corpse map due to system shutdown\n");
12646 return VM_MAP_NULL;
12647 }
12648
12649 entry_size = old_entry->vme_end - old_entry->vme_start;
12650
12651 old_entry_inheritance = old_entry->inheritance;
12652 /*
12653 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
12654 * share VM_INHERIT_NONE entries that are not backed by a
12655 * device pager.
12656 */
12657 if (old_entry_inheritance == VM_INHERIT_NONE &&
12658 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
12659 (old_entry->protection & VM_PROT_READ) &&
12660 !(!old_entry->is_sub_map &&
12661 VME_OBJECT(old_entry) != NULL &&
12662 VME_OBJECT(old_entry)->pager != NULL &&
12663 is_device_pager_ops(
12664 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
12665 old_entry_inheritance = VM_INHERIT_SHARE;
12666 }
12667
12668 if (old_entry_inheritance != VM_INHERIT_NONE &&
12669 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
12670 footprint_collect_kr == KERN_SUCCESS) {
12671 /*
12672 * The corpse won't have old_map->pmap to query
12673 * footprint information, so collect that data now
12674 * and store it in new_map->vmmap_corpse_footprint
12675 * for later autopsy.
12676 */
12677 footprint_collect_kr =
12678 vm_map_corpse_footprint_collect(old_map,
12679 old_entry,
12680 new_map);
12681 }
12682
12683 switch (old_entry_inheritance) {
12684 case VM_INHERIT_NONE:
12685 break;
12686
12687 case VM_INHERIT_SHARE:
12688 vm_map_fork_share(old_map, old_entry, new_map);
12689 new_size += entry_size;
12690 break;
12691
12692 case VM_INHERIT_COPY:
12693
12694 /*
12695 * Inline the copy_quickly case;
12696 * upon failure, fall back on call
12697 * to vm_map_fork_copy.
12698 */
12699
12700 if (old_entry->is_sub_map) {
12701 break;
12702 }
12703 if ((old_entry->wired_count != 0) ||
12704 ((VME_OBJECT(old_entry) != NULL) &&
12705 (VME_OBJECT(old_entry)->true_share))) {
12706 goto slow_vm_map_fork_copy;
12707 }
12708
12709 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
12710 vm_map_entry_copy(old_map, new_entry, old_entry);
12711 if (old_entry->permanent) {
12712 /* inherit "permanent" on fork() */
12713 new_entry->permanent = TRUE;
12714 }
12715
12716 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
12717 new_map->jit_entry_exists = TRUE;
12718 }
12719
12720 if (new_entry->is_sub_map) {
12721 /* clear address space specifics */
12722 new_entry->use_pmap = FALSE;
12723 } else {
12724 /*
12725 * We're dealing with a copy-on-write operation,
12726 * so the resulting mapping should not inherit
12727 * the original mapping's accounting settings.
12728 * "iokit_acct" should have been cleared in
12729 * vm_map_entry_copy().
12730 * "use_pmap" should be reset to its default
12731 * (TRUE) so that the new mapping gets
12732 * accounted for in the task's memory footprint.
12733 */
12734 assert(!new_entry->iokit_acct);
12735 new_entry->use_pmap = TRUE;
12736 }
12737
12738 if (!vm_object_copy_quickly(
12739 VME_OBJECT(new_entry),
12740 VME_OFFSET(old_entry),
12741 (old_entry->vme_end -
12742 old_entry->vme_start),
12743 &src_needs_copy,
12744 &new_entry_needs_copy)) {
12745 vm_map_entry_dispose(new_entry);
12746 goto slow_vm_map_fork_copy;
12747 }
12748
12749 /*
12750 * Handle copy-on-write obligations
12751 */
12752
12753 if (src_needs_copy && !old_entry->needs_copy) {
12754 vm_prot_t prot;
12755
12756 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, old_entry->protection));
12757
12758 prot = old_entry->protection & ~VM_PROT_WRITE;
12759
12760 if (override_nx(old_map, VME_ALIAS(old_entry))
12761 && prot) {
12762 prot |= VM_PROT_EXECUTE;
12763 }
12764
12765 assert(!pmap_has_prot_policy(old_map->pmap, old_entry->translated_allow_execute, prot));
12766
12767 vm_object_pmap_protect(
12768 VME_OBJECT(old_entry),
12769 VME_OFFSET(old_entry),
12770 (old_entry->vme_end -
12771 old_entry->vme_start),
12772 ((old_entry->is_shared
12773 || old_map->mapped_in_other_pmaps)
12774 ? PMAP_NULL :
12775 old_map->pmap),
12776 VM_MAP_PAGE_SIZE(old_map),
12777 old_entry->vme_start,
12778 prot);
12779
12780 assert(old_entry->wired_count == 0);
12781 old_entry->needs_copy = TRUE;
12782 }
12783 new_entry->needs_copy = new_entry_needs_copy;
12784
12785 /*
12786 * Insert the entry at the end
12787 * of the map.
12788 */
12789
12790 vm_map_store_entry_link(new_map,
12791 vm_map_last_entry(new_map),
12792 new_entry,
12793 VM_MAP_KERNEL_FLAGS_NONE);
12794 new_size += entry_size;
12795 break;
12796
12797 slow_vm_map_fork_copy:
12798 vm_map_copyin_flags = 0;
12799 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
12800 vm_map_copyin_flags |=
12801 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
12802 }
12803 if (vm_map_fork_copy(old_map,
12804 &old_entry,
12805 new_map,
12806 vm_map_copyin_flags)) {
12807 new_size += entry_size;
12808 }
12809 continue;
12810 }
12811 old_entry = old_entry->vme_next;
12812 }
12813
12814 #if defined(__arm64__)
12815 pmap_insert_sharedpage(new_map->pmap);
12816 #endif /* __arm64__ */
12817
12818 new_map->size = new_size;
12819
12820 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
12821 vm_map_corpse_footprint_collect_done(new_map);
12822 }
12823
12824 /* Propagate JIT entitlement for the pmap layer. */
12825 if (pmap_get_jit_entitled(old_map->pmap)) {
12826 /* Tell the pmap that it supports JIT. */
12827 pmap_set_jit_entitled(new_map->pmap);
12828 }
12829
12830 vm_map_unlock(new_map);
12831 vm_map_unlock(old_map);
12832 vm_map_deallocate(old_map);
12833
12834 return new_map;
12835 }
12836
12837 /*
12838 * vm_map_exec:
12839 *
12840 * Setup the "new_map" with the proper execution environment according
12841 * to the type of executable (platform, 64bit, chroot environment).
12842 * Map the comm page and shared region, etc...
12843 */
12844 kern_return_t
vm_map_exec(vm_map_t new_map,task_t task,boolean_t is64bit,void * fsroot,cpu_type_t cpu,cpu_subtype_t cpu_subtype,boolean_t reslide,boolean_t is_driverkit)12845 vm_map_exec(
12846 vm_map_t new_map,
12847 task_t task,
12848 boolean_t is64bit,
12849 void *fsroot,
12850 cpu_type_t cpu,
12851 cpu_subtype_t cpu_subtype,
12852 boolean_t reslide,
12853 boolean_t is_driverkit)
12854 {
12855 SHARED_REGION_TRACE_DEBUG(
12856 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
12857 (void *)VM_KERNEL_ADDRPERM(current_task()),
12858 (void *)VM_KERNEL_ADDRPERM(new_map),
12859 (void *)VM_KERNEL_ADDRPERM(task),
12860 (void *)VM_KERNEL_ADDRPERM(fsroot),
12861 cpu,
12862 cpu_subtype));
12863 (void) vm_commpage_enter(new_map, task, is64bit);
12864
12865 (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit);
12866
12867 SHARED_REGION_TRACE_DEBUG(
12868 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
12869 (void *)VM_KERNEL_ADDRPERM(current_task()),
12870 (void *)VM_KERNEL_ADDRPERM(new_map),
12871 (void *)VM_KERNEL_ADDRPERM(task),
12872 (void *)VM_KERNEL_ADDRPERM(fsroot),
12873 cpu,
12874 cpu_subtype));
12875
12876 /*
12877 * Some devices have region(s) of memory that shouldn't get allocated by
12878 * user processes. The following code creates dummy vm_map_entry_t's for each
12879 * of the regions that needs to be reserved to prevent any allocations in
12880 * those regions.
12881 */
12882 kern_return_t kr = KERN_FAILURE;
12883 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
12884 vmk_flags.vmkf_permanent = TRUE;
12885 vmk_flags.vmkf_beyond_max = TRUE;
12886
12887 struct vm_reserved_region *regions = NULL;
12888 size_t num_regions = ml_get_vm_reserved_regions(is64bit, ®ions);
12889 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
12890
12891 for (size_t i = 0; i < num_regions; ++i) {
12892 kr = vm_map_enter(
12893 new_map,
12894 ®ions[i].vmrr_addr,
12895 regions[i].vmrr_size,
12896 (vm_map_offset_t)0,
12897 VM_FLAGS_FIXED,
12898 vmk_flags,
12899 VM_KERN_MEMORY_NONE,
12900 VM_OBJECT_NULL,
12901 (vm_object_offset_t)0,
12902 FALSE,
12903 VM_PROT_NONE,
12904 VM_PROT_NONE,
12905 VM_INHERIT_COPY);
12906
12907 if (kr != KERN_SUCCESS) {
12908 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
12909 }
12910 }
12911
12912 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
12913
12914 return KERN_SUCCESS;
12915 }
12916
12917 uint64_t vm_map_lookup_locked_copy_slowly_count = 0;
12918 uint64_t vm_map_lookup_locked_copy_slowly_size = 0;
12919 uint64_t vm_map_lookup_locked_copy_slowly_max = 0;
12920 uint64_t vm_map_lookup_locked_copy_slowly_restart = 0;
12921 uint64_t vm_map_lookup_locked_copy_slowly_error = 0;
12922 uint64_t vm_map_lookup_locked_copy_strategically_count = 0;
12923 uint64_t vm_map_lookup_locked_copy_strategically_size = 0;
12924 uint64_t vm_map_lookup_locked_copy_strategically_max = 0;
12925 uint64_t vm_map_lookup_locked_copy_strategically_restart = 0;
12926 uint64_t vm_map_lookup_locked_copy_strategically_error = 0;
12927 uint64_t vm_map_lookup_locked_copy_shadow_count = 0;
12928 uint64_t vm_map_lookup_locked_copy_shadow_size = 0;
12929 uint64_t vm_map_lookup_locked_copy_shadow_max = 0;
12930 /*
12931 * vm_map_lookup_locked:
12932 *
12933 * Finds the VM object, offset, and
12934 * protection for a given virtual address in the
12935 * specified map, assuming a page fault of the
12936 * type specified.
12937 *
12938 * Returns the (object, offset, protection) for
12939 * this address, whether it is wired down, and whether
12940 * this map has the only reference to the data in question.
12941 * In order to later verify this lookup, a "version"
12942 * is returned.
12943 * If contended != NULL, *contended will be set to
12944 * true iff the thread had to spin or block to acquire
12945 * an exclusive lock.
12946 *
12947 * The map MUST be locked by the caller and WILL be
12948 * locked on exit. In order to guarantee the
12949 * existence of the returned object, it is returned
12950 * locked.
12951 *
12952 * If a lookup is requested with "write protection"
12953 * specified, the map may be changed to perform virtual
12954 * copying operations, although the data referenced will
12955 * remain the same.
12956 */
12957 kern_return_t
vm_map_lookup_locked(vm_map_t * var_map,vm_map_offset_t vaddr,vm_prot_t fault_type,int object_lock_type,vm_map_version_t * out_version,vm_object_t * object,vm_object_offset_t * offset,vm_prot_t * out_prot,boolean_t * wired,vm_object_fault_info_t fault_info,vm_map_t * real_map,bool * contended)12958 vm_map_lookup_locked(
12959 vm_map_t *var_map, /* IN/OUT */
12960 vm_map_offset_t vaddr,
12961 vm_prot_t fault_type,
12962 int object_lock_type,
12963 vm_map_version_t *out_version, /* OUT */
12964 vm_object_t *object, /* OUT */
12965 vm_object_offset_t *offset, /* OUT */
12966 vm_prot_t *out_prot, /* OUT */
12967 boolean_t *wired, /* OUT */
12968 vm_object_fault_info_t fault_info, /* OUT */
12969 vm_map_t *real_map, /* OUT */
12970 bool *contended) /* OUT */
12971 {
12972 vm_map_entry_t entry;
12973 vm_map_t map = *var_map;
12974 vm_map_t old_map = *var_map;
12975 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
12976 vm_map_offset_t cow_parent_vaddr = 0;
12977 vm_map_offset_t old_start = 0;
12978 vm_map_offset_t old_end = 0;
12979 vm_prot_t prot;
12980 boolean_t mask_protections;
12981 boolean_t force_copy;
12982 boolean_t no_force_copy_if_executable;
12983 boolean_t submap_needed_copy;
12984 vm_prot_t original_fault_type;
12985 vm_map_size_t fault_page_mask;
12986
12987 /*
12988 * VM_PROT_MASK means that the caller wants us to use "fault_type"
12989 * as a mask against the mapping's actual protections, not as an
12990 * absolute value.
12991 */
12992 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
12993 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
12994 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
12995 fault_type &= VM_PROT_ALL;
12996 original_fault_type = fault_type;
12997 if (contended) {
12998 *contended = false;
12999 }
13000
13001 *real_map = map;
13002
13003 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13004 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13005
13006 RetryLookup:
13007 fault_type = original_fault_type;
13008
13009 /*
13010 * If the map has an interesting hint, try it before calling
13011 * full blown lookup routine.
13012 */
13013 entry = map->hint;
13014
13015 if ((entry == vm_map_to_entry(map)) ||
13016 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13017 vm_map_entry_t tmp_entry;
13018
13019 /*
13020 * Entry was either not a valid hint, or the vaddr
13021 * was not contained in the entry, so do a full lookup.
13022 */
13023 if (!vm_map_lookup_entry(map, vaddr, &tmp_entry)) {
13024 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13025 vm_map_unlock(cow_sub_map_parent);
13026 }
13027 if ((*real_map != map)
13028 && (*real_map != cow_sub_map_parent)) {
13029 vm_map_unlock(*real_map);
13030 }
13031 return KERN_INVALID_ADDRESS;
13032 }
13033
13034 entry = tmp_entry;
13035 }
13036 if (map == old_map) {
13037 old_start = entry->vme_start;
13038 old_end = entry->vme_end;
13039 }
13040
13041 /*
13042 * Handle submaps. Drop lock on upper map, submap is
13043 * returned locked.
13044 */
13045
13046 submap_needed_copy = FALSE;
13047 submap_recurse:
13048 if (entry->is_sub_map) {
13049 vm_map_offset_t local_vaddr;
13050 vm_map_offset_t end_delta;
13051 vm_map_offset_t start_delta;
13052 vm_map_entry_t submap_entry, saved_submap_entry;
13053 vm_object_offset_t submap_entry_offset;
13054 vm_object_size_t submap_entry_size;
13055 vm_prot_t subentry_protection;
13056 vm_prot_t subentry_max_protection;
13057 boolean_t subentry_no_copy_on_read;
13058 boolean_t mapped_needs_copy = FALSE;
13059 vm_map_version_t version;
13060
13061 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13062 "map %p (%d) entry %p submap %p (%d)\n",
13063 map, VM_MAP_PAGE_SHIFT(map), entry,
13064 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
13065
13066 local_vaddr = vaddr;
13067
13068 if ((entry->use_pmap &&
13069 !((fault_type & VM_PROT_WRITE) ||
13070 force_copy))) {
13071 /* if real_map equals map we unlock below */
13072 if ((*real_map != map) &&
13073 (*real_map != cow_sub_map_parent)) {
13074 vm_map_unlock(*real_map);
13075 }
13076 *real_map = VME_SUBMAP(entry);
13077 }
13078
13079 if (entry->needs_copy &&
13080 ((fault_type & VM_PROT_WRITE) ||
13081 force_copy)) {
13082 if (!mapped_needs_copy) {
13083 if (vm_map_lock_read_to_write(map)) {
13084 vm_map_lock_read(map);
13085 *real_map = map;
13086 goto RetryLookup;
13087 }
13088 vm_map_lock_read(VME_SUBMAP(entry));
13089 *var_map = VME_SUBMAP(entry);
13090 cow_sub_map_parent = map;
13091 /* reset base to map before cow object */
13092 /* this is the map which will accept */
13093 /* the new cow object */
13094 old_start = entry->vme_start;
13095 old_end = entry->vme_end;
13096 cow_parent_vaddr = vaddr;
13097 mapped_needs_copy = TRUE;
13098 } else {
13099 vm_map_lock_read(VME_SUBMAP(entry));
13100 *var_map = VME_SUBMAP(entry);
13101 if ((cow_sub_map_parent != map) &&
13102 (*real_map != map)) {
13103 vm_map_unlock(map);
13104 }
13105 }
13106 } else {
13107 if (entry->needs_copy) {
13108 submap_needed_copy = TRUE;
13109 }
13110 vm_map_lock_read(VME_SUBMAP(entry));
13111 *var_map = VME_SUBMAP(entry);
13112 /* leave map locked if it is a target */
13113 /* cow sub_map above otherwise, just */
13114 /* follow the maps down to the object */
13115 /* here we unlock knowing we are not */
13116 /* revisiting the map. */
13117 if ((*real_map != map) && (map != cow_sub_map_parent)) {
13118 vm_map_unlock_read(map);
13119 }
13120 }
13121
13122 map = *var_map;
13123
13124 /* calculate the offset in the submap for vaddr */
13125 local_vaddr = (local_vaddr - entry->vme_start) + VME_OFFSET(entry);
13126 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
13127 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
13128 (uint64_t)local_vaddr, (uint64_t)entry->vme_start, (uint64_t)fault_page_mask);
13129
13130 RetrySubMap:
13131 if (!vm_map_lookup_entry(map, local_vaddr, &submap_entry)) {
13132 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13133 vm_map_unlock(cow_sub_map_parent);
13134 }
13135 if ((*real_map != map)
13136 && (*real_map != cow_sub_map_parent)) {
13137 vm_map_unlock(*real_map);
13138 }
13139 *real_map = map;
13140 return KERN_INVALID_ADDRESS;
13141 }
13142
13143 /* find the attenuated shadow of the underlying object */
13144 /* on our target map */
13145
13146 /* in english the submap object may extend beyond the */
13147 /* region mapped by the entry or, may only fill a portion */
13148 /* of it. For our purposes, we only care if the object */
13149 /* doesn't fill. In this case the area which will */
13150 /* ultimately be clipped in the top map will only need */
13151 /* to be as big as the portion of the underlying entry */
13152 /* which is mapped */
13153 start_delta = submap_entry->vme_start > VME_OFFSET(entry) ?
13154 submap_entry->vme_start - VME_OFFSET(entry) : 0;
13155
13156 end_delta =
13157 (VME_OFFSET(entry) + start_delta + (old_end - old_start)) <=
13158 submap_entry->vme_end ?
13159 0 : (VME_OFFSET(entry) +
13160 (old_end - old_start))
13161 - submap_entry->vme_end;
13162
13163 old_start += start_delta;
13164 old_end -= end_delta;
13165
13166 if (submap_entry->is_sub_map) {
13167 entry = submap_entry;
13168 vaddr = local_vaddr;
13169 goto submap_recurse;
13170 }
13171
13172 if (((fault_type & VM_PROT_WRITE) ||
13173 force_copy)
13174 && cow_sub_map_parent) {
13175 vm_object_t sub_object, copy_object;
13176 vm_object_offset_t copy_offset;
13177 vm_map_offset_t local_start;
13178 vm_map_offset_t local_end;
13179 boolean_t object_copied = FALSE;
13180 vm_object_offset_t object_copied_offset = 0;
13181 boolean_t object_copied_needs_copy = FALSE;
13182 kern_return_t kr = KERN_SUCCESS;
13183
13184 if (vm_map_lock_read_to_write(map)) {
13185 vm_map_lock_read(map);
13186 old_start -= start_delta;
13187 old_end += end_delta;
13188 goto RetrySubMap;
13189 }
13190
13191
13192 sub_object = VME_OBJECT(submap_entry);
13193 if (sub_object == VM_OBJECT_NULL) {
13194 sub_object =
13195 vm_object_allocate(
13196 (vm_map_size_t)
13197 (submap_entry->vme_end -
13198 submap_entry->vme_start));
13199 VME_OBJECT_SET(submap_entry, sub_object);
13200 VME_OFFSET_SET(submap_entry, 0);
13201 assert(!submap_entry->is_sub_map);
13202 assert(submap_entry->use_pmap);
13203 }
13204 local_start = local_vaddr -
13205 (cow_parent_vaddr - old_start);
13206 local_end = local_vaddr +
13207 (old_end - cow_parent_vaddr);
13208 vm_map_clip_start(map, submap_entry, local_start);
13209 vm_map_clip_end(map, submap_entry, local_end);
13210 if (submap_entry->is_sub_map) {
13211 /* unnesting was done when clipping */
13212 assert(!submap_entry->use_pmap);
13213 }
13214
13215 /* This is the COW case, lets connect */
13216 /* an entry in our space to the underlying */
13217 /* object in the submap, bypassing the */
13218 /* submap. */
13219 submap_entry_offset = VME_OFFSET(submap_entry);
13220 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
13221
13222 if ((submap_entry->wired_count != 0 ||
13223 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
13224 (submap_entry->protection & VM_PROT_EXECUTE) &&
13225 no_force_copy_if_executable) {
13226 // printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
13227 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13228 vm_map_unlock(cow_sub_map_parent);
13229 }
13230 if ((*real_map != map)
13231 && (*real_map != cow_sub_map_parent)) {
13232 vm_map_unlock(*real_map);
13233 }
13234 *real_map = map;
13235 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), 0 /* arg */);
13236 vm_map_lock_write_to_read(map);
13237 kr = KERN_PROTECTION_FAILURE;
13238 DTRACE_VM4(submap_no_copy_executable,
13239 vm_map_t, map,
13240 vm_object_offset_t, submap_entry_offset,
13241 vm_object_size_t, submap_entry_size,
13242 int, kr);
13243 return kr;
13244 }
13245
13246 if (submap_entry->wired_count != 0) {
13247 vm_object_reference(sub_object);
13248
13249 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
13250 "submap_entry %p offset 0x%llx\n",
13251 submap_entry, VME_OFFSET(submap_entry));
13252
13253 DTRACE_VM6(submap_copy_slowly,
13254 vm_map_t, cow_sub_map_parent,
13255 vm_map_offset_t, vaddr,
13256 vm_map_t, map,
13257 vm_object_size_t, submap_entry_size,
13258 int, submap_entry->wired_count,
13259 int, sub_object->copy_strategy);
13260
13261 saved_submap_entry = submap_entry;
13262 version.main_timestamp = map->timestamp;
13263 vm_map_unlock(map); /* Increments timestamp by 1 */
13264 submap_entry = VM_MAP_ENTRY_NULL;
13265
13266 vm_object_lock(sub_object);
13267 kr = vm_object_copy_slowly(sub_object,
13268 submap_entry_offset,
13269 submap_entry_size,
13270 FALSE,
13271 ©_object);
13272 object_copied = TRUE;
13273 object_copied_offset = 0;
13274 /* 4k: account for extra offset in physical page */
13275 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
13276 object_copied_needs_copy = FALSE;
13277 vm_object_deallocate(sub_object);
13278
13279 vm_map_lock(map);
13280
13281 if (kr != KERN_SUCCESS &&
13282 kr != KERN_MEMORY_RESTART_COPY) {
13283 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13284 vm_map_unlock(cow_sub_map_parent);
13285 }
13286 if ((*real_map != map)
13287 && (*real_map != cow_sub_map_parent)) {
13288 vm_map_unlock(*real_map);
13289 }
13290 *real_map = map;
13291 vm_object_deallocate(copy_object);
13292 copy_object = VM_OBJECT_NULL;
13293 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), 0 /* arg */);
13294 vm_map_lock_write_to_read(map);
13295 DTRACE_VM4(submap_copy_error_slowly,
13296 vm_object_t, sub_object,
13297 vm_object_offset_t, submap_entry_offset,
13298 vm_object_size_t, submap_entry_size,
13299 int, kr);
13300 vm_map_lookup_locked_copy_slowly_error++;
13301 return kr;
13302 }
13303
13304 if ((kr == KERN_SUCCESS) &&
13305 (version.main_timestamp + 1) == map->timestamp) {
13306 submap_entry = saved_submap_entry;
13307 } else {
13308 saved_submap_entry = NULL;
13309 old_start -= start_delta;
13310 old_end += end_delta;
13311 vm_object_deallocate(copy_object);
13312 copy_object = VM_OBJECT_NULL;
13313 vm_map_lock_write_to_read(map);
13314 vm_map_lookup_locked_copy_slowly_restart++;
13315 goto RetrySubMap;
13316 }
13317 vm_map_lookup_locked_copy_slowly_count++;
13318 vm_map_lookup_locked_copy_slowly_size += submap_entry_size;
13319 if (submap_entry_size > vm_map_lookup_locked_copy_slowly_max) {
13320 vm_map_lookup_locked_copy_slowly_max = submap_entry_size;
13321 }
13322 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
13323 submap_entry_offset = VME_OFFSET(submap_entry);
13324 copy_object = VM_OBJECT_NULL;
13325 object_copied_offset = submap_entry_offset;
13326 object_copied_needs_copy = FALSE;
13327 DTRACE_VM6(submap_copy_strategically,
13328 vm_map_t, cow_sub_map_parent,
13329 vm_map_offset_t, vaddr,
13330 vm_map_t, map,
13331 vm_object_size_t, submap_entry_size,
13332 int, submap_entry->wired_count,
13333 int, sub_object->copy_strategy);
13334 kr = vm_object_copy_strategically(
13335 sub_object,
13336 submap_entry_offset,
13337 submap_entry->vme_end - submap_entry->vme_start,
13338 ©_object,
13339 &object_copied_offset,
13340 &object_copied_needs_copy);
13341 if (kr == KERN_MEMORY_RESTART_COPY) {
13342 old_start -= start_delta;
13343 old_end += end_delta;
13344 vm_object_deallocate(copy_object);
13345 copy_object = VM_OBJECT_NULL;
13346 vm_map_lock_write_to_read(map);
13347 vm_map_lookup_locked_copy_strategically_restart++;
13348 goto RetrySubMap;
13349 }
13350 if (kr != KERN_SUCCESS) {
13351 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13352 vm_map_unlock(cow_sub_map_parent);
13353 }
13354 if ((*real_map != map)
13355 && (*real_map != cow_sub_map_parent)) {
13356 vm_map_unlock(*real_map);
13357 }
13358 *real_map = map;
13359 vm_object_deallocate(copy_object);
13360 copy_object = VM_OBJECT_NULL;
13361 kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), 0 /* arg */);
13362 vm_map_lock_write_to_read(map);
13363 DTRACE_VM4(submap_copy_error_strategically,
13364 vm_object_t, sub_object,
13365 vm_object_offset_t, submap_entry_offset,
13366 vm_object_size_t, submap_entry_size,
13367 int, kr);
13368 vm_map_lookup_locked_copy_strategically_error++;
13369 return kr;
13370 }
13371 assert(copy_object != VM_OBJECT_NULL);
13372 assert(copy_object != sub_object);
13373 object_copied = TRUE;
13374 vm_map_lookup_locked_copy_strategically_count++;
13375 vm_map_lookup_locked_copy_strategically_size += submap_entry_size;
13376 if (submap_entry_size > vm_map_lookup_locked_copy_strategically_max) {
13377 vm_map_lookup_locked_copy_strategically_max = submap_entry_size;
13378 }
13379 } else {
13380 /* set up shadow object */
13381 object_copied = FALSE;
13382 copy_object = sub_object;
13383 vm_object_lock(sub_object);
13384 vm_object_reference_locked(sub_object);
13385 sub_object->shadowed = TRUE;
13386 vm_object_unlock(sub_object);
13387
13388 assert(submap_entry->wired_count == 0);
13389 submap_entry->needs_copy = TRUE;
13390
13391 prot = submap_entry->protection;
13392 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13393 prot = prot & ~VM_PROT_WRITE;
13394 assert(!pmap_has_prot_policy(map->pmap, submap_entry->translated_allow_execute, prot));
13395
13396 if (override_nx(old_map,
13397 VME_ALIAS(submap_entry))
13398 && prot) {
13399 prot |= VM_PROT_EXECUTE;
13400 }
13401
13402 vm_object_pmap_protect(
13403 sub_object,
13404 VME_OFFSET(submap_entry),
13405 submap_entry->vme_end -
13406 submap_entry->vme_start,
13407 (submap_entry->is_shared
13408 || map->mapped_in_other_pmaps) ?
13409 PMAP_NULL : map->pmap,
13410 VM_MAP_PAGE_SIZE(map),
13411 submap_entry->vme_start,
13412 prot);
13413 vm_map_lookup_locked_copy_shadow_count++;
13414 vm_map_lookup_locked_copy_shadow_size += submap_entry_size;
13415 if (submap_entry_size > vm_map_lookup_locked_copy_shadow_max) {
13416 vm_map_lookup_locked_copy_shadow_max = submap_entry_size;
13417 }
13418 }
13419
13420 /*
13421 * Adjust the fault offset to the submap entry.
13422 */
13423 copy_offset = (local_vaddr -
13424 submap_entry->vme_start +
13425 VME_OFFSET(submap_entry));
13426
13427 /* This works diffently than the */
13428 /* normal submap case. We go back */
13429 /* to the parent of the cow map and*/
13430 /* clip out the target portion of */
13431 /* the sub_map, substituting the */
13432 /* new copy object, */
13433
13434 subentry_protection = submap_entry->protection;
13435 subentry_max_protection = submap_entry->max_protection;
13436 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
13437 vm_map_unlock(map);
13438 submap_entry = NULL; /* not valid after map unlock */
13439
13440 local_start = old_start;
13441 local_end = old_end;
13442 map = cow_sub_map_parent;
13443 *var_map = cow_sub_map_parent;
13444 vaddr = cow_parent_vaddr;
13445 cow_sub_map_parent = NULL;
13446
13447 if (!vm_map_lookup_entry(map,
13448 vaddr, &entry)) {
13449 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13450 vm_map_unlock(cow_sub_map_parent);
13451 }
13452 if ((*real_map != map)
13453 && (*real_map != cow_sub_map_parent)) {
13454 vm_map_unlock(*real_map);
13455 }
13456 *real_map = map;
13457 vm_object_deallocate(
13458 copy_object);
13459 copy_object = VM_OBJECT_NULL;
13460 vm_map_lock_write_to_read(map);
13461 DTRACE_VM4(submap_lookup_post_unlock,
13462 uint64_t, (uint64_t)entry->vme_start,
13463 uint64_t, (uint64_t)entry->vme_end,
13464 vm_map_offset_t, vaddr,
13465 int, object_copied);
13466 return KERN_INVALID_ADDRESS;
13467 }
13468
13469 /* clip out the portion of space */
13470 /* mapped by the sub map which */
13471 /* corresponds to the underlying */
13472 /* object */
13473
13474 /*
13475 * Clip (and unnest) the smallest nested chunk
13476 * possible around the faulting address...
13477 */
13478 local_start = vaddr & ~(pmap_shared_region_size_min(map->pmap) - 1);
13479 local_end = local_start + pmap_shared_region_size_min(map->pmap);
13480 /*
13481 * ... but don't go beyond the "old_start" to "old_end"
13482 * range, to avoid spanning over another VM region
13483 * with a possibly different VM object and/or offset.
13484 */
13485 if (local_start < old_start) {
13486 local_start = old_start;
13487 }
13488 if (local_end > old_end) {
13489 local_end = old_end;
13490 }
13491 /*
13492 * Adjust copy_offset to the start of the range.
13493 */
13494 copy_offset -= (vaddr - local_start);
13495
13496 vm_map_clip_start(map, entry, local_start);
13497 vm_map_clip_end(map, entry, local_end);
13498 if (entry->is_sub_map) {
13499 /* unnesting was done when clipping */
13500 assert(!entry->use_pmap);
13501 }
13502
13503 /* substitute copy object for */
13504 /* shared map entry */
13505 vm_map_deallocate(VME_SUBMAP(entry));
13506 assert(!entry->iokit_acct);
13507 entry->is_sub_map = FALSE;
13508 entry->use_pmap = TRUE;
13509 VME_OBJECT_SET(entry, copy_object);
13510
13511 /* propagate the submap entry's protections */
13512 if (entry->protection != VM_PROT_READ) {
13513 /*
13514 * Someone has already altered the top entry's
13515 * protections via vm_protect(VM_PROT_COPY).
13516 * Respect these new values and ignore the
13517 * submap entry's protections.
13518 */
13519 } else {
13520 /*
13521 * Regular copy-on-write: propagate the submap
13522 * entry's protections to the top map entry.
13523 */
13524 entry->protection |= subentry_protection;
13525 }
13526 entry->max_protection |= subentry_max_protection;
13527 /* propagate no_copy_on_read */
13528 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
13529
13530 if ((entry->protection & VM_PROT_WRITE) &&
13531 (entry->protection & VM_PROT_EXECUTE) &&
13532 #if XNU_TARGET_OS_OSX
13533 map->pmap != kernel_pmap &&
13534 (vm_map_cs_enforcement(map)
13535 #if __arm64__
13536 || !VM_MAP_IS_EXOTIC(map)
13537 #endif /* __arm64__ */
13538 ) &&
13539 #endif /* XNU_TARGET_OS_OSX */
13540 !(entry->used_for_jit) &&
13541 VM_MAP_POLICY_WX_STRIP_X(map)) {
13542 DTRACE_VM3(cs_wx,
13543 uint64_t, (uint64_t)entry->vme_start,
13544 uint64_t, (uint64_t)entry->vme_end,
13545 vm_prot_t, entry->protection);
13546 printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
13547 proc_selfpid(),
13548 (current_task()->bsd_info
13549 ? proc_name_address(current_task()->bsd_info)
13550 : "?"),
13551 __FUNCTION__);
13552 entry->protection &= ~VM_PROT_EXECUTE;
13553 }
13554
13555 if (object_copied) {
13556 VME_OFFSET_SET(entry, local_start - old_start + object_copied_offset);
13557 entry->needs_copy = object_copied_needs_copy;
13558 entry->is_shared = FALSE;
13559 } else {
13560 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
13561 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
13562 assert(entry->wired_count == 0);
13563 VME_OFFSET_SET(entry, copy_offset);
13564 entry->needs_copy = TRUE;
13565 if (map != old_map) {
13566 entry->is_shared = TRUE;
13567 }
13568 }
13569 if (entry->inheritance == VM_INHERIT_SHARE) {
13570 entry->inheritance = VM_INHERIT_COPY;
13571 }
13572
13573 vm_map_lock_write_to_read(map);
13574 } else {
13575 if ((cow_sub_map_parent)
13576 && (cow_sub_map_parent != *real_map)
13577 && (cow_sub_map_parent != map)) {
13578 vm_map_unlock(cow_sub_map_parent);
13579 }
13580 entry = submap_entry;
13581 vaddr = local_vaddr;
13582 }
13583 }
13584
13585 /*
13586 * Check whether this task is allowed to have
13587 * this page.
13588 */
13589
13590 prot = entry->protection;
13591
13592 if (override_nx(old_map, VME_ALIAS(entry)) && prot) {
13593 /*
13594 * HACK -- if not a stack, then allow execution
13595 */
13596 prot |= VM_PROT_EXECUTE;
13597 }
13598
13599 if (mask_protections) {
13600 fault_type &= prot;
13601 if (fault_type == VM_PROT_NONE) {
13602 goto protection_failure;
13603 }
13604 }
13605 if (((fault_type & prot) != fault_type)
13606 #if __arm64__
13607 /* prefetch abort in execute-only page */
13608 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
13609 #elif defined(__x86_64__)
13610 /* Consider the UEXEC bit when handling an EXECUTE fault */
13611 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
13612 #endif
13613 ) {
13614 protection_failure:
13615 if (*real_map != map) {
13616 vm_map_unlock(*real_map);
13617 }
13618 *real_map = map;
13619
13620 if ((fault_type & VM_PROT_EXECUTE) && prot) {
13621 log_stack_execution_failure((addr64_t)vaddr, prot);
13622 }
13623
13624 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
13625 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
13626 /*
13627 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
13628 *
13629 * kernel_triage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
13630 */
13631 return KERN_PROTECTION_FAILURE;
13632 }
13633
13634 /*
13635 * If this page is not pageable, we have to get
13636 * it for all possible accesses.
13637 */
13638
13639 *wired = (entry->wired_count != 0);
13640 if (*wired) {
13641 fault_type = prot;
13642 }
13643
13644 /*
13645 * If the entry was copy-on-write, we either ...
13646 */
13647
13648 if (entry->needs_copy) {
13649 /*
13650 * If we want to write the page, we may as well
13651 * handle that now since we've got the map locked.
13652 *
13653 * If we don't need to write the page, we just
13654 * demote the permissions allowed.
13655 */
13656
13657 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
13658 /*
13659 * Make a new object, and place it in the
13660 * object chain. Note that no new references
13661 * have appeared -- one just moved from the
13662 * map to the new object.
13663 */
13664
13665 if (vm_map_lock_read_to_write(map)) {
13666 vm_map_lock_read(map);
13667 goto RetryLookup;
13668 }
13669
13670 if (VME_OBJECT(entry)->shadowed == FALSE) {
13671 vm_object_lock(VME_OBJECT(entry));
13672 VME_OBJECT(entry)->shadowed = TRUE;
13673 vm_object_unlock(VME_OBJECT(entry));
13674 }
13675 VME_OBJECT_SHADOW(entry,
13676 (vm_map_size_t) (entry->vme_end -
13677 entry->vme_start));
13678 entry->needs_copy = FALSE;
13679
13680 vm_map_lock_write_to_read(map);
13681 }
13682 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
13683 /*
13684 * We're attempting to read a copy-on-write
13685 * page -- don't allow writes.
13686 */
13687
13688 prot &= (~VM_PROT_WRITE);
13689 }
13690 }
13691
13692 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
13693 /*
13694 * We went through a "needs_copy" submap without triggering
13695 * a copy, so granting write access to the page would bypass
13696 * that submap's "needs_copy".
13697 */
13698 assert(!(fault_type & VM_PROT_WRITE));
13699 assert(!*wired);
13700 assert(!force_copy);
13701 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
13702 prot &= ~VM_PROT_WRITE;
13703 }
13704
13705 /*
13706 * Create an object if necessary.
13707 */
13708 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
13709 if (vm_map_lock_read_to_write(map)) {
13710 vm_map_lock_read(map);
13711 goto RetryLookup;
13712 }
13713
13714 VME_OBJECT_SET(entry,
13715 vm_object_allocate(
13716 (vm_map_size_t)(entry->vme_end -
13717 entry->vme_start)));
13718 VME_OFFSET_SET(entry, 0);
13719 assert(entry->use_pmap);
13720 vm_map_lock_write_to_read(map);
13721 }
13722
13723 /*
13724 * Return the object/offset from this entry. If the entry
13725 * was copy-on-write or empty, it has been fixed up. Also
13726 * return the protection.
13727 */
13728
13729 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
13730 *object = VME_OBJECT(entry);
13731 *out_prot = prot;
13732 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
13733
13734 if (fault_info) {
13735 fault_info->interruptible = THREAD_UNINT; /* for now... */
13736 /* ... the caller will change "interruptible" if needed */
13737 fault_info->cluster_size = 0;
13738 fault_info->user_tag = VME_ALIAS(entry);
13739 fault_info->pmap_options = 0;
13740 if (entry->iokit_acct ||
13741 (!entry->is_sub_map && !entry->use_pmap)) {
13742 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
13743 }
13744 fault_info->behavior = entry->behavior;
13745 fault_info->lo_offset = VME_OFFSET(entry);
13746 fault_info->hi_offset =
13747 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
13748 fault_info->no_cache = entry->no_cache;
13749 fault_info->stealth = FALSE;
13750 fault_info->io_sync = FALSE;
13751 if (entry->used_for_jit ||
13752 entry->vme_resilient_codesign) {
13753 fault_info->cs_bypass = TRUE;
13754 } else {
13755 fault_info->cs_bypass = FALSE;
13756 }
13757 fault_info->pmap_cs_associated = FALSE;
13758 #if CONFIG_PMAP_CS
13759 if (entry->pmap_cs_associated) {
13760 /*
13761 * The pmap layer will validate this page
13762 * before allowing it to be executed from.
13763 */
13764 fault_info->pmap_cs_associated = TRUE;
13765 }
13766 #endif /* CONFIG_PMAP_CS */
13767 fault_info->mark_zf_absent = FALSE;
13768 fault_info->batch_pmap_op = FALSE;
13769 fault_info->resilient_media = entry->vme_resilient_media;
13770 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
13771 if (entry->translated_allow_execute) {
13772 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
13773 }
13774 }
13775
13776 /*
13777 * Lock the object to prevent it from disappearing
13778 */
13779 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
13780 if (contended == NULL) {
13781 vm_object_lock(*object);
13782 } else {
13783 *contended = vm_object_lock_check_contended(*object);
13784 }
13785 } else {
13786 vm_object_lock_shared(*object);
13787 }
13788
13789 /*
13790 * Save the version number
13791 */
13792
13793 out_version->main_timestamp = map->timestamp;
13794
13795 return KERN_SUCCESS;
13796 }
13797
13798
13799 /*
13800 * vm_map_verify:
13801 *
13802 * Verifies that the map in question has not changed
13803 * since the given version. The map has to be locked
13804 * ("shared" mode is fine) before calling this function
13805 * and it will be returned locked too.
13806 */
13807 boolean_t
vm_map_verify(vm_map_t map,vm_map_version_t * version)13808 vm_map_verify(
13809 vm_map_t map,
13810 vm_map_version_t *version) /* REF */
13811 {
13812 boolean_t result;
13813
13814 vm_map_lock_assert_held(map);
13815 result = (map->timestamp == version->main_timestamp);
13816
13817 return result;
13818 }
13819
13820 /*
13821 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
13822 * Goes away after regular vm_region_recurse function migrates to
13823 * 64 bits
13824 * vm_region_recurse: A form of vm_region which follows the
13825 * submaps in a target map
13826 *
13827 */
13828
13829 kern_return_t
vm_map_region_recurse_64(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,natural_t * nesting_depth,vm_region_submap_info_64_t submap_info,mach_msg_type_number_t * count)13830 vm_map_region_recurse_64(
13831 vm_map_t map,
13832 vm_map_offset_t *address, /* IN/OUT */
13833 vm_map_size_t *size, /* OUT */
13834 natural_t *nesting_depth, /* IN/OUT */
13835 vm_region_submap_info_64_t submap_info, /* IN/OUT */
13836 mach_msg_type_number_t *count) /* IN/OUT */
13837 {
13838 mach_msg_type_number_t original_count;
13839 vm_region_extended_info_data_t extended;
13840 vm_map_entry_t tmp_entry;
13841 vm_map_offset_t user_address;
13842 unsigned int user_max_depth;
13843
13844 /*
13845 * "curr_entry" is the VM map entry preceding or including the
13846 * address we're looking for.
13847 * "curr_map" is the map or sub-map containing "curr_entry".
13848 * "curr_address" is the equivalent of the top map's "user_address"
13849 * in the current map.
13850 * "curr_offset" is the cumulated offset of "curr_map" in the
13851 * target task's address space.
13852 * "curr_depth" is the depth of "curr_map" in the chain of
13853 * sub-maps.
13854 *
13855 * "curr_max_below" and "curr_max_above" limit the range (around
13856 * "curr_address") we should take into account in the current (sub)map.
13857 * They limit the range to what's visible through the map entries
13858 * we've traversed from the top map to the current map.
13859 *
13860 */
13861 vm_map_entry_t curr_entry;
13862 vm_map_address_t curr_address;
13863 vm_map_offset_t curr_offset;
13864 vm_map_t curr_map;
13865 unsigned int curr_depth;
13866 vm_map_offset_t curr_max_below, curr_max_above;
13867 vm_map_offset_t curr_skip;
13868
13869 /*
13870 * "next_" is the same as "curr_" but for the VM region immediately
13871 * after the address we're looking for. We need to keep track of this
13872 * too because we want to return info about that region if the
13873 * address we're looking for is not mapped.
13874 */
13875 vm_map_entry_t next_entry;
13876 vm_map_offset_t next_offset;
13877 vm_map_offset_t next_address;
13878 vm_map_t next_map;
13879 unsigned int next_depth;
13880 vm_map_offset_t next_max_below, next_max_above;
13881 vm_map_offset_t next_skip;
13882
13883 boolean_t look_for_pages;
13884 vm_region_submap_short_info_64_t short_info;
13885 boolean_t do_region_footprint;
13886 int effective_page_size, effective_page_shift;
13887 boolean_t submap_needed_copy;
13888
13889 if (map == VM_MAP_NULL) {
13890 /* no address space to work on */
13891 return KERN_INVALID_ARGUMENT;
13892 }
13893
13894 effective_page_shift = vm_self_region_page_shift(map);
13895 effective_page_size = (1 << effective_page_shift);
13896
13897 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
13898 /*
13899 * "info" structure is not big enough and
13900 * would overflow
13901 */
13902 return KERN_INVALID_ARGUMENT;
13903 }
13904
13905 do_region_footprint = task_self_region_footprint();
13906 original_count = *count;
13907
13908 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
13909 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
13910 look_for_pages = FALSE;
13911 short_info = (vm_region_submap_short_info_64_t) submap_info;
13912 submap_info = NULL;
13913 } else {
13914 look_for_pages = TRUE;
13915 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
13916 short_info = NULL;
13917
13918 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
13919 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
13920 }
13921 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
13922 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
13923 }
13924 }
13925
13926 user_address = *address;
13927 user_max_depth = *nesting_depth;
13928 submap_needed_copy = FALSE;
13929
13930 if (not_in_kdp) {
13931 vm_map_lock_read(map);
13932 }
13933
13934 recurse_again:
13935 curr_entry = NULL;
13936 curr_map = map;
13937 curr_address = user_address;
13938 curr_offset = 0;
13939 curr_skip = 0;
13940 curr_depth = 0;
13941 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
13942 curr_max_below = curr_address;
13943
13944 next_entry = NULL;
13945 next_map = NULL;
13946 next_address = 0;
13947 next_offset = 0;
13948 next_skip = 0;
13949 next_depth = 0;
13950 next_max_above = (vm_map_offset_t) -1;
13951 next_max_below = (vm_map_offset_t) -1;
13952
13953 for (;;) {
13954 if (vm_map_lookup_entry(curr_map,
13955 curr_address,
13956 &tmp_entry)) {
13957 /* tmp_entry contains the address we're looking for */
13958 curr_entry = tmp_entry;
13959 } else {
13960 vm_map_offset_t skip;
13961 /*
13962 * The address is not mapped. "tmp_entry" is the
13963 * map entry preceding the address. We want the next
13964 * one, if it exists.
13965 */
13966 curr_entry = tmp_entry->vme_next;
13967
13968 if (curr_entry == vm_map_to_entry(curr_map) ||
13969 (curr_entry->vme_start >=
13970 curr_address + curr_max_above)) {
13971 /* no next entry at this level: stop looking */
13972 if (not_in_kdp) {
13973 vm_map_unlock_read(curr_map);
13974 }
13975 curr_entry = NULL;
13976 curr_map = NULL;
13977 curr_skip = 0;
13978 curr_offset = 0;
13979 curr_depth = 0;
13980 curr_max_above = 0;
13981 curr_max_below = 0;
13982 break;
13983 }
13984
13985 /* adjust current address and offset */
13986 skip = curr_entry->vme_start - curr_address;
13987 curr_address = curr_entry->vme_start;
13988 curr_skip += skip;
13989 curr_offset += skip;
13990 curr_max_above -= skip;
13991 curr_max_below = 0;
13992 }
13993
13994 /*
13995 * Is the next entry at this level closer to the address (or
13996 * deeper in the submap chain) than the one we had
13997 * so far ?
13998 */
13999 tmp_entry = curr_entry->vme_next;
14000 if (tmp_entry == vm_map_to_entry(curr_map)) {
14001 /* no next entry at this level */
14002 } else if (tmp_entry->vme_start >=
14003 curr_address + curr_max_above) {
14004 /*
14005 * tmp_entry is beyond the scope of what we mapped of
14006 * this submap in the upper level: ignore it.
14007 */
14008 } else if ((next_entry == NULL) ||
14009 (tmp_entry->vme_start + curr_offset <=
14010 next_entry->vme_start + next_offset)) {
14011 /*
14012 * We didn't have a "next_entry" or this one is
14013 * closer to the address we're looking for:
14014 * use this "tmp_entry" as the new "next_entry".
14015 */
14016 if (next_entry != NULL) {
14017 /* unlock the last "next_map" */
14018 if (next_map != curr_map && not_in_kdp) {
14019 vm_map_unlock_read(next_map);
14020 }
14021 }
14022 next_entry = tmp_entry;
14023 next_map = curr_map;
14024 next_depth = curr_depth;
14025 next_address = next_entry->vme_start;
14026 next_skip = curr_skip;
14027 next_skip += (next_address - curr_address);
14028 next_offset = curr_offset;
14029 next_offset += (next_address - curr_address);
14030 next_max_above = MIN(next_max_above, curr_max_above);
14031 next_max_above = MIN(next_max_above,
14032 next_entry->vme_end - next_address);
14033 next_max_below = MIN(next_max_below, curr_max_below);
14034 next_max_below = MIN(next_max_below,
14035 next_address - next_entry->vme_start);
14036 }
14037
14038 /*
14039 * "curr_max_{above,below}" allow us to keep track of the
14040 * portion of the submap that is actually mapped at this level:
14041 * the rest of that submap is irrelevant to us, since it's not
14042 * mapped here.
14043 * The relevant portion of the map starts at
14044 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
14045 */
14046 curr_max_above = MIN(curr_max_above,
14047 curr_entry->vme_end - curr_address);
14048 curr_max_below = MIN(curr_max_below,
14049 curr_address - curr_entry->vme_start);
14050
14051 if (!curr_entry->is_sub_map ||
14052 curr_depth >= user_max_depth) {
14053 /*
14054 * We hit a leaf map or we reached the maximum depth
14055 * we could, so stop looking. Keep the current map
14056 * locked.
14057 */
14058 break;
14059 }
14060
14061 /*
14062 * Get down to the next submap level.
14063 */
14064
14065 if (curr_entry->needs_copy) {
14066 /* everything below this is effectively copy-on-write */
14067 submap_needed_copy = TRUE;
14068 }
14069
14070 /*
14071 * Lock the next level and unlock the current level,
14072 * unless we need to keep it locked to access the "next_entry"
14073 * later.
14074 */
14075 if (not_in_kdp) {
14076 vm_map_lock_read(VME_SUBMAP(curr_entry));
14077 }
14078 if (curr_map == next_map) {
14079 /* keep "next_map" locked in case we need it */
14080 } else {
14081 /* release this map */
14082 if (not_in_kdp) {
14083 vm_map_unlock_read(curr_map);
14084 }
14085 }
14086
14087 /*
14088 * Adjust the offset. "curr_entry" maps the submap
14089 * at relative address "curr_entry->vme_start" in the
14090 * curr_map but skips the first "VME_OFFSET(curr_entry)"
14091 * bytes of the submap.
14092 * "curr_offset" always represents the offset of a virtual
14093 * address in the curr_map relative to the absolute address
14094 * space (i.e. the top-level VM map).
14095 */
14096 curr_offset +=
14097 (VME_OFFSET(curr_entry) - curr_entry->vme_start);
14098 curr_address = user_address + curr_offset;
14099 /* switch to the submap */
14100 curr_map = VME_SUBMAP(curr_entry);
14101 curr_depth++;
14102 curr_entry = NULL;
14103 }
14104
14105 // LP64todo: all the current tools are 32bit, obviously never worked for 64b
14106 // so probably should be a real 32b ID vs. ptr.
14107 // Current users just check for equality
14108
14109 if (curr_entry == NULL) {
14110 /* no VM region contains the address... */
14111
14112 if (do_region_footprint && /* we want footprint numbers */
14113 next_entry == NULL && /* & there are no more regions */
14114 /* & we haven't already provided our fake region: */
14115 user_address <= vm_map_last_entry(map)->vme_end) {
14116 ledger_amount_t ledger_resident, ledger_compressed;
14117
14118 /*
14119 * Add a fake memory region to account for
14120 * purgeable and/or ledger-tagged memory that
14121 * counts towards this task's memory footprint,
14122 * i.e. the resident/compressed pages of non-volatile
14123 * objects owned by that task.
14124 */
14125 task_ledgers_footprint(map->pmap->ledger,
14126 &ledger_resident,
14127 &ledger_compressed);
14128 if (ledger_resident + ledger_compressed == 0) {
14129 /* no purgeable memory usage to report */
14130 return KERN_INVALID_ADDRESS;
14131 }
14132 /* fake region to show nonvolatile footprint */
14133 if (look_for_pages) {
14134 submap_info->protection = VM_PROT_DEFAULT;
14135 submap_info->max_protection = VM_PROT_DEFAULT;
14136 submap_info->inheritance = VM_INHERIT_DEFAULT;
14137 submap_info->offset = 0;
14138 submap_info->user_tag = -1;
14139 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
14140 submap_info->pages_shared_now_private = 0;
14141 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
14142 submap_info->pages_dirtied = submap_info->pages_resident;
14143 submap_info->ref_count = 1;
14144 submap_info->shadow_depth = 0;
14145 submap_info->external_pager = 0;
14146 submap_info->share_mode = SM_PRIVATE;
14147 if (submap_needed_copy) {
14148 submap_info->share_mode = SM_COW;
14149 }
14150 submap_info->is_submap = 0;
14151 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
14152 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14153 submap_info->user_wired_count = 0;
14154 submap_info->pages_reusable = 0;
14155 } else {
14156 short_info->user_tag = -1;
14157 short_info->offset = 0;
14158 short_info->protection = VM_PROT_DEFAULT;
14159 short_info->inheritance = VM_INHERIT_DEFAULT;
14160 short_info->max_protection = VM_PROT_DEFAULT;
14161 short_info->behavior = VM_BEHAVIOR_DEFAULT;
14162 short_info->user_wired_count = 0;
14163 short_info->is_submap = 0;
14164 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
14165 short_info->external_pager = 0;
14166 short_info->shadow_depth = 0;
14167 short_info->share_mode = SM_PRIVATE;
14168 if (submap_needed_copy) {
14169 short_info->share_mode = SM_COW;
14170 }
14171 short_info->ref_count = 1;
14172 }
14173 *nesting_depth = 0;
14174 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
14175 // *address = user_address;
14176 *address = vm_map_last_entry(map)->vme_end;
14177 return KERN_SUCCESS;
14178 }
14179
14180 if (next_entry == NULL) {
14181 /* ... and no VM region follows it either */
14182 return KERN_INVALID_ADDRESS;
14183 }
14184 /* ... gather info about the next VM region */
14185 curr_entry = next_entry;
14186 curr_map = next_map; /* still locked ... */
14187 curr_address = next_address;
14188 curr_skip = next_skip;
14189 curr_offset = next_offset;
14190 curr_depth = next_depth;
14191 curr_max_above = next_max_above;
14192 curr_max_below = next_max_below;
14193 } else {
14194 /* we won't need "next_entry" after all */
14195 if (next_entry != NULL) {
14196 /* release "next_map" */
14197 if (next_map != curr_map && not_in_kdp) {
14198 vm_map_unlock_read(next_map);
14199 }
14200 }
14201 }
14202 next_entry = NULL;
14203 next_map = NULL;
14204 next_offset = 0;
14205 next_skip = 0;
14206 next_depth = 0;
14207 next_max_below = -1;
14208 next_max_above = -1;
14209
14210 if (curr_entry->is_sub_map &&
14211 curr_depth < user_max_depth) {
14212 /*
14213 * We're not as deep as we could be: we must have
14214 * gone back up after not finding anything mapped
14215 * below the original top-level map entry's.
14216 * Let's move "curr_address" forward and recurse again.
14217 */
14218 user_address = curr_address;
14219 goto recurse_again;
14220 }
14221
14222 *nesting_depth = curr_depth;
14223 *size = curr_max_above + curr_max_below;
14224 *address = user_address + curr_skip - curr_max_below;
14225
14226 if (look_for_pages) {
14227 submap_info->user_tag = VME_ALIAS(curr_entry);
14228 submap_info->offset = VME_OFFSET(curr_entry);
14229 submap_info->protection = curr_entry->protection;
14230 submap_info->inheritance = curr_entry->inheritance;
14231 submap_info->max_protection = curr_entry->max_protection;
14232 submap_info->behavior = curr_entry->behavior;
14233 submap_info->user_wired_count = curr_entry->user_wired_count;
14234 submap_info->is_submap = curr_entry->is_sub_map;
14235 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14236 } else {
14237 short_info->user_tag = VME_ALIAS(curr_entry);
14238 short_info->offset = VME_OFFSET(curr_entry);
14239 short_info->protection = curr_entry->protection;
14240 short_info->inheritance = curr_entry->inheritance;
14241 short_info->max_protection = curr_entry->max_protection;
14242 short_info->behavior = curr_entry->behavior;
14243 short_info->user_wired_count = curr_entry->user_wired_count;
14244 short_info->is_submap = curr_entry->is_sub_map;
14245 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
14246 }
14247
14248 extended.pages_resident = 0;
14249 extended.pages_swapped_out = 0;
14250 extended.pages_shared_now_private = 0;
14251 extended.pages_dirtied = 0;
14252 extended.pages_reusable = 0;
14253 extended.external_pager = 0;
14254 extended.shadow_depth = 0;
14255 extended.share_mode = SM_EMPTY;
14256 extended.ref_count = 0;
14257
14258 if (not_in_kdp) {
14259 if (!curr_entry->is_sub_map) {
14260 vm_map_offset_t range_start, range_end;
14261 range_start = MAX((curr_address - curr_max_below),
14262 curr_entry->vme_start);
14263 range_end = MIN((curr_address + curr_max_above),
14264 curr_entry->vme_end);
14265 vm_map_region_walk(curr_map,
14266 range_start,
14267 curr_entry,
14268 (VME_OFFSET(curr_entry) +
14269 (range_start -
14270 curr_entry->vme_start)),
14271 range_end - range_start,
14272 &extended,
14273 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
14274 if (extended.external_pager &&
14275 extended.ref_count == 2 &&
14276 extended.share_mode == SM_SHARED) {
14277 extended.share_mode = SM_PRIVATE;
14278 }
14279 if (submap_needed_copy) {
14280 extended.share_mode = SM_COW;
14281 }
14282 } else {
14283 if (curr_entry->use_pmap) {
14284 extended.share_mode = SM_TRUESHARED;
14285 } else {
14286 extended.share_mode = SM_PRIVATE;
14287 }
14288 extended.ref_count = os_ref_get_count_raw(&VME_SUBMAP(curr_entry)->map_refcnt);
14289 }
14290 }
14291
14292 if (look_for_pages) {
14293 submap_info->pages_resident = extended.pages_resident;
14294 submap_info->pages_swapped_out = extended.pages_swapped_out;
14295 submap_info->pages_shared_now_private =
14296 extended.pages_shared_now_private;
14297 submap_info->pages_dirtied = extended.pages_dirtied;
14298 submap_info->external_pager = extended.external_pager;
14299 submap_info->shadow_depth = extended.shadow_depth;
14300 submap_info->share_mode = extended.share_mode;
14301 submap_info->ref_count = extended.ref_count;
14302
14303 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14304 submap_info->pages_reusable = extended.pages_reusable;
14305 }
14306 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14307 submap_info->object_id_full = (vm_object_id_t) (VME_OBJECT(curr_entry) != NULL) ? VM_KERNEL_ADDRPERM(VME_OBJECT(curr_entry)) : 0ULL;
14308 }
14309 } else {
14310 short_info->external_pager = extended.external_pager;
14311 short_info->shadow_depth = extended.shadow_depth;
14312 short_info->share_mode = extended.share_mode;
14313 short_info->ref_count = extended.ref_count;
14314 }
14315
14316 if (not_in_kdp) {
14317 vm_map_unlock_read(curr_map);
14318 }
14319
14320 return KERN_SUCCESS;
14321 }
14322
14323 /*
14324 * vm_region:
14325 *
14326 * User call to obtain information about a region in
14327 * a task's address map. Currently, only one flavor is
14328 * supported.
14329 *
14330 * XXX The reserved and behavior fields cannot be filled
14331 * in until the vm merge from the IK is completed, and
14332 * vm_reserve is implemented.
14333 */
14334
14335 kern_return_t
vm_map_region(vm_map_t map,vm_map_offset_t * address,vm_map_size_t * size,vm_region_flavor_t flavor,vm_region_info_t info,mach_msg_type_number_t * count,mach_port_t * object_name)14336 vm_map_region(
14337 vm_map_t map,
14338 vm_map_offset_t *address, /* IN/OUT */
14339 vm_map_size_t *size, /* OUT */
14340 vm_region_flavor_t flavor, /* IN */
14341 vm_region_info_t info, /* OUT */
14342 mach_msg_type_number_t *count, /* IN/OUT */
14343 mach_port_t *object_name) /* OUT */
14344 {
14345 vm_map_entry_t tmp_entry;
14346 vm_map_entry_t entry;
14347 vm_map_offset_t start;
14348
14349 if (map == VM_MAP_NULL) {
14350 return KERN_INVALID_ARGUMENT;
14351 }
14352
14353 switch (flavor) {
14354 case VM_REGION_BASIC_INFO:
14355 /* legacy for old 32-bit objects info */
14356 {
14357 vm_region_basic_info_t basic;
14358
14359 if (*count < VM_REGION_BASIC_INFO_COUNT) {
14360 return KERN_INVALID_ARGUMENT;
14361 }
14362
14363 basic = (vm_region_basic_info_t) info;
14364 *count = VM_REGION_BASIC_INFO_COUNT;
14365
14366 vm_map_lock_read(map);
14367
14368 start = *address;
14369 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14370 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14371 vm_map_unlock_read(map);
14372 return KERN_INVALID_ADDRESS;
14373 }
14374 } else {
14375 entry = tmp_entry;
14376 }
14377
14378 start = entry->vme_start;
14379
14380 basic->offset = (uint32_t)VME_OFFSET(entry);
14381 basic->protection = entry->protection;
14382 basic->inheritance = entry->inheritance;
14383 basic->max_protection = entry->max_protection;
14384 basic->behavior = entry->behavior;
14385 basic->user_wired_count = entry->user_wired_count;
14386 basic->reserved = entry->is_sub_map;
14387 *address = start;
14388 *size = (entry->vme_end - start);
14389
14390 if (object_name) {
14391 *object_name = IP_NULL;
14392 }
14393 if (entry->is_sub_map) {
14394 basic->shared = FALSE;
14395 } else {
14396 basic->shared = entry->is_shared;
14397 }
14398
14399 vm_map_unlock_read(map);
14400 return KERN_SUCCESS;
14401 }
14402
14403 case VM_REGION_BASIC_INFO_64:
14404 {
14405 vm_region_basic_info_64_t basic;
14406
14407 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
14408 return KERN_INVALID_ARGUMENT;
14409 }
14410
14411 basic = (vm_region_basic_info_64_t) info;
14412 *count = VM_REGION_BASIC_INFO_COUNT_64;
14413
14414 vm_map_lock_read(map);
14415
14416 start = *address;
14417 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14418 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14419 vm_map_unlock_read(map);
14420 return KERN_INVALID_ADDRESS;
14421 }
14422 } else {
14423 entry = tmp_entry;
14424 }
14425
14426 start = entry->vme_start;
14427
14428 basic->offset = VME_OFFSET(entry);
14429 basic->protection = entry->protection;
14430 basic->inheritance = entry->inheritance;
14431 basic->max_protection = entry->max_protection;
14432 basic->behavior = entry->behavior;
14433 basic->user_wired_count = entry->user_wired_count;
14434 basic->reserved = entry->is_sub_map;
14435 *address = start;
14436 *size = (entry->vme_end - start);
14437
14438 if (object_name) {
14439 *object_name = IP_NULL;
14440 }
14441 if (entry->is_sub_map) {
14442 basic->shared = FALSE;
14443 } else {
14444 basic->shared = entry->is_shared;
14445 }
14446
14447 vm_map_unlock_read(map);
14448 return KERN_SUCCESS;
14449 }
14450 case VM_REGION_EXTENDED_INFO:
14451 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
14452 return KERN_INVALID_ARGUMENT;
14453 }
14454 OS_FALLTHROUGH;
14455 case VM_REGION_EXTENDED_INFO__legacy:
14456 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
14457 return KERN_INVALID_ARGUMENT;
14458 }
14459
14460 {
14461 vm_region_extended_info_t extended;
14462 mach_msg_type_number_t original_count;
14463 int effective_page_size, effective_page_shift;
14464
14465 extended = (vm_region_extended_info_t) info;
14466
14467 effective_page_shift = vm_self_region_page_shift(map);
14468 effective_page_size = (1 << effective_page_shift);
14469
14470 vm_map_lock_read(map);
14471
14472 start = *address;
14473 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14474 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14475 vm_map_unlock_read(map);
14476 return KERN_INVALID_ADDRESS;
14477 }
14478 } else {
14479 entry = tmp_entry;
14480 }
14481 start = entry->vme_start;
14482
14483 extended->protection = entry->protection;
14484 extended->user_tag = VME_ALIAS(entry);
14485 extended->pages_resident = 0;
14486 extended->pages_swapped_out = 0;
14487 extended->pages_shared_now_private = 0;
14488 extended->pages_dirtied = 0;
14489 extended->external_pager = 0;
14490 extended->shadow_depth = 0;
14491
14492 original_count = *count;
14493 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
14494 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
14495 } else {
14496 extended->pages_reusable = 0;
14497 *count = VM_REGION_EXTENDED_INFO_COUNT;
14498 }
14499
14500 vm_map_region_walk(map, start, entry, VME_OFFSET(entry), entry->vme_end - start, extended, TRUE, *count);
14501
14502 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
14503 extended->share_mode = SM_PRIVATE;
14504 }
14505
14506 if (object_name) {
14507 *object_name = IP_NULL;
14508 }
14509 *address = start;
14510 *size = (entry->vme_end - start);
14511
14512 vm_map_unlock_read(map);
14513 return KERN_SUCCESS;
14514 }
14515 case VM_REGION_TOP_INFO:
14516 {
14517 vm_region_top_info_t top;
14518
14519 if (*count < VM_REGION_TOP_INFO_COUNT) {
14520 return KERN_INVALID_ARGUMENT;
14521 }
14522
14523 top = (vm_region_top_info_t) info;
14524 *count = VM_REGION_TOP_INFO_COUNT;
14525
14526 vm_map_lock_read(map);
14527
14528 start = *address;
14529 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
14530 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
14531 vm_map_unlock_read(map);
14532 return KERN_INVALID_ADDRESS;
14533 }
14534 } else {
14535 entry = tmp_entry;
14536 }
14537 start = entry->vme_start;
14538
14539 top->private_pages_resident = 0;
14540 top->shared_pages_resident = 0;
14541
14542 vm_map_region_top_walk(entry, top);
14543
14544 if (object_name) {
14545 *object_name = IP_NULL;
14546 }
14547 *address = start;
14548 *size = (entry->vme_end - start);
14549
14550 vm_map_unlock_read(map);
14551 return KERN_SUCCESS;
14552 }
14553 default:
14554 return KERN_INVALID_ARGUMENT;
14555 }
14556 }
14557
14558 #define OBJ_RESIDENT_COUNT(obj, entry_size) \
14559 MIN((entry_size), \
14560 ((obj)->all_reusable ? \
14561 (obj)->wired_page_count : \
14562 (obj)->resident_page_count - (obj)->reusable_page_count))
14563
14564 void
vm_map_region_top_walk(vm_map_entry_t entry,vm_region_top_info_t top)14565 vm_map_region_top_walk(
14566 vm_map_entry_t entry,
14567 vm_region_top_info_t top)
14568 {
14569 if (VME_OBJECT(entry) == 0 || entry->is_sub_map) {
14570 top->share_mode = SM_EMPTY;
14571 top->ref_count = 0;
14572 top->obj_id = 0;
14573 return;
14574 }
14575
14576 {
14577 struct vm_object *obj, *tmp_obj;
14578 int ref_count;
14579 uint32_t entry_size;
14580
14581 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
14582
14583 obj = VME_OBJECT(entry);
14584
14585 vm_object_lock(obj);
14586
14587 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14588 ref_count--;
14589 }
14590
14591 assert(obj->reusable_page_count <= obj->resident_page_count);
14592 if (obj->shadow) {
14593 if (ref_count == 1) {
14594 top->private_pages_resident =
14595 OBJ_RESIDENT_COUNT(obj, entry_size);
14596 } else {
14597 top->shared_pages_resident =
14598 OBJ_RESIDENT_COUNT(obj, entry_size);
14599 }
14600 top->ref_count = ref_count;
14601 top->share_mode = SM_COW;
14602
14603 while ((tmp_obj = obj->shadow)) {
14604 vm_object_lock(tmp_obj);
14605 vm_object_unlock(obj);
14606 obj = tmp_obj;
14607
14608 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14609 ref_count--;
14610 }
14611
14612 assert(obj->reusable_page_count <= obj->resident_page_count);
14613 top->shared_pages_resident +=
14614 OBJ_RESIDENT_COUNT(obj, entry_size);
14615 top->ref_count += ref_count - 1;
14616 }
14617 } else {
14618 if (entry->superpage_size) {
14619 top->share_mode = SM_LARGE_PAGE;
14620 top->shared_pages_resident = 0;
14621 top->private_pages_resident = entry_size;
14622 } else if (entry->needs_copy) {
14623 top->share_mode = SM_COW;
14624 top->shared_pages_resident =
14625 OBJ_RESIDENT_COUNT(obj, entry_size);
14626 } else {
14627 if (ref_count == 1 ||
14628 (ref_count == 2 && obj->named)) {
14629 top->share_mode = SM_PRIVATE;
14630 top->private_pages_resident =
14631 OBJ_RESIDENT_COUNT(obj,
14632 entry_size);
14633 } else {
14634 top->share_mode = SM_SHARED;
14635 top->shared_pages_resident =
14636 OBJ_RESIDENT_COUNT(obj,
14637 entry_size);
14638 }
14639 }
14640 top->ref_count = ref_count;
14641 }
14642 /* XXX K64: obj_id will be truncated */
14643 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRPERM(obj);
14644
14645 vm_object_unlock(obj);
14646 }
14647 }
14648
14649 void
vm_map_region_walk(vm_map_t map,vm_map_offset_t va,vm_map_entry_t entry,vm_object_offset_t offset,vm_object_size_t range,vm_region_extended_info_t extended,boolean_t look_for_pages,mach_msg_type_number_t count)14650 vm_map_region_walk(
14651 vm_map_t map,
14652 vm_map_offset_t va,
14653 vm_map_entry_t entry,
14654 vm_object_offset_t offset,
14655 vm_object_size_t range,
14656 vm_region_extended_info_t extended,
14657 boolean_t look_for_pages,
14658 mach_msg_type_number_t count)
14659 {
14660 struct vm_object *obj, *tmp_obj;
14661 vm_map_offset_t last_offset;
14662 int i;
14663 int ref_count;
14664 struct vm_object *shadow_object;
14665 unsigned short shadow_depth;
14666 boolean_t do_region_footprint;
14667 int effective_page_size, effective_page_shift;
14668 vm_map_offset_t effective_page_mask;
14669
14670 do_region_footprint = task_self_region_footprint();
14671
14672 if ((VME_OBJECT(entry) == 0) ||
14673 (entry->is_sub_map) ||
14674 (VME_OBJECT(entry)->phys_contiguous &&
14675 !entry->superpage_size)) {
14676 extended->share_mode = SM_EMPTY;
14677 extended->ref_count = 0;
14678 return;
14679 }
14680
14681 if (entry->superpage_size) {
14682 extended->shadow_depth = 0;
14683 extended->share_mode = SM_LARGE_PAGE;
14684 extended->ref_count = 1;
14685 extended->external_pager = 0;
14686
14687 /* TODO4K: Superpage in 4k mode? */
14688 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
14689 extended->shadow_depth = 0;
14690 return;
14691 }
14692
14693 effective_page_shift = vm_self_region_page_shift(map);
14694 effective_page_size = (1 << effective_page_shift);
14695 effective_page_mask = effective_page_size - 1;
14696
14697 offset = vm_map_trunc_page(offset, effective_page_mask);
14698
14699 obj = VME_OBJECT(entry);
14700
14701 vm_object_lock(obj);
14702
14703 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14704 ref_count--;
14705 }
14706
14707 if (look_for_pages) {
14708 for (last_offset = offset + range;
14709 offset < last_offset;
14710 offset += effective_page_size, va += effective_page_size) {
14711 if (do_region_footprint) {
14712 int disp;
14713
14714 disp = 0;
14715 if (map->has_corpse_footprint) {
14716 /*
14717 * Query the page info data we saved
14718 * while forking the corpse.
14719 */
14720 vm_map_corpse_footprint_query_page_info(
14721 map,
14722 va,
14723 &disp);
14724 } else {
14725 /*
14726 * Query the pmap.
14727 */
14728 vm_map_footprint_query_page_info(
14729 map,
14730 entry,
14731 va,
14732 &disp);
14733 }
14734 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
14735 extended->pages_resident++;
14736 }
14737 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
14738 extended->pages_reusable++;
14739 }
14740 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
14741 extended->pages_dirtied++;
14742 }
14743 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
14744 extended->pages_swapped_out++;
14745 }
14746 continue;
14747 }
14748
14749 vm_map_region_look_for_page(map, va, obj,
14750 vm_object_trunc_page(offset), ref_count,
14751 0, extended, count);
14752 }
14753
14754 if (do_region_footprint) {
14755 goto collect_object_info;
14756 }
14757 } else {
14758 collect_object_info:
14759 shadow_object = obj->shadow;
14760 shadow_depth = 0;
14761
14762 if (!(obj->internal)) {
14763 extended->external_pager = 1;
14764 }
14765
14766 if (shadow_object != VM_OBJECT_NULL) {
14767 vm_object_lock(shadow_object);
14768 for (;
14769 shadow_object != VM_OBJECT_NULL;
14770 shadow_depth++) {
14771 vm_object_t next_shadow;
14772
14773 if (!(shadow_object->internal)) {
14774 extended->external_pager = 1;
14775 }
14776
14777 next_shadow = shadow_object->shadow;
14778 if (next_shadow) {
14779 vm_object_lock(next_shadow);
14780 }
14781 vm_object_unlock(shadow_object);
14782 shadow_object = next_shadow;
14783 }
14784 }
14785 extended->shadow_depth = shadow_depth;
14786 }
14787
14788 if (extended->shadow_depth || entry->needs_copy) {
14789 extended->share_mode = SM_COW;
14790 } else {
14791 if (ref_count == 1) {
14792 extended->share_mode = SM_PRIVATE;
14793 } else {
14794 if (obj->true_share) {
14795 extended->share_mode = SM_TRUESHARED;
14796 } else {
14797 extended->share_mode = SM_SHARED;
14798 }
14799 }
14800 }
14801 extended->ref_count = ref_count - extended->shadow_depth;
14802
14803 for (i = 0; i < extended->shadow_depth; i++) {
14804 if ((tmp_obj = obj->shadow) == 0) {
14805 break;
14806 }
14807 vm_object_lock(tmp_obj);
14808 vm_object_unlock(obj);
14809
14810 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
14811 ref_count--;
14812 }
14813
14814 extended->ref_count += ref_count;
14815 obj = tmp_obj;
14816 }
14817 vm_object_unlock(obj);
14818
14819 if (extended->share_mode == SM_SHARED) {
14820 vm_map_entry_t cur;
14821 vm_map_entry_t last;
14822 int my_refs;
14823
14824 obj = VME_OBJECT(entry);
14825 last = vm_map_to_entry(map);
14826 my_refs = 0;
14827
14828 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
14829 ref_count--;
14830 }
14831 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
14832 my_refs += vm_map_region_count_obj_refs(cur, obj);
14833 }
14834
14835 if (my_refs == ref_count) {
14836 extended->share_mode = SM_PRIVATE_ALIASED;
14837 } else if (my_refs > 1) {
14838 extended->share_mode = SM_SHARED_ALIASED;
14839 }
14840 }
14841 }
14842
14843
14844 /* object is locked on entry and locked on return */
14845
14846
14847 static void
vm_map_region_look_for_page(__unused vm_map_t map,__unused vm_map_offset_t va,vm_object_t object,vm_object_offset_t offset,int max_refcnt,unsigned short depth,vm_region_extended_info_t extended,mach_msg_type_number_t count)14848 vm_map_region_look_for_page(
14849 __unused vm_map_t map,
14850 __unused vm_map_offset_t va,
14851 vm_object_t object,
14852 vm_object_offset_t offset,
14853 int max_refcnt,
14854 unsigned short depth,
14855 vm_region_extended_info_t extended,
14856 mach_msg_type_number_t count)
14857 {
14858 vm_page_t p;
14859 vm_object_t shadow;
14860 int ref_count;
14861 vm_object_t caller_object;
14862
14863 shadow = object->shadow;
14864 caller_object = object;
14865
14866
14867 while (TRUE) {
14868 if (!(object->internal)) {
14869 extended->external_pager = 1;
14870 }
14871
14872 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
14873 if (shadow && (max_refcnt == 1)) {
14874 extended->pages_shared_now_private++;
14875 }
14876
14877 if (!p->vmp_fictitious &&
14878 (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
14879 extended->pages_dirtied++;
14880 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
14881 if (p->vmp_reusable || object->all_reusable) {
14882 extended->pages_reusable++;
14883 }
14884 }
14885
14886 extended->pages_resident++;
14887
14888 if (object != caller_object) {
14889 vm_object_unlock(object);
14890 }
14891
14892 return;
14893 }
14894 if (object->internal &&
14895 object->alive &&
14896 !object->terminating &&
14897 object->pager_ready) {
14898 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
14899 == VM_EXTERNAL_STATE_EXISTS) {
14900 /* the pager has that page */
14901 extended->pages_swapped_out++;
14902 if (object != caller_object) {
14903 vm_object_unlock(object);
14904 }
14905 return;
14906 }
14907 }
14908
14909 if (shadow) {
14910 vm_object_lock(shadow);
14911
14912 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
14913 ref_count--;
14914 }
14915
14916 if (++depth > extended->shadow_depth) {
14917 extended->shadow_depth = depth;
14918 }
14919
14920 if (ref_count > max_refcnt) {
14921 max_refcnt = ref_count;
14922 }
14923
14924 if (object != caller_object) {
14925 vm_object_unlock(object);
14926 }
14927
14928 offset = offset + object->vo_shadow_offset;
14929 object = shadow;
14930 shadow = object->shadow;
14931 continue;
14932 }
14933 if (object != caller_object) {
14934 vm_object_unlock(object);
14935 }
14936 break;
14937 }
14938 }
14939
14940 static int
vm_map_region_count_obj_refs(vm_map_entry_t entry,vm_object_t object)14941 vm_map_region_count_obj_refs(
14942 vm_map_entry_t entry,
14943 vm_object_t object)
14944 {
14945 int ref_count;
14946 vm_object_t chk_obj;
14947 vm_object_t tmp_obj;
14948
14949 if (VME_OBJECT(entry) == 0) {
14950 return 0;
14951 }
14952
14953 if (entry->is_sub_map) {
14954 return 0;
14955 } else {
14956 ref_count = 0;
14957
14958 chk_obj = VME_OBJECT(entry);
14959 vm_object_lock(chk_obj);
14960
14961 while (chk_obj) {
14962 if (chk_obj == object) {
14963 ref_count++;
14964 }
14965 tmp_obj = chk_obj->shadow;
14966 if (tmp_obj) {
14967 vm_object_lock(tmp_obj);
14968 }
14969 vm_object_unlock(chk_obj);
14970
14971 chk_obj = tmp_obj;
14972 }
14973 }
14974 return ref_count;
14975 }
14976
14977
14978 /*
14979 * Routine: vm_map_simplify
14980 *
14981 * Description:
14982 * Attempt to simplify the map representation in
14983 * the vicinity of the given starting address.
14984 * Note:
14985 * This routine is intended primarily to keep the
14986 * kernel maps more compact -- they generally don't
14987 * benefit from the "expand a map entry" technology
14988 * at allocation time because the adjacent entry
14989 * is often wired down.
14990 */
14991 void
vm_map_simplify_entry(vm_map_t map,vm_map_entry_t this_entry)14992 vm_map_simplify_entry(
14993 vm_map_t map,
14994 vm_map_entry_t this_entry)
14995 {
14996 vm_map_entry_t prev_entry;
14997
14998 prev_entry = this_entry->vme_prev;
14999
15000 if ((this_entry != vm_map_to_entry(map)) &&
15001 (prev_entry != vm_map_to_entry(map)) &&
15002
15003 (prev_entry->vme_end == this_entry->vme_start) &&
15004
15005 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
15006 (VME_OBJECT(prev_entry) == VME_OBJECT(this_entry)) &&
15007 ((VME_OFFSET(prev_entry) + (prev_entry->vme_end -
15008 prev_entry->vme_start))
15009 == VME_OFFSET(this_entry)) &&
15010
15011 (prev_entry->behavior == this_entry->behavior) &&
15012 (prev_entry->needs_copy == this_entry->needs_copy) &&
15013 (prev_entry->protection == this_entry->protection) &&
15014 (prev_entry->max_protection == this_entry->max_protection) &&
15015 (prev_entry->inheritance == this_entry->inheritance) &&
15016 (prev_entry->use_pmap == this_entry->use_pmap) &&
15017 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
15018 (prev_entry->no_cache == this_entry->no_cache) &&
15019 (prev_entry->permanent == this_entry->permanent) &&
15020 (prev_entry->map_aligned == this_entry->map_aligned) &&
15021 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
15022 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
15023 (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
15024 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
15025 (prev_entry->vme_resilient_codesign ==
15026 this_entry->vme_resilient_codesign) &&
15027 (prev_entry->vme_resilient_media ==
15028 this_entry->vme_resilient_media) &&
15029 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
15030
15031 (prev_entry->wired_count == this_entry->wired_count) &&
15032 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
15033
15034 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
15035 (prev_entry->in_transition == FALSE) &&
15036 (this_entry->in_transition == FALSE) &&
15037 (prev_entry->needs_wakeup == FALSE) &&
15038 (this_entry->needs_wakeup == FALSE) &&
15039 (prev_entry->is_shared == this_entry->is_shared) &&
15040 (prev_entry->superpage_size == FALSE) &&
15041 (this_entry->superpage_size == FALSE)
15042 ) {
15043 vm_map_store_entry_unlink(map, prev_entry);
15044 assert(prev_entry->vme_start < this_entry->vme_end);
15045 if (prev_entry->map_aligned) {
15046 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
15047 VM_MAP_PAGE_MASK(map)));
15048 }
15049 this_entry->vme_start = prev_entry->vme_start;
15050 VME_OFFSET_SET(this_entry, VME_OFFSET(prev_entry));
15051
15052 if (map->holelistenabled) {
15053 vm_map_store_update_first_free(map, this_entry, TRUE);
15054 }
15055
15056 if (prev_entry->is_sub_map) {
15057 vm_map_deallocate(VME_SUBMAP(prev_entry));
15058 } else {
15059 vm_object_deallocate(VME_OBJECT(prev_entry));
15060 }
15061 vm_map_entry_dispose(prev_entry);
15062 SAVE_HINT_MAP_WRITE(map, this_entry);
15063 }
15064 }
15065
15066 void
vm_map_simplify(vm_map_t map,vm_map_offset_t start)15067 vm_map_simplify(
15068 vm_map_t map,
15069 vm_map_offset_t start)
15070 {
15071 vm_map_entry_t this_entry;
15072
15073 vm_map_lock(map);
15074 if (vm_map_lookup_entry(map, start, &this_entry)) {
15075 vm_map_simplify_entry(map, this_entry);
15076 vm_map_simplify_entry(map, this_entry->vme_next);
15077 }
15078 vm_map_unlock(map);
15079 }
15080
15081 static void
vm_map_simplify_range(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15082 vm_map_simplify_range(
15083 vm_map_t map,
15084 vm_map_offset_t start,
15085 vm_map_offset_t end)
15086 {
15087 vm_map_entry_t entry;
15088
15089 /*
15090 * The map should be locked (for "write") by the caller.
15091 */
15092
15093 if (start >= end) {
15094 /* invalid address range */
15095 return;
15096 }
15097
15098 start = vm_map_trunc_page(start,
15099 VM_MAP_PAGE_MASK(map));
15100 end = vm_map_round_page(end,
15101 VM_MAP_PAGE_MASK(map));
15102
15103 if (!vm_map_lookup_entry(map, start, &entry)) {
15104 /* "start" is not mapped and "entry" ends before "start" */
15105 if (entry == vm_map_to_entry(map)) {
15106 /* start with first entry in the map */
15107 entry = vm_map_first_entry(map);
15108 } else {
15109 /* start with next entry */
15110 entry = entry->vme_next;
15111 }
15112 }
15113
15114 while (entry != vm_map_to_entry(map) &&
15115 entry->vme_start <= end) {
15116 /* try and coalesce "entry" with its previous entry */
15117 vm_map_simplify_entry(map, entry);
15118 entry = entry->vme_next;
15119 }
15120 }
15121
15122
15123 /*
15124 * Routine: vm_map_machine_attribute
15125 * Purpose:
15126 * Provide machine-specific attributes to mappings,
15127 * such as cachability etc. for machines that provide
15128 * them. NUMA architectures and machines with big/strange
15129 * caches will use this.
15130 * Note:
15131 * Responsibilities for locking and checking are handled here,
15132 * everything else in the pmap module. If any non-volatile
15133 * information must be kept, the pmap module should handle
15134 * it itself. [This assumes that attributes do not
15135 * need to be inherited, which seems ok to me]
15136 */
15137 kern_return_t
vm_map_machine_attribute(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_machine_attribute_t attribute,vm_machine_attribute_val_t * value)15138 vm_map_machine_attribute(
15139 vm_map_t map,
15140 vm_map_offset_t start,
15141 vm_map_offset_t end,
15142 vm_machine_attribute_t attribute,
15143 vm_machine_attribute_val_t* value) /* IN/OUT */
15144 {
15145 kern_return_t ret;
15146 vm_map_size_t sync_size;
15147 vm_map_entry_t entry;
15148
15149 if (start < vm_map_min(map) || end > vm_map_max(map)) {
15150 return KERN_INVALID_ADDRESS;
15151 }
15152
15153 /* Figure how much memory we need to flush (in page increments) */
15154 sync_size = end - start;
15155
15156 vm_map_lock(map);
15157
15158 if (attribute != MATTR_CACHE) {
15159 /* If we don't have to find physical addresses, we */
15160 /* don't have to do an explicit traversal here. */
15161 ret = pmap_attribute(map->pmap, start, end - start,
15162 attribute, value);
15163 vm_map_unlock(map);
15164 return ret;
15165 }
15166
15167 ret = KERN_SUCCESS; /* Assume it all worked */
15168
15169 while (sync_size) {
15170 if (vm_map_lookup_entry(map, start, &entry)) {
15171 vm_map_size_t sub_size;
15172 if ((entry->vme_end - start) > sync_size) {
15173 sub_size = sync_size;
15174 sync_size = 0;
15175 } else {
15176 sub_size = entry->vme_end - start;
15177 sync_size -= sub_size;
15178 }
15179 if (entry->is_sub_map) {
15180 vm_map_offset_t sub_start;
15181 vm_map_offset_t sub_end;
15182
15183 sub_start = (start - entry->vme_start)
15184 + VME_OFFSET(entry);
15185 sub_end = sub_start + sub_size;
15186 vm_map_machine_attribute(
15187 VME_SUBMAP(entry),
15188 sub_start,
15189 sub_end,
15190 attribute, value);
15191 } else {
15192 if (VME_OBJECT(entry)) {
15193 vm_page_t m;
15194 vm_object_t object;
15195 vm_object_t base_object;
15196 vm_object_t last_object;
15197 vm_object_offset_t offset;
15198 vm_object_offset_t base_offset;
15199 vm_map_size_t range;
15200 range = sub_size;
15201 offset = (start - entry->vme_start)
15202 + VME_OFFSET(entry);
15203 offset = vm_object_trunc_page(offset);
15204 base_offset = offset;
15205 object = VME_OBJECT(entry);
15206 base_object = object;
15207 last_object = NULL;
15208
15209 vm_object_lock(object);
15210
15211 while (range) {
15212 m = vm_page_lookup(
15213 object, offset);
15214
15215 if (m && !m->vmp_fictitious) {
15216 ret =
15217 pmap_attribute_cache_sync(
15218 VM_PAGE_GET_PHYS_PAGE(m),
15219 PAGE_SIZE,
15220 attribute, value);
15221 } else if (object->shadow) {
15222 offset = offset + object->vo_shadow_offset;
15223 last_object = object;
15224 object = object->shadow;
15225 vm_object_lock(last_object->shadow);
15226 vm_object_unlock(last_object);
15227 continue;
15228 }
15229 if (range < PAGE_SIZE) {
15230 range = 0;
15231 } else {
15232 range -= PAGE_SIZE;
15233 }
15234
15235 if (base_object != object) {
15236 vm_object_unlock(object);
15237 vm_object_lock(base_object);
15238 object = base_object;
15239 }
15240 /* Bump to the next page */
15241 base_offset += PAGE_SIZE;
15242 offset = base_offset;
15243 }
15244 vm_object_unlock(object);
15245 }
15246 }
15247 start += sub_size;
15248 } else {
15249 vm_map_unlock(map);
15250 return KERN_FAILURE;
15251 }
15252 }
15253
15254 vm_map_unlock(map);
15255
15256 return ret;
15257 }
15258
15259 /*
15260 * vm_map_behavior_set:
15261 *
15262 * Sets the paging reference behavior of the specified address
15263 * range in the target map. Paging reference behavior affects
15264 * how pagein operations resulting from faults on the map will be
15265 * clustered.
15266 */
15267 kern_return_t
vm_map_behavior_set(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_behavior_t new_behavior)15268 vm_map_behavior_set(
15269 vm_map_t map,
15270 vm_map_offset_t start,
15271 vm_map_offset_t end,
15272 vm_behavior_t new_behavior)
15273 {
15274 vm_map_entry_t entry;
15275 vm_map_entry_t temp_entry;
15276
15277 if (start > end ||
15278 start < vm_map_min(map) ||
15279 end > vm_map_max(map)) {
15280 return KERN_NO_SPACE;
15281 }
15282
15283 switch (new_behavior) {
15284 /*
15285 * This first block of behaviors all set a persistent state on the specified
15286 * memory range. All we have to do here is to record the desired behavior
15287 * in the vm_map_entry_t's.
15288 */
15289
15290 case VM_BEHAVIOR_DEFAULT:
15291 case VM_BEHAVIOR_RANDOM:
15292 case VM_BEHAVIOR_SEQUENTIAL:
15293 case VM_BEHAVIOR_RSEQNTL:
15294 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
15295 vm_map_lock(map);
15296
15297 /*
15298 * The entire address range must be valid for the map.
15299 * Note that vm_map_range_check() does a
15300 * vm_map_lookup_entry() internally and returns the
15301 * entry containing the start of the address range if
15302 * the entire range is valid.
15303 */
15304 if (vm_map_range_check(map, start, end, &temp_entry)) {
15305 entry = temp_entry;
15306 vm_map_clip_start(map, entry, start);
15307 } else {
15308 vm_map_unlock(map);
15309 return KERN_INVALID_ADDRESS;
15310 }
15311
15312 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
15313 vm_map_clip_end(map, entry, end);
15314 if (entry->is_sub_map) {
15315 assert(!entry->use_pmap);
15316 }
15317
15318 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
15319 entry->zero_wired_pages = TRUE;
15320 } else {
15321 entry->behavior = new_behavior;
15322 }
15323 entry = entry->vme_next;
15324 }
15325
15326 vm_map_unlock(map);
15327 break;
15328
15329 /*
15330 * The rest of these are different from the above in that they cause
15331 * an immediate action to take place as opposed to setting a behavior that
15332 * affects future actions.
15333 */
15334
15335 case VM_BEHAVIOR_WILLNEED:
15336 return vm_map_willneed(map, start, end);
15337
15338 case VM_BEHAVIOR_DONTNEED:
15339 return vm_map_msync(map, start, end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
15340
15341 case VM_BEHAVIOR_FREE:
15342 return vm_map_msync(map, start, end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
15343
15344 case VM_BEHAVIOR_REUSABLE:
15345 return vm_map_reusable_pages(map, start, end);
15346
15347 case VM_BEHAVIOR_REUSE:
15348 return vm_map_reuse_pages(map, start, end);
15349
15350 case VM_BEHAVIOR_CAN_REUSE:
15351 return vm_map_can_reuse(map, start, end);
15352
15353 #if MACH_ASSERT
15354 case VM_BEHAVIOR_PAGEOUT:
15355 return vm_map_pageout(map, start, end);
15356 #endif /* MACH_ASSERT */
15357
15358 default:
15359 return KERN_INVALID_ARGUMENT;
15360 }
15361
15362 return KERN_SUCCESS;
15363 }
15364
15365
15366 /*
15367 * Internals for madvise(MADV_WILLNEED) system call.
15368 *
15369 * The implementation is to do:-
15370 * a) read-ahead if the mapping corresponds to a mapped regular file
15371 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
15372 */
15373
15374
15375 static kern_return_t
vm_map_willneed(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15376 vm_map_willneed(
15377 vm_map_t map,
15378 vm_map_offset_t start,
15379 vm_map_offset_t end
15380 )
15381 {
15382 vm_map_entry_t entry;
15383 vm_object_t object;
15384 memory_object_t pager;
15385 struct vm_object_fault_info fault_info = {};
15386 kern_return_t kr;
15387 vm_object_size_t len;
15388 vm_object_offset_t offset;
15389
15390 fault_info.interruptible = THREAD_UNINT; /* ignored value */
15391 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
15392 fault_info.stealth = TRUE;
15393
15394 /*
15395 * The MADV_WILLNEED operation doesn't require any changes to the
15396 * vm_map_entry_t's, so the read lock is sufficient.
15397 */
15398
15399 vm_map_lock_read(map);
15400
15401 /*
15402 * The madvise semantics require that the address range be fully
15403 * allocated with no holes. Otherwise, we're required to return
15404 * an error.
15405 */
15406
15407 if (!vm_map_range_check(map, start, end, &entry)) {
15408 vm_map_unlock_read(map);
15409 return KERN_INVALID_ADDRESS;
15410 }
15411
15412 /*
15413 * Examine each vm_map_entry_t in the range.
15414 */
15415 for (; entry != vm_map_to_entry(map) && start < end;) {
15416 /*
15417 * The first time through, the start address could be anywhere
15418 * within the vm_map_entry we found. So adjust the offset to
15419 * correspond. After that, the offset will always be zero to
15420 * correspond to the beginning of the current vm_map_entry.
15421 */
15422 offset = (start - entry->vme_start) + VME_OFFSET(entry);
15423
15424 /*
15425 * Set the length so we don't go beyond the end of the
15426 * map_entry or beyond the end of the range we were given.
15427 * This range could span also multiple map entries all of which
15428 * map different files, so make sure we only do the right amount
15429 * of I/O for each object. Note that it's possible for there
15430 * to be multiple map entries all referring to the same object
15431 * but with different page permissions, but it's not worth
15432 * trying to optimize that case.
15433 */
15434 len = MIN(entry->vme_end - start, end - start);
15435
15436 if ((vm_size_t) len != len) {
15437 /* 32-bit overflow */
15438 len = (vm_size_t) (0 - PAGE_SIZE);
15439 }
15440 fault_info.cluster_size = (vm_size_t) len;
15441 fault_info.lo_offset = offset;
15442 fault_info.hi_offset = offset + len;
15443 fault_info.user_tag = VME_ALIAS(entry);
15444 fault_info.pmap_options = 0;
15445 if (entry->iokit_acct ||
15446 (!entry->is_sub_map && !entry->use_pmap)) {
15447 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
15448 }
15449
15450 /*
15451 * If the entry is a submap OR there's no read permission
15452 * to this mapping, then just skip it.
15453 */
15454 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
15455 entry = entry->vme_next;
15456 start = entry->vme_start;
15457 continue;
15458 }
15459
15460 object = VME_OBJECT(entry);
15461
15462 if (object == NULL ||
15463 (object && object->internal)) {
15464 /*
15465 * Memory range backed by anonymous memory.
15466 */
15467 vm_size_t region_size = 0, effective_page_size = 0;
15468 vm_map_offset_t addr = 0, effective_page_mask = 0;
15469
15470 region_size = len;
15471 addr = start;
15472
15473 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
15474 effective_page_size = effective_page_mask + 1;
15475
15476 vm_map_unlock_read(map);
15477
15478 while (region_size) {
15479 vm_pre_fault(
15480 vm_map_trunc_page(addr, effective_page_mask),
15481 VM_PROT_READ | VM_PROT_WRITE);
15482
15483 region_size -= effective_page_size;
15484 addr += effective_page_size;
15485 }
15486 } else {
15487 /*
15488 * Find the file object backing this map entry. If there is
15489 * none, then we simply ignore the "will need" advice for this
15490 * entry and go on to the next one.
15491 */
15492 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
15493 entry = entry->vme_next;
15494 start = entry->vme_start;
15495 continue;
15496 }
15497
15498 vm_object_paging_begin(object);
15499 pager = object->pager;
15500 vm_object_unlock(object);
15501
15502 /*
15503 * The data_request() could take a long time, so let's
15504 * release the map lock to avoid blocking other threads.
15505 */
15506 vm_map_unlock_read(map);
15507
15508 /*
15509 * Get the data from the object asynchronously.
15510 *
15511 * Note that memory_object_data_request() places limits on the
15512 * amount of I/O it will do. Regardless of the len we
15513 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
15514 * silently truncates the len to that size. This isn't
15515 * necessarily bad since madvise shouldn't really be used to
15516 * page in unlimited amounts of data. Other Unix variants
15517 * limit the willneed case as well. If this turns out to be an
15518 * issue for developers, then we can always adjust the policy
15519 * here and still be backwards compatible since this is all
15520 * just "advice".
15521 */
15522 kr = memory_object_data_request(
15523 pager,
15524 vm_object_trunc_page(offset) + object->paging_offset,
15525 0, /* ignored */
15526 VM_PROT_READ,
15527 (memory_object_fault_info_t)&fault_info);
15528
15529 vm_object_lock(object);
15530 vm_object_paging_end(object);
15531 vm_object_unlock(object);
15532
15533 /*
15534 * If we couldn't do the I/O for some reason, just give up on
15535 * the madvise. We still return success to the user since
15536 * madvise isn't supposed to fail when the advice can't be
15537 * taken.
15538 */
15539
15540 if (kr != KERN_SUCCESS) {
15541 return KERN_SUCCESS;
15542 }
15543 }
15544
15545 start += len;
15546 if (start >= end) {
15547 /* done */
15548 return KERN_SUCCESS;
15549 }
15550
15551 /* look up next entry */
15552 vm_map_lock_read(map);
15553 if (!vm_map_lookup_entry(map, start, &entry)) {
15554 /*
15555 * There's a new hole in the address range.
15556 */
15557 vm_map_unlock_read(map);
15558 return KERN_INVALID_ADDRESS;
15559 }
15560 }
15561
15562 vm_map_unlock_read(map);
15563 return KERN_SUCCESS;
15564 }
15565
15566 static boolean_t
vm_map_entry_is_reusable(vm_map_entry_t entry)15567 vm_map_entry_is_reusable(
15568 vm_map_entry_t entry)
15569 {
15570 /* Only user map entries */
15571
15572 vm_object_t object;
15573
15574 if (entry->is_sub_map) {
15575 return FALSE;
15576 }
15577
15578 switch (VME_ALIAS(entry)) {
15579 case VM_MEMORY_MALLOC:
15580 case VM_MEMORY_MALLOC_SMALL:
15581 case VM_MEMORY_MALLOC_LARGE:
15582 case VM_MEMORY_REALLOC:
15583 case VM_MEMORY_MALLOC_TINY:
15584 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
15585 case VM_MEMORY_MALLOC_LARGE_REUSED:
15586 /*
15587 * This is a malloc() memory region: check if it's still
15588 * in its original state and can be re-used for more
15589 * malloc() allocations.
15590 */
15591 break;
15592 default:
15593 /*
15594 * Not a malloc() memory region: let the caller decide if
15595 * it's re-usable.
15596 */
15597 return TRUE;
15598 }
15599
15600 if (/*entry->is_shared ||*/
15601 entry->is_sub_map ||
15602 entry->in_transition ||
15603 entry->protection != VM_PROT_DEFAULT ||
15604 entry->max_protection != VM_PROT_ALL ||
15605 entry->inheritance != VM_INHERIT_DEFAULT ||
15606 entry->no_cache ||
15607 entry->permanent ||
15608 entry->superpage_size != FALSE ||
15609 entry->zero_wired_pages ||
15610 entry->wired_count != 0 ||
15611 entry->user_wired_count != 0) {
15612 return FALSE;
15613 }
15614
15615 object = VME_OBJECT(entry);
15616 if (object == VM_OBJECT_NULL) {
15617 return TRUE;
15618 }
15619 if (
15620 #if 0
15621 /*
15622 * Let's proceed even if the VM object is potentially
15623 * shared.
15624 * We check for this later when processing the actual
15625 * VM pages, so the contents will be safe if shared.
15626 *
15627 * But we can still mark this memory region as "reusable" to
15628 * acknowledge that the caller did let us know that the memory
15629 * could be re-used and should not be penalized for holding
15630 * on to it. This allows its "resident size" to not include
15631 * the reusable range.
15632 */
15633 object->ref_count == 1 &&
15634 #endif
15635 object->wired_page_count == 0 &&
15636 object->copy == VM_OBJECT_NULL &&
15637 object->shadow == VM_OBJECT_NULL &&
15638 object->internal &&
15639 object->purgable == VM_PURGABLE_DENY &&
15640 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
15641 !object->code_signed) {
15642 return TRUE;
15643 }
15644 return FALSE;
15645 }
15646
15647 static kern_return_t
vm_map_reuse_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15648 vm_map_reuse_pages(
15649 vm_map_t map,
15650 vm_map_offset_t start,
15651 vm_map_offset_t end)
15652 {
15653 vm_map_entry_t entry;
15654 vm_object_t object;
15655 vm_object_offset_t start_offset, end_offset;
15656
15657 /*
15658 * The MADV_REUSE operation doesn't require any changes to the
15659 * vm_map_entry_t's, so the read lock is sufficient.
15660 */
15661
15662 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15663 /*
15664 * XXX TODO4K
15665 * need to figure out what reusable means for a
15666 * portion of a native page.
15667 */
15668 return KERN_SUCCESS;
15669 }
15670
15671 vm_map_lock_read(map);
15672 assert(map->pmap != kernel_pmap); /* protect alias access */
15673
15674 /*
15675 * The madvise semantics require that the address range be fully
15676 * allocated with no holes. Otherwise, we're required to return
15677 * an error.
15678 */
15679
15680 if (!vm_map_range_check(map, start, end, &entry)) {
15681 vm_map_unlock_read(map);
15682 vm_page_stats_reusable.reuse_pages_failure++;
15683 return KERN_INVALID_ADDRESS;
15684 }
15685
15686 /*
15687 * Examine each vm_map_entry_t in the range.
15688 */
15689 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15690 entry = entry->vme_next) {
15691 /*
15692 * Sanity check on the VM map entry.
15693 */
15694 if (!vm_map_entry_is_reusable(entry)) {
15695 vm_map_unlock_read(map);
15696 vm_page_stats_reusable.reuse_pages_failure++;
15697 return KERN_INVALID_ADDRESS;
15698 }
15699
15700 /*
15701 * The first time through, the start address could be anywhere
15702 * within the vm_map_entry we found. So adjust the offset to
15703 * correspond.
15704 */
15705 if (entry->vme_start < start) {
15706 start_offset = start - entry->vme_start;
15707 } else {
15708 start_offset = 0;
15709 }
15710 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15711 start_offset += VME_OFFSET(entry);
15712 end_offset += VME_OFFSET(entry);
15713
15714 assert(!entry->is_sub_map);
15715 object = VME_OBJECT(entry);
15716 if (object != VM_OBJECT_NULL) {
15717 vm_object_lock(object);
15718 vm_object_reuse_pages(object, start_offset, end_offset,
15719 TRUE);
15720 vm_object_unlock(object);
15721 }
15722
15723 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
15724 /*
15725 * XXX
15726 * We do not hold the VM map exclusively here.
15727 * The "alias" field is not that critical, so it's
15728 * safe to update it here, as long as it is the only
15729 * one that can be modified while holding the VM map
15730 * "shared".
15731 */
15732 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
15733 }
15734 }
15735
15736 vm_map_unlock_read(map);
15737 vm_page_stats_reusable.reuse_pages_success++;
15738 return KERN_SUCCESS;
15739 }
15740
15741
15742 static kern_return_t
vm_map_reusable_pages(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15743 vm_map_reusable_pages(
15744 vm_map_t map,
15745 vm_map_offset_t start,
15746 vm_map_offset_t end)
15747 {
15748 vm_map_entry_t entry;
15749 vm_object_t object;
15750 vm_object_offset_t start_offset, end_offset;
15751 vm_map_offset_t pmap_offset;
15752
15753 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
15754 /*
15755 * XXX TODO4K
15756 * need to figure out what reusable means for a portion
15757 * of a native page.
15758 */
15759 return KERN_SUCCESS;
15760 }
15761
15762 /*
15763 * The MADV_REUSABLE operation doesn't require any changes to the
15764 * vm_map_entry_t's, so the read lock is sufficient.
15765 */
15766
15767 vm_map_lock_read(map);
15768 assert(map->pmap != kernel_pmap); /* protect alias access */
15769
15770 /*
15771 * The madvise semantics require that the address range be fully
15772 * allocated with no holes. Otherwise, we're required to return
15773 * an error.
15774 */
15775
15776 if (!vm_map_range_check(map, start, end, &entry)) {
15777 vm_map_unlock_read(map);
15778 vm_page_stats_reusable.reusable_pages_failure++;
15779 return KERN_INVALID_ADDRESS;
15780 }
15781
15782 /*
15783 * Examine each vm_map_entry_t in the range.
15784 */
15785 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15786 entry = entry->vme_next) {
15787 int kill_pages = 0;
15788
15789 /*
15790 * Sanity check on the VM map entry.
15791 */
15792 if (!vm_map_entry_is_reusable(entry)) {
15793 vm_map_unlock_read(map);
15794 vm_page_stats_reusable.reusable_pages_failure++;
15795 return KERN_INVALID_ADDRESS;
15796 }
15797
15798 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit) {
15799 /* not writable: can't discard contents */
15800 vm_map_unlock_read(map);
15801 vm_page_stats_reusable.reusable_nonwritable++;
15802 vm_page_stats_reusable.reusable_pages_failure++;
15803 return KERN_PROTECTION_FAILURE;
15804 }
15805
15806 /*
15807 * The first time through, the start address could be anywhere
15808 * within the vm_map_entry we found. So adjust the offset to
15809 * correspond.
15810 */
15811 if (entry->vme_start < start) {
15812 start_offset = start - entry->vme_start;
15813 pmap_offset = start;
15814 } else {
15815 start_offset = 0;
15816 pmap_offset = entry->vme_start;
15817 }
15818 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
15819 start_offset += VME_OFFSET(entry);
15820 end_offset += VME_OFFSET(entry);
15821
15822 assert(!entry->is_sub_map);
15823 object = VME_OBJECT(entry);
15824 if (object == VM_OBJECT_NULL) {
15825 continue;
15826 }
15827
15828
15829 vm_object_lock(object);
15830 if (((object->ref_count == 1) ||
15831 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
15832 object->copy == VM_OBJECT_NULL)) &&
15833 object->shadow == VM_OBJECT_NULL &&
15834 /*
15835 * "iokit_acct" entries are billed for their virtual size
15836 * (rather than for their resident pages only), so they
15837 * wouldn't benefit from making pages reusable, and it
15838 * would be hard to keep track of pages that are both
15839 * "iokit_acct" and "reusable" in the pmap stats and
15840 * ledgers.
15841 */
15842 !(entry->iokit_acct ||
15843 (!entry->is_sub_map && !entry->use_pmap))) {
15844 if (object->ref_count != 1) {
15845 vm_page_stats_reusable.reusable_shared++;
15846 }
15847 kill_pages = 1;
15848 } else {
15849 kill_pages = -1;
15850 }
15851 if (kill_pages != -1) {
15852 vm_object_deactivate_pages(object,
15853 start_offset,
15854 end_offset - start_offset,
15855 kill_pages,
15856 TRUE /*reusable_pages*/,
15857 map->pmap,
15858 pmap_offset);
15859 } else {
15860 vm_page_stats_reusable.reusable_pages_shared++;
15861 DTRACE_VM4(vm_map_reusable_pages_shared,
15862 unsigned int, VME_ALIAS(entry),
15863 vm_map_t, map,
15864 vm_map_entry_t, entry,
15865 vm_object_t, object);
15866 }
15867 vm_object_unlock(object);
15868
15869 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
15870 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
15871 /*
15872 * XXX
15873 * We do not hold the VM map exclusively here.
15874 * The "alias" field is not that critical, so it's
15875 * safe to update it here, as long as it is the only
15876 * one that can be modified while holding the VM map
15877 * "shared".
15878 */
15879 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
15880 }
15881 }
15882
15883 vm_map_unlock_read(map);
15884 vm_page_stats_reusable.reusable_pages_success++;
15885 return KERN_SUCCESS;
15886 }
15887
15888
15889 static kern_return_t
vm_map_can_reuse(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15890 vm_map_can_reuse(
15891 vm_map_t map,
15892 vm_map_offset_t start,
15893 vm_map_offset_t end)
15894 {
15895 vm_map_entry_t entry;
15896
15897 /*
15898 * The MADV_REUSABLE operation doesn't require any changes to the
15899 * vm_map_entry_t's, so the read lock is sufficient.
15900 */
15901
15902 vm_map_lock_read(map);
15903 assert(map->pmap != kernel_pmap); /* protect alias access */
15904
15905 /*
15906 * The madvise semantics require that the address range be fully
15907 * allocated with no holes. Otherwise, we're required to return
15908 * an error.
15909 */
15910
15911 if (!vm_map_range_check(map, start, end, &entry)) {
15912 vm_map_unlock_read(map);
15913 vm_page_stats_reusable.can_reuse_failure++;
15914 return KERN_INVALID_ADDRESS;
15915 }
15916
15917 /*
15918 * Examine each vm_map_entry_t in the range.
15919 */
15920 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15921 entry = entry->vme_next) {
15922 /*
15923 * Sanity check on the VM map entry.
15924 */
15925 if (!vm_map_entry_is_reusable(entry)) {
15926 vm_map_unlock_read(map);
15927 vm_page_stats_reusable.can_reuse_failure++;
15928 return KERN_INVALID_ADDRESS;
15929 }
15930 }
15931
15932 vm_map_unlock_read(map);
15933 vm_page_stats_reusable.can_reuse_success++;
15934 return KERN_SUCCESS;
15935 }
15936
15937
15938 #if MACH_ASSERT
15939 static kern_return_t
vm_map_pageout(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)15940 vm_map_pageout(
15941 vm_map_t map,
15942 vm_map_offset_t start,
15943 vm_map_offset_t end)
15944 {
15945 vm_map_entry_t entry;
15946
15947 /*
15948 * The MADV_PAGEOUT operation doesn't require any changes to the
15949 * vm_map_entry_t's, so the read lock is sufficient.
15950 */
15951
15952 vm_map_lock_read(map);
15953
15954 /*
15955 * The madvise semantics require that the address range be fully
15956 * allocated with no holes. Otherwise, we're required to return
15957 * an error.
15958 */
15959
15960 if (!vm_map_range_check(map, start, end, &entry)) {
15961 vm_map_unlock_read(map);
15962 return KERN_INVALID_ADDRESS;
15963 }
15964
15965 /*
15966 * Examine each vm_map_entry_t in the range.
15967 */
15968 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
15969 entry = entry->vme_next) {
15970 vm_object_t object;
15971
15972 /*
15973 * Sanity check on the VM map entry.
15974 */
15975 if (entry->is_sub_map) {
15976 vm_map_t submap;
15977 vm_map_offset_t submap_start;
15978 vm_map_offset_t submap_end;
15979 vm_map_entry_t submap_entry;
15980
15981 submap = VME_SUBMAP(entry);
15982 submap_start = VME_OFFSET(entry);
15983 submap_end = submap_start + (entry->vme_end -
15984 entry->vme_start);
15985
15986 vm_map_lock_read(submap);
15987
15988 if (!vm_map_range_check(submap,
15989 submap_start,
15990 submap_end,
15991 &submap_entry)) {
15992 vm_map_unlock_read(submap);
15993 vm_map_unlock_read(map);
15994 return KERN_INVALID_ADDRESS;
15995 }
15996
15997 object = VME_OBJECT(submap_entry);
15998 if (submap_entry->is_sub_map ||
15999 object == VM_OBJECT_NULL ||
16000 !object->internal) {
16001 vm_map_unlock_read(submap);
16002 continue;
16003 }
16004
16005 vm_object_pageout(object);
16006
16007 vm_map_unlock_read(submap);
16008 submap = VM_MAP_NULL;
16009 submap_entry = VM_MAP_ENTRY_NULL;
16010 continue;
16011 }
16012
16013 object = VME_OBJECT(entry);
16014 if (entry->is_sub_map ||
16015 object == VM_OBJECT_NULL ||
16016 !object->internal) {
16017 continue;
16018 }
16019
16020 vm_object_pageout(object);
16021 }
16022
16023 vm_map_unlock_read(map);
16024 return KERN_SUCCESS;
16025 }
16026 #endif /* MACH_ASSERT */
16027
16028
16029 /*
16030 * Routine: vm_map_entry_insert
16031 *
16032 * Description: This routine inserts a new vm_entry in a locked map.
16033 */
16034 static vm_map_entry_t
vm_map_entry_insert(vm_map_t map,vm_map_entry_t insp_entry,vm_map_offset_t start,vm_map_offset_t end,vm_object_t object,vm_object_offset_t offset,vm_map_kernel_flags_t vmk_flags,boolean_t needs_copy,vm_prot_t cur_protection,vm_prot_t max_protection,vm_inherit_t inheritance,boolean_t no_cache,boolean_t permanent,boolean_t no_copy_on_read,unsigned int superpage_size,boolean_t clear_map_aligned,boolean_t is_submap,boolean_t used_for_jit,int alias,boolean_t translated_allow_execute)16035 vm_map_entry_insert(
16036 vm_map_t map,
16037 vm_map_entry_t insp_entry,
16038 vm_map_offset_t start,
16039 vm_map_offset_t end,
16040 vm_object_t object,
16041 vm_object_offset_t offset,
16042 vm_map_kernel_flags_t vmk_flags,
16043 boolean_t needs_copy,
16044 vm_prot_t cur_protection,
16045 vm_prot_t max_protection,
16046 vm_inherit_t inheritance,
16047 boolean_t no_cache,
16048 boolean_t permanent,
16049 boolean_t no_copy_on_read,
16050 unsigned int superpage_size,
16051 boolean_t clear_map_aligned,
16052 boolean_t is_submap,
16053 boolean_t used_for_jit,
16054 int alias,
16055 boolean_t translated_allow_execute)
16056 {
16057 vm_map_entry_t new_entry;
16058 boolean_t map_aligned = FALSE;
16059
16060 assert(insp_entry != (vm_map_entry_t)0);
16061 vm_map_lock_assert_exclusive(map);
16062
16063 #if DEVELOPMENT || DEBUG
16064 vm_object_offset_t end_offset = 0;
16065 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
16066 #endif /* DEVELOPMENT || DEBUG */
16067
16068 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
16069 map_aligned = TRUE;
16070 }
16071 if (clear_map_aligned &&
16072 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
16073 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
16074 map_aligned = FALSE;
16075 }
16076 if (map_aligned) {
16077 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
16078 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
16079 } else {
16080 assert(page_aligned(start));
16081 assert(page_aligned(end));
16082 }
16083 assert(start < end);
16084
16085 new_entry = vm_map_entry_create(map);
16086
16087 new_entry->vme_start = start;
16088 new_entry->vme_end = end;
16089
16090 VME_OBJECT_SET(new_entry, object);
16091 VME_OFFSET_SET(new_entry, offset);
16092 VME_ALIAS_SET(new_entry, alias);
16093
16094 new_entry->map_aligned = map_aligned;
16095 new_entry->is_sub_map = is_submap;
16096 new_entry->needs_copy = needs_copy;
16097 new_entry->inheritance = inheritance;
16098 new_entry->protection = cur_protection;
16099 new_entry->max_protection = max_protection;
16100 /*
16101 * submap: "use_pmap" means "nested".
16102 * default: false.
16103 *
16104 * object: "use_pmap" means "use pmap accounting" for footprint.
16105 * default: true.
16106 */
16107 new_entry->use_pmap = !is_submap;
16108 new_entry->no_cache = no_cache;
16109 new_entry->permanent = permanent;
16110 new_entry->translated_allow_execute = translated_allow_execute;
16111 new_entry->vme_no_copy_on_read = no_copy_on_read;
16112 new_entry->superpage_size = (superpage_size != 0);
16113
16114 if (used_for_jit) {
16115 if (!(map->jit_entry_exists) ||
16116 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
16117 new_entry->used_for_jit = TRUE;
16118 map->jit_entry_exists = TRUE;
16119 }
16120 }
16121
16122 /*
16123 * Insert the new entry into the list.
16124 */
16125
16126 vm_map_store_entry_link(map, insp_entry, new_entry, vmk_flags);
16127 map->size += end - start;
16128
16129 /*
16130 * Update the free space hint and the lookup hint.
16131 */
16132
16133 SAVE_HINT_MAP_WRITE(map, new_entry);
16134 return new_entry;
16135 }
16136
16137 /*
16138 * Routine: vm_map_remap_extract
16139 *
16140 * Description: This routine returns a vm_entry list from a map.
16141 */
16142 static kern_return_t
vm_map_remap_extract(vm_map_t map,vm_map_offset_t addr,vm_map_size_t size,boolean_t copy,struct vm_map_header * map_header,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance,vm_map_kernel_flags_t vmk_flags)16143 vm_map_remap_extract(
16144 vm_map_t map,
16145 vm_map_offset_t addr,
16146 vm_map_size_t size,
16147 boolean_t copy,
16148 struct vm_map_header *map_header,
16149 vm_prot_t *cur_protection, /* IN/OUT */
16150 vm_prot_t *max_protection, /* IN/OUT */
16151 /* What, no behavior? */
16152 vm_inherit_t inheritance,
16153 vm_map_kernel_flags_t vmk_flags)
16154 {
16155 kern_return_t result;
16156 vm_map_size_t mapped_size;
16157 vm_map_size_t tmp_size;
16158 vm_map_entry_t src_entry; /* result of last map lookup */
16159 vm_map_entry_t new_entry;
16160 vm_object_offset_t offset;
16161 vm_map_offset_t map_address;
16162 vm_map_offset_t src_start; /* start of entry to map */
16163 vm_map_offset_t src_end; /* end of region to be mapped */
16164 vm_object_t object;
16165 vm_map_version_t version;
16166 boolean_t src_needs_copy;
16167 boolean_t new_entry_needs_copy;
16168 vm_map_entry_t saved_src_entry;
16169 boolean_t src_entry_was_wired;
16170 vm_prot_t max_prot_for_prot_copy;
16171 vm_map_offset_t effective_page_mask;
16172 boolean_t pageable, same_map;
16173 boolean_t vm_remap_legacy;
16174 vm_prot_t required_cur_prot, required_max_prot;
16175 vm_object_t new_copy_object; /* vm_object_copy_* result */
16176 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
16177
16178 pageable = vmk_flags.vmkf_copy_pageable;
16179 same_map = vmk_flags.vmkf_copy_same_map;
16180
16181 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
16182
16183 assert(map != VM_MAP_NULL);
16184 assert(size != 0);
16185 assert(size == vm_map_round_page(size, effective_page_mask));
16186 assert(inheritance == VM_INHERIT_NONE ||
16187 inheritance == VM_INHERIT_COPY ||
16188 inheritance == VM_INHERIT_SHARE);
16189 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16190 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
16191 assert((*cur_protection & *max_protection) == *cur_protection);
16192
16193 /*
16194 * Compute start and end of region.
16195 */
16196 src_start = vm_map_trunc_page(addr, effective_page_mask);
16197 src_end = vm_map_round_page(src_start + size, effective_page_mask);
16198
16199 /*
16200 * Initialize map_header.
16201 */
16202 map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16203 map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
16204 map_header->nentries = 0;
16205 map_header->entries_pageable = pageable;
16206 // map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
16207 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
16208 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
16209
16210 vm_map_store_init( map_header );
16211
16212 if (copy && vmk_flags.vmkf_remap_prot_copy) {
16213 /*
16214 * Special case for vm_map_protect(VM_PROT_COPY):
16215 * we want to set the new mappings' max protection to the
16216 * specified *max_protection...
16217 */
16218 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
16219 /* ... but we want to use the vm_remap() legacy mode */
16220 *max_protection = VM_PROT_NONE;
16221 *cur_protection = VM_PROT_NONE;
16222 } else {
16223 max_prot_for_prot_copy = VM_PROT_NONE;
16224 }
16225
16226 if (*cur_protection == VM_PROT_NONE &&
16227 *max_protection == VM_PROT_NONE) {
16228 /*
16229 * vm_remap() legacy mode:
16230 * Extract all memory regions in the specified range and
16231 * collect the strictest set of protections allowed on the
16232 * entire range, so the caller knows what they can do with
16233 * the remapped range.
16234 * We start with VM_PROT_ALL and we'll remove the protections
16235 * missing from each memory region.
16236 */
16237 vm_remap_legacy = TRUE;
16238 *cur_protection = VM_PROT_ALL;
16239 *max_protection = VM_PROT_ALL;
16240 required_cur_prot = VM_PROT_NONE;
16241 required_max_prot = VM_PROT_NONE;
16242 } else {
16243 /*
16244 * vm_remap_new() mode:
16245 * Extract all memory regions in the specified range and
16246 * ensure that they have at least the protections specified
16247 * by the caller via *cur_protection and *max_protection.
16248 * The resulting mapping should have these protections.
16249 */
16250 vm_remap_legacy = FALSE;
16251 if (copy) {
16252 required_cur_prot = VM_PROT_NONE;
16253 required_max_prot = VM_PROT_READ;
16254 } else {
16255 required_cur_prot = *cur_protection;
16256 required_max_prot = *max_protection;
16257 }
16258 }
16259
16260 map_address = 0;
16261 mapped_size = 0;
16262 result = KERN_SUCCESS;
16263
16264 /*
16265 * The specified source virtual space might correspond to
16266 * multiple map entries, need to loop on them.
16267 */
16268 vm_map_lock(map);
16269 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16270 /*
16271 * This address space uses sub-pages so the range might
16272 * not be re-mappable in an address space with larger
16273 * pages. Re-assemble any broken-up VM map entries to
16274 * improve our chances of making it work.
16275 */
16276 vm_map_simplify_range(map, src_start, src_end);
16277 }
16278 while (mapped_size != size) {
16279 vm_map_size_t entry_size;
16280
16281 /*
16282 * Find the beginning of the region.
16283 */
16284 if (!vm_map_lookup_entry(map, src_start, &src_entry)) {
16285 result = KERN_INVALID_ADDRESS;
16286 break;
16287 }
16288
16289 if (src_start < src_entry->vme_start ||
16290 (mapped_size && src_start != src_entry->vme_start)) {
16291 result = KERN_INVALID_ADDRESS;
16292 break;
16293 }
16294
16295 tmp_size = size - mapped_size;
16296 if (src_end > src_entry->vme_end) {
16297 tmp_size -= (src_end - src_entry->vme_end);
16298 }
16299
16300 entry_size = (vm_map_size_t)(src_entry->vme_end -
16301 src_entry->vme_start);
16302
16303 if (src_entry->is_sub_map &&
16304 vmk_flags.vmkf_copy_single_object) {
16305 vm_map_t submap;
16306 vm_map_offset_t submap_start;
16307 vm_map_size_t submap_size;
16308 boolean_t submap_needs_copy;
16309
16310 /*
16311 * No check for "required protection" on "src_entry"
16312 * because the protections that matter are the ones
16313 * on the submap's VM map entry, which will be checked
16314 * during the call to vm_map_remap_extract() below.
16315 */
16316 submap_size = src_entry->vme_end - src_start;
16317 if (submap_size > size) {
16318 submap_size = size;
16319 }
16320 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16321 submap = VME_SUBMAP(src_entry);
16322 if (copy) {
16323 /*
16324 * The caller wants a copy-on-write re-mapping,
16325 * so let's extract from the submap accordingly.
16326 */
16327 submap_needs_copy = TRUE;
16328 } else if (src_entry->needs_copy) {
16329 /*
16330 * The caller wants a shared re-mapping but the
16331 * submap is mapped with "needs_copy", so its
16332 * contents can't be shared as is. Extract the
16333 * contents of the submap as "copy-on-write".
16334 * The re-mapping won't be shared with the
16335 * original mapping but this is equivalent to
16336 * what happened with the original "remap from
16337 * submap" code.
16338 * The shared region is mapped "needs_copy", for
16339 * example.
16340 */
16341 submap_needs_copy = TRUE;
16342 } else {
16343 /*
16344 * The caller wants a shared re-mapping and
16345 * this mapping can be shared (no "needs_copy"),
16346 * so let's extract from the submap accordingly.
16347 * Kernel submaps are mapped without
16348 * "needs_copy", for example.
16349 */
16350 submap_needs_copy = FALSE;
16351 }
16352 vm_map_reference(submap);
16353 vm_map_unlock(map);
16354 src_entry = NULL;
16355 if (vm_remap_legacy) {
16356 *cur_protection = VM_PROT_NONE;
16357 *max_protection = VM_PROT_NONE;
16358 }
16359
16360 DTRACE_VM7(remap_submap_recurse,
16361 vm_map_t, map,
16362 vm_map_offset_t, addr,
16363 vm_map_size_t, size,
16364 boolean_t, copy,
16365 vm_map_offset_t, submap_start,
16366 vm_map_size_t, submap_size,
16367 boolean_t, submap_needs_copy);
16368
16369 result = vm_map_remap_extract(submap,
16370 submap_start,
16371 submap_size,
16372 submap_needs_copy,
16373 map_header,
16374 cur_protection,
16375 max_protection,
16376 inheritance,
16377 vmk_flags);
16378 vm_map_deallocate(submap);
16379 return result;
16380 }
16381
16382 if (src_entry->is_sub_map) {
16383 /* protections for submap mapping are irrelevant here */
16384 } else if (((src_entry->protection & required_cur_prot) !=
16385 required_cur_prot) ||
16386 ((src_entry->max_protection & required_max_prot) !=
16387 required_max_prot)) {
16388 if (vmk_flags.vmkf_copy_single_object &&
16389 mapped_size != 0) {
16390 /*
16391 * Single object extraction.
16392 * We can't extract more with the required
16393 * protection but we've extracted some, so
16394 * stop there and declare success.
16395 * The caller should check the size of
16396 * the copy entry we've extracted.
16397 */
16398 result = KERN_SUCCESS;
16399 } else {
16400 /*
16401 * VM range extraction.
16402 * Required proctection is not available
16403 * for this part of the range: fail.
16404 */
16405 result = KERN_PROTECTION_FAILURE;
16406 }
16407 break;
16408 }
16409
16410 if (src_entry->is_sub_map) {
16411 vm_map_t submap;
16412 vm_map_offset_t submap_start;
16413 vm_map_size_t submap_size;
16414 vm_map_copy_t submap_copy;
16415 vm_prot_t submap_curprot, submap_maxprot;
16416 boolean_t submap_needs_copy;
16417
16418 /*
16419 * No check for "required protection" on "src_entry"
16420 * because the protections that matter are the ones
16421 * on the submap's VM map entry, which will be checked
16422 * during the call to vm_map_copy_extract() below.
16423 */
16424 object = VM_OBJECT_NULL;
16425 submap_copy = VM_MAP_COPY_NULL;
16426
16427 /* find equivalent range in the submap */
16428 submap = VME_SUBMAP(src_entry);
16429 submap_start = VME_OFFSET(src_entry) + src_start - src_entry->vme_start;
16430 submap_size = tmp_size;
16431 if (copy) {
16432 /*
16433 * The caller wants a copy-on-write re-mapping,
16434 * so let's extract from the submap accordingly.
16435 */
16436 submap_needs_copy = TRUE;
16437 } else if (src_entry->needs_copy) {
16438 /*
16439 * The caller wants a shared re-mapping but the
16440 * submap is mapped with "needs_copy", so its
16441 * contents can't be shared as is. Extract the
16442 * contents of the submap as "copy-on-write".
16443 * The re-mapping won't be shared with the
16444 * original mapping but this is equivalent to
16445 * what happened with the original "remap from
16446 * submap" code.
16447 * The shared region is mapped "needs_copy", for
16448 * example.
16449 */
16450 submap_needs_copy = TRUE;
16451 } else {
16452 /*
16453 * The caller wants a shared re-mapping and
16454 * this mapping can be shared (no "needs_copy"),
16455 * so let's extract from the submap accordingly.
16456 * Kernel submaps are mapped without
16457 * "needs_copy", for example.
16458 */
16459 submap_needs_copy = FALSE;
16460 }
16461 /* extra ref to keep submap alive */
16462 vm_map_reference(submap);
16463
16464 DTRACE_VM7(remap_submap_recurse,
16465 vm_map_t, map,
16466 vm_map_offset_t, addr,
16467 vm_map_size_t, size,
16468 boolean_t, copy,
16469 vm_map_offset_t, submap_start,
16470 vm_map_size_t, submap_size,
16471 boolean_t, submap_needs_copy);
16472
16473 /*
16474 * The map can be safely unlocked since we
16475 * already hold a reference on the submap.
16476 *
16477 * No timestamp since we don't care if the map
16478 * gets modified while we're down in the submap.
16479 * We'll resume the extraction at src_start + tmp_size
16480 * anyway.
16481 */
16482 vm_map_unlock(map);
16483 src_entry = NULL; /* not valid once map is unlocked */
16484
16485 if (vm_remap_legacy) {
16486 submap_curprot = VM_PROT_NONE;
16487 submap_maxprot = VM_PROT_NONE;
16488 if (max_prot_for_prot_copy) {
16489 submap_maxprot = max_prot_for_prot_copy;
16490 }
16491 } else {
16492 assert(!max_prot_for_prot_copy);
16493 submap_curprot = *cur_protection;
16494 submap_maxprot = *max_protection;
16495 }
16496 result = vm_map_copy_extract(submap,
16497 submap_start,
16498 submap_size,
16499 submap_needs_copy,
16500 &submap_copy,
16501 &submap_curprot,
16502 &submap_maxprot,
16503 inheritance,
16504 vmk_flags);
16505
16506 /* release extra ref on submap */
16507 vm_map_deallocate(submap);
16508 submap = VM_MAP_NULL;
16509
16510 if (result != KERN_SUCCESS) {
16511 vm_map_lock(map);
16512 break;
16513 }
16514
16515 /* transfer submap_copy entries to map_header */
16516 while (vm_map_copy_first_entry(submap_copy) !=
16517 vm_map_copy_to_entry(submap_copy)) {
16518 vm_map_entry_t copy_entry;
16519 vm_map_size_t copy_entry_size;
16520
16521 copy_entry = vm_map_copy_first_entry(submap_copy);
16522 assert(!copy_entry->is_sub_map);
16523 object = VME_OBJECT(copy_entry);
16524
16525 /*
16526 * Prevent kernel_object from being exposed to
16527 * user space.
16528 */
16529 if (__improbable(object == kernel_object)) {
16530 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16531 proc_selfpid(),
16532 (current_task()->bsd_info
16533 ? proc_name_address(current_task()->bsd_info)
16534 : "?"));
16535 DTRACE_VM(extract_kernel_only);
16536 result = KERN_INVALID_RIGHT;
16537 vm_map_copy_discard(submap_copy);
16538 submap_copy = VM_MAP_COPY_NULL;
16539 vm_map_lock(map);
16540 break;
16541 }
16542
16543 vm_map_copy_entry_unlink(submap_copy, copy_entry);
16544 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
16545 copy_entry->vme_start = map_address;
16546 copy_entry->vme_end = map_address + copy_entry_size;
16547 map_address += copy_entry_size;
16548 mapped_size += copy_entry_size;
16549 src_start += copy_entry_size;
16550 assert(src_start <= src_end);
16551 _vm_map_store_entry_link(map_header,
16552 map_header->links.prev,
16553 copy_entry);
16554 }
16555 /* done with submap_copy */
16556 vm_map_copy_discard(submap_copy);
16557
16558 if (vm_remap_legacy) {
16559 *cur_protection &= submap_curprot;
16560 *max_protection &= submap_maxprot;
16561 }
16562
16563 /* re-acquire the map lock and continue to next entry */
16564 vm_map_lock(map);
16565 continue;
16566 } else {
16567 object = VME_OBJECT(src_entry);
16568
16569 /*
16570 * Prevent kernel_object from being exposed to
16571 * user space.
16572 */
16573 if (__improbable(object == kernel_object)) {
16574 printf("%d[%s]: rejecting attempt to extract from kernel_object\n",
16575 proc_selfpid(),
16576 (current_task()->bsd_info
16577 ? proc_name_address(current_task()->bsd_info)
16578 : "?"));
16579 DTRACE_VM(extract_kernel_only);
16580 result = KERN_INVALID_RIGHT;
16581 break;
16582 }
16583
16584 if (src_entry->iokit_acct) {
16585 /*
16586 * This entry uses "IOKit accounting".
16587 */
16588 } else if (object != VM_OBJECT_NULL &&
16589 (object->purgable != VM_PURGABLE_DENY ||
16590 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
16591 /*
16592 * Purgeable objects have their own accounting:
16593 * no pmap accounting for them.
16594 */
16595 assertf(!src_entry->use_pmap,
16596 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16597 map,
16598 src_entry,
16599 (uint64_t)src_entry->vme_start,
16600 (uint64_t)src_entry->vme_end,
16601 src_entry->protection,
16602 src_entry->max_protection,
16603 VME_ALIAS(src_entry));
16604 } else {
16605 /*
16606 * Not IOKit or purgeable:
16607 * must be accounted by pmap stats.
16608 */
16609 assertf(src_entry->use_pmap,
16610 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
16611 map,
16612 src_entry,
16613 (uint64_t)src_entry->vme_start,
16614 (uint64_t)src_entry->vme_end,
16615 src_entry->protection,
16616 src_entry->max_protection,
16617 VME_ALIAS(src_entry));
16618 }
16619
16620 if (object == VM_OBJECT_NULL) {
16621 assert(!src_entry->needs_copy);
16622 object = vm_object_allocate(entry_size);
16623 VME_OFFSET_SET(src_entry, 0);
16624 VME_OBJECT_SET(src_entry, object);
16625 assert(src_entry->use_pmap);
16626 assert(!map->mapped_in_other_pmaps);
16627 } else if (src_entry->wired_count ||
16628 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
16629 /*
16630 * A wired memory region should not have
16631 * any pending copy-on-write and needs to
16632 * keep pointing at the VM object that
16633 * contains the wired pages.
16634 * If we're sharing this memory (copy=false),
16635 * we'll share this VM object.
16636 * If we're copying this memory (copy=true),
16637 * we'll call vm_object_copy_slowly() below
16638 * and use the new VM object for the remapping.
16639 *
16640 * Or, we are already using an asymmetric
16641 * copy, and therefore we already have
16642 * the right object.
16643 */
16644 assert(!src_entry->needs_copy);
16645 } else if (src_entry->needs_copy || object->shadowed ||
16646 (object->internal && !object->true_share &&
16647 !src_entry->is_shared &&
16648 object->vo_size > entry_size)) {
16649 VME_OBJECT_SHADOW(src_entry, entry_size);
16650 assert(src_entry->use_pmap);
16651
16652 if (!src_entry->needs_copy &&
16653 (src_entry->protection & VM_PROT_WRITE)) {
16654 vm_prot_t prot;
16655
16656 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16657
16658 prot = src_entry->protection & ~VM_PROT_WRITE;
16659
16660 if (override_nx(map,
16661 VME_ALIAS(src_entry))
16662 && prot) {
16663 prot |= VM_PROT_EXECUTE;
16664 }
16665
16666 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16667
16668 if (map->mapped_in_other_pmaps) {
16669 vm_object_pmap_protect(
16670 VME_OBJECT(src_entry),
16671 VME_OFFSET(src_entry),
16672 entry_size,
16673 PMAP_NULL,
16674 PAGE_SIZE,
16675 src_entry->vme_start,
16676 prot);
16677 #if MACH_ASSERT
16678 } else if (__improbable(map->pmap == PMAP_NULL)) {
16679 extern boolean_t vm_tests_in_progress;
16680 assert(vm_tests_in_progress);
16681 /*
16682 * Some VM tests (in vm_tests.c)
16683 * sometimes want to use a VM
16684 * map without a pmap.
16685 * Otherwise, this should never
16686 * happen.
16687 */
16688 #endif /* MACH_ASSERT */
16689 } else {
16690 pmap_protect(vm_map_pmap(map),
16691 src_entry->vme_start,
16692 src_entry->vme_end,
16693 prot);
16694 }
16695 }
16696
16697 object = VME_OBJECT(src_entry);
16698 src_entry->needs_copy = FALSE;
16699 }
16700
16701
16702 vm_object_lock(object);
16703 vm_object_reference_locked(object); /* object ref. for new entry */
16704 assert(!src_entry->needs_copy);
16705 if (object->copy_strategy ==
16706 MEMORY_OBJECT_COPY_SYMMETRIC) {
16707 /*
16708 * If we want to share this object (copy==0),
16709 * it needs to be COPY_DELAY.
16710 * If we want to copy this object (copy==1),
16711 * we can't just set "needs_copy" on our side
16712 * and expect the other side to do the same
16713 * (symmetrically), so we can't let the object
16714 * stay COPY_SYMMETRIC.
16715 * So we always switch from COPY_SYMMETRIC to
16716 * COPY_DELAY.
16717 */
16718 object->copy_strategy =
16719 MEMORY_OBJECT_COPY_DELAY;
16720 object->true_share = TRUE;
16721 }
16722 vm_object_unlock(object);
16723 }
16724
16725 offset = (VME_OFFSET(src_entry) +
16726 (src_start - src_entry->vme_start));
16727
16728 new_entry = _vm_map_entry_create(map_header);
16729 vm_map_entry_copy(map, new_entry, src_entry);
16730 if (new_entry->is_sub_map) {
16731 /* clr address space specifics */
16732 new_entry->use_pmap = FALSE;
16733 } else if (copy) {
16734 /*
16735 * We're dealing with a copy-on-write operation,
16736 * so the resulting mapping should not inherit the
16737 * original mapping's accounting settings.
16738 * "use_pmap" should be reset to its default (TRUE)
16739 * so that the new mapping gets accounted for in
16740 * the task's memory footprint.
16741 */
16742 new_entry->use_pmap = TRUE;
16743 }
16744 /* "iokit_acct" was cleared in vm_map_entry_copy() */
16745 assert(!new_entry->iokit_acct);
16746
16747 new_entry->map_aligned = FALSE;
16748
16749 new_entry->vme_start = map_address;
16750 new_entry->vme_end = map_address + tmp_size;
16751 assert(new_entry->vme_start < new_entry->vme_end);
16752 if (copy && vmk_flags.vmkf_remap_prot_copy) {
16753 /*
16754 * Remapping for vm_map_protect(VM_PROT_COPY)
16755 * to convert a read-only mapping into a
16756 * copy-on-write version of itself but
16757 * with write access:
16758 * keep the original inheritance and add
16759 * VM_PROT_WRITE to the max protection.
16760 */
16761 new_entry->inheritance = src_entry->inheritance;
16762 new_entry->protection &= max_prot_for_prot_copy;
16763 new_entry->max_protection |= VM_PROT_WRITE;
16764 } else {
16765 new_entry->inheritance = inheritance;
16766 if (!vm_remap_legacy) {
16767 new_entry->protection = *cur_protection;
16768 new_entry->max_protection = *max_protection;
16769 }
16770 }
16771 VME_OFFSET_SET(new_entry, offset);
16772
16773 /*
16774 * The new region has to be copied now if required.
16775 */
16776 RestartCopy:
16777 if (!copy) {
16778 if (src_entry->used_for_jit == TRUE) {
16779 if (same_map) {
16780 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
16781 /*
16782 * Cannot allow an entry describing a JIT
16783 * region to be shared across address spaces.
16784 */
16785 result = KERN_INVALID_ARGUMENT;
16786 vm_object_deallocate(object);
16787 vm_map_entry_dispose(new_entry);
16788 new_entry = VM_MAP_ENTRY_NULL;
16789 break;
16790 }
16791 }
16792
16793 src_entry->is_shared = TRUE;
16794 new_entry->is_shared = TRUE;
16795 if (!(new_entry->is_sub_map)) {
16796 new_entry->needs_copy = FALSE;
16797 }
16798 } else if (src_entry->is_sub_map) {
16799 /* make this a COW sub_map if not already */
16800 assert(new_entry->wired_count == 0);
16801 new_entry->needs_copy = TRUE;
16802 object = VM_OBJECT_NULL;
16803 } else if (src_entry->wired_count == 0 &&
16804 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
16805 vm_object_copy_quickly(VME_OBJECT(new_entry),
16806 VME_OFFSET(new_entry),
16807 (new_entry->vme_end -
16808 new_entry->vme_start),
16809 &src_needs_copy,
16810 &new_entry_needs_copy)) {
16811 new_entry->needs_copy = new_entry_needs_copy;
16812 new_entry->is_shared = FALSE;
16813 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16814
16815 /*
16816 * Handle copy_on_write semantics.
16817 */
16818 if (src_needs_copy && !src_entry->needs_copy) {
16819 vm_prot_t prot;
16820
16821 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, src_entry->protection));
16822
16823 prot = src_entry->protection & ~VM_PROT_WRITE;
16824
16825 if (override_nx(map,
16826 VME_ALIAS(src_entry))
16827 && prot) {
16828 prot |= VM_PROT_EXECUTE;
16829 }
16830
16831 assert(!pmap_has_prot_policy(map->pmap, src_entry->translated_allow_execute, prot));
16832
16833 vm_object_pmap_protect(object,
16834 offset,
16835 entry_size,
16836 ((src_entry->is_shared
16837 || map->mapped_in_other_pmaps) ?
16838 PMAP_NULL : map->pmap),
16839 VM_MAP_PAGE_SIZE(map),
16840 src_entry->vme_start,
16841 prot);
16842
16843 assert(src_entry->wired_count == 0);
16844 src_entry->needs_copy = TRUE;
16845 }
16846 /*
16847 * Throw away the old object reference of the new entry.
16848 */
16849 vm_object_deallocate(object);
16850 } else {
16851 new_entry->is_shared = FALSE;
16852 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
16853
16854 src_entry_was_wired = (src_entry->wired_count > 0);
16855 saved_src_entry = src_entry;
16856 src_entry = VM_MAP_ENTRY_NULL;
16857
16858 /*
16859 * The map can be safely unlocked since we
16860 * already hold a reference on the object.
16861 *
16862 * Record the timestamp of the map for later
16863 * verification, and unlock the map.
16864 */
16865 version.main_timestamp = map->timestamp;
16866 vm_map_unlock(map); /* Increments timestamp once! */
16867
16868 /*
16869 * Perform the copy.
16870 */
16871 if (src_entry_was_wired > 0 ||
16872 (debug4k_no_cow_copyin &&
16873 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
16874 vm_object_lock(object);
16875 result = vm_object_copy_slowly(
16876 object,
16877 offset,
16878 (new_entry->vme_end -
16879 new_entry->vme_start),
16880 THREAD_UNINT,
16881 &new_copy_object);
16882 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16883 saved_used_for_jit = new_entry->used_for_jit;
16884 VME_OBJECT_SET(new_entry, new_copy_object);
16885 new_entry->used_for_jit = saved_used_for_jit;
16886 VME_OFFSET_SET(new_entry, offset - vm_object_trunc_page(offset));
16887 new_entry->needs_copy = FALSE;
16888 } else {
16889 vm_object_offset_t new_offset;
16890
16891 new_offset = VME_OFFSET(new_entry);
16892 result = vm_object_copy_strategically(
16893 object,
16894 offset,
16895 (new_entry->vme_end -
16896 new_entry->vme_start),
16897 &new_copy_object,
16898 &new_offset,
16899 &new_entry_needs_copy);
16900 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
16901 saved_used_for_jit = new_entry->used_for_jit;
16902 VME_OBJECT_SET(new_entry, new_copy_object);
16903 new_entry->used_for_jit = saved_used_for_jit;
16904 if (new_offset != VME_OFFSET(new_entry)) {
16905 VME_OFFSET_SET(new_entry, new_offset);
16906 }
16907
16908 new_entry->needs_copy = new_entry_needs_copy;
16909 }
16910
16911 /*
16912 * Throw away the old object reference of the new entry.
16913 */
16914 vm_object_deallocate(object);
16915
16916 if (result != KERN_SUCCESS &&
16917 result != KERN_MEMORY_RESTART_COPY) {
16918 vm_map_entry_dispose(new_entry);
16919 vm_map_lock(map);
16920 break;
16921 }
16922
16923 /*
16924 * Verify that the map has not substantially
16925 * changed while the copy was being made.
16926 */
16927
16928 vm_map_lock(map);
16929 if (version.main_timestamp + 1 != map->timestamp) {
16930 /*
16931 * Simple version comparison failed.
16932 *
16933 * Retry the lookup and verify that the
16934 * same object/offset are still present.
16935 */
16936 saved_src_entry = VM_MAP_ENTRY_NULL;
16937 vm_object_deallocate(VME_OBJECT(new_entry));
16938 vm_map_entry_dispose(new_entry);
16939 if (result == KERN_MEMORY_RESTART_COPY) {
16940 result = KERN_SUCCESS;
16941 }
16942 continue;
16943 }
16944 /* map hasn't changed: src_entry is still valid */
16945 src_entry = saved_src_entry;
16946 saved_src_entry = VM_MAP_ENTRY_NULL;
16947
16948 if (result == KERN_MEMORY_RESTART_COPY) {
16949 vm_object_reference(object);
16950 goto RestartCopy;
16951 }
16952 }
16953
16954 _vm_map_store_entry_link(map_header,
16955 map_header->links.prev, new_entry);
16956
16957 /* protections for submap mapping are irrelevant here */
16958 if (vm_remap_legacy && !src_entry->is_sub_map) {
16959 *cur_protection &= src_entry->protection;
16960 *max_protection &= src_entry->max_protection;
16961 }
16962
16963 map_address += tmp_size;
16964 mapped_size += tmp_size;
16965 src_start += tmp_size;
16966
16967 if (vmk_flags.vmkf_copy_single_object) {
16968 if (mapped_size != size) {
16969 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n", map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
16970 if (src_entry->vme_next != vm_map_to_entry(map) &&
16971 VME_OBJECT(src_entry->vme_next) == VME_OBJECT(src_entry)) {
16972 /* XXX TODO4K */
16973 DEBUG4K_ERROR("could have extended copy to next entry...\n");
16974 }
16975 }
16976 break;
16977 }
16978 } /* end while */
16979
16980 vm_map_unlock(map);
16981 if (result != KERN_SUCCESS) {
16982 /*
16983 * Free all allocated elements.
16984 */
16985 for (src_entry = map_header->links.next;
16986 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
16987 src_entry = new_entry) {
16988 new_entry = src_entry->vme_next;
16989 _vm_map_store_entry_unlink(map_header, src_entry);
16990 if (src_entry->is_sub_map) {
16991 vm_map_deallocate(VME_SUBMAP(src_entry));
16992 } else {
16993 vm_object_deallocate(VME_OBJECT(src_entry));
16994 }
16995 vm_map_entry_dispose(src_entry);
16996 }
16997 }
16998 return result;
16999 }
17000
17001 bool
vm_map_is_exotic(vm_map_t map)17002 vm_map_is_exotic(
17003 vm_map_t map)
17004 {
17005 return VM_MAP_IS_EXOTIC(map);
17006 }
17007
17008 bool
vm_map_is_alien(vm_map_t map)17009 vm_map_is_alien(
17010 vm_map_t map)
17011 {
17012 return VM_MAP_IS_ALIEN(map);
17013 }
17014
17015 #if XNU_TARGET_OS_OSX
17016 void
vm_map_mark_alien(vm_map_t map)17017 vm_map_mark_alien(
17018 vm_map_t map)
17019 {
17020 vm_map_lock(map);
17021 map->is_alien = true;
17022 vm_map_unlock(map);
17023 }
17024
17025 void
vm_map_single_jit(vm_map_t map)17026 vm_map_single_jit(
17027 vm_map_t map)
17028 {
17029 vm_map_lock(map);
17030 map->single_jit = true;
17031 vm_map_unlock(map);
17032 }
17033 #endif /* XNU_TARGET_OS_OSX */
17034
17035 /*
17036 * Callers of this function must call vm_map_copy_require on
17037 * previously created vm_map_copy_t or pass a newly created
17038 * one to ensure that it hasn't been forged.
17039 */
17040 static kern_return_t
vm_map_copy_to_physcopy(vm_map_copy_t copy_map,vm_map_t target_map)17041 vm_map_copy_to_physcopy(
17042 vm_map_copy_t copy_map,
17043 vm_map_t target_map)
17044 {
17045 vm_map_size_t size;
17046 vm_map_entry_t entry;
17047 vm_map_entry_t new_entry;
17048 vm_object_t new_object;
17049 unsigned int pmap_flags;
17050 pmap_t new_pmap;
17051 vm_map_t new_map;
17052 vm_map_address_t src_start, src_end, src_cur;
17053 vm_map_address_t dst_start, dst_end, dst_cur;
17054 kern_return_t kr;
17055 void *kbuf;
17056
17057 /*
17058 * Perform the equivalent of vm_allocate() and memcpy().
17059 * Replace the mappings in "copy_map" with the newly allocated mapping.
17060 */
17061 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17062
17063 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
17064
17065 /* create a new pmap to map "copy_map" */
17066 pmap_flags = 0;
17067 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
17068 #if PMAP_CREATE_FORCE_4K_PAGES
17069 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
17070 #endif /* PMAP_CREATE_FORCE_4K_PAGES */
17071 pmap_flags |= PMAP_CREATE_64BIT;
17072 new_pmap = pmap_create_options(NULL, (vm_map_size_t)0, pmap_flags);
17073 if (new_pmap == NULL) {
17074 return KERN_RESOURCE_SHORTAGE;
17075 }
17076
17077 /* allocate new VM object */
17078 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
17079 new_object = vm_object_allocate(size);
17080 assert(new_object);
17081
17082 /* allocate new VM map entry */
17083 new_entry = vm_map_copy_entry_create(copy_map);
17084 assert(new_entry);
17085
17086 /* finish initializing new VM map entry */
17087 new_entry->protection = VM_PROT_DEFAULT;
17088 new_entry->max_protection = VM_PROT_DEFAULT;
17089 new_entry->use_pmap = TRUE;
17090
17091 /* make new VM map entry point to new VM object */
17092 new_entry->vme_start = 0;
17093 new_entry->vme_end = size;
17094 VME_OBJECT_SET(new_entry, new_object);
17095 VME_OFFSET_SET(new_entry, 0);
17096
17097 /* create a new pageable VM map to map "copy_map" */
17098 new_map = vm_map_create_options(new_pmap, 0, MACH_VM_MAX_ADDRESS,
17099 VM_MAP_CREATE_PAGEABLE);
17100 assert(new_map);
17101 vm_map_set_page_shift(new_map, copy_map->cpy_hdr.page_shift);
17102
17103 /* map "copy_map" in the new VM map */
17104 src_start = 0;
17105 kr = vm_map_copyout_internal(
17106 new_map,
17107 &src_start,
17108 copy_map,
17109 copy_map->size,
17110 FALSE, /* consume_on_success */
17111 VM_PROT_DEFAULT,
17112 VM_PROT_DEFAULT,
17113 VM_INHERIT_DEFAULT);
17114 assert(kr == KERN_SUCCESS);
17115 src_end = src_start + copy_map->size;
17116
17117 /* map "new_object" in the new VM map */
17118 vm_object_reference(new_object);
17119 dst_start = 0;
17120 kr = vm_map_enter(new_map,
17121 &dst_start,
17122 size,
17123 0, /* mask */
17124 VM_FLAGS_ANYWHERE,
17125 VM_MAP_KERNEL_FLAGS_NONE,
17126 VM_KERN_MEMORY_OSFMK,
17127 new_object,
17128 0, /* offset */
17129 FALSE, /* needs copy */
17130 VM_PROT_DEFAULT,
17131 VM_PROT_DEFAULT,
17132 VM_INHERIT_DEFAULT);
17133 assert(kr == KERN_SUCCESS);
17134 dst_end = dst_start + size;
17135
17136 /* get a kernel buffer */
17137 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
17138
17139 /* physically copy "copy_map" mappings to new VM object */
17140 for (src_cur = src_start, dst_cur = dst_start;
17141 src_cur < src_end;
17142 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
17143 vm_size_t bytes;
17144
17145 bytes = PAGE_SIZE;
17146 if (src_cur + PAGE_SIZE > src_end) {
17147 /* partial copy for last page */
17148 bytes = src_end - src_cur;
17149 assert(bytes > 0 && bytes < PAGE_SIZE);
17150 /* rest of dst page should be zero-filled */
17151 }
17152 /* get bytes from src mapping */
17153 kr = copyinmap(new_map, src_cur, kbuf, bytes);
17154 if (kr != KERN_SUCCESS) {
17155 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
17156 }
17157 /* put bytes in dst mapping */
17158 assert(dst_cur < dst_end);
17159 assert(dst_cur + bytes <= dst_end);
17160 kr = copyoutmap(new_map, kbuf, dst_cur, bytes);
17161 if (kr != KERN_SUCCESS) {
17162 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
17163 }
17164 }
17165
17166 /* free kernel buffer */
17167 kfree_data(kbuf, PAGE_SIZE);
17168
17169 /* destroy new map */
17170 vm_map_destroy(new_map);
17171 new_map = VM_MAP_NULL;
17172
17173 /* dispose of the old map entries in "copy_map" */
17174 while (vm_map_copy_first_entry(copy_map) !=
17175 vm_map_copy_to_entry(copy_map)) {
17176 entry = vm_map_copy_first_entry(copy_map);
17177 vm_map_copy_entry_unlink(copy_map, entry);
17178 if (entry->is_sub_map) {
17179 vm_map_deallocate(VME_SUBMAP(entry));
17180 } else {
17181 vm_object_deallocate(VME_OBJECT(entry));
17182 }
17183 vm_map_copy_entry_dispose(entry);
17184 }
17185
17186 /* change "copy_map"'s page_size to match "target_map" */
17187 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17188 copy_map->offset = 0;
17189 copy_map->size = size;
17190
17191 /* insert new map entry in "copy_map" */
17192 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
17193 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
17194
17195 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
17196 return KERN_SUCCESS;
17197 }
17198
17199 void
17200 vm_map_copy_adjust_get_target_copy_map(
17201 vm_map_copy_t copy_map,
17202 vm_map_copy_t *target_copy_map_p);
17203 void
vm_map_copy_adjust_get_target_copy_map(vm_map_copy_t copy_map,vm_map_copy_t * target_copy_map_p)17204 vm_map_copy_adjust_get_target_copy_map(
17205 vm_map_copy_t copy_map,
17206 vm_map_copy_t *target_copy_map_p)
17207 {
17208 vm_map_copy_t target_copy_map;
17209 vm_map_entry_t entry, target_entry;
17210
17211 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
17212 /* the caller already has a "target_copy_map": use it */
17213 return;
17214 }
17215
17216 /* the caller wants us to create a new copy of "copy_map" */
17217 target_copy_map = vm_map_copy_allocate();
17218 target_copy_map->type = copy_map->type;
17219 assert(target_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17220 target_copy_map->offset = copy_map->offset;
17221 target_copy_map->size = copy_map->size;
17222 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
17223 vm_map_store_init(&target_copy_map->cpy_hdr);
17224 for (entry = vm_map_copy_first_entry(copy_map);
17225 entry != vm_map_copy_to_entry(copy_map);
17226 entry = entry->vme_next) {
17227 target_entry = vm_map_copy_entry_create(target_copy_map);
17228 vm_map_entry_copy_full(target_entry, entry);
17229 if (target_entry->is_sub_map) {
17230 vm_map_reference(VME_SUBMAP(target_entry));
17231 } else {
17232 vm_object_reference(VME_OBJECT(target_entry));
17233 }
17234 vm_map_copy_entry_link(
17235 target_copy_map,
17236 vm_map_copy_last_entry(target_copy_map),
17237 target_entry);
17238 }
17239 entry = VM_MAP_ENTRY_NULL;
17240 *target_copy_map_p = target_copy_map;
17241 }
17242
17243 /*
17244 * Callers of this function must call vm_map_copy_require on
17245 * previously created vm_map_copy_t or pass a newly created
17246 * one to ensure that it hasn't been forged.
17247 */
17248 static void
vm_map_copy_trim(vm_map_copy_t copy_map,uint16_t new_page_shift,vm_map_offset_t trim_start,vm_map_offset_t trim_end)17249 vm_map_copy_trim(
17250 vm_map_copy_t copy_map,
17251 uint16_t new_page_shift,
17252 vm_map_offset_t trim_start,
17253 vm_map_offset_t trim_end)
17254 {
17255 uint16_t copy_page_shift;
17256 vm_map_entry_t entry, next_entry;
17257
17258 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17259 assert(copy_map->cpy_hdr.nentries > 0);
17260
17261 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
17262 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
17263
17264 /* use the new page_shift to do the clipping */
17265 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17266 copy_map->cpy_hdr.page_shift = new_page_shift;
17267
17268 for (entry = vm_map_copy_first_entry(copy_map);
17269 entry != vm_map_copy_to_entry(copy_map);
17270 entry = next_entry) {
17271 next_entry = entry->vme_next;
17272 if (entry->vme_end <= trim_start) {
17273 /* entry fully before trim range: skip */
17274 continue;
17275 }
17276 if (entry->vme_start >= trim_end) {
17277 /* entry fully after trim range: done */
17278 break;
17279 }
17280 /* clip entry if needed */
17281 vm_map_copy_clip_start(copy_map, entry, trim_start);
17282 vm_map_copy_clip_end(copy_map, entry, trim_end);
17283 /* dispose of entry */
17284 copy_map->size -= entry->vme_end - entry->vme_start;
17285 vm_map_copy_entry_unlink(copy_map, entry);
17286 if (entry->is_sub_map) {
17287 vm_map_deallocate(VME_SUBMAP(entry));
17288 } else {
17289 vm_object_deallocate(VME_OBJECT(entry));
17290 }
17291 vm_map_copy_entry_dispose(entry);
17292 entry = VM_MAP_ENTRY_NULL;
17293 }
17294
17295 /* restore copy_map's original page_shift */
17296 copy_map->cpy_hdr.page_shift = copy_page_shift;
17297 }
17298
17299 /*
17300 * Make any necessary adjustments to "copy_map" to allow it to be
17301 * mapped into "target_map".
17302 * If no changes were necessary, "target_copy_map" points to the
17303 * untouched "copy_map".
17304 * If changes are necessary, changes will be made to "target_copy_map".
17305 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
17306 * copy the original "copy_map" to it before applying the changes.
17307 * The caller should discard "target_copy_map" if it's not the same as
17308 * the original "copy_map".
17309 */
17310 /* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
17311 kern_return_t
vm_map_copy_adjust_to_target(vm_map_copy_t src_copy_map,vm_map_offset_t offset,vm_map_size_t size,vm_map_t target_map,boolean_t copy,vm_map_copy_t * target_copy_map_p,vm_map_offset_t * overmap_start_p,vm_map_offset_t * overmap_end_p,vm_map_offset_t * trimmed_start_p)17312 vm_map_copy_adjust_to_target(
17313 vm_map_copy_t src_copy_map,
17314 vm_map_offset_t offset,
17315 vm_map_size_t size,
17316 vm_map_t target_map,
17317 boolean_t copy,
17318 vm_map_copy_t *target_copy_map_p,
17319 vm_map_offset_t *overmap_start_p,
17320 vm_map_offset_t *overmap_end_p,
17321 vm_map_offset_t *trimmed_start_p)
17322 {
17323 vm_map_copy_t copy_map, target_copy_map;
17324 vm_map_size_t target_size;
17325 vm_map_size_t src_copy_map_size;
17326 vm_map_size_t overmap_start, overmap_end;
17327 int misalignments;
17328 vm_map_entry_t entry, target_entry;
17329 vm_map_offset_t addr_adjustment;
17330 vm_map_offset_t new_start, new_end;
17331 int copy_page_mask, target_page_mask;
17332 uint16_t copy_page_shift, target_page_shift;
17333 vm_map_offset_t trimmed_end;
17334
17335 /*
17336 * Assert that the vm_map_copy is coming from the right
17337 * zone and hasn't been forged
17338 */
17339 vm_map_copy_require(src_copy_map);
17340 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
17341
17342 /*
17343 * Start working with "src_copy_map" but we'll switch
17344 * to "target_copy_map" as soon as we start making adjustments.
17345 */
17346 copy_map = src_copy_map;
17347 src_copy_map_size = src_copy_map->size;
17348
17349 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
17350 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
17351 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(target_map);
17352 target_page_mask = VM_MAP_PAGE_MASK(target_map);
17353
17354 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
17355
17356 target_copy_map = *target_copy_map_p;
17357 if (target_copy_map != VM_MAP_COPY_NULL) {
17358 vm_map_copy_require(target_copy_map);
17359 }
17360
17361 if (offset + size > copy_map->size) {
17362 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
17363 return KERN_INVALID_ARGUMENT;
17364 }
17365
17366 /* trim the end */
17367 trimmed_end = 0;
17368 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
17369 if (new_end < copy_map->size) {
17370 trimmed_end = src_copy_map_size - new_end;
17371 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
17372 /* get "target_copy_map" if needed and adjust it */
17373 vm_map_copy_adjust_get_target_copy_map(copy_map,
17374 &target_copy_map);
17375 copy_map = target_copy_map;
17376 vm_map_copy_trim(target_copy_map, target_page_shift,
17377 new_end, copy_map->size);
17378 }
17379
17380 /* trim the start */
17381 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
17382 if (new_start != 0) {
17383 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
17384 /* get "target_copy_map" if needed and adjust it */
17385 vm_map_copy_adjust_get_target_copy_map(copy_map,
17386 &target_copy_map);
17387 copy_map = target_copy_map;
17388 vm_map_copy_trim(target_copy_map, target_page_shift,
17389 0, new_start);
17390 }
17391 *trimmed_start_p = new_start;
17392
17393 /* target_size starts with what's left after trimming */
17394 target_size = copy_map->size;
17395 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
17396 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
17397 (uint64_t)target_size, (uint64_t)src_copy_map_size,
17398 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
17399
17400 /* check for misalignments but don't adjust yet */
17401 misalignments = 0;
17402 overmap_start = 0;
17403 overmap_end = 0;
17404 if (copy_page_shift < target_page_shift) {
17405 /*
17406 * Remapping from 4K to 16K: check the VM object alignments
17407 * throughout the range.
17408 * If the start and end of the range are mis-aligned, we can
17409 * over-map to re-align, and adjust the "overmap" start/end
17410 * and "target_size" of the range accordingly.
17411 * If there is any mis-alignment within the range:
17412 * if "copy":
17413 * we can do immediate-copy instead of copy-on-write,
17414 * else:
17415 * no way to remap and share; fail.
17416 */
17417 for (entry = vm_map_copy_first_entry(copy_map);
17418 entry != vm_map_copy_to_entry(copy_map);
17419 entry = entry->vme_next) {
17420 vm_object_offset_t object_offset_start, object_offset_end;
17421
17422 object_offset_start = VME_OFFSET(entry);
17423 object_offset_end = object_offset_start;
17424 object_offset_end += entry->vme_end - entry->vme_start;
17425 if (object_offset_start & target_page_mask) {
17426 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
17427 overmap_start++;
17428 } else {
17429 misalignments++;
17430 }
17431 }
17432 if (object_offset_end & target_page_mask) {
17433 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
17434 overmap_end++;
17435 } else {
17436 misalignments++;
17437 }
17438 }
17439 }
17440 }
17441 entry = VM_MAP_ENTRY_NULL;
17442
17443 /* decide how to deal with misalignments */
17444 assert(overmap_start <= 1);
17445 assert(overmap_end <= 1);
17446 if (!overmap_start && !overmap_end && !misalignments) {
17447 /* copy_map is properly aligned for target_map ... */
17448 if (*trimmed_start_p) {
17449 /* ... but we trimmed it, so still need to adjust */
17450 } else {
17451 /* ... and we didn't trim anything: we're done */
17452 if (target_copy_map == VM_MAP_COPY_NULL) {
17453 target_copy_map = copy_map;
17454 }
17455 *target_copy_map_p = target_copy_map;
17456 *overmap_start_p = 0;
17457 *overmap_end_p = 0;
17458 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17459 return KERN_SUCCESS;
17460 }
17461 } else if (misalignments && !copy) {
17462 /* can't "share" if misaligned */
17463 DEBUG4K_ADJUST("unsupported sharing\n");
17464 #if MACH_ASSERT
17465 if (debug4k_panic_on_misaligned_sharing) {
17466 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
17467 }
17468 #endif /* MACH_ASSERT */
17469 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
17470 return KERN_NOT_SUPPORTED;
17471 } else {
17472 /* can't virtual-copy if misaligned (but can physical-copy) */
17473 DEBUG4K_ADJUST("mis-aligned copying\n");
17474 }
17475
17476 /* get a "target_copy_map" if needed and switch to it */
17477 vm_map_copy_adjust_get_target_copy_map(copy_map, &target_copy_map);
17478 copy_map = target_copy_map;
17479
17480 if (misalignments && copy) {
17481 vm_map_size_t target_copy_map_size;
17482
17483 /*
17484 * Can't do copy-on-write with misaligned mappings.
17485 * Replace the mappings with a physical copy of the original
17486 * mappings' contents.
17487 */
17488 target_copy_map_size = target_copy_map->size;
17489 kern_return_t kr = vm_map_copy_to_physcopy(target_copy_map, target_map);
17490 if (kr != KERN_SUCCESS) {
17491 return kr;
17492 }
17493 *target_copy_map_p = target_copy_map;
17494 *overmap_start_p = 0;
17495 *overmap_end_p = target_copy_map->size - target_copy_map_size;
17496 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17497 return KERN_SUCCESS;
17498 }
17499
17500 /* apply the adjustments */
17501 misalignments = 0;
17502 overmap_start = 0;
17503 overmap_end = 0;
17504 /* remove copy_map->offset, so that everything starts at offset 0 */
17505 addr_adjustment = copy_map->offset;
17506 /* also remove whatever we trimmed from the start */
17507 addr_adjustment += *trimmed_start_p;
17508 for (target_entry = vm_map_copy_first_entry(target_copy_map);
17509 target_entry != vm_map_copy_to_entry(target_copy_map);
17510 target_entry = target_entry->vme_next) {
17511 vm_object_offset_t object_offset_start, object_offset_end;
17512
17513 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17514 object_offset_start = VME_OFFSET(target_entry);
17515 if (object_offset_start & target_page_mask) {
17516 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17517 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17518 /*
17519 * start of 1st entry is mis-aligned:
17520 * re-adjust by over-mapping.
17521 */
17522 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
17523 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
17524 VME_OFFSET_SET(target_entry, VME_OFFSET(target_entry) - overmap_start);
17525 } else {
17526 misalignments++;
17527 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17528 assert(copy);
17529 }
17530 }
17531
17532 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
17533 target_size += overmap_start;
17534 } else {
17535 target_entry->vme_start += overmap_start;
17536 }
17537 target_entry->vme_end += overmap_start;
17538
17539 object_offset_end = VME_OFFSET(target_entry) + target_entry->vme_end - target_entry->vme_start;
17540 if (object_offset_end & target_page_mask) {
17541 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17542 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
17543 /*
17544 * end of last entry is mis-aligned: re-adjust by over-mapping.
17545 */
17546 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
17547 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
17548 target_entry->vme_end += overmap_end;
17549 target_size += overmap_end;
17550 } else {
17551 misalignments++;
17552 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
17553 assert(copy);
17554 }
17555 }
17556 target_entry->vme_start -= addr_adjustment;
17557 target_entry->vme_end -= addr_adjustment;
17558 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
17559 }
17560
17561 target_copy_map->size = target_size;
17562 target_copy_map->offset += overmap_start;
17563 target_copy_map->offset -= addr_adjustment;
17564 target_copy_map->cpy_hdr.page_shift = target_page_shift;
17565
17566 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
17567 // assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
17568 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
17569 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
17570
17571 *target_copy_map_p = target_copy_map;
17572 *overmap_start_p = overmap_start;
17573 *overmap_end_p = overmap_end;
17574
17575 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
17576 return KERN_SUCCESS;
17577 }
17578
17579 kern_return_t
vm_map_range_physical_size(vm_map_t map,vm_map_address_t start,mach_vm_size_t size,mach_vm_size_t * phys_size)17580 vm_map_range_physical_size(
17581 vm_map_t map,
17582 vm_map_address_t start,
17583 mach_vm_size_t size,
17584 mach_vm_size_t * phys_size)
17585 {
17586 kern_return_t kr;
17587 vm_map_copy_t copy_map, target_copy_map;
17588 vm_map_offset_t adjusted_start, adjusted_end;
17589 vm_map_size_t adjusted_size;
17590 vm_prot_t cur_prot, max_prot;
17591 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17592 vm_map_kernel_flags_t vmk_flags;
17593
17594 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
17595 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
17596 adjusted_size = adjusted_end - adjusted_start;
17597 *phys_size = adjusted_size;
17598 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
17599 return KERN_SUCCESS;
17600 }
17601 if (start == 0) {
17602 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
17603 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
17604 adjusted_size = adjusted_end - adjusted_start;
17605 *phys_size = adjusted_size;
17606 return KERN_SUCCESS;
17607 }
17608 if (adjusted_size == 0) {
17609 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx adjusted 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_size);
17610 *phys_size = 0;
17611 return KERN_SUCCESS;
17612 }
17613
17614 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
17615 vmk_flags.vmkf_copy_pageable = TRUE;
17616 vmk_flags.vmkf_copy_same_map = TRUE;
17617 assert(adjusted_size != 0);
17618 cur_prot = VM_PROT_NONE; /* legacy mode */
17619 max_prot = VM_PROT_NONE; /* legacy mode */
17620 kr = vm_map_copy_extract(map, adjusted_start, adjusted_size,
17621 FALSE /* copy */,
17622 ©_map,
17623 &cur_prot, &max_prot, VM_INHERIT_DEFAULT,
17624 vmk_flags);
17625 if (kr != KERN_SUCCESS) {
17626 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17627 //assert(0);
17628 *phys_size = 0;
17629 return kr;
17630 }
17631 assert(copy_map != VM_MAP_COPY_NULL);
17632 target_copy_map = copy_map;
17633 DEBUG4K_ADJUST("adjusting...\n");
17634 kr = vm_map_copy_adjust_to_target(
17635 copy_map,
17636 start - adjusted_start, /* offset */
17637 size, /* size */
17638 kernel_map,
17639 FALSE, /* copy */
17640 &target_copy_map,
17641 &overmap_start,
17642 &overmap_end,
17643 &trimmed_start);
17644 if (kr == KERN_SUCCESS) {
17645 if (target_copy_map->size != *phys_size) {
17646 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
17647 }
17648 *phys_size = target_copy_map->size;
17649 } else {
17650 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
17651 //assert(0);
17652 *phys_size = 0;
17653 }
17654 vm_map_copy_discard(copy_map);
17655 copy_map = VM_MAP_COPY_NULL;
17656
17657 return kr;
17658 }
17659
17660
17661 kern_return_t
memory_entry_check_for_adjustment(vm_map_t src_map,ipc_port_t port,vm_map_offset_t * overmap_start,vm_map_offset_t * overmap_end)17662 memory_entry_check_for_adjustment(
17663 vm_map_t src_map,
17664 ipc_port_t port,
17665 vm_map_offset_t *overmap_start,
17666 vm_map_offset_t *overmap_end)
17667 {
17668 kern_return_t kr = KERN_SUCCESS;
17669 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
17670
17671 assert(port);
17672 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
17673
17674 vm_named_entry_t named_entry;
17675
17676 named_entry = mach_memory_entry_from_port(port);
17677 named_entry_lock(named_entry);
17678 copy_map = named_entry->backing.copy;
17679 target_copy_map = copy_map;
17680
17681 if (src_map && VM_MAP_PAGE_SHIFT(src_map) < PAGE_SHIFT) {
17682 vm_map_offset_t trimmed_start;
17683
17684 trimmed_start = 0;
17685 DEBUG4K_ADJUST("adjusting...\n");
17686 kr = vm_map_copy_adjust_to_target(
17687 copy_map,
17688 0, /* offset */
17689 copy_map->size, /* size */
17690 src_map,
17691 FALSE, /* copy */
17692 &target_copy_map,
17693 overmap_start,
17694 overmap_end,
17695 &trimmed_start);
17696 assert(trimmed_start == 0);
17697 }
17698 named_entry_unlock(named_entry);
17699
17700 return kr;
17701 }
17702
17703
17704 /*
17705 * Routine: vm_remap
17706 *
17707 * Map portion of a task's address space.
17708 * Mapped region must not overlap more than
17709 * one vm memory object. Protections and
17710 * inheritance attributes remain the same
17711 * as in the original task and are out parameters.
17712 * Source and Target task can be identical
17713 * Other attributes are identical as for vm_map()
17714 */
17715 kern_return_t
vm_map_remap(vm_map_t target_map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,vm_tag_t tag,vm_map_t src_map,vm_map_offset_t memory_address,boolean_t copy,vm_prot_t * cur_protection,vm_prot_t * max_protection,vm_inherit_t inheritance)17716 vm_map_remap(
17717 vm_map_t target_map,
17718 vm_map_address_t *address,
17719 vm_map_size_t size,
17720 vm_map_offset_t mask,
17721 int flags,
17722 vm_map_kernel_flags_t vmk_flags,
17723 vm_tag_t tag,
17724 vm_map_t src_map,
17725 vm_map_offset_t memory_address,
17726 boolean_t copy,
17727 vm_prot_t *cur_protection, /* IN/OUT */
17728 vm_prot_t *max_protection, /* IN/OUT */
17729 vm_inherit_t inheritance)
17730 {
17731 kern_return_t result;
17732 vm_map_entry_t entry;
17733 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
17734 vm_map_entry_t new_entry;
17735 vm_map_copy_t copy_map;
17736 vm_map_offset_t offset_in_mapping;
17737 vm_map_size_t target_size = 0;
17738 vm_map_size_t src_page_mask, target_page_mask;
17739 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
17740 vm_map_offset_t initial_memory_address;
17741 vm_map_size_t initial_size;
17742 VM_MAP_ZAP_DECLARE(zap_list);
17743
17744 if (target_map == VM_MAP_NULL) {
17745 return KERN_INVALID_ARGUMENT;
17746 }
17747
17748 initial_memory_address = memory_address;
17749 initial_size = size;
17750 src_page_mask = VM_MAP_PAGE_MASK(src_map);
17751 target_page_mask = VM_MAP_PAGE_MASK(target_map);
17752
17753 switch (inheritance) {
17754 case VM_INHERIT_NONE:
17755 case VM_INHERIT_COPY:
17756 case VM_INHERIT_SHARE:
17757 if (size != 0 && src_map != VM_MAP_NULL) {
17758 break;
17759 }
17760 OS_FALLTHROUGH;
17761 default:
17762 return KERN_INVALID_ARGUMENT;
17763 }
17764
17765 if (src_page_mask != target_page_mask) {
17766 if (copy) {
17767 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17768 } else {
17769 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
17770 }
17771 }
17772
17773 /*
17774 * If the user is requesting that we return the address of the
17775 * first byte of the data (rather than the base of the page),
17776 * then we use different rounding semantics: specifically,
17777 * we assume that (memory_address, size) describes a region
17778 * all of whose pages we must cover, rather than a base to be truncated
17779 * down and a size to be added to that base. So we figure out
17780 * the highest page that the requested region includes and make
17781 * sure that the size will cover it.
17782 *
17783 * The key example we're worried about it is of the form:
17784 *
17785 * memory_address = 0x1ff0, size = 0x20
17786 *
17787 * With the old semantics, we round down the memory_address to 0x1000
17788 * and round up the size to 0x1000, resulting in our covering *only*
17789 * page 0x1000. With the new semantics, we'd realize that the region covers
17790 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
17791 * 0x1000 and page 0x2000 in the region we remap.
17792 */
17793 if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17794 vm_map_offset_t range_start, range_end;
17795
17796 range_start = vm_map_trunc_page(memory_address, src_page_mask);
17797 range_end = vm_map_round_page(memory_address + size, src_page_mask);
17798 memory_address = range_start;
17799 size = range_end - range_start;
17800 offset_in_mapping = initial_memory_address - memory_address;
17801 } else {
17802 /*
17803 * IMPORTANT:
17804 * This legacy code path is broken: for the range mentioned
17805 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
17806 * two 4k pages, it yields [ memory_address = 0x1000,
17807 * size = 0x1000 ], which covers only the first 4k page.
17808 * BUT some code unfortunately depends on this bug, so we
17809 * can't fix it without breaking something.
17810 * New code should get automatically opted in the new
17811 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
17812 */
17813 offset_in_mapping = 0;
17814 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
17815 size = vm_map_round_page(size, src_page_mask);
17816 initial_memory_address = memory_address;
17817 initial_size = size;
17818 }
17819
17820
17821 if (size == 0) {
17822 return KERN_INVALID_ARGUMENT;
17823 }
17824
17825 if (flags & VM_FLAGS_RESILIENT_MEDIA) {
17826 /* must be copy-on-write to be "media resilient" */
17827 if (!copy) {
17828 return KERN_INVALID_ARGUMENT;
17829 }
17830 }
17831
17832 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
17833 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
17834
17835 assert(size != 0);
17836 result = vm_map_copy_extract(src_map,
17837 memory_address,
17838 size,
17839 copy, ©_map,
17840 cur_protection, /* IN/OUT */
17841 max_protection, /* IN/OUT */
17842 inheritance,
17843 vmk_flags);
17844 if (result != KERN_SUCCESS) {
17845 return result;
17846 }
17847 assert(copy_map != VM_MAP_COPY_NULL);
17848
17849 overmap_start = 0;
17850 overmap_end = 0;
17851 trimmed_start = 0;
17852 target_size = size;
17853 if (src_page_mask != target_page_mask) {
17854 vm_map_copy_t target_copy_map;
17855
17856 target_copy_map = copy_map; /* can modify "copy_map" itself */
17857 DEBUG4K_ADJUST("adjusting...\n");
17858 result = vm_map_copy_adjust_to_target(
17859 copy_map,
17860 offset_in_mapping, /* offset */
17861 initial_size,
17862 target_map,
17863 copy,
17864 &target_copy_map,
17865 &overmap_start,
17866 &overmap_end,
17867 &trimmed_start);
17868 if (result != KERN_SUCCESS) {
17869 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
17870 vm_map_copy_discard(copy_map);
17871 return result;
17872 }
17873 if (trimmed_start == 0) {
17874 /* nothing trimmed: no adjustment needed */
17875 } else if (trimmed_start >= offset_in_mapping) {
17876 /* trimmed more than offset_in_mapping: nothing left */
17877 assert(overmap_start == 0);
17878 assert(overmap_end == 0);
17879 offset_in_mapping = 0;
17880 } else {
17881 /* trimmed some of offset_in_mapping: adjust */
17882 assert(overmap_start == 0);
17883 assert(overmap_end == 0);
17884 offset_in_mapping -= trimmed_start;
17885 }
17886 offset_in_mapping += overmap_start;
17887 target_size = target_copy_map->size;
17888 }
17889
17890 /*
17891 * Allocate/check a range of free virtual address
17892 * space for the target
17893 */
17894 *address = vm_map_trunc_page(*address, target_page_mask);
17895 vm_map_lock(target_map);
17896 target_size = vm_map_round_page(target_size, target_page_mask);
17897 result = vm_map_remap_range_allocate(target_map, address,
17898 target_size, mask, flags, vmk_flags, tag,
17899 &insp_entry, &zap_list);
17900
17901 for (entry = vm_map_copy_first_entry(copy_map);
17902 entry != vm_map_copy_to_entry(copy_map);
17903 entry = new_entry) {
17904 new_entry = entry->vme_next;
17905 vm_map_copy_entry_unlink(copy_map, entry);
17906 if (result == KERN_SUCCESS) {
17907 if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17908 /* no codesigning -> read-only access */
17909 entry->max_protection = VM_PROT_READ;
17910 entry->protection = VM_PROT_READ;
17911 entry->vme_resilient_codesign = TRUE;
17912 }
17913 entry->vme_start += *address;
17914 entry->vme_end += *address;
17915 assert(!entry->map_aligned);
17916 if ((flags & VM_FLAGS_RESILIENT_MEDIA) &&
17917 !entry->is_sub_map &&
17918 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
17919 VME_OBJECT(entry)->internal)) {
17920 entry->vme_resilient_media = TRUE;
17921 }
17922 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
17923 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
17924 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
17925 vm_map_store_entry_link(target_map, insp_entry, entry,
17926 vmk_flags);
17927 insp_entry = entry;
17928 } else {
17929 if (!entry->is_sub_map) {
17930 vm_object_deallocate(VME_OBJECT(entry));
17931 } else {
17932 vm_map_deallocate(VME_SUBMAP(entry));
17933 }
17934 vm_map_copy_entry_dispose(entry);
17935 }
17936 }
17937
17938 if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
17939 *cur_protection = VM_PROT_READ;
17940 *max_protection = VM_PROT_READ;
17941 }
17942
17943 if (result == KERN_SUCCESS) {
17944 target_map->size += target_size;
17945 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
17946
17947 }
17948 vm_map_unlock(target_map);
17949
17950 vm_map_zap_dispose(&zap_list);
17951
17952 if (result == KERN_SUCCESS && target_map->wiring_required) {
17953 result = vm_map_wire_kernel(target_map, *address,
17954 *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
17955 TRUE);
17956 }
17957
17958 /*
17959 * If requested, return the address of the data pointed to by the
17960 * request, rather than the base of the resulting page.
17961 */
17962 if ((flags & VM_FLAGS_RETURN_DATA_ADDR) != 0) {
17963 *address += offset_in_mapping;
17964 }
17965
17966 if (src_page_mask != target_page_mask) {
17967 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
17968 }
17969 vm_map_copy_discard(copy_map);
17970 copy_map = VM_MAP_COPY_NULL;
17971
17972 return result;
17973 }
17974
17975 /*
17976 * Routine: vm_map_remap_range_allocate
17977 *
17978 * Description:
17979 * Allocate a range in the specified virtual address map.
17980 * returns the address and the map entry just before the allocated
17981 * range
17982 *
17983 * Map must be locked.
17984 */
17985
17986 static kern_return_t
vm_map_remap_range_allocate(vm_map_t map,vm_map_address_t * address,vm_map_size_t size,vm_map_offset_t mask,int flags,vm_map_kernel_flags_t vmk_flags,__unused vm_tag_t tag,vm_map_entry_t * map_entry,vm_map_zap_t zap_list)17987 vm_map_remap_range_allocate(
17988 vm_map_t map,
17989 vm_map_address_t *address, /* IN/OUT */
17990 vm_map_size_t size,
17991 vm_map_offset_t mask,
17992 int flags,
17993 vm_map_kernel_flags_t vmk_flags,
17994 __unused vm_tag_t tag,
17995 vm_map_entry_t *map_entry, /* OUT */
17996 vm_map_zap_t zap_list)
17997 {
17998 vm_map_entry_t entry;
17999 vm_map_offset_t start;
18000 kern_return_t kr;
18001
18002 start = *address;
18003
18004 if (flags & VM_FLAGS_ANYWHERE) {
18005 if (flags & VM_FLAGS_RANDOM_ADDR) {
18006 vmk_flags.vmkf_random_address = true;
18007 }
18008 if (start) {
18009 vmk_flags.vmkf_range_id = kmem_addr_get_range(start, size);
18010 }
18011
18012 kr = vm_map_locate_space(map, size, mask, vmk_flags,
18013 &start, &entry);
18014 if (kr != KERN_SUCCESS) {
18015 return kr;
18016 }
18017 *address = start;
18018 } else {
18019 vm_map_entry_t temp_entry;
18020 vm_map_offset_t end;
18021
18022 /*
18023 * Verify that:
18024 * the address doesn't itself violate
18025 * the mask requirement.
18026 */
18027
18028 if ((start & mask) != 0) {
18029 return KERN_NO_SPACE;
18030 }
18031
18032
18033 /*
18034 * ... the address is within bounds
18035 */
18036
18037 end = start + size;
18038
18039 if ((start < map->min_offset) ||
18040 (end > map->max_offset) ||
18041 (start >= end)) {
18042 return KERN_INVALID_ADDRESS;
18043 }
18044
18045 /*
18046 * If we're asked to overwrite whatever was mapped in that
18047 * range, first deallocate that range.
18048 */
18049 if (flags & VM_FLAGS_OVERWRITE) {
18050 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
18051
18052 /*
18053 * We use a "zap_list" to avoid having to unlock
18054 * the "map" in vm_map_delete(), which would compromise
18055 * the atomicity of the "deallocate" and then "remap"
18056 * combination.
18057 */
18058 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
18059
18060 if (vmk_flags.vmkf_overwrite_immutable) {
18061 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
18062 }
18063 (void)vm_map_delete(map, start, end,
18064 remove_flags, zap_list);
18065 }
18066
18067 /*
18068 * ... the starting address isn't allocated
18069 */
18070
18071 if (vm_map_lookup_entry(map, start, &temp_entry)) {
18072 return KERN_NO_SPACE;
18073 }
18074
18075 entry = temp_entry;
18076
18077 /*
18078 * ... the next region doesn't overlap the
18079 * end point.
18080 */
18081
18082 if ((entry->vme_next != vm_map_to_entry(map)) &&
18083 (entry->vme_next->vme_start < end)) {
18084 return KERN_NO_SPACE;
18085 }
18086 }
18087 *map_entry = entry;
18088 return KERN_SUCCESS;
18089 }
18090
18091 /*
18092 * vm_map_switch:
18093 *
18094 * Set the address map for the current thread to the specified map
18095 */
18096
18097 vm_map_t
vm_map_switch(vm_map_t map)18098 vm_map_switch(
18099 vm_map_t map)
18100 {
18101 int mycpu;
18102 thread_t thread = current_thread();
18103 vm_map_t oldmap = thread->map;
18104
18105 mp_disable_preemption();
18106 mycpu = cpu_number();
18107
18108 /*
18109 * Deactivate the current map and activate the requested map
18110 */
18111 PMAP_SWITCH_USER(thread, map, mycpu);
18112
18113 mp_enable_preemption();
18114 return oldmap;
18115 }
18116
18117
18118 /*
18119 * Routine: vm_map_write_user
18120 *
18121 * Description:
18122 * Copy out data from a kernel space into space in the
18123 * destination map. The space must already exist in the
18124 * destination map.
18125 * NOTE: This routine should only be called by threads
18126 * which can block on a page fault. i.e. kernel mode user
18127 * threads.
18128 *
18129 */
18130 kern_return_t
vm_map_write_user(vm_map_t map,void * src_p,vm_map_address_t dst_addr,vm_size_t size)18131 vm_map_write_user(
18132 vm_map_t map,
18133 void *src_p,
18134 vm_map_address_t dst_addr,
18135 vm_size_t size)
18136 {
18137 kern_return_t kr = KERN_SUCCESS;
18138
18139 if (current_map() == map) {
18140 if (copyout(src_p, dst_addr, size)) {
18141 kr = KERN_INVALID_ADDRESS;
18142 }
18143 } else {
18144 vm_map_t oldmap;
18145
18146 /* take on the identity of the target map while doing */
18147 /* the transfer */
18148
18149 vm_map_reference(map);
18150 oldmap = vm_map_switch(map);
18151 if (copyout(src_p, dst_addr, size)) {
18152 kr = KERN_INVALID_ADDRESS;
18153 }
18154 vm_map_switch(oldmap);
18155 vm_map_deallocate(map);
18156 }
18157 return kr;
18158 }
18159
18160 /*
18161 * Routine: vm_map_read_user
18162 *
18163 * Description:
18164 * Copy in data from a user space source map into the
18165 * kernel map. The space must already exist in the
18166 * kernel map.
18167 * NOTE: This routine should only be called by threads
18168 * which can block on a page fault. i.e. kernel mode user
18169 * threads.
18170 *
18171 */
18172 kern_return_t
vm_map_read_user(vm_map_t map,vm_map_address_t src_addr,void * dst_p,vm_size_t size)18173 vm_map_read_user(
18174 vm_map_t map,
18175 vm_map_address_t src_addr,
18176 void *dst_p,
18177 vm_size_t size)
18178 {
18179 kern_return_t kr = KERN_SUCCESS;
18180
18181 if (current_map() == map) {
18182 if (copyin(src_addr, dst_p, size)) {
18183 kr = KERN_INVALID_ADDRESS;
18184 }
18185 } else {
18186 vm_map_t oldmap;
18187
18188 /* take on the identity of the target map while doing */
18189 /* the transfer */
18190
18191 vm_map_reference(map);
18192 oldmap = vm_map_switch(map);
18193 if (copyin(src_addr, dst_p, size)) {
18194 kr = KERN_INVALID_ADDRESS;
18195 }
18196 vm_map_switch(oldmap);
18197 vm_map_deallocate(map);
18198 }
18199 return kr;
18200 }
18201
18202
18203 /*
18204 * vm_map_check_protection:
18205 *
18206 * Assert that the target map allows the specified
18207 * privilege on the entire address region given.
18208 * The entire region must be allocated.
18209 */
18210 boolean_t
vm_map_check_protection(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end,vm_prot_t protection)18211 vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
18212 vm_map_offset_t end, vm_prot_t protection)
18213 {
18214 vm_map_entry_t entry;
18215 vm_map_entry_t tmp_entry;
18216
18217 vm_map_lock(map);
18218
18219 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
18220 vm_map_unlock(map);
18221 return FALSE;
18222 }
18223
18224 if (!vm_map_lookup_entry(map, start, &tmp_entry)) {
18225 vm_map_unlock(map);
18226 return FALSE;
18227 }
18228
18229 entry = tmp_entry;
18230
18231 while (start < end) {
18232 if (entry == vm_map_to_entry(map)) {
18233 vm_map_unlock(map);
18234 return FALSE;
18235 }
18236
18237 /*
18238 * No holes allowed!
18239 */
18240
18241 if (start < entry->vme_start) {
18242 vm_map_unlock(map);
18243 return FALSE;
18244 }
18245
18246 /*
18247 * Check protection associated with entry.
18248 */
18249
18250 if ((entry->protection & protection) != protection) {
18251 vm_map_unlock(map);
18252 return FALSE;
18253 }
18254
18255 /* go to next entry */
18256
18257 start = entry->vme_end;
18258 entry = entry->vme_next;
18259 }
18260 vm_map_unlock(map);
18261 return TRUE;
18262 }
18263
18264 kern_return_t
vm_map_purgable_control(vm_map_t map,vm_map_offset_t address,vm_purgable_t control,int * state)18265 vm_map_purgable_control(
18266 vm_map_t map,
18267 vm_map_offset_t address,
18268 vm_purgable_t control,
18269 int *state)
18270 {
18271 vm_map_entry_t entry;
18272 vm_object_t object;
18273 kern_return_t kr;
18274 boolean_t was_nonvolatile;
18275
18276 /*
18277 * Vet all the input parameters and current type and state of the
18278 * underlaying object. Return with an error if anything is amiss.
18279 */
18280 if (map == VM_MAP_NULL) {
18281 return KERN_INVALID_ARGUMENT;
18282 }
18283
18284 if (control != VM_PURGABLE_SET_STATE &&
18285 control != VM_PURGABLE_GET_STATE &&
18286 control != VM_PURGABLE_PURGE_ALL &&
18287 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
18288 return KERN_INVALID_ARGUMENT;
18289 }
18290
18291 if (control == VM_PURGABLE_PURGE_ALL) {
18292 vm_purgeable_object_purge_all();
18293 return KERN_SUCCESS;
18294 }
18295
18296 if ((control == VM_PURGABLE_SET_STATE ||
18297 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
18298 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
18299 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
18300 return KERN_INVALID_ARGUMENT;
18301 }
18302
18303 vm_map_lock_read(map);
18304
18305 if (!vm_map_lookup_entry(map, address, &entry) || entry->is_sub_map) {
18306 /*
18307 * Must pass a valid non-submap address.
18308 */
18309 vm_map_unlock_read(map);
18310 return KERN_INVALID_ADDRESS;
18311 }
18312
18313 if ((entry->protection & VM_PROT_WRITE) == 0 &&
18314 control != VM_PURGABLE_GET_STATE) {
18315 /*
18316 * Can't apply purgable controls to something you can't write.
18317 */
18318 vm_map_unlock_read(map);
18319 return KERN_PROTECTION_FAILURE;
18320 }
18321
18322 object = VME_OBJECT(entry);
18323 if (object == VM_OBJECT_NULL ||
18324 object->purgable == VM_PURGABLE_DENY) {
18325 /*
18326 * Object must already be present and be purgeable.
18327 */
18328 vm_map_unlock_read(map);
18329 return KERN_INVALID_ARGUMENT;
18330 }
18331
18332 vm_object_lock(object);
18333
18334 #if 00
18335 if (VME_OFFSET(entry) != 0 ||
18336 entry->vme_end - entry->vme_start != object->vo_size) {
18337 /*
18338 * Can only apply purgable controls to the whole (existing)
18339 * object at once.
18340 */
18341 vm_map_unlock_read(map);
18342 vm_object_unlock(object);
18343 return KERN_INVALID_ARGUMENT;
18344 }
18345 #endif
18346
18347 assert(!entry->is_sub_map);
18348 assert(!entry->use_pmap); /* purgeable has its own accounting */
18349
18350 vm_map_unlock_read(map);
18351
18352 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
18353
18354 kr = vm_object_purgable_control(object, control, state);
18355
18356 if (was_nonvolatile &&
18357 object->purgable != VM_PURGABLE_NONVOLATILE &&
18358 map->pmap == kernel_pmap) {
18359 #if DEBUG
18360 object->vo_purgeable_volatilizer = kernel_task;
18361 #endif /* DEBUG */
18362 }
18363
18364 vm_object_unlock(object);
18365
18366 return kr;
18367 }
18368
18369 void
vm_map_footprint_query_page_info(vm_map_t map,vm_map_entry_t map_entry,vm_map_offset_t curr_s_offset,int * disposition_p)18370 vm_map_footprint_query_page_info(
18371 vm_map_t map,
18372 vm_map_entry_t map_entry,
18373 vm_map_offset_t curr_s_offset,
18374 int *disposition_p)
18375 {
18376 int pmap_disp;
18377 vm_object_t object;
18378 int disposition;
18379 int effective_page_size;
18380
18381 vm_map_lock_assert_held(map);
18382 assert(!map->has_corpse_footprint);
18383 assert(curr_s_offset >= map_entry->vme_start);
18384 assert(curr_s_offset < map_entry->vme_end);
18385
18386 object = VME_OBJECT(map_entry);
18387 if (object == VM_OBJECT_NULL) {
18388 *disposition_p = 0;
18389 return;
18390 }
18391
18392 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
18393
18394 pmap_disp = 0;
18395 if (object == VM_OBJECT_NULL) {
18396 /* nothing mapped here: no need to ask */
18397 *disposition_p = 0;
18398 return;
18399 } else if (map_entry->is_sub_map &&
18400 !map_entry->use_pmap) {
18401 /* nested pmap: no footprint */
18402 *disposition_p = 0;
18403 return;
18404 }
18405
18406 /*
18407 * Query the pmap.
18408 */
18409 pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
18410
18411 /*
18412 * Compute this page's disposition.
18413 */
18414 disposition = 0;
18415
18416 /* deal with "alternate accounting" first */
18417 if (!map_entry->is_sub_map &&
18418 object->vo_no_footprint) {
18419 /* does not count in footprint */
18420 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18421 } else if (!map_entry->is_sub_map &&
18422 (object->purgable == VM_PURGABLE_NONVOLATILE ||
18423 (object->purgable == VM_PURGABLE_DENY &&
18424 object->vo_ledger_tag)) &&
18425 VM_OBJECT_OWNER(object) != NULL &&
18426 VM_OBJECT_OWNER(object)->map == map) {
18427 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18428 if ((((curr_s_offset
18429 - map_entry->vme_start
18430 + VME_OFFSET(map_entry))
18431 / effective_page_size) <
18432 (object->resident_page_count +
18433 vm_compressor_pager_get_count(object->pager)))) {
18434 /*
18435 * Non-volatile purgeable object owned
18436 * by this task: report the first
18437 * "#resident + #compressed" pages as
18438 * "resident" (to show that they
18439 * contribute to the footprint) but not
18440 * "dirty" (to avoid double-counting
18441 * with the fake "non-volatile" region
18442 * we'll report at the end of the
18443 * address space to account for all
18444 * (mapped or not) non-volatile memory
18445 * owned by this task.
18446 */
18447 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18448 }
18449 } else if (!map_entry->is_sub_map &&
18450 (object->purgable == VM_PURGABLE_VOLATILE ||
18451 object->purgable == VM_PURGABLE_EMPTY) &&
18452 VM_OBJECT_OWNER(object) != NULL &&
18453 VM_OBJECT_OWNER(object)->map == map) {
18454 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18455 if ((((curr_s_offset
18456 - map_entry->vme_start
18457 + VME_OFFSET(map_entry))
18458 / effective_page_size) <
18459 object->wired_page_count)) {
18460 /*
18461 * Volatile|empty purgeable object owned
18462 * by this task: report the first
18463 * "#wired" pages as "resident" (to
18464 * show that they contribute to the
18465 * footprint) but not "dirty" (to avoid
18466 * double-counting with the fake
18467 * "non-volatile" region we'll report
18468 * at the end of the address space to
18469 * account for all (mapped or not)
18470 * non-volatile memory owned by this
18471 * task.
18472 */
18473 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18474 }
18475 } else if (!map_entry->is_sub_map &&
18476 map_entry->iokit_acct &&
18477 object->internal &&
18478 object->purgable == VM_PURGABLE_DENY) {
18479 /*
18480 * Non-purgeable IOKit memory: phys_footprint
18481 * includes the entire virtual mapping.
18482 */
18483 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18484 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18485 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18486 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
18487 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
18488 /* alternate accounting */
18489 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
18490 if (map->pmap->footprint_was_suspended) {
18491 /*
18492 * The assertion below can fail if dyld
18493 * suspended footprint accounting
18494 * while doing some adjustments to
18495 * this page; the mapping would say
18496 * "use pmap accounting" but the page
18497 * would be marked "alternate
18498 * accounting".
18499 */
18500 } else
18501 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
18502 {
18503 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18504 }
18505 disposition = 0;
18506 } else {
18507 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
18508 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18509 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18510 disposition |= VM_PAGE_QUERY_PAGE_REF;
18511 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
18512 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18513 } else {
18514 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18515 }
18516 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
18517 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18518 }
18519 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
18520 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
18521 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18522 }
18523 }
18524
18525 *disposition_p = disposition;
18526 }
18527
18528 kern_return_t
vm_map_page_query_internal(vm_map_t target_map,vm_map_offset_t offset,int * disposition,int * ref_count)18529 vm_map_page_query_internal(
18530 vm_map_t target_map,
18531 vm_map_offset_t offset,
18532 int *disposition,
18533 int *ref_count)
18534 {
18535 kern_return_t kr;
18536 vm_page_info_basic_data_t info;
18537 mach_msg_type_number_t count;
18538
18539 count = VM_PAGE_INFO_BASIC_COUNT;
18540 kr = vm_map_page_info(target_map,
18541 offset,
18542 VM_PAGE_INFO_BASIC,
18543 (vm_page_info_t) &info,
18544 &count);
18545 if (kr == KERN_SUCCESS) {
18546 *disposition = info.disposition;
18547 *ref_count = info.ref_count;
18548 } else {
18549 *disposition = 0;
18550 *ref_count = 0;
18551 }
18552
18553 return kr;
18554 }
18555
18556 kern_return_t
vm_map_page_info(vm_map_t map,vm_map_offset_t offset,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18557 vm_map_page_info(
18558 vm_map_t map,
18559 vm_map_offset_t offset,
18560 vm_page_info_flavor_t flavor,
18561 vm_page_info_t info,
18562 mach_msg_type_number_t *count)
18563 {
18564 return vm_map_page_range_info_internal(map,
18565 offset, /* start of range */
18566 (offset + 1), /* this will get rounded in the call to the page boundary */
18567 (int)-1, /* effective_page_shift: unspecified */
18568 flavor,
18569 info,
18570 count);
18571 }
18572
18573 kern_return_t
vm_map_page_range_info_internal(vm_map_t map,vm_map_offset_t start_offset,vm_map_offset_t end_offset,int effective_page_shift,vm_page_info_flavor_t flavor,vm_page_info_t info,mach_msg_type_number_t * count)18574 vm_map_page_range_info_internal(
18575 vm_map_t map,
18576 vm_map_offset_t start_offset,
18577 vm_map_offset_t end_offset,
18578 int effective_page_shift,
18579 vm_page_info_flavor_t flavor,
18580 vm_page_info_t info,
18581 mach_msg_type_number_t *count)
18582 {
18583 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
18584 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
18585 vm_page_t m = VM_PAGE_NULL;
18586 kern_return_t retval = KERN_SUCCESS;
18587 int disposition = 0;
18588 int ref_count = 0;
18589 int depth = 0, info_idx = 0;
18590 vm_page_info_basic_t basic_info = 0;
18591 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
18592 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
18593 boolean_t do_region_footprint;
18594 ledger_amount_t ledger_resident, ledger_compressed;
18595 int effective_page_size;
18596 vm_map_offset_t effective_page_mask;
18597
18598 switch (flavor) {
18599 case VM_PAGE_INFO_BASIC:
18600 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
18601 /*
18602 * The "vm_page_info_basic_data" structure was not
18603 * properly padded, so allow the size to be off by
18604 * one to maintain backwards binary compatibility...
18605 */
18606 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
18607 return KERN_INVALID_ARGUMENT;
18608 }
18609 }
18610 break;
18611 default:
18612 return KERN_INVALID_ARGUMENT;
18613 }
18614
18615 if (effective_page_shift == -1) {
18616 effective_page_shift = vm_self_region_page_shift_safely(map);
18617 if (effective_page_shift == -1) {
18618 return KERN_INVALID_ARGUMENT;
18619 }
18620 }
18621 effective_page_size = (1 << effective_page_shift);
18622 effective_page_mask = effective_page_size - 1;
18623
18624 do_region_footprint = task_self_region_footprint();
18625 disposition = 0;
18626 ref_count = 0;
18627 depth = 0;
18628 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
18629 retval = KERN_SUCCESS;
18630
18631 offset_in_page = start_offset & effective_page_mask;
18632 start = vm_map_trunc_page(start_offset, effective_page_mask);
18633 end = vm_map_round_page(end_offset, effective_page_mask);
18634
18635 if (end < start) {
18636 return KERN_INVALID_ARGUMENT;
18637 }
18638
18639 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
18640
18641 vm_map_lock_read(map);
18642
18643 task_ledgers_footprint(map->pmap->ledger, &ledger_resident, &ledger_compressed);
18644
18645 for (curr_s_offset = start; curr_s_offset < end;) {
18646 /*
18647 * New lookup needs reset of these variables.
18648 */
18649 curr_object = object = VM_OBJECT_NULL;
18650 offset_in_object = 0;
18651 ref_count = 0;
18652 depth = 0;
18653
18654 if (do_region_footprint &&
18655 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
18656 /*
18657 * Request for "footprint" info about a page beyond
18658 * the end of address space: this must be for
18659 * the fake region vm_map_region_recurse_64()
18660 * reported to account for non-volatile purgeable
18661 * memory owned by this task.
18662 */
18663 disposition = 0;
18664
18665 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
18666 (unsigned) ledger_compressed) {
18667 /*
18668 * We haven't reported all the "non-volatile
18669 * compressed" pages yet, so report this fake
18670 * page as "compressed".
18671 */
18672 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18673 } else {
18674 /*
18675 * We've reported all the non-volatile
18676 * compressed page but not all the non-volatile
18677 * pages , so report this fake page as
18678 * "resident dirty".
18679 */
18680 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18681 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18682 disposition |= VM_PAGE_QUERY_PAGE_REF;
18683 }
18684 switch (flavor) {
18685 case VM_PAGE_INFO_BASIC:
18686 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18687 basic_info->disposition = disposition;
18688 basic_info->ref_count = 1;
18689 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18690 basic_info->offset = 0;
18691 basic_info->depth = 0;
18692
18693 info_idx++;
18694 break;
18695 }
18696 curr_s_offset += effective_page_size;
18697 continue;
18698 }
18699
18700 /*
18701 * First, find the map entry covering "curr_s_offset", going down
18702 * submaps if necessary.
18703 */
18704 if (!vm_map_lookup_entry(map, curr_s_offset, &map_entry)) {
18705 /* no entry -> no object -> no page */
18706
18707 if (curr_s_offset < vm_map_min(map)) {
18708 /*
18709 * Illegal address that falls below map min.
18710 */
18711 curr_e_offset = MIN(end, vm_map_min(map));
18712 } else if (curr_s_offset >= vm_map_max(map)) {
18713 /*
18714 * Illegal address that falls on/after map max.
18715 */
18716 curr_e_offset = end;
18717 } else if (map_entry == vm_map_to_entry(map)) {
18718 /*
18719 * Hit a hole.
18720 */
18721 if (map_entry->vme_next == vm_map_to_entry(map)) {
18722 /*
18723 * Empty map.
18724 */
18725 curr_e_offset = MIN(map->max_offset, end);
18726 } else {
18727 /*
18728 * Hole at start of the map.
18729 */
18730 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18731 }
18732 } else {
18733 if (map_entry->vme_next == vm_map_to_entry(map)) {
18734 /*
18735 * Hole at the end of the map.
18736 */
18737 curr_e_offset = MIN(map->max_offset, end);
18738 } else {
18739 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
18740 }
18741 }
18742
18743 assert(curr_e_offset >= curr_s_offset);
18744
18745 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18746
18747 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18748
18749 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18750
18751 curr_s_offset = curr_e_offset;
18752
18753 info_idx += num_pages;
18754
18755 continue;
18756 }
18757
18758 /* compute offset from this map entry's start */
18759 offset_in_object = curr_s_offset - map_entry->vme_start;
18760
18761 /* compute offset into this map entry's object (or submap) */
18762 offset_in_object += VME_OFFSET(map_entry);
18763
18764 if (map_entry->is_sub_map) {
18765 vm_map_t sub_map = VM_MAP_NULL;
18766 vm_page_info_t submap_info = 0;
18767 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
18768
18769 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
18770
18771 submap_s_offset = offset_in_object;
18772 submap_e_offset = submap_s_offset + range_len;
18773
18774 sub_map = VME_SUBMAP(map_entry);
18775
18776 vm_map_reference(sub_map);
18777 vm_map_unlock_read(map);
18778
18779 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18780
18781 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
18782 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
18783
18784 retval = vm_map_page_range_info_internal(sub_map,
18785 submap_s_offset,
18786 submap_e_offset,
18787 effective_page_shift,
18788 VM_PAGE_INFO_BASIC,
18789 (vm_page_info_t) submap_info,
18790 count);
18791
18792 assert(retval == KERN_SUCCESS);
18793
18794 vm_map_lock_read(map);
18795 vm_map_deallocate(sub_map);
18796
18797 /* Move the "info" index by the number of pages we inspected.*/
18798 info_idx += range_len >> effective_page_shift;
18799
18800 /* Move our current offset by the size of the range we inspected.*/
18801 curr_s_offset += range_len;
18802
18803 continue;
18804 }
18805
18806 object = VME_OBJECT(map_entry);
18807
18808 if (object == VM_OBJECT_NULL) {
18809 /*
18810 * We don't have an object here and, hence,
18811 * no pages to inspect. We'll fill up the
18812 * info structure appropriately.
18813 */
18814
18815 curr_e_offset = MIN(map_entry->vme_end, end);
18816
18817 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
18818
18819 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18820
18821 bzero(info_ptr, num_pages * sizeof(struct vm_page_info_basic));
18822
18823 curr_s_offset = curr_e_offset;
18824
18825 info_idx += num_pages;
18826
18827 continue;
18828 }
18829
18830 if (do_region_footprint) {
18831 disposition = 0;
18832 if (map->has_corpse_footprint) {
18833 /*
18834 * Query the page info data we saved
18835 * while forking the corpse.
18836 */
18837 vm_map_corpse_footprint_query_page_info(
18838 map,
18839 curr_s_offset,
18840 &disposition);
18841 } else {
18842 /*
18843 * Query the live pmap for footprint info
18844 * about this page.
18845 */
18846 vm_map_footprint_query_page_info(
18847 map,
18848 map_entry,
18849 curr_s_offset,
18850 &disposition);
18851 }
18852 switch (flavor) {
18853 case VM_PAGE_INFO_BASIC:
18854 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18855 basic_info->disposition = disposition;
18856 basic_info->ref_count = 1;
18857 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
18858 basic_info->offset = 0;
18859 basic_info->depth = 0;
18860
18861 info_idx++;
18862 break;
18863 }
18864 curr_s_offset += effective_page_size;
18865 continue;
18866 }
18867
18868 vm_object_reference(object);
18869 /*
18870 * Shared mode -- so we can allow other readers
18871 * to grab the lock too.
18872 */
18873 vm_object_lock_shared(object);
18874
18875 curr_e_offset = MIN(map_entry->vme_end, end);
18876
18877 vm_map_unlock_read(map);
18878
18879 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
18880
18881 curr_object = object;
18882
18883 for (; curr_s_offset < curr_e_offset;) {
18884 if (object == curr_object) {
18885 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
18886 } else {
18887 ref_count = curr_object->ref_count;
18888 }
18889
18890 curr_offset_in_object = offset_in_object;
18891
18892 for (;;) {
18893 m = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset_in_object));
18894
18895 if (m != VM_PAGE_NULL) {
18896 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
18897 break;
18898 } else {
18899 if (curr_object->internal &&
18900 curr_object->alive &&
18901 !curr_object->terminating &&
18902 curr_object->pager_ready) {
18903 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
18904 == VM_EXTERNAL_STATE_EXISTS) {
18905 /* the pager has that page */
18906 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
18907 break;
18908 }
18909 }
18910
18911 /*
18912 * Go down the VM object shadow chain until we find the page
18913 * we're looking for.
18914 */
18915
18916 if (curr_object->shadow != VM_OBJECT_NULL) {
18917 vm_object_t shadow = VM_OBJECT_NULL;
18918
18919 curr_offset_in_object += curr_object->vo_shadow_offset;
18920 shadow = curr_object->shadow;
18921
18922 vm_object_lock_shared(shadow);
18923 vm_object_unlock(curr_object);
18924
18925 curr_object = shadow;
18926 depth++;
18927 continue;
18928 } else {
18929 break;
18930 }
18931 }
18932 }
18933
18934 /* The ref_count is not strictly accurate, it measures the number */
18935 /* of entities holding a ref on the object, they may not be mapping */
18936 /* the object or may not be mapping the section holding the */
18937 /* target page but its still a ball park number and though an over- */
18938 /* count, it picks up the copy-on-write cases */
18939
18940 /* We could also get a picture of page sharing from pmap_attributes */
18941 /* but this would under count as only faulted-in mappings would */
18942 /* show up. */
18943
18944 if ((curr_object == object) && curr_object->shadow) {
18945 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
18946 }
18947
18948 if (!curr_object->internal) {
18949 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
18950 }
18951
18952 if (m != VM_PAGE_NULL) {
18953 if (m->vmp_fictitious) {
18954 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
18955 } else {
18956 if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m))) {
18957 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
18958 }
18959
18960 if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m))) {
18961 disposition |= VM_PAGE_QUERY_PAGE_REF;
18962 }
18963
18964 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
18965 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
18966 }
18967
18968 /*
18969 * XXX TODO4K:
18970 * when this routine deals with 4k
18971 * pages, check the appropriate CS bit
18972 * here.
18973 */
18974 if (m->vmp_cs_validated) {
18975 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
18976 }
18977 if (m->vmp_cs_tainted) {
18978 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
18979 }
18980 if (m->vmp_cs_nx) {
18981 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
18982 }
18983 if (m->vmp_reusable || curr_object->all_reusable) {
18984 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
18985 }
18986 }
18987 }
18988
18989 switch (flavor) {
18990 case VM_PAGE_INFO_BASIC:
18991 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
18992 basic_info->disposition = disposition;
18993 basic_info->ref_count = ref_count;
18994 basic_info->object_id = (vm_object_id_t) (uintptr_t)
18995 VM_KERNEL_ADDRPERM(curr_object);
18996 basic_info->offset =
18997 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
18998 basic_info->depth = depth;
18999
19000 info_idx++;
19001 break;
19002 }
19003
19004 disposition = 0;
19005 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
19006
19007 /*
19008 * Move to next offset in the range and in our object.
19009 */
19010 curr_s_offset += effective_page_size;
19011 offset_in_object += effective_page_size;
19012 curr_offset_in_object = offset_in_object;
19013
19014 if (curr_object != object) {
19015 vm_object_unlock(curr_object);
19016
19017 curr_object = object;
19018
19019 vm_object_lock_shared(curr_object);
19020 } else {
19021 vm_object_lock_yield_shared(curr_object);
19022 }
19023 }
19024
19025 vm_object_unlock(curr_object);
19026 vm_object_deallocate(curr_object);
19027
19028 vm_map_lock_read(map);
19029 }
19030
19031 vm_map_unlock_read(map);
19032 return retval;
19033 }
19034
19035 /*
19036 * vm_map_msync
19037 *
19038 * Synchronises the memory range specified with its backing store
19039 * image by either flushing or cleaning the contents to the appropriate
19040 * memory manager engaging in a memory object synchronize dialog with
19041 * the manager. The client doesn't return until the manager issues
19042 * m_o_s_completed message. MIG Magically converts user task parameter
19043 * to the task's address map.
19044 *
19045 * interpretation of sync_flags
19046 * VM_SYNC_INVALIDATE - discard pages, only return precious
19047 * pages to manager.
19048 *
19049 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
19050 * - discard pages, write dirty or precious
19051 * pages back to memory manager.
19052 *
19053 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
19054 * - write dirty or precious pages back to
19055 * the memory manager.
19056 *
19057 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
19058 * is a hole in the region, and we would
19059 * have returned KERN_SUCCESS, return
19060 * KERN_INVALID_ADDRESS instead.
19061 *
19062 * NOTE
19063 * The memory object attributes have not yet been implemented, this
19064 * function will have to deal with the invalidate attribute
19065 *
19066 * RETURNS
19067 * KERN_INVALID_TASK Bad task parameter
19068 * KERN_INVALID_ARGUMENT both sync and async were specified.
19069 * KERN_SUCCESS The usual.
19070 * KERN_INVALID_ADDRESS There was a hole in the region.
19071 */
19072
19073 kern_return_t
vm_map_msync(vm_map_t map,vm_map_address_t address,vm_map_size_t size,vm_sync_t sync_flags)19074 vm_map_msync(
19075 vm_map_t map,
19076 vm_map_address_t address,
19077 vm_map_size_t size,
19078 vm_sync_t sync_flags)
19079 {
19080 vm_map_entry_t entry;
19081 vm_map_size_t amount_left;
19082 vm_object_offset_t offset;
19083 vm_object_offset_t start_offset, end_offset;
19084 boolean_t do_sync_req;
19085 boolean_t had_hole = FALSE;
19086 vm_map_offset_t pmap_offset;
19087
19088 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
19089 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
19090 return KERN_INVALID_ARGUMENT;
19091 }
19092
19093 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19094 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
19095 }
19096
19097 /*
19098 * align address and size on page boundaries
19099 */
19100 size = (vm_map_round_page(address + size,
19101 VM_MAP_PAGE_MASK(map)) -
19102 vm_map_trunc_page(address,
19103 VM_MAP_PAGE_MASK(map)));
19104 address = vm_map_trunc_page(address,
19105 VM_MAP_PAGE_MASK(map));
19106
19107 if (map == VM_MAP_NULL) {
19108 return KERN_INVALID_TASK;
19109 }
19110
19111 if (size == 0) {
19112 return KERN_SUCCESS;
19113 }
19114
19115 amount_left = size;
19116
19117 while (amount_left > 0) {
19118 vm_object_size_t flush_size;
19119 vm_object_t object;
19120
19121 vm_map_lock(map);
19122 if (!vm_map_lookup_entry(map,
19123 address,
19124 &entry)) {
19125 vm_map_size_t skip;
19126
19127 /*
19128 * hole in the address map.
19129 */
19130 had_hole = TRUE;
19131
19132 if (sync_flags & VM_SYNC_KILLPAGES) {
19133 /*
19134 * For VM_SYNC_KILLPAGES, there should be
19135 * no holes in the range, since we couldn't
19136 * prevent someone else from allocating in
19137 * that hole and we wouldn't want to "kill"
19138 * their pages.
19139 */
19140 vm_map_unlock(map);
19141 break;
19142 }
19143
19144 /*
19145 * Check for empty map.
19146 */
19147 if (entry == vm_map_to_entry(map) &&
19148 entry->vme_next == entry) {
19149 vm_map_unlock(map);
19150 break;
19151 }
19152 /*
19153 * Check that we don't wrap and that
19154 * we have at least one real map entry.
19155 */
19156 if ((map->hdr.nentries == 0) ||
19157 (entry->vme_next->vme_start < address)) {
19158 vm_map_unlock(map);
19159 break;
19160 }
19161 /*
19162 * Move up to the next entry if needed
19163 */
19164 skip = (entry->vme_next->vme_start - address);
19165 if (skip >= amount_left) {
19166 amount_left = 0;
19167 } else {
19168 amount_left -= skip;
19169 }
19170 address = entry->vme_next->vme_start;
19171 vm_map_unlock(map);
19172 continue;
19173 }
19174
19175 offset = address - entry->vme_start;
19176 pmap_offset = address;
19177
19178 /*
19179 * do we have more to flush than is contained in this
19180 * entry ?
19181 */
19182 if (amount_left + entry->vme_start + offset > entry->vme_end) {
19183 flush_size = entry->vme_end -
19184 (entry->vme_start + offset);
19185 } else {
19186 flush_size = amount_left;
19187 }
19188 amount_left -= flush_size;
19189 address += flush_size;
19190
19191 if (entry->is_sub_map == TRUE) {
19192 vm_map_t local_map;
19193 vm_map_offset_t local_offset;
19194
19195 local_map = VME_SUBMAP(entry);
19196 local_offset = VME_OFFSET(entry);
19197 vm_map_reference(local_map);
19198 vm_map_unlock(map);
19199 if (vm_map_msync(
19200 local_map,
19201 local_offset,
19202 flush_size,
19203 sync_flags) == KERN_INVALID_ADDRESS) {
19204 had_hole = TRUE;
19205 }
19206 vm_map_deallocate(local_map);
19207 continue;
19208 }
19209 object = VME_OBJECT(entry);
19210
19211 /*
19212 * We can't sync this object if the object has not been
19213 * created yet
19214 */
19215 if (object == VM_OBJECT_NULL) {
19216 vm_map_unlock(map);
19217 continue;
19218 }
19219 offset += VME_OFFSET(entry);
19220
19221 vm_object_lock(object);
19222
19223 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
19224 int kill_pages = 0;
19225 boolean_t reusable_pages = FALSE;
19226
19227 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19228 /*
19229 * This is a destructive operation and so we
19230 * err on the side of limiting the range of
19231 * the operation.
19232 */
19233 start_offset = vm_object_round_page(offset);
19234 end_offset = vm_object_trunc_page(offset + flush_size);
19235
19236 if (end_offset <= start_offset) {
19237 vm_object_unlock(object);
19238 vm_map_unlock(map);
19239 continue;
19240 }
19241
19242 pmap_offset += start_offset - offset;
19243 } else {
19244 start_offset = offset;
19245 end_offset = offset + flush_size;
19246 }
19247
19248 if (sync_flags & VM_SYNC_KILLPAGES) {
19249 if (((object->ref_count == 1) ||
19250 ((object->copy_strategy !=
19251 MEMORY_OBJECT_COPY_SYMMETRIC) &&
19252 (object->copy == VM_OBJECT_NULL))) &&
19253 (object->shadow == VM_OBJECT_NULL)) {
19254 if (object->ref_count != 1) {
19255 vm_page_stats_reusable.free_shared++;
19256 }
19257 kill_pages = 1;
19258 } else {
19259 kill_pages = -1;
19260 }
19261 }
19262 if (kill_pages != -1) {
19263 vm_object_deactivate_pages(
19264 object,
19265 start_offset,
19266 (vm_object_size_t) (end_offset - start_offset),
19267 kill_pages,
19268 reusable_pages,
19269 map->pmap,
19270 pmap_offset);
19271 }
19272 vm_object_unlock(object);
19273 vm_map_unlock(map);
19274 continue;
19275 }
19276 /*
19277 * We can't sync this object if there isn't a pager.
19278 * Don't bother to sync internal objects, since there can't
19279 * be any "permanent" storage for these objects anyway.
19280 */
19281 if ((object->pager == MEMORY_OBJECT_NULL) ||
19282 (object->internal) || (object->private)) {
19283 vm_object_unlock(object);
19284 vm_map_unlock(map);
19285 continue;
19286 }
19287 /*
19288 * keep reference on the object until syncing is done
19289 */
19290 vm_object_reference_locked(object);
19291 vm_object_unlock(object);
19292
19293 vm_map_unlock(map);
19294
19295 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
19296 start_offset = vm_object_trunc_page(offset);
19297 end_offset = vm_object_round_page(offset + flush_size);
19298 } else {
19299 start_offset = offset;
19300 end_offset = offset + flush_size;
19301 }
19302
19303 do_sync_req = vm_object_sync(object,
19304 start_offset,
19305 (end_offset - start_offset),
19306 sync_flags & VM_SYNC_INVALIDATE,
19307 ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
19308 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
19309 sync_flags & VM_SYNC_SYNCHRONOUS);
19310
19311 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
19312 /*
19313 * clear out the clustering and read-ahead hints
19314 */
19315 vm_object_lock(object);
19316
19317 object->pages_created = 0;
19318 object->pages_used = 0;
19319 object->sequential = 0;
19320 object->last_alloc = 0;
19321
19322 vm_object_unlock(object);
19323 }
19324 vm_object_deallocate(object);
19325 } /* while */
19326
19327 /* for proper msync() behaviour */
19328 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
19329 return KERN_INVALID_ADDRESS;
19330 }
19331
19332 return KERN_SUCCESS;
19333 }/* vm_msync */
19334
19335 void
vm_named_entry_associate_vm_object(vm_named_entry_t named_entry,vm_object_t object,vm_object_offset_t offset,vm_object_size_t size,vm_prot_t prot)19336 vm_named_entry_associate_vm_object(
19337 vm_named_entry_t named_entry,
19338 vm_object_t object,
19339 vm_object_offset_t offset,
19340 vm_object_size_t size,
19341 vm_prot_t prot)
19342 {
19343 vm_map_copy_t copy;
19344 vm_map_entry_t copy_entry;
19345
19346 assert(!named_entry->is_sub_map);
19347 assert(!named_entry->is_copy);
19348 assert(!named_entry->is_object);
19349 assert(!named_entry->internal);
19350 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
19351
19352 copy = vm_map_copy_allocate();
19353 copy->type = VM_MAP_COPY_ENTRY_LIST;
19354 copy->offset = offset;
19355 copy->size = size;
19356 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
19357 vm_map_store_init(©->cpy_hdr);
19358
19359 copy_entry = vm_map_copy_entry_create(copy);
19360 copy_entry->protection = prot;
19361 copy_entry->max_protection = prot;
19362 copy_entry->use_pmap = TRUE;
19363 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
19364 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
19365 VME_OBJECT_SET(copy_entry, object);
19366 VME_OFFSET_SET(copy_entry, vm_object_trunc_page(offset));
19367 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
19368
19369 named_entry->backing.copy = copy;
19370 named_entry->is_object = TRUE;
19371 if (object->internal) {
19372 named_entry->internal = TRUE;
19373 }
19374
19375 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
19376 named_entry, copy, object, offset, size, prot);
19377 }
19378
19379 vm_object_t
vm_named_entry_to_vm_object(vm_named_entry_t named_entry)19380 vm_named_entry_to_vm_object(
19381 vm_named_entry_t named_entry)
19382 {
19383 vm_map_copy_t copy;
19384 vm_map_entry_t copy_entry;
19385 vm_object_t object;
19386
19387 assert(!named_entry->is_sub_map);
19388 assert(!named_entry->is_copy);
19389 assert(named_entry->is_object);
19390 copy = named_entry->backing.copy;
19391 assert(copy != VM_MAP_COPY_NULL);
19392 /*
19393 * Assert that the vm_map_copy is coming from the right
19394 * zone and hasn't been forged
19395 */
19396 vm_map_copy_require(copy);
19397 assert(copy->cpy_hdr.nentries == 1);
19398 copy_entry = vm_map_copy_first_entry(copy);
19399 assert(!copy_entry->is_sub_map);
19400 object = VME_OBJECT(copy_entry);
19401
19402 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
19403
19404 return object;
19405 }
19406
19407 /*
19408 * Routine: convert_port_entry_to_map
19409 * Purpose:
19410 * Convert from a port specifying an entry or a task
19411 * to a map. Doesn't consume the port ref; produces a map ref,
19412 * which may be null. Unlike convert_port_to_map, the
19413 * port may be task or a named entry backed.
19414 * Conditions:
19415 * Nothing locked.
19416 */
19417
19418 vm_map_t
convert_port_entry_to_map(ipc_port_t port)19419 convert_port_entry_to_map(
19420 ipc_port_t port)
19421 {
19422 vm_map_t map = VM_MAP_NULL;
19423 vm_named_entry_t named_entry;
19424
19425 if (!IP_VALID(port)) {
19426 return VM_MAP_NULL;
19427 }
19428
19429 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
19430 return convert_port_to_map(port);
19431 }
19432
19433 named_entry = mach_memory_entry_from_port(port);
19434
19435 if ((named_entry->is_sub_map) &&
19436 (named_entry->protection & VM_PROT_WRITE)) {
19437 map = named_entry->backing.map;
19438 if (map->pmap != PMAP_NULL) {
19439 if (map->pmap == kernel_pmap) {
19440 panic("userspace has access "
19441 "to a kernel map %p", map);
19442 }
19443 pmap_require(map->pmap);
19444 }
19445 vm_map_reference(map);
19446 }
19447
19448 return map;
19449 }
19450
19451 /*
19452 * Export routines to other components for the things we access locally through
19453 * macros.
19454 */
19455 #undef current_map
19456 vm_map_t
current_map(void)19457 current_map(void)
19458 {
19459 return current_map_fast();
19460 }
19461
19462 /*
19463 * vm_map_reference:
19464 *
19465 * Takes a reference on the specified map.
19466 */
19467 void
vm_map_reference(vm_map_t map)19468 vm_map_reference(
19469 vm_map_t map)
19470 {
19471 if (__probable(map != VM_MAP_NULL)) {
19472 vm_map_require(map);
19473 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
19474 }
19475 }
19476
19477 /*
19478 * vm_map_deallocate:
19479 *
19480 * Removes a reference from the specified map,
19481 * destroying it if no references remain.
19482 * The map should not be locked.
19483 */
19484 void
vm_map_deallocate(vm_map_t map)19485 vm_map_deallocate(
19486 vm_map_t map)
19487 {
19488 if (__probable(map != VM_MAP_NULL)) {
19489 vm_map_require(map);
19490 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
19491 vm_map_destroy(map);
19492 }
19493 }
19494 }
19495
19496 void
vm_map_inspect_deallocate(vm_map_inspect_t map)19497 vm_map_inspect_deallocate(
19498 vm_map_inspect_t map)
19499 {
19500 vm_map_deallocate((vm_map_t)map);
19501 }
19502
19503 void
vm_map_read_deallocate(vm_map_read_t map)19504 vm_map_read_deallocate(
19505 vm_map_read_t map)
19506 {
19507 vm_map_deallocate((vm_map_t)map);
19508 }
19509
19510
19511 void
vm_map_disable_NX(vm_map_t map)19512 vm_map_disable_NX(vm_map_t map)
19513 {
19514 if (map == NULL) {
19515 return;
19516 }
19517 if (map->pmap == NULL) {
19518 return;
19519 }
19520
19521 pmap_disable_NX(map->pmap);
19522 }
19523
19524 void
vm_map_disallow_data_exec(vm_map_t map)19525 vm_map_disallow_data_exec(vm_map_t map)
19526 {
19527 if (map == NULL) {
19528 return;
19529 }
19530
19531 map->map_disallow_data_exec = TRUE;
19532 }
19533
19534 /* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
19535 * more descriptive.
19536 */
19537 void
vm_map_set_32bit(vm_map_t map)19538 vm_map_set_32bit(vm_map_t map)
19539 {
19540 #if defined(__arm__) || defined(__arm64__)
19541 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
19542 #else
19543 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
19544 #endif
19545 }
19546
19547
19548 void
vm_map_set_64bit(vm_map_t map)19549 vm_map_set_64bit(vm_map_t map)
19550 {
19551 #if defined(__arm__) || defined(__arm64__)
19552 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
19553 #else
19554 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
19555 #endif
19556 }
19557
19558 /*
19559 * Expand the maximum size of an existing map to the maximum supported.
19560 */
19561 void
vm_map_set_jumbo(vm_map_t map)19562 vm_map_set_jumbo(vm_map_t map)
19563 {
19564 #if defined (__arm64__) && !defined(CONFIG_ARROW)
19565 vm_map_set_max_addr(map, ~0);
19566 #else /* arm64 */
19567 (void) map;
19568 #endif
19569 }
19570
19571 /*
19572 * This map has a JIT entitlement
19573 */
19574 void
vm_map_set_jit_entitled(vm_map_t map)19575 vm_map_set_jit_entitled(vm_map_t map)
19576 {
19577 #if defined (__arm64__)
19578 pmap_set_jit_entitled(map->pmap);
19579 #else /* arm64 */
19580 (void) map;
19581 #endif
19582 }
19583
19584 /*
19585 * Expand the maximum size of an existing map.
19586 */
19587 void
vm_map_set_max_addr(vm_map_t map,vm_map_offset_t new_max_offset)19588 vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
19589 {
19590 #if defined(__arm64__)
19591 vm_map_offset_t max_supported_offset = 0;
19592 vm_map_offset_t old_max_offset = map->max_offset;
19593 max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
19594
19595 new_max_offset = trunc_page(new_max_offset);
19596
19597 /* The address space cannot be shrunk using this routine. */
19598 if (old_max_offset >= new_max_offset) {
19599 return;
19600 }
19601
19602 if (max_supported_offset < new_max_offset) {
19603 new_max_offset = max_supported_offset;
19604 }
19605
19606 map->max_offset = new_max_offset;
19607
19608 if (map->holes_list->prev->vme_end == old_max_offset) {
19609 /*
19610 * There is already a hole at the end of the map; simply make it bigger.
19611 */
19612 map->holes_list->prev->vme_end = map->max_offset;
19613 } else {
19614 /*
19615 * There is no hole at the end, so we need to create a new hole
19616 * for the new empty space we're creating.
19617 */
19618 struct vm_map_links *new_hole = zalloc(vm_map_holes_zone);
19619 new_hole->start = old_max_offset;
19620 new_hole->end = map->max_offset;
19621 new_hole->prev = map->holes_list->prev;
19622 new_hole->next = (struct vm_map_entry *)map->holes_list;
19623 map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
19624 map->holes_list->prev = (struct vm_map_entry *)new_hole;
19625 }
19626 #else
19627 (void)map;
19628 (void)new_max_offset;
19629 #endif
19630 }
19631
19632 vm_map_offset_t
vm_compute_max_offset(boolean_t is64)19633 vm_compute_max_offset(boolean_t is64)
19634 {
19635 #if defined(__arm__) || defined(__arm64__)
19636 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
19637 #else
19638 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
19639 #endif
19640 }
19641
19642 void
vm_map_get_max_aslr_slide_section(vm_map_t map __unused,int64_t * max_sections,int64_t * section_size)19643 vm_map_get_max_aslr_slide_section(
19644 vm_map_t map __unused,
19645 int64_t *max_sections,
19646 int64_t *section_size)
19647 {
19648 #if defined(__arm64__)
19649 *max_sections = 3;
19650 *section_size = ARM_TT_TWIG_SIZE;
19651 #else
19652 *max_sections = 1;
19653 *section_size = 0;
19654 #endif
19655 }
19656
19657 uint64_t
vm_map_get_max_aslr_slide_pages(vm_map_t map)19658 vm_map_get_max_aslr_slide_pages(vm_map_t map)
19659 {
19660 #if defined(__arm64__)
19661 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
19662 * limited embedded address space; this is also meant to minimize pmap
19663 * memory usage on 16KB page systems.
19664 */
19665 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
19666 #else
19667 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19668 #endif
19669 }
19670
19671 uint64_t
vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)19672 vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
19673 {
19674 #if defined(__arm64__)
19675 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
19676 * of independent entropy on 16KB page systems.
19677 */
19678 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
19679 #else
19680 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
19681 #endif
19682 }
19683
19684 #ifndef __arm__
19685 boolean_t
vm_map_is_64bit(vm_map_t map)19686 vm_map_is_64bit(
19687 vm_map_t map)
19688 {
19689 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
19690 }
19691 #endif
19692
19693 boolean_t
vm_map_has_hard_pagezero(vm_map_t map,vm_map_offset_t pagezero_size)19694 vm_map_has_hard_pagezero(
19695 vm_map_t map,
19696 vm_map_offset_t pagezero_size)
19697 {
19698 /*
19699 * XXX FBDP
19700 * We should lock the VM map (for read) here but we can get away
19701 * with it for now because there can't really be any race condition:
19702 * the VM map's min_offset is changed only when the VM map is created
19703 * and when the zero page is established (when the binary gets loaded),
19704 * and this routine gets called only when the task terminates and the
19705 * VM map is being torn down, and when a new map is created via
19706 * load_machfile()/execve().
19707 */
19708 return map->min_offset >= pagezero_size;
19709 }
19710
19711 /*
19712 * Raise a VM map's maximun offset.
19713 */
19714 kern_return_t
vm_map_raise_max_offset(vm_map_t map,vm_map_offset_t new_max_offset)19715 vm_map_raise_max_offset(
19716 vm_map_t map,
19717 vm_map_offset_t new_max_offset)
19718 {
19719 kern_return_t ret;
19720
19721 vm_map_lock(map);
19722 ret = KERN_INVALID_ADDRESS;
19723
19724 if (new_max_offset >= map->max_offset) {
19725 if (!vm_map_is_64bit(map)) {
19726 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
19727 map->max_offset = new_max_offset;
19728 ret = KERN_SUCCESS;
19729 }
19730 } else {
19731 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
19732 map->max_offset = new_max_offset;
19733 ret = KERN_SUCCESS;
19734 }
19735 }
19736 }
19737
19738 vm_map_unlock(map);
19739 return ret;
19740 }
19741
19742
19743 /*
19744 * Raise a VM map's minimum offset.
19745 * To strictly enforce "page zero" reservation.
19746 */
19747 kern_return_t
vm_map_raise_min_offset(vm_map_t map,vm_map_offset_t new_min_offset)19748 vm_map_raise_min_offset(
19749 vm_map_t map,
19750 vm_map_offset_t new_min_offset)
19751 {
19752 vm_map_entry_t first_entry;
19753
19754 new_min_offset = vm_map_round_page(new_min_offset,
19755 VM_MAP_PAGE_MASK(map));
19756
19757 vm_map_lock(map);
19758
19759 if (new_min_offset < map->min_offset) {
19760 /*
19761 * Can't move min_offset backwards, as that would expose
19762 * a part of the address space that was previously, and for
19763 * possibly good reasons, inaccessible.
19764 */
19765 vm_map_unlock(map);
19766 return KERN_INVALID_ADDRESS;
19767 }
19768 if (new_min_offset >= map->max_offset) {
19769 /* can't go beyond the end of the address space */
19770 vm_map_unlock(map);
19771 return KERN_INVALID_ADDRESS;
19772 }
19773
19774 first_entry = vm_map_first_entry(map);
19775 if (first_entry != vm_map_to_entry(map) &&
19776 first_entry->vme_start < new_min_offset) {
19777 /*
19778 * Some memory was already allocated below the new
19779 * minimun offset. It's too late to change it now...
19780 */
19781 vm_map_unlock(map);
19782 return KERN_NO_SPACE;
19783 }
19784
19785 map->min_offset = new_min_offset;
19786
19787 assert(map->holes_list);
19788 map->holes_list->start = new_min_offset;
19789 assert(new_min_offset < map->holes_list->end);
19790
19791 vm_map_unlock(map);
19792
19793 return KERN_SUCCESS;
19794 }
19795
19796 /*
19797 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
19798 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
19799 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
19800 * have to reach over to the BSD data structures.
19801 */
19802
19803 uint64_t vm_map_set_size_limit_count = 0;
19804 kern_return_t
vm_map_set_size_limit(vm_map_t map,uint64_t new_size_limit)19805 vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
19806 {
19807 kern_return_t kr;
19808
19809 vm_map_lock(map);
19810 if (new_size_limit < map->size) {
19811 /* new limit should not be lower than its current size */
19812 DTRACE_VM2(vm_map_set_size_limit_fail,
19813 vm_map_size_t, map->size,
19814 uint64_t, new_size_limit);
19815 kr = KERN_FAILURE;
19816 } else if (new_size_limit == map->size_limit) {
19817 /* no change */
19818 kr = KERN_SUCCESS;
19819 } else {
19820 /* set new limit */
19821 DTRACE_VM2(vm_map_set_size_limit,
19822 vm_map_size_t, map->size,
19823 uint64_t, new_size_limit);
19824 if (new_size_limit != RLIM_INFINITY) {
19825 vm_map_set_size_limit_count++;
19826 }
19827 map->size_limit = new_size_limit;
19828 kr = KERN_SUCCESS;
19829 }
19830 vm_map_unlock(map);
19831 return kr;
19832 }
19833
19834 uint64_t vm_map_set_data_limit_count = 0;
19835 kern_return_t
vm_map_set_data_limit(vm_map_t map,uint64_t new_data_limit)19836 vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
19837 {
19838 kern_return_t kr;
19839
19840 vm_map_lock(map);
19841 if (new_data_limit < map->size) {
19842 /* new limit should not be lower than its current size */
19843 DTRACE_VM2(vm_map_set_data_limit_fail,
19844 vm_map_size_t, map->size,
19845 uint64_t, new_data_limit);
19846 kr = KERN_FAILURE;
19847 } else if (new_data_limit == map->data_limit) {
19848 /* no change */
19849 kr = KERN_SUCCESS;
19850 } else {
19851 /* set new limit */
19852 DTRACE_VM2(vm_map_set_data_limit,
19853 vm_map_size_t, map->size,
19854 uint64_t, new_data_limit);
19855 if (new_data_limit != RLIM_INFINITY) {
19856 vm_map_set_data_limit_count++;
19857 }
19858 map->data_limit = new_data_limit;
19859 kr = KERN_SUCCESS;
19860 }
19861 vm_map_unlock(map);
19862 return kr;
19863 }
19864
19865 void
vm_map_set_user_wire_limit(vm_map_t map,vm_size_t limit)19866 vm_map_set_user_wire_limit(vm_map_t map,
19867 vm_size_t limit)
19868 {
19869 vm_map_lock(map);
19870 map->user_wire_limit = limit;
19871 vm_map_unlock(map);
19872 }
19873
19874
19875 void
vm_map_switch_protect(vm_map_t map,boolean_t val)19876 vm_map_switch_protect(vm_map_t map,
19877 boolean_t val)
19878 {
19879 vm_map_lock(map);
19880 map->switch_protect = val;
19881 vm_map_unlock(map);
19882 }
19883
19884 extern int cs_process_enforcement_enable;
19885 boolean_t
vm_map_cs_enforcement(vm_map_t map)19886 vm_map_cs_enforcement(
19887 vm_map_t map)
19888 {
19889 if (cs_process_enforcement_enable) {
19890 return TRUE;
19891 }
19892 return map->cs_enforcement;
19893 }
19894
19895 kern_return_t
vm_map_cs_wx_enable(vm_map_t map)19896 vm_map_cs_wx_enable(
19897 vm_map_t map)
19898 {
19899 return pmap_cs_allow_invalid(vm_map_pmap(map));
19900 }
19901
19902 void
vm_map_cs_debugged_set(vm_map_t map,boolean_t val)19903 vm_map_cs_debugged_set(
19904 vm_map_t map,
19905 boolean_t val)
19906 {
19907 vm_map_lock(map);
19908 map->cs_debugged = val;
19909 vm_map_unlock(map);
19910 }
19911
19912 void
vm_map_cs_enforcement_set(vm_map_t map,boolean_t val)19913 vm_map_cs_enforcement_set(
19914 vm_map_t map,
19915 boolean_t val)
19916 {
19917 vm_map_lock(map);
19918 map->cs_enforcement = val;
19919 pmap_set_vm_map_cs_enforced(map->pmap, val);
19920 vm_map_unlock(map);
19921 }
19922
19923 /*
19924 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
19925 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
19926 * bump both counters.
19927 */
19928 void
vm_map_iokit_mapped_region(vm_map_t map,vm_size_t bytes)19929 vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
19930 {
19931 pmap_t pmap = vm_map_pmap(map);
19932
19933 ledger_credit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19934 ledger_credit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19935 }
19936
19937 void
vm_map_iokit_unmapped_region(vm_map_t map,vm_size_t bytes)19938 vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
19939 {
19940 pmap_t pmap = vm_map_pmap(map);
19941
19942 ledger_debit(pmap->ledger, task_ledgers.iokit_mapped, bytes);
19943 ledger_debit(pmap->ledger, task_ledgers.phys_footprint, bytes);
19944 }
19945
19946 /* Add (generate) code signature for memory range */
19947 #if CONFIG_DYNAMIC_CODE_SIGNING
19948 kern_return_t
vm_map_sign(vm_map_t map,vm_map_offset_t start,vm_map_offset_t end)19949 vm_map_sign(vm_map_t map,
19950 vm_map_offset_t start,
19951 vm_map_offset_t end)
19952 {
19953 vm_map_entry_t entry;
19954 vm_page_t m;
19955 vm_object_t object;
19956
19957 /*
19958 * Vet all the input parameters and current type and state of the
19959 * underlaying object. Return with an error if anything is amiss.
19960 */
19961 if (map == VM_MAP_NULL) {
19962 return KERN_INVALID_ARGUMENT;
19963 }
19964
19965 vm_map_lock_read(map);
19966
19967 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
19968 /*
19969 * Must pass a valid non-submap address.
19970 */
19971 vm_map_unlock_read(map);
19972 return KERN_INVALID_ADDRESS;
19973 }
19974
19975 if ((entry->vme_start > start) || (entry->vme_end < end)) {
19976 /*
19977 * Map entry doesn't cover the requested range. Not handling
19978 * this situation currently.
19979 */
19980 vm_map_unlock_read(map);
19981 return KERN_INVALID_ARGUMENT;
19982 }
19983
19984 object = VME_OBJECT(entry);
19985 if (object == VM_OBJECT_NULL) {
19986 /*
19987 * Object must already be present or we can't sign.
19988 */
19989 vm_map_unlock_read(map);
19990 return KERN_INVALID_ARGUMENT;
19991 }
19992
19993 vm_object_lock(object);
19994 vm_map_unlock_read(map);
19995
19996 while (start < end) {
19997 uint32_t refmod;
19998
19999 m = vm_page_lookup(object,
20000 start - entry->vme_start + VME_OFFSET(entry));
20001 if (m == VM_PAGE_NULL) {
20002 /* shoud we try to fault a page here? we can probably
20003 * demand it exists and is locked for this request */
20004 vm_object_unlock(object);
20005 return KERN_FAILURE;
20006 }
20007 /* deal with special page status */
20008 if (m->vmp_busy ||
20009 (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
20010 vm_object_unlock(object);
20011 return KERN_FAILURE;
20012 }
20013
20014 /* Page is OK... now "validate" it */
20015 /* This is the place where we'll call out to create a code
20016 * directory, later */
20017 /* XXX TODO4K: deal with 4k subpages individually? */
20018 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
20019
20020 /* The page is now "clean" for codesigning purposes. That means
20021 * we don't consider it as modified (wpmapped) anymore. But
20022 * we'll disconnect the page so we note any future modification
20023 * attempts. */
20024 m->vmp_wpmapped = FALSE;
20025 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
20026
20027 /* Pull the dirty status from the pmap, since we cleared the
20028 * wpmapped bit */
20029 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
20030 SET_PAGE_DIRTY(m, FALSE);
20031 }
20032
20033 /* On to the next page */
20034 start += PAGE_SIZE;
20035 }
20036 vm_object_unlock(object);
20037
20038 return KERN_SUCCESS;
20039 }
20040 #endif
20041
20042 kern_return_t
vm_map_partial_reap(vm_map_t map,unsigned int * reclaimed_resident,unsigned int * reclaimed_compressed)20043 vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
20044 {
20045 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
20046 vm_map_entry_t next_entry;
20047 kern_return_t kr = KERN_SUCCESS;
20048 VM_MAP_ZAP_DECLARE(zap_list);
20049
20050 vm_map_lock(map);
20051
20052 for (entry = vm_map_first_entry(map);
20053 entry != vm_map_to_entry(map);
20054 entry = next_entry) {
20055 next_entry = entry->vme_next;
20056
20057 if (VME_OBJECT(entry) &&
20058 !entry->is_sub_map &&
20059 (VME_OBJECT(entry)->internal == TRUE) &&
20060 (VME_OBJECT(entry)->ref_count == 1)) {
20061 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
20062 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
20063
20064 (void)vm_map_delete(map, entry->vme_start,
20065 entry->vme_end,
20066 VM_MAP_REMOVE_NO_YIELD,
20067 &zap_list);
20068 }
20069 }
20070
20071 vm_map_unlock(map);
20072
20073 vm_map_zap_dispose(&zap_list);
20074
20075 return kr;
20076 }
20077
20078
20079 #if DEVELOPMENT || DEBUG
20080
20081 int
vm_map_disconnect_page_mappings(vm_map_t map,boolean_t do_unnest)20082 vm_map_disconnect_page_mappings(
20083 vm_map_t map,
20084 boolean_t do_unnest)
20085 {
20086 vm_map_entry_t entry;
20087 ledger_amount_t byte_count = 0;
20088
20089 if (do_unnest == TRUE) {
20090 #ifndef NO_NESTED_PMAP
20091 vm_map_lock(map);
20092
20093 for (entry = vm_map_first_entry(map);
20094 entry != vm_map_to_entry(map);
20095 entry = entry->vme_next) {
20096 if (entry->is_sub_map && entry->use_pmap) {
20097 /*
20098 * Make sure the range between the start of this entry and
20099 * the end of this entry is no longer nested, so that
20100 * we will only remove mappings from the pmap in use by this
20101 * this task
20102 */
20103 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
20104 }
20105 }
20106 vm_map_unlock(map);
20107 #endif
20108 }
20109 vm_map_lock_read(map);
20110
20111 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
20112
20113 for (entry = vm_map_first_entry(map);
20114 entry != vm_map_to_entry(map);
20115 entry = entry->vme_next) {
20116 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
20117 (VME_OBJECT(entry)->phys_contiguous))) {
20118 continue;
20119 }
20120 if (entry->is_sub_map) {
20121 assert(!entry->use_pmap);
20122 }
20123
20124 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
20125 }
20126 vm_map_unlock_read(map);
20127
20128 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
20129 }
20130
20131 kern_return_t
vm_map_inject_error(vm_map_t map,vm_map_offset_t vaddr)20132 vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
20133 {
20134 vm_object_t object = NULL;
20135 vm_object_offset_t offset;
20136 vm_prot_t prot;
20137 boolean_t wired;
20138 vm_map_version_t version;
20139 vm_map_t real_map;
20140 int result = KERN_FAILURE;
20141
20142 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
20143 vm_map_lock(map);
20144
20145 result = vm_map_lookup_locked(&map, vaddr, VM_PROT_READ,
20146 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
20147 NULL, &real_map, NULL);
20148 if (object == NULL) {
20149 result = KERN_MEMORY_ERROR;
20150 } else if (object->pager) {
20151 result = vm_compressor_pager_inject_error(object->pager,
20152 offset);
20153 } else {
20154 result = KERN_MEMORY_PRESENT;
20155 }
20156
20157 if (object != NULL) {
20158 vm_object_unlock(object);
20159 }
20160
20161 if (real_map != map) {
20162 vm_map_unlock(real_map);
20163 }
20164 vm_map_unlock(map);
20165
20166 return result;
20167 }
20168
20169 #endif
20170
20171
20172 #if CONFIG_FREEZE
20173
20174
20175 extern struct freezer_context freezer_context_global;
20176 AbsoluteTime c_freezer_last_yield_ts = 0;
20177
20178 extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
20179 extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
20180
20181 kern_return_t
vm_map_freeze(task_t task,unsigned int * purgeable_count,unsigned int * wired_count,unsigned int * clean_count,unsigned int * dirty_count,unsigned int dirty_budget,unsigned int * shared_count,int * freezer_error_code,boolean_t eval_only)20182 vm_map_freeze(
20183 task_t task,
20184 unsigned int *purgeable_count,
20185 unsigned int *wired_count,
20186 unsigned int *clean_count,
20187 unsigned int *dirty_count,
20188 unsigned int dirty_budget,
20189 unsigned int *shared_count,
20190 int *freezer_error_code,
20191 boolean_t eval_only)
20192 {
20193 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
20194 kern_return_t kr = KERN_SUCCESS;
20195 boolean_t evaluation_phase = TRUE;
20196 vm_object_t cur_shared_object = NULL;
20197 int cur_shared_obj_ref_cnt = 0;
20198 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
20199
20200 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
20201
20202 /*
20203 * We need the exclusive lock here so that we can
20204 * block any page faults or lookups while we are
20205 * in the middle of freezing this vm map.
20206 */
20207 vm_map_t map = task->map;
20208
20209 vm_map_lock(map);
20210
20211 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
20212
20213 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20214 if (vm_compressor_low_on_space()) {
20215 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20216 }
20217
20218 if (vm_swap_low_on_space()) {
20219 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20220 }
20221
20222 kr = KERN_NO_SPACE;
20223 goto done;
20224 }
20225
20226 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
20227 /*
20228 * In-memory compressor backing the freezer. No disk.
20229 * So no need to do the evaluation phase.
20230 */
20231 evaluation_phase = FALSE;
20232
20233 if (eval_only == TRUE) {
20234 /*
20235 * We don't support 'eval_only' mode
20236 * in this non-swap config.
20237 */
20238 *freezer_error_code = FREEZER_ERROR_GENERIC;
20239 kr = KERN_INVALID_ARGUMENT;
20240 goto done;
20241 }
20242
20243 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20244 clock_get_uptime(&c_freezer_last_yield_ts);
20245 }
20246 again:
20247
20248 for (entry2 = vm_map_first_entry(map);
20249 entry2 != vm_map_to_entry(map);
20250 entry2 = entry2->vme_next) {
20251 vm_object_t src_object = VME_OBJECT(entry2);
20252
20253 if (src_object &&
20254 !entry2->is_sub_map &&
20255 !src_object->phys_contiguous) {
20256 /* If eligible, scan the entry, moving eligible pages over to our parent object */
20257
20258 if (src_object->internal == TRUE) {
20259 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
20260 /*
20261 * We skip purgeable objects during evaluation phase only.
20262 * If we decide to freeze this process, we'll explicitly
20263 * purge these objects before we go around again with
20264 * 'evaluation_phase' set to FALSE.
20265 */
20266
20267 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
20268 /*
20269 * We want to purge objects that may not belong to this task but are mapped
20270 * in this task alone. Since we already purged this task's purgeable memory
20271 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
20272 * on this task's purgeable objects. Hence the check for only volatile objects.
20273 */
20274 if (evaluation_phase == FALSE &&
20275 (src_object->purgable == VM_PURGABLE_VOLATILE) &&
20276 (src_object->ref_count == 1)) {
20277 vm_object_lock(src_object);
20278 vm_object_purge(src_object, 0);
20279 vm_object_unlock(src_object);
20280 }
20281 continue;
20282 }
20283
20284 /*
20285 * Pages belonging to this object could be swapped to disk.
20286 * Make sure it's not a shared object because we could end
20287 * up just bringing it back in again.
20288 *
20289 * We try to optimize somewhat by checking for objects that are mapped
20290 * more than once within our own map. But we don't do full searches,
20291 * we just look at the entries following our current entry.
20292 */
20293
20294 if (src_object->ref_count > 1) {
20295 if (src_object != cur_shared_object) {
20296 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20297 dirty_shared_count += obj_pages_snapshot;
20298
20299 cur_shared_object = src_object;
20300 cur_shared_obj_ref_cnt = 1;
20301 continue;
20302 } else {
20303 cur_shared_obj_ref_cnt++;
20304 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
20305 /*
20306 * Fall through to below and treat this object as private.
20307 * So deduct its pages from our shared total and add it to the
20308 * private total.
20309 */
20310
20311 dirty_shared_count -= obj_pages_snapshot;
20312 dirty_private_count += obj_pages_snapshot;
20313 } else {
20314 continue;
20315 }
20316 }
20317 }
20318
20319
20320 if (src_object->ref_count == 1) {
20321 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
20322 }
20323
20324 if (evaluation_phase == TRUE) {
20325 continue;
20326 }
20327 }
20328
20329 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
20330 *wired_count += src_object->wired_page_count;
20331
20332 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
20333 if (vm_compressor_low_on_space()) {
20334 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
20335 }
20336
20337 if (vm_swap_low_on_space()) {
20338 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
20339 }
20340
20341 kr = KERN_NO_SPACE;
20342 break;
20343 }
20344 if (paged_out_count >= dirty_budget) {
20345 break;
20346 }
20347 dirty_budget -= paged_out_count;
20348 }
20349 }
20350 }
20351
20352 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
20353 if (evaluation_phase) {
20354 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
20355
20356 if (dirty_shared_count > shared_pages_threshold) {
20357 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
20358 kr = KERN_FAILURE;
20359 goto done;
20360 }
20361
20362 if (dirty_shared_count &&
20363 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
20364 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
20365 kr = KERN_FAILURE;
20366 goto done;
20367 }
20368
20369 evaluation_phase = FALSE;
20370 dirty_shared_count = dirty_private_count = 0;
20371
20372 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
20373 clock_get_uptime(&c_freezer_last_yield_ts);
20374
20375 if (eval_only) {
20376 kr = KERN_SUCCESS;
20377 goto done;
20378 }
20379
20380 vm_purgeable_purge_task_owned(task);
20381
20382 goto again;
20383 } else {
20384 kr = KERN_SUCCESS;
20385 }
20386
20387 done:
20388 vm_map_unlock(map);
20389
20390 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
20391 vm_object_compressed_freezer_done();
20392 }
20393 return kr;
20394 }
20395
20396 #endif
20397
20398 /*
20399 * vm_map_entry_should_cow_for_true_share:
20400 *
20401 * Determines if the map entry should be clipped and setup for copy-on-write
20402 * to avoid applying "true_share" to a large VM object when only a subset is
20403 * targeted.
20404 *
20405 * For now, we target only the map entries created for the Objective C
20406 * Garbage Collector, which initially have the following properties:
20407 * - alias == VM_MEMORY_MALLOC
20408 * - wired_count == 0
20409 * - !needs_copy
20410 * and a VM object with:
20411 * - internal
20412 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
20413 * - !true_share
20414 * - vo_size == ANON_CHUNK_SIZE
20415 *
20416 * Only non-kernel map entries.
20417 */
20418 boolean_t
vm_map_entry_should_cow_for_true_share(vm_map_entry_t entry)20419 vm_map_entry_should_cow_for_true_share(
20420 vm_map_entry_t entry)
20421 {
20422 vm_object_t object;
20423
20424 if (entry->is_sub_map) {
20425 /* entry does not point at a VM object */
20426 return FALSE;
20427 }
20428
20429 if (entry->needs_copy) {
20430 /* already set for copy_on_write: done! */
20431 return FALSE;
20432 }
20433
20434 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
20435 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
20436 /* not a malloc heap or Obj-C Garbage Collector heap */
20437 return FALSE;
20438 }
20439
20440 if (entry->wired_count) {
20441 /* wired: can't change the map entry... */
20442 vm_counters.should_cow_but_wired++;
20443 return FALSE;
20444 }
20445
20446 object = VME_OBJECT(entry);
20447
20448 if (object == VM_OBJECT_NULL) {
20449 /* no object yet... */
20450 return FALSE;
20451 }
20452
20453 if (!object->internal) {
20454 /* not an internal object */
20455 return FALSE;
20456 }
20457
20458 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
20459 /* not the default copy strategy */
20460 return FALSE;
20461 }
20462
20463 if (object->true_share) {
20464 /* already true_share: too late to avoid it */
20465 return FALSE;
20466 }
20467
20468 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
20469 object->vo_size != ANON_CHUNK_SIZE) {
20470 /* ... not an object created for the ObjC Garbage Collector */
20471 return FALSE;
20472 }
20473
20474 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
20475 object->vo_size != 2048 * 4096) {
20476 /* ... not a "MALLOC_SMALL" heap */
20477 return FALSE;
20478 }
20479
20480 /*
20481 * All the criteria match: we have a large object being targeted for "true_share".
20482 * To limit the adverse side-effects linked with "true_share", tell the caller to
20483 * try and avoid setting up the entire object for "true_share" by clipping the
20484 * targeted range and setting it up for copy-on-write.
20485 */
20486 return TRUE;
20487 }
20488
20489 vm_map_offset_t
vm_map_round_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20490 vm_map_round_page_mask(
20491 vm_map_offset_t offset,
20492 vm_map_offset_t mask)
20493 {
20494 return VM_MAP_ROUND_PAGE(offset, mask);
20495 }
20496
20497 vm_map_offset_t
vm_map_trunc_page_mask(vm_map_offset_t offset,vm_map_offset_t mask)20498 vm_map_trunc_page_mask(
20499 vm_map_offset_t offset,
20500 vm_map_offset_t mask)
20501 {
20502 return VM_MAP_TRUNC_PAGE(offset, mask);
20503 }
20504
20505 boolean_t
vm_map_page_aligned(vm_map_offset_t offset,vm_map_offset_t mask)20506 vm_map_page_aligned(
20507 vm_map_offset_t offset,
20508 vm_map_offset_t mask)
20509 {
20510 return ((offset) & mask) == 0;
20511 }
20512
20513 int
vm_map_page_shift(vm_map_t map)20514 vm_map_page_shift(
20515 vm_map_t map)
20516 {
20517 return VM_MAP_PAGE_SHIFT(map);
20518 }
20519
20520 int
vm_map_page_size(vm_map_t map)20521 vm_map_page_size(
20522 vm_map_t map)
20523 {
20524 return VM_MAP_PAGE_SIZE(map);
20525 }
20526
20527 vm_map_offset_t
vm_map_page_mask(vm_map_t map)20528 vm_map_page_mask(
20529 vm_map_t map)
20530 {
20531 return VM_MAP_PAGE_MASK(map);
20532 }
20533
20534 kern_return_t
vm_map_set_page_shift(vm_map_t map,int pageshift)20535 vm_map_set_page_shift(
20536 vm_map_t map,
20537 int pageshift)
20538 {
20539 if (map->hdr.nentries != 0) {
20540 /* too late to change page size */
20541 return KERN_FAILURE;
20542 }
20543
20544 map->hdr.page_shift = (uint16_t)pageshift;
20545
20546 return KERN_SUCCESS;
20547 }
20548
20549 kern_return_t
vm_map_query_volatile(vm_map_t map,mach_vm_size_t * volatile_virtual_size_p,mach_vm_size_t * volatile_resident_size_p,mach_vm_size_t * volatile_compressed_size_p,mach_vm_size_t * volatile_pmap_size_p,mach_vm_size_t * volatile_compressed_pmap_size_p)20550 vm_map_query_volatile(
20551 vm_map_t map,
20552 mach_vm_size_t *volatile_virtual_size_p,
20553 mach_vm_size_t *volatile_resident_size_p,
20554 mach_vm_size_t *volatile_compressed_size_p,
20555 mach_vm_size_t *volatile_pmap_size_p,
20556 mach_vm_size_t *volatile_compressed_pmap_size_p)
20557 {
20558 mach_vm_size_t volatile_virtual_size;
20559 mach_vm_size_t volatile_resident_count;
20560 mach_vm_size_t volatile_compressed_count;
20561 mach_vm_size_t volatile_pmap_count;
20562 mach_vm_size_t volatile_compressed_pmap_count;
20563 mach_vm_size_t resident_count;
20564 vm_map_entry_t entry;
20565 vm_object_t object;
20566
20567 /* map should be locked by caller */
20568
20569 volatile_virtual_size = 0;
20570 volatile_resident_count = 0;
20571 volatile_compressed_count = 0;
20572 volatile_pmap_count = 0;
20573 volatile_compressed_pmap_count = 0;
20574
20575 for (entry = vm_map_first_entry(map);
20576 entry != vm_map_to_entry(map);
20577 entry = entry->vme_next) {
20578 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
20579
20580 if (entry->is_sub_map) {
20581 continue;
20582 }
20583 if (!(entry->protection & VM_PROT_WRITE)) {
20584 continue;
20585 }
20586 object = VME_OBJECT(entry);
20587 if (object == VM_OBJECT_NULL) {
20588 continue;
20589 }
20590 if (object->purgable != VM_PURGABLE_VOLATILE &&
20591 object->purgable != VM_PURGABLE_EMPTY) {
20592 continue;
20593 }
20594 if (VME_OFFSET(entry)) {
20595 /*
20596 * If the map entry has been split and the object now
20597 * appears several times in the VM map, we don't want
20598 * to count the object's resident_page_count more than
20599 * once. We count it only for the first one, starting
20600 * at offset 0 and ignore the other VM map entries.
20601 */
20602 continue;
20603 }
20604 resident_count = object->resident_page_count;
20605 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
20606 resident_count = 0;
20607 } else {
20608 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
20609 }
20610
20611 volatile_virtual_size += entry->vme_end - entry->vme_start;
20612 volatile_resident_count += resident_count;
20613 if (object->pager) {
20614 volatile_compressed_count +=
20615 vm_compressor_pager_get_count(object->pager);
20616 }
20617 pmap_compressed_bytes = 0;
20618 pmap_resident_bytes =
20619 pmap_query_resident(map->pmap,
20620 entry->vme_start,
20621 entry->vme_end,
20622 &pmap_compressed_bytes);
20623 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
20624 volatile_compressed_pmap_count += (pmap_compressed_bytes
20625 / PAGE_SIZE);
20626 }
20627
20628 /* map is still locked on return */
20629
20630 *volatile_virtual_size_p = volatile_virtual_size;
20631 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
20632 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
20633 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
20634 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
20635
20636 return KERN_SUCCESS;
20637 }
20638
20639 void
vm_map_sizes(vm_map_t map,vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)20640 vm_map_sizes(vm_map_t map,
20641 vm_map_size_t * psize,
20642 vm_map_size_t * pfree,
20643 vm_map_size_t * plargest_free)
20644 {
20645 vm_map_entry_t entry;
20646 vm_map_offset_t prev;
20647 vm_map_size_t free, total_free, largest_free;
20648 boolean_t end;
20649
20650 if (!map) {
20651 *psize = *pfree = *plargest_free = 0;
20652 return;
20653 }
20654 total_free = largest_free = 0;
20655
20656 vm_map_lock_read(map);
20657 if (psize) {
20658 *psize = map->max_offset - map->min_offset;
20659 }
20660
20661 prev = map->min_offset;
20662 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
20663 end = (entry == vm_map_to_entry(map));
20664
20665 if (end) {
20666 free = entry->vme_end - prev;
20667 } else {
20668 free = entry->vme_start - prev;
20669 }
20670
20671 total_free += free;
20672 if (free > largest_free) {
20673 largest_free = free;
20674 }
20675
20676 if (end) {
20677 break;
20678 }
20679 prev = entry->vme_end;
20680 }
20681 vm_map_unlock_read(map);
20682 if (pfree) {
20683 *pfree = total_free;
20684 }
20685 if (plargest_free) {
20686 *plargest_free = largest_free;
20687 }
20688 }
20689
20690 #if VM_SCAN_FOR_SHADOW_CHAIN
20691 int vm_map_shadow_max(vm_map_t map);
20692 int
vm_map_shadow_max(vm_map_t map)20693 vm_map_shadow_max(
20694 vm_map_t map)
20695 {
20696 int shadows, shadows_max;
20697 vm_map_entry_t entry;
20698 vm_object_t object, next_object;
20699
20700 if (map == NULL) {
20701 return 0;
20702 }
20703
20704 shadows_max = 0;
20705
20706 vm_map_lock_read(map);
20707
20708 for (entry = vm_map_first_entry(map);
20709 entry != vm_map_to_entry(map);
20710 entry = entry->vme_next) {
20711 if (entry->is_sub_map) {
20712 continue;
20713 }
20714 object = VME_OBJECT(entry);
20715 if (object == NULL) {
20716 continue;
20717 }
20718 vm_object_lock_shared(object);
20719 for (shadows = 0;
20720 object->shadow != NULL;
20721 shadows++, object = next_object) {
20722 next_object = object->shadow;
20723 vm_object_lock_shared(next_object);
20724 vm_object_unlock(object);
20725 }
20726 vm_object_unlock(object);
20727 if (shadows > shadows_max) {
20728 shadows_max = shadows;
20729 }
20730 }
20731
20732 vm_map_unlock_read(map);
20733
20734 return shadows_max;
20735 }
20736 #endif /* VM_SCAN_FOR_SHADOW_CHAIN */
20737
20738 void
vm_commit_pagezero_status(vm_map_t lmap)20739 vm_commit_pagezero_status(vm_map_t lmap)
20740 {
20741 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
20742 }
20743
20744 #if XNU_TARGET_OS_OSX
20745 void
vm_map_set_high_start(vm_map_t map,vm_map_offset_t high_start)20746 vm_map_set_high_start(
20747 vm_map_t map,
20748 vm_map_offset_t high_start)
20749 {
20750 map->vmmap_high_start = high_start;
20751 }
20752 #endif /* XNU_TARGET_OS_OSX */
20753
20754
20755 /*
20756 * FORKED CORPSE FOOTPRINT
20757 *
20758 * A forked corpse gets a copy of the original VM map but its pmap is mostly
20759 * empty since it never ran and never got to fault in any pages.
20760 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
20761 * a forked corpse would therefore return very little information.
20762 *
20763 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
20764 * to vm_map_fork() to collect footprint information from the original VM map
20765 * and its pmap, and store it in the forked corpse's VM map. That information
20766 * is stored in place of the VM map's "hole list" since we'll never need to
20767 * lookup for holes in the corpse's map.
20768 *
20769 * The corpse's footprint info looks like this:
20770 *
20771 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
20772 * as follows:
20773 * +---------------------------------------+
20774 * header-> | cf_size |
20775 * +-------------------+-------------------+
20776 * | cf_last_region | cf_last_zeroes |
20777 * +-------------------+-------------------+
20778 * region1-> | cfr_vaddr |
20779 * +-------------------+-------------------+
20780 * | cfr_num_pages | d0 | d1 | d2 | d3 |
20781 * +---------------------------------------+
20782 * | d4 | d5 | ... |
20783 * +---------------------------------------+
20784 * | ... |
20785 * +-------------------+-------------------+
20786 * | dy | dz | na | na | cfr_vaddr... | <-region2
20787 * +-------------------+-------------------+
20788 * | cfr_vaddr (ctd) | cfr_num_pages |
20789 * +---------------------------------------+
20790 * | d0 | d1 ... |
20791 * +---------------------------------------+
20792 * ...
20793 * +---------------------------------------+
20794 * last region-> | cfr_vaddr |
20795 * +---------------------------------------+
20796 * + cfr_num_pages | d0 | d1 | d2 | d3 |
20797 * +---------------------------------------+
20798 * ...
20799 * +---------------------------------------+
20800 * | dx | dy | dz | na | na | na | na | na |
20801 * +---------------------------------------+
20802 *
20803 * where:
20804 * cf_size: total size of the buffer (rounded to page size)
20805 * cf_last_region: offset in the buffer of the last "region" sub-header
20806 * cf_last_zeroes: number of trailing "zero" dispositions at the end
20807 * of last region
20808 * cfr_vaddr: virtual address of the start of the covered "region"
20809 * cfr_num_pages: number of pages in the covered "region"
20810 * d*: disposition of the page at that virtual address
20811 * Regions in the buffer are word-aligned.
20812 *
20813 * We estimate the size of the buffer based on the number of memory regions
20814 * and the virtual size of the address space. While copying each memory region
20815 * during vm_map_fork(), we also collect the footprint info for that region
20816 * and store it in the buffer, packing it as much as possible (coalescing
20817 * contiguous memory regions to avoid having too many region headers and
20818 * avoiding long streaks of "zero" page dispositions by splitting footprint
20819 * "regions", so the number of regions in the footprint buffer might not match
20820 * the number of memory regions in the address space.
20821 *
20822 * We also have to copy the original task's "nonvolatile" ledgers since that's
20823 * part of the footprint and will need to be reported to any tool asking for
20824 * the footprint information of the forked corpse.
20825 */
20826
20827 uint64_t vm_map_corpse_footprint_count = 0;
20828 uint64_t vm_map_corpse_footprint_size_avg = 0;
20829 uint64_t vm_map_corpse_footprint_size_max = 0;
20830 uint64_t vm_map_corpse_footprint_full = 0;
20831 uint64_t vm_map_corpse_footprint_no_buf = 0;
20832
20833 struct vm_map_corpse_footprint_header {
20834 vm_size_t cf_size; /* allocated buffer size */
20835 uint32_t cf_last_region; /* offset of last region in buffer */
20836 union {
20837 uint32_t cfu_last_zeroes; /* during creation:
20838 * number of "zero" dispositions at
20839 * end of last region */
20840 uint32_t cfu_hint_region; /* during lookup:
20841 * offset of last looked up region */
20842 #define cf_last_zeroes cfu.cfu_last_zeroes
20843 #define cf_hint_region cfu.cfu_hint_region
20844 } cfu;
20845 };
20846 typedef uint8_t cf_disp_t;
20847 struct vm_map_corpse_footprint_region {
20848 vm_map_offset_t cfr_vaddr; /* region start virtual address */
20849 uint32_t cfr_num_pages; /* number of pages in this "region" */
20850 cf_disp_t cfr_disposition[0]; /* disposition of each page */
20851 } __attribute__((packed));
20852
20853 static cf_disp_t
vm_page_disposition_to_cf_disp(int disposition)20854 vm_page_disposition_to_cf_disp(
20855 int disposition)
20856 {
20857 assert(sizeof(cf_disp_t) == 1);
20858 /* relocate bits that don't fit in a "uint8_t" */
20859 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
20860 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20861 }
20862 /* cast gets rid of extra bits */
20863 return (cf_disp_t) disposition;
20864 }
20865
20866 static int
vm_page_cf_disp_to_disposition(cf_disp_t cf_disp)20867 vm_page_cf_disp_to_disposition(
20868 cf_disp_t cf_disp)
20869 {
20870 int disposition;
20871
20872 assert(sizeof(cf_disp_t) == 1);
20873 disposition = (int) cf_disp;
20874 /* move relocated bits back in place */
20875 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
20876 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20877 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
20878 }
20879 return disposition;
20880 }
20881
20882 /*
20883 * vm_map_corpse_footprint_new_region:
20884 * closes the current footprint "region" and creates a new one
20885 *
20886 * Returns NULL if there's not enough space in the buffer for a new region.
20887 */
20888 static struct vm_map_corpse_footprint_region *
vm_map_corpse_footprint_new_region(struct vm_map_corpse_footprint_header * footprint_header)20889 vm_map_corpse_footprint_new_region(
20890 struct vm_map_corpse_footprint_header *footprint_header)
20891 {
20892 uintptr_t footprint_edge;
20893 uint32_t new_region_offset;
20894 struct vm_map_corpse_footprint_region *footprint_region;
20895 struct vm_map_corpse_footprint_region *new_footprint_region;
20896
20897 footprint_edge = ((uintptr_t)footprint_header +
20898 footprint_header->cf_size);
20899 footprint_region = ((struct vm_map_corpse_footprint_region *)
20900 ((char *)footprint_header +
20901 footprint_header->cf_last_region));
20902 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
20903 footprint_edge);
20904
20905 /* get rid of trailing zeroes in the last region */
20906 assert(footprint_region->cfr_num_pages >=
20907 footprint_header->cf_last_zeroes);
20908 footprint_region->cfr_num_pages -=
20909 footprint_header->cf_last_zeroes;
20910 footprint_header->cf_last_zeroes = 0;
20911
20912 /* reuse this region if it's now empty */
20913 if (footprint_region->cfr_num_pages == 0) {
20914 return footprint_region;
20915 }
20916
20917 /* compute offset of new region */
20918 new_region_offset = footprint_header->cf_last_region;
20919 new_region_offset += sizeof(*footprint_region);
20920 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
20921 new_region_offset = roundup(new_region_offset, sizeof(int));
20922
20923 /* check if we're going over the edge */
20924 if (((uintptr_t)footprint_header +
20925 new_region_offset +
20926 sizeof(*footprint_region)) >=
20927 footprint_edge) {
20928 /* over the edge: no new region */
20929 return NULL;
20930 }
20931
20932 /* adjust offset of last region in header */
20933 footprint_header->cf_last_region = new_region_offset;
20934
20935 new_footprint_region = (struct vm_map_corpse_footprint_region *)
20936 ((char *)footprint_header +
20937 footprint_header->cf_last_region);
20938 new_footprint_region->cfr_vaddr = 0;
20939 new_footprint_region->cfr_num_pages = 0;
20940 /* caller needs to initialize new region */
20941
20942 return new_footprint_region;
20943 }
20944
20945 /*
20946 * vm_map_corpse_footprint_collect:
20947 * collect footprint information for "old_entry" in "old_map" and
20948 * stores it in "new_map"'s vmmap_footprint_info.
20949 */
20950 kern_return_t
vm_map_corpse_footprint_collect(vm_map_t old_map,vm_map_entry_t old_entry,vm_map_t new_map)20951 vm_map_corpse_footprint_collect(
20952 vm_map_t old_map,
20953 vm_map_entry_t old_entry,
20954 vm_map_t new_map)
20955 {
20956 vm_map_offset_t va;
20957 kern_return_t kr;
20958 struct vm_map_corpse_footprint_header *footprint_header;
20959 struct vm_map_corpse_footprint_region *footprint_region;
20960 struct vm_map_corpse_footprint_region *new_footprint_region;
20961 cf_disp_t *next_disp_p;
20962 uintptr_t footprint_edge;
20963 uint32_t num_pages_tmp;
20964 int effective_page_size;
20965
20966 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
20967
20968 va = old_entry->vme_start;
20969
20970 vm_map_lock_assert_exclusive(old_map);
20971 vm_map_lock_assert_exclusive(new_map);
20972
20973 assert(new_map->has_corpse_footprint);
20974 assert(!old_map->has_corpse_footprint);
20975 if (!new_map->has_corpse_footprint ||
20976 old_map->has_corpse_footprint) {
20977 /*
20978 * This can only transfer footprint info from a
20979 * map with a live pmap to a map with a corpse footprint.
20980 */
20981 return KERN_NOT_SUPPORTED;
20982 }
20983
20984 if (new_map->vmmap_corpse_footprint == NULL) {
20985 vm_offset_t buf;
20986 vm_size_t buf_size;
20987
20988 buf = 0;
20989 buf_size = (sizeof(*footprint_header) +
20990 (old_map->hdr.nentries
20991 *
20992 (sizeof(*footprint_region) +
20993 +3)) /* potential alignment for each region */
20994 +
20995 ((old_map->size / effective_page_size)
20996 *
20997 sizeof(cf_disp_t))); /* disposition for each page */
20998 // printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
20999 buf_size = round_page(buf_size);
21000
21001 /* limit buffer to 1 page to validate overflow detection */
21002 // buf_size = PAGE_SIZE;
21003
21004 /* limit size to a somewhat sane amount */
21005 #if XNU_TARGET_OS_OSX
21006 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
21007 #else /* XNU_TARGET_OS_OSX */
21008 #define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
21009 #endif /* XNU_TARGET_OS_OSX */
21010 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
21011 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
21012 }
21013
21014 /*
21015 * Allocate the pageable buffer (with a trailing guard page).
21016 * It will be zero-filled on demand.
21017 */
21018 kr = kmem_alloc(kernel_map, &buf, buf_size + PAGE_SIZE,
21019 KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
21020 VM_KERN_MEMORY_DIAG);
21021 if (kr != KERN_SUCCESS) {
21022 vm_map_corpse_footprint_no_buf++;
21023 return kr;
21024 }
21025
21026 /* initialize header and 1st region */
21027 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
21028 new_map->vmmap_corpse_footprint = footprint_header;
21029
21030 footprint_header->cf_size = buf_size;
21031 footprint_header->cf_last_region =
21032 sizeof(*footprint_header);
21033 footprint_header->cf_last_zeroes = 0;
21034
21035 footprint_region = (struct vm_map_corpse_footprint_region *)
21036 ((char *)footprint_header +
21037 footprint_header->cf_last_region);
21038 footprint_region->cfr_vaddr = 0;
21039 footprint_region->cfr_num_pages = 0;
21040 } else {
21041 /* retrieve header and last region */
21042 footprint_header = (struct vm_map_corpse_footprint_header *)
21043 new_map->vmmap_corpse_footprint;
21044 footprint_region = (struct vm_map_corpse_footprint_region *)
21045 ((char *)footprint_header +
21046 footprint_header->cf_last_region);
21047 }
21048 footprint_edge = ((uintptr_t)footprint_header +
21049 footprint_header->cf_size);
21050
21051 if ((footprint_region->cfr_vaddr +
21052 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
21053 effective_page_size))
21054 != old_entry->vme_start) {
21055 uint64_t num_pages_delta, num_pages_delta_size;
21056 uint32_t region_offset_delta_size;
21057
21058 /*
21059 * Not the next contiguous virtual address:
21060 * start a new region or store "zero" dispositions for
21061 * the missing pages?
21062 */
21063 /* size of gap in actual page dispositions */
21064 num_pages_delta = ((old_entry->vme_start -
21065 footprint_region->cfr_vaddr) / effective_page_size)
21066 - footprint_region->cfr_num_pages;
21067 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
21068 /* size of gap as a new footprint region header */
21069 region_offset_delta_size =
21070 (sizeof(*footprint_region) +
21071 roundup(((footprint_region->cfr_num_pages -
21072 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
21073 sizeof(int)) -
21074 ((footprint_region->cfr_num_pages -
21075 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
21076 // printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
21077 if (region_offset_delta_size < num_pages_delta_size ||
21078 os_add3_overflow(footprint_region->cfr_num_pages,
21079 (uint32_t) num_pages_delta,
21080 1,
21081 &num_pages_tmp)) {
21082 /*
21083 * Storing data for this gap would take more space
21084 * than inserting a new footprint region header:
21085 * let's start a new region and save space. If it's a
21086 * tie, let's avoid using a new region, since that
21087 * would require more region hops to find the right
21088 * range during lookups.
21089 *
21090 * If the current region's cfr_num_pages would overflow
21091 * if we added "zero" page dispositions for the gap,
21092 * no choice but to start a new region.
21093 */
21094 // printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
21095 new_footprint_region =
21096 vm_map_corpse_footprint_new_region(footprint_header);
21097 /* check that we're not going over the edge */
21098 if (new_footprint_region == NULL) {
21099 goto over_the_edge;
21100 }
21101 footprint_region = new_footprint_region;
21102 /* initialize new region as empty */
21103 footprint_region->cfr_vaddr = old_entry->vme_start;
21104 footprint_region->cfr_num_pages = 0;
21105 } else {
21106 /*
21107 * Store "zero" page dispositions for the missing
21108 * pages.
21109 */
21110 // printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
21111 for (; num_pages_delta > 0; num_pages_delta--) {
21112 next_disp_p = (cf_disp_t *)
21113 ((uintptr_t) footprint_region +
21114 sizeof(*footprint_region));
21115 next_disp_p += footprint_region->cfr_num_pages;
21116 /* check that we're not going over the edge */
21117 if ((uintptr_t)next_disp_p >= footprint_edge) {
21118 goto over_the_edge;
21119 }
21120 /* store "zero" disposition for this gap page */
21121 footprint_region->cfr_num_pages++;
21122 *next_disp_p = (cf_disp_t) 0;
21123 footprint_header->cf_last_zeroes++;
21124 }
21125 }
21126 }
21127
21128 for (va = old_entry->vme_start;
21129 va < old_entry->vme_end;
21130 va += effective_page_size) {
21131 int disposition;
21132 cf_disp_t cf_disp;
21133
21134 vm_map_footprint_query_page_info(old_map,
21135 old_entry,
21136 va,
21137 &disposition);
21138 cf_disp = vm_page_disposition_to_cf_disp(disposition);
21139
21140 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
21141
21142 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
21143 /*
21144 * Ignore "zero" dispositions at start of
21145 * region: just move start of region.
21146 */
21147 footprint_region->cfr_vaddr += effective_page_size;
21148 continue;
21149 }
21150
21151 /* would region's cfr_num_pages overflow? */
21152 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
21153 &num_pages_tmp)) {
21154 /* overflow: create a new region */
21155 new_footprint_region =
21156 vm_map_corpse_footprint_new_region(
21157 footprint_header);
21158 if (new_footprint_region == NULL) {
21159 goto over_the_edge;
21160 }
21161 footprint_region = new_footprint_region;
21162 footprint_region->cfr_vaddr = va;
21163 footprint_region->cfr_num_pages = 0;
21164 }
21165
21166 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
21167 sizeof(*footprint_region));
21168 next_disp_p += footprint_region->cfr_num_pages;
21169 /* check that we're not going over the edge */
21170 if ((uintptr_t)next_disp_p >= footprint_edge) {
21171 goto over_the_edge;
21172 }
21173 /* store this dispostion */
21174 *next_disp_p = cf_disp;
21175 footprint_region->cfr_num_pages++;
21176
21177 if (cf_disp != 0) {
21178 /* non-zero disp: break the current zero streak */
21179 footprint_header->cf_last_zeroes = 0;
21180 /* done */
21181 continue;
21182 }
21183
21184 /* zero disp: add to the current streak of zeroes */
21185 footprint_header->cf_last_zeroes++;
21186 if ((footprint_header->cf_last_zeroes +
21187 roundup(((footprint_region->cfr_num_pages -
21188 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
21189 (sizeof(int) - 1),
21190 sizeof(int))) <
21191 (sizeof(*footprint_header))) {
21192 /*
21193 * There are not enough trailing "zero" dispositions
21194 * (+ the extra padding we would need for the previous
21195 * region); creating a new region would not save space
21196 * at this point, so let's keep this "zero" disposition
21197 * in this region and reconsider later.
21198 */
21199 continue;
21200 }
21201 /*
21202 * Create a new region to avoid having too many consecutive
21203 * "zero" dispositions.
21204 */
21205 new_footprint_region =
21206 vm_map_corpse_footprint_new_region(footprint_header);
21207 if (new_footprint_region == NULL) {
21208 goto over_the_edge;
21209 }
21210 footprint_region = new_footprint_region;
21211 /* initialize the new region as empty ... */
21212 footprint_region->cfr_num_pages = 0;
21213 /* ... and skip this "zero" disp */
21214 footprint_region->cfr_vaddr = va + effective_page_size;
21215 }
21216
21217 return KERN_SUCCESS;
21218
21219 over_the_edge:
21220 // printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
21221 vm_map_corpse_footprint_full++;
21222 return KERN_RESOURCE_SHORTAGE;
21223 }
21224
21225 /*
21226 * vm_map_corpse_footprint_collect_done:
21227 * completes the footprint collection by getting rid of any remaining
21228 * trailing "zero" dispositions and trimming the unused part of the
21229 * kernel buffer
21230 */
21231 void
vm_map_corpse_footprint_collect_done(vm_map_t new_map)21232 vm_map_corpse_footprint_collect_done(
21233 vm_map_t new_map)
21234 {
21235 struct vm_map_corpse_footprint_header *footprint_header;
21236 struct vm_map_corpse_footprint_region *footprint_region;
21237 vm_size_t buf_size, actual_size;
21238 kern_return_t kr;
21239
21240 assert(new_map->has_corpse_footprint);
21241 if (!new_map->has_corpse_footprint ||
21242 new_map->vmmap_corpse_footprint == NULL) {
21243 return;
21244 }
21245
21246 footprint_header = (struct vm_map_corpse_footprint_header *)
21247 new_map->vmmap_corpse_footprint;
21248 buf_size = footprint_header->cf_size;
21249
21250 footprint_region = (struct vm_map_corpse_footprint_region *)
21251 ((char *)footprint_header +
21252 footprint_header->cf_last_region);
21253
21254 /* get rid of trailing zeroes in last region */
21255 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
21256 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
21257 footprint_header->cf_last_zeroes = 0;
21258
21259 actual_size = (vm_size_t)(footprint_header->cf_last_region +
21260 sizeof(*footprint_region) +
21261 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
21262
21263 // printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
21264 vm_map_corpse_footprint_size_avg =
21265 (((vm_map_corpse_footprint_size_avg *
21266 vm_map_corpse_footprint_count) +
21267 actual_size) /
21268 (vm_map_corpse_footprint_count + 1));
21269 vm_map_corpse_footprint_count++;
21270 if (actual_size > vm_map_corpse_footprint_size_max) {
21271 vm_map_corpse_footprint_size_max = actual_size;
21272 }
21273
21274 actual_size = round_page(actual_size);
21275 if (buf_size > actual_size) {
21276 kr = vm_deallocate(kernel_map,
21277 ((vm_address_t)footprint_header +
21278 actual_size +
21279 PAGE_SIZE), /* trailing guard page */
21280 (buf_size - actual_size));
21281 assertf(kr == KERN_SUCCESS,
21282 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21283 footprint_header,
21284 (uint64_t) buf_size,
21285 (uint64_t) actual_size,
21286 kr);
21287 kr = vm_protect(kernel_map,
21288 ((vm_address_t)footprint_header +
21289 actual_size),
21290 PAGE_SIZE,
21291 FALSE, /* set_maximum */
21292 VM_PROT_NONE);
21293 assertf(kr == KERN_SUCCESS,
21294 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
21295 footprint_header,
21296 (uint64_t) buf_size,
21297 (uint64_t) actual_size,
21298 kr);
21299 }
21300
21301 footprint_header->cf_size = actual_size;
21302 }
21303
21304 /*
21305 * vm_map_corpse_footprint_query_page_info:
21306 * retrieves the disposition of the page at virtual address "vaddr"
21307 * in the forked corpse's VM map
21308 *
21309 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
21310 */
21311 kern_return_t
vm_map_corpse_footprint_query_page_info(vm_map_t map,vm_map_offset_t va,int * disposition_p)21312 vm_map_corpse_footprint_query_page_info(
21313 vm_map_t map,
21314 vm_map_offset_t va,
21315 int *disposition_p)
21316 {
21317 struct vm_map_corpse_footprint_header *footprint_header;
21318 struct vm_map_corpse_footprint_region *footprint_region;
21319 uint32_t footprint_region_offset;
21320 vm_map_offset_t region_start, region_end;
21321 int disp_idx;
21322 kern_return_t kr;
21323 int effective_page_size;
21324 cf_disp_t cf_disp;
21325
21326 if (!map->has_corpse_footprint) {
21327 *disposition_p = 0;
21328 kr = KERN_INVALID_ARGUMENT;
21329 goto done;
21330 }
21331
21332 footprint_header = map->vmmap_corpse_footprint;
21333 if (footprint_header == NULL) {
21334 *disposition_p = 0;
21335 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21336 kr = KERN_INVALID_ARGUMENT;
21337 goto done;
21338 }
21339
21340 /* start looking at the hint ("cf_hint_region") */
21341 footprint_region_offset = footprint_header->cf_hint_region;
21342
21343 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
21344
21345 lookup_again:
21346 if (footprint_region_offset < sizeof(*footprint_header)) {
21347 /* hint too low: start from 1st region */
21348 footprint_region_offset = sizeof(*footprint_header);
21349 }
21350 if (footprint_region_offset >= footprint_header->cf_last_region) {
21351 /* hint too high: re-start from 1st region */
21352 footprint_region_offset = sizeof(*footprint_header);
21353 }
21354 footprint_region = (struct vm_map_corpse_footprint_region *)
21355 ((char *)footprint_header + footprint_region_offset);
21356 region_start = footprint_region->cfr_vaddr;
21357 region_end = (region_start +
21358 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21359 effective_page_size));
21360 if (va < region_start &&
21361 footprint_region_offset != sizeof(*footprint_header)) {
21362 /* our range starts before the hint region */
21363
21364 /* reset the hint (in a racy way...) */
21365 footprint_header->cf_hint_region = sizeof(*footprint_header);
21366 /* lookup "va" again from 1st region */
21367 footprint_region_offset = sizeof(*footprint_header);
21368 goto lookup_again;
21369 }
21370
21371 while (va >= region_end) {
21372 if (footprint_region_offset >= footprint_header->cf_last_region) {
21373 break;
21374 }
21375 /* skip the region's header */
21376 footprint_region_offset += sizeof(*footprint_region);
21377 /* skip the region's page dispositions */
21378 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
21379 /* align to next word boundary */
21380 footprint_region_offset =
21381 roundup(footprint_region_offset,
21382 sizeof(int));
21383 footprint_region = (struct vm_map_corpse_footprint_region *)
21384 ((char *)footprint_header + footprint_region_offset);
21385 region_start = footprint_region->cfr_vaddr;
21386 region_end = (region_start +
21387 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
21388 effective_page_size));
21389 }
21390 if (va < region_start || va >= region_end) {
21391 /* page not found */
21392 *disposition_p = 0;
21393 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21394 kr = KERN_SUCCESS;
21395 goto done;
21396 }
21397
21398 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
21399 footprint_header->cf_hint_region = footprint_region_offset;
21400
21401 /* get page disposition for "va" in this region */
21402 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
21403 cf_disp = footprint_region->cfr_disposition[disp_idx];
21404 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
21405 kr = KERN_SUCCESS;
21406 done:
21407 // if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
21408 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
21409 DTRACE_VM4(footprint_query_page_info,
21410 vm_map_t, map,
21411 vm_map_offset_t, va,
21412 int, *disposition_p,
21413 kern_return_t, kr);
21414
21415 return kr;
21416 }
21417
21418 void
vm_map_corpse_footprint_destroy(vm_map_t map)21419 vm_map_corpse_footprint_destroy(
21420 vm_map_t map)
21421 {
21422 if (map->has_corpse_footprint &&
21423 map->vmmap_corpse_footprint != 0) {
21424 struct vm_map_corpse_footprint_header *footprint_header;
21425 vm_size_t buf_size;
21426 kern_return_t kr;
21427
21428 footprint_header = map->vmmap_corpse_footprint;
21429 buf_size = footprint_header->cf_size;
21430 kr = vm_deallocate(kernel_map,
21431 (vm_offset_t) map->vmmap_corpse_footprint,
21432 ((vm_size_t) buf_size
21433 + PAGE_SIZE)); /* trailing guard page */
21434 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
21435 map->vmmap_corpse_footprint = 0;
21436 map->has_corpse_footprint = FALSE;
21437 }
21438 }
21439
21440 /*
21441 * vm_map_copy_footprint_ledgers:
21442 * copies any ledger that's relevant to the memory footprint of "old_task"
21443 * into the forked corpse's task ("new_task")
21444 */
21445 void
vm_map_copy_footprint_ledgers(task_t old_task,task_t new_task)21446 vm_map_copy_footprint_ledgers(
21447 task_t old_task,
21448 task_t new_task)
21449 {
21450 vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
21451 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
21452 vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
21453 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
21454 vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
21455 vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
21456 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
21457 vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
21458 vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
21459 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint);
21460 vm_map_copy_ledger(old_task, new_task, task_ledgers.tagged_footprint_compressed);
21461 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
21462 vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
21463 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint);
21464 vm_map_copy_ledger(old_task, new_task, task_ledgers.media_footprint_compressed);
21465 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint);
21466 vm_map_copy_ledger(old_task, new_task, task_ledgers.graphics_footprint_compressed);
21467 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint);
21468 vm_map_copy_ledger(old_task, new_task, task_ledgers.neural_footprint_compressed);
21469 vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
21470 }
21471
21472 /*
21473 * vm_map_copy_ledger:
21474 * copy a single ledger from "old_task" to "new_task"
21475 */
21476 void
vm_map_copy_ledger(task_t old_task,task_t new_task,int ledger_entry)21477 vm_map_copy_ledger(
21478 task_t old_task,
21479 task_t new_task,
21480 int ledger_entry)
21481 {
21482 ledger_amount_t old_balance, new_balance, delta;
21483
21484 assert(new_task->map->has_corpse_footprint);
21485 if (!new_task->map->has_corpse_footprint) {
21486 return;
21487 }
21488
21489 /* turn off sanity checks for the ledger we're about to mess with */
21490 ledger_disable_panic_on_negative(new_task->ledger,
21491 ledger_entry);
21492
21493 /* adjust "new_task" to match "old_task" */
21494 ledger_get_balance(old_task->ledger,
21495 ledger_entry,
21496 &old_balance);
21497 ledger_get_balance(new_task->ledger,
21498 ledger_entry,
21499 &new_balance);
21500 if (new_balance == old_balance) {
21501 /* new == old: done */
21502 } else if (new_balance > old_balance) {
21503 /* new > old ==> new -= new - old */
21504 delta = new_balance - old_balance;
21505 ledger_debit(new_task->ledger,
21506 ledger_entry,
21507 delta);
21508 } else {
21509 /* new < old ==> new += old - new */
21510 delta = old_balance - new_balance;
21511 ledger_credit(new_task->ledger,
21512 ledger_entry,
21513 delta);
21514 }
21515 }
21516
21517 /*
21518 * vm_map_get_pmap:
21519 * returns the pmap associated with the vm_map
21520 */
21521 pmap_t
vm_map_get_pmap(vm_map_t map)21522 vm_map_get_pmap(vm_map_t map)
21523 {
21524 return vm_map_pmap(map);
21525 }
21526
21527 #if MACH_ASSERT
21528
21529 extern int pmap_ledgers_panic;
21530 extern int pmap_ledgers_panic_leeway;
21531
21532 #define LEDGER_DRIFT(__LEDGER) \
21533 int __LEDGER##_over; \
21534 ledger_amount_t __LEDGER##_over_total; \
21535 ledger_amount_t __LEDGER##_over_max; \
21536 int __LEDGER##_under; \
21537 ledger_amount_t __LEDGER##_under_total; \
21538 ledger_amount_t __LEDGER##_under_max
21539
21540 struct {
21541 uint64_t num_pmaps_checked;
21542
21543 LEDGER_DRIFT(phys_footprint);
21544 LEDGER_DRIFT(internal);
21545 LEDGER_DRIFT(internal_compressed);
21546 LEDGER_DRIFT(external);
21547 LEDGER_DRIFT(reusable);
21548 LEDGER_DRIFT(iokit_mapped);
21549 LEDGER_DRIFT(alternate_accounting);
21550 LEDGER_DRIFT(alternate_accounting_compressed);
21551 LEDGER_DRIFT(page_table);
21552 LEDGER_DRIFT(purgeable_volatile);
21553 LEDGER_DRIFT(purgeable_nonvolatile);
21554 LEDGER_DRIFT(purgeable_volatile_compressed);
21555 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
21556 LEDGER_DRIFT(tagged_nofootprint);
21557 LEDGER_DRIFT(tagged_footprint);
21558 LEDGER_DRIFT(tagged_nofootprint_compressed);
21559 LEDGER_DRIFT(tagged_footprint_compressed);
21560 LEDGER_DRIFT(network_volatile);
21561 LEDGER_DRIFT(network_nonvolatile);
21562 LEDGER_DRIFT(network_volatile_compressed);
21563 LEDGER_DRIFT(network_nonvolatile_compressed);
21564 LEDGER_DRIFT(media_nofootprint);
21565 LEDGER_DRIFT(media_footprint);
21566 LEDGER_DRIFT(media_nofootprint_compressed);
21567 LEDGER_DRIFT(media_footprint_compressed);
21568 LEDGER_DRIFT(graphics_nofootprint);
21569 LEDGER_DRIFT(graphics_footprint);
21570 LEDGER_DRIFT(graphics_nofootprint_compressed);
21571 LEDGER_DRIFT(graphics_footprint_compressed);
21572 LEDGER_DRIFT(neural_nofootprint);
21573 LEDGER_DRIFT(neural_footprint);
21574 LEDGER_DRIFT(neural_nofootprint_compressed);
21575 LEDGER_DRIFT(neural_footprint_compressed);
21576 } pmap_ledgers_drift;
21577
21578 void
vm_map_pmap_check_ledgers(pmap_t pmap,ledger_t ledger,int pid,char * procname)21579 vm_map_pmap_check_ledgers(
21580 pmap_t pmap,
21581 ledger_t ledger,
21582 int pid,
21583 char *procname)
21584 {
21585 ledger_amount_t bal;
21586 boolean_t do_panic;
21587
21588 do_panic = FALSE;
21589
21590 pmap_ledgers_drift.num_pmaps_checked++;
21591
21592 #define LEDGER_CHECK_BALANCE(__LEDGER) \
21593 MACRO_BEGIN \
21594 int panic_on_negative = TRUE; \
21595 ledger_get_balance(ledger, \
21596 task_ledgers.__LEDGER, \
21597 &bal); \
21598 ledger_get_panic_on_negative(ledger, \
21599 task_ledgers.__LEDGER, \
21600 &panic_on_negative); \
21601 if (bal != 0) { \
21602 if (panic_on_negative || \
21603 (pmap_ledgers_panic && \
21604 pmap_ledgers_panic_leeway > 0 && \
21605 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
21606 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
21607 do_panic = TRUE; \
21608 } \
21609 printf("LEDGER BALANCE proc %d (%s) " \
21610 "\"%s\" = %lld\n", \
21611 pid, procname, #__LEDGER, bal); \
21612 if (bal > 0) { \
21613 pmap_ledgers_drift.__LEDGER##_over++; \
21614 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
21615 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
21616 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
21617 } \
21618 } else if (bal < 0) { \
21619 pmap_ledgers_drift.__LEDGER##_under++; \
21620 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
21621 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
21622 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
21623 } \
21624 } \
21625 } \
21626 MACRO_END
21627
21628 LEDGER_CHECK_BALANCE(phys_footprint);
21629 LEDGER_CHECK_BALANCE(internal);
21630 LEDGER_CHECK_BALANCE(internal_compressed);
21631 LEDGER_CHECK_BALANCE(external);
21632 LEDGER_CHECK_BALANCE(reusable);
21633 LEDGER_CHECK_BALANCE(iokit_mapped);
21634 LEDGER_CHECK_BALANCE(alternate_accounting);
21635 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
21636 LEDGER_CHECK_BALANCE(page_table);
21637 LEDGER_CHECK_BALANCE(purgeable_volatile);
21638 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
21639 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
21640 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
21641 LEDGER_CHECK_BALANCE(tagged_nofootprint);
21642 LEDGER_CHECK_BALANCE(tagged_footprint);
21643 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
21644 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
21645 LEDGER_CHECK_BALANCE(network_volatile);
21646 LEDGER_CHECK_BALANCE(network_nonvolatile);
21647 LEDGER_CHECK_BALANCE(network_volatile_compressed);
21648 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
21649 LEDGER_CHECK_BALANCE(media_nofootprint);
21650 LEDGER_CHECK_BALANCE(media_footprint);
21651 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
21652 LEDGER_CHECK_BALANCE(media_footprint_compressed);
21653 LEDGER_CHECK_BALANCE(graphics_nofootprint);
21654 LEDGER_CHECK_BALANCE(graphics_footprint);
21655 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
21656 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
21657 LEDGER_CHECK_BALANCE(neural_nofootprint);
21658 LEDGER_CHECK_BALANCE(neural_footprint);
21659 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
21660 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
21661
21662 if (do_panic) {
21663 if (pmap_ledgers_panic) {
21664 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
21665 pmap, pid, procname);
21666 } else {
21667 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
21668 pmap, pid, procname);
21669 }
21670 }
21671 }
21672 #endif /* MACH_ASSERT */
21673