1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_kern.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Kernel memory management.
64 */
65
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <kern/misc_protos.h>
79 #include <vm/cpm.h>
80 #include <kern/ledger.h>
81 #include <kern/bits.h>
82 #include <kern/startup.h>
83
84 #include <string.h>
85
86 #include <libkern/OSDebug.h>
87 #include <libkern/crypto/sha2.h>
88 #include <libkern/section_keywords.h>
89 #include <sys/kdebug.h>
90
91 #include <san/kasan.h>
92 #include <kern/kext_alloc.h>
93 #include <kern/backtrace.h>
94 #include <os/hash.h>
95 #include <kern/zalloc_internal.h>
96
97 /*
98 * Variables exported by this module.
99 */
100
101 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
102 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
103 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
104 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
105 #endif
106 TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges", KMEM_RANGE_ID_NUM_PTR);
107 TUNABLE(uint32_t, kmem_sprayqtn_range, "kmem_sprayqtn_range", 1);
108
109 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
110 __startup_data static vm_map_size_t data_range_size;
111 __startup_data static vm_map_size_t ptr_range_size;
112 #endif
113 __startup_data static vm_map_size_t sprayqtn_range_size;
114
115 #pragma mark helpers
116
117 __attribute__((overloadable))
118 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)119 ANYF(kma_flags_t flags)
120 {
121 return (kmem_flags_t)flags;
122 }
123
124 __attribute__((overloadable))
125 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)126 ANYF(kmr_flags_t flags)
127 {
128 return (kmem_flags_t)flags;
129 }
130
131 __attribute__((overloadable))
132 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)133 ANYF(kmf_flags_t flags)
134 {
135 return (kmem_flags_t)flags;
136 }
137
138 __abortlike
139 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)140 __kmem_invalid_size_panic(
141 vm_map_t map,
142 vm_size_t size,
143 uint32_t flags)
144 {
145 panic("kmem(map=%p, flags=0x%x): invalid size %zd",
146 map, flags, (size_t)size);
147 }
148
149 __abortlike
150 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)151 __kmem_invalid_arguments_panic(
152 const char *what,
153 vm_map_t map,
154 vm_address_t address,
155 vm_size_t size,
156 uint32_t flags)
157 {
158 panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
159 "invalid arguments passed",
160 what, map, (void *)address, (size_t)size, flags);
161 }
162
163 __abortlike
164 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)165 __kmem_failed_panic(
166 vm_map_t map,
167 vm_size_t size,
168 uint32_t flags,
169 kern_return_t kr,
170 const char *what)
171 {
172 panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
173 what, map, (size_t)size, flags, kr);
174 }
175
176 __abortlike
177 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)178 __kmem_entry_not_found_panic(
179 vm_map_t map,
180 vm_offset_t addr)
181 {
182 panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
183 }
184
185 __abortlike
186 static void
__kmem_invalid_object_panic(uint32_t flags)187 __kmem_invalid_object_panic(uint32_t flags)
188 {
189 if (flags == 0) {
190 panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
191 }
192 panic("more than one of KMEM_KOBJECT or KMEM_COMPRESSOR specified");
193 }
194
195 static inline vm_object_t
__kmem_object(kmem_flags_t flags)196 __kmem_object(kmem_flags_t flags)
197 {
198 flags &= (KMEM_KOBJECT | KMEM_COMPRESSOR);
199 if (flags == 0 || (flags & (flags - 1))) {
200 __kmem_invalid_object_panic(flags);
201 }
202
203 return (flags & KMEM_KOBJECT) ? kernel_object : compressor_object;
204 }
205
206 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)207 __kmem_guard_left(kmem_flags_t flags)
208 {
209 return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
210 }
211
212 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)213 __kmem_guard_right(kmem_flags_t flags)
214 {
215 return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
216 }
217
218 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)219 __kmem_guard_size(kmem_flags_t flags)
220 {
221 return __kmem_guard_left(flags) + __kmem_guard_right(flags);
222 }
223
224
225 #pragma mark kmem range methods
226
227 #if __arm64__
228 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
229 #define mach_vm_range_load(r, r_min, r_max) \
230 asm("ldp %[rmin], %[rmax], [%[range]]" \
231 : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
232 : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
233 #else
234 #define mach_vm_range_load(r, rmin, rmax) \
235 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
236 #endif
237
238 __abortlike
239 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)240 __mach_vm_range_overflow(
241 mach_vm_offset_t addr,
242 mach_vm_offset_t size)
243 {
244 panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
245 addr, addr, size);
246 }
247
248 __abortlike
249 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)250 __mach_vm_range_invalid(
251 mach_vm_offset_t min_address,
252 mach_vm_offset_t max_address)
253 {
254 panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
255 min_address, max_address);
256 }
257
258 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)259 mach_vm_range_size(const struct mach_vm_range *r)
260 {
261 mach_vm_offset_t rmin, rmax;
262
263 mach_vm_range_load(r, rmin, rmax);
264 return rmax - rmin;
265 }
266
267 __attribute__((overloadable))
268 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)269 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
270 {
271 mach_vm_offset_t rmin, rmax;
272
273 #if CONFIG_KERNEL_TBI
274 if (VM_KERNEL_ADDRESS(addr)) {
275 addr = VM_KERNEL_TBI_FILL(addr);
276 }
277 #endif /* CONFIG_KERNEL_TBI */
278
279 /*
280 * The `&` is not a typo: we really expect the check to pass,
281 * so encourage the compiler to eagerly load and test without branches
282 */
283 mach_vm_range_load(r, rmin, rmax);
284 return (addr >= rmin) & (addr < rmax);
285 }
286
287 __attribute__((overloadable))
288 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)289 mach_vm_range_contains(
290 const struct mach_vm_range *r,
291 mach_vm_offset_t addr,
292 mach_vm_offset_t size)
293 {
294 mach_vm_offset_t rmin, rmax;
295
296 #if CONFIG_KERNEL_TBI
297 if (VM_KERNEL_ADDRESS(addr)) {
298 addr = VM_KERNEL_TBI_FILL(addr);
299 }
300 #endif /* CONFIG_KERNEL_TBI */
301
302 /*
303 * The `&` is not a typo: we really expect the check to pass,
304 * so encourage the compiler to eagerly load and test without branches
305 */
306 mach_vm_range_load(r, rmin, rmax);
307 return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
308 }
309
310 __attribute__((overloadable))
311 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)312 mach_vm_range_intersects(
313 const struct mach_vm_range *r1,
314 const struct mach_vm_range *r2)
315 {
316 mach_vm_offset_t r1_min, r1_max;
317 mach_vm_offset_t r2_min, r2_max;
318
319 mach_vm_range_load(r1, r1_min, r1_max);
320 r2_min = r2->min_address;
321 r2_max = r2->max_address;
322
323 if (r1_min > r1_max) {
324 __mach_vm_range_invalid(r1_min, r1_max);
325 }
326
327 if (r2_min > r2_max) {
328 __mach_vm_range_invalid(r2_min, r2_max);
329 }
330
331 return r1_max > r2_min && r1_min < r2_max;
332 }
333
334 __attribute__((overloadable))
335 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)336 mach_vm_range_intersects(
337 const struct mach_vm_range *r1,
338 mach_vm_offset_t addr,
339 mach_vm_offset_t size)
340 {
341 struct mach_vm_range r2;
342
343 #if CONFIG_KERNEL_TBI
344 addr = VM_KERNEL_STRIP_UPTR(addr);
345 #endif /* CONFIG_KERNEL_TBI */
346 r2.min_address = addr;
347 if (os_add_overflow(addr, size, &r2.max_address)) {
348 __mach_vm_range_overflow(addr, size);
349 }
350
351 return mach_vm_range_intersects(r1, &r2);
352 }
353
354 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)355 kmem_range_id_contains(
356 kmem_range_id_t range_id,
357 vm_map_offset_t addr,
358 vm_map_size_t size)
359 {
360 return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
361 }
362
363 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)364 kmem_range_id_size(kmem_range_id_t range_id)
365 {
366 return mach_vm_range_size(&kmem_ranges[range_id]);
367 }
368
369 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)370 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
371 {
372 kmem_range_id_t range_id = 0;
373 for (; range_id < KMEM_RANGE_COUNT; range_id++) {
374 if (kmem_range_id_contains(range_id, addr, size)) {
375 break;
376 }
377 }
378 return range_id;
379 }
380
381
382 #pragma mark entry parameters
383
384
385 __abortlike
386 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)387 __kmem_entry_validate_panic(
388 vm_map_t map,
389 vm_map_entry_t entry,
390 vm_offset_t addr,
391 vm_size_t size,
392 uint32_t flags,
393 kmem_guard_t guard)
394 {
395 const char *what = "???";
396
397 if (entry->vme_atomic != guard.kmg_atomic) {
398 what = "atomicity";
399 } else if (entry->is_sub_map != guard.kmg_submap) {
400 what = "objectness";
401 } else if (addr != entry->vme_start) {
402 what = "left bound";
403 } else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
404 what = "right bound";
405 #if __LP64__
406 } else if (guard.kmg_context != entry->vme_context) {
407 what = "guard";
408 #endif
409 }
410
411 panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
412 "entry:%p %s mismatch guard(0x%08x)",
413 map, (void *)addr, size, flags, entry,
414 what, guard.kmg_context);
415 }
416
417 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)418 __kmem_entry_validate_guard(
419 vm_map_entry_t entry,
420 vm_offset_t addr,
421 vm_size_t size,
422 kmem_flags_t flags,
423 kmem_guard_t guard)
424 {
425 if (entry->vme_atomic != guard.kmg_atomic) {
426 return false;
427 }
428
429 if (!guard.kmg_atomic) {
430 return true;
431 }
432
433 if (entry->is_sub_map != guard.kmg_submap) {
434 return false;
435 }
436
437 if (addr != entry->vme_start) {
438 return false;
439 }
440
441 if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
442 return false;
443 }
444
445 #if __LP64__
446 if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
447 return false;
448 }
449 #endif
450
451 return true;
452 }
453
454 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)455 kmem_entry_validate_guard(
456 vm_map_t map,
457 vm_map_entry_t entry,
458 vm_offset_t addr,
459 vm_size_t size,
460 kmem_guard_t guard)
461 {
462 if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
463 __kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
464 }
465 }
466
467 __abortlike
468 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)469 __kmem_entry_validate_object_panic(
470 vm_map_t map,
471 vm_map_entry_t entry,
472 kmem_flags_t flags)
473 {
474 const char *what;
475 const char *verb;
476
477 if (entry->is_sub_map) {
478 panic("kmem(map=%p) entry %p is a submap", map, entry);
479 }
480
481 if (flags & KMEM_KOBJECT) {
482 what = "kernel";
483 verb = "isn't";
484 } else if (flags & KMEM_COMPRESSOR) {
485 what = "compressor";
486 verb = "isn't";
487 } else if (entry->vme_kernel_object) {
488 what = "kernel";
489 verb = "is unexpectedly";
490 } else {
491 what = "compressor";
492 verb = "is unexpectedly";
493 }
494
495 panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
496 map, flags, entry, verb, what);
497 }
498
499 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)500 __kmem_entry_validate_object(
501 vm_map_entry_t entry,
502 kmem_flags_t flags)
503 {
504 if (entry->is_sub_map) {
505 return false;
506 }
507 if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
508 return false;
509 }
510
511 return (bool)(flags & KMEM_COMPRESSOR) ==
512 (VME_OBJECT(entry) == compressor_object);
513 }
514
515 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)516 kmem_size_guard(
517 vm_map_t map,
518 vm_offset_t addr,
519 kmem_guard_t guard)
520 {
521 kmem_flags_t flags = KMEM_GUESS_SIZE;
522 vm_map_entry_t entry;
523 vm_size_t size;
524
525 vm_map_lock_read(map);
526
527 if (!vm_map_lookup_entry(map, addr, &entry)) {
528 __kmem_entry_not_found_panic(map, addr);
529 }
530
531 if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
532 __kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
533 }
534
535 size = (vm_size_t)(entry->vme_end - entry->vme_start);
536
537 vm_map_unlock_read(map);
538
539 return size;
540 }
541
542 #if ZSECURITY_CONFIG(KALLOC_TYPE)
543 static inline uint16_t
kmem_hash_backtrace(void * fp)544 kmem_hash_backtrace(
545 void *fp)
546 {
547 uint64_t bt_count;
548 uintptr_t bt[8] = {};
549
550 struct backtrace_control ctl = {
551 .btc_frame_addr = (uintptr_t)fp,
552 };
553
554 bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
555 return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
556 }
557 #endif
558
559 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
560 "Insufficient bits to represent ptr ranges");
561
562 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)563 kmem_adjust_range_id(
564 uint32_t hash)
565 {
566 return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
567 (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
568 }
569
570 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)571 kmem_apply_security_policy(
572 vm_map_t map,
573 kma_flags_t kma_flags,
574 kmem_guard_t guard,
575 vm_map_kernel_flags_t *vmk_flags,
576 bool assert_dir __unused)
577 {
578 kmem_range_id_t range_id;
579 bool direction;
580 uint16_t type_hash = guard.kmg_type_hash;
581
582 if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
583 return;
584 }
585
586 /*
587 * When ZSECURITY_CONFIG(KALLOC_TYPE) is enabled, a non-zero type-hash
588 * must be passed by krealloc_type
589 */
590 #if (DEBUG || DEVELOPMENT) && ZSECURITY_CONFIG(KALLOC_TYPE)
591 if (assert_dir && !(kma_flags & KMA_DATA)) {
592 assert(type_hash != 0);
593 }
594 #endif
595
596 if (kma_flags & KMA_DATA) {
597 range_id = KMEM_RANGE_ID_DATA;
598 /*
599 * As an optimization in KMA_DATA to avoid fragmentation,
600 * allocate static carveouts at the end of the DATA range.
601 */
602 direction = (bool)(kma_flags & KMA_PERMANENT);
603 } else if (kma_flags & KMA_SPRAYQTN) {
604 range_id = KMEM_RANGE_ID_SPRAYQTN;
605 direction = (bool)(kma_flags & KMA_PERMANENT);
606 } else if (type_hash) {
607 range_id = type_hash & KMEM_RANGE_MASK;
608 direction = type_hash & KMEM_DIRECTION_MASK;
609 } else {
610 #if ZSECURITY_CONFIG(KALLOC_TYPE)
611 type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
612 #endif
613 /*
614 * Range id needs to correspond to one of the PTR ranges
615 */
616 range_id = kmem_adjust_range_id(type_hash);
617 direction = type_hash & KMEM_DIRECTION_MASK;
618 }
619
620 vmk_flags->vmkf_range_id = range_id;
621 vmk_flags->vmkf_last_free = direction;
622 }
623
624 #pragma mark allocation
625
626 kern_return_t
kmem_alloc_contig(vm_map_t map,vm_offset_t * addrp,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,vm_tag_t tag)627 kmem_alloc_contig(
628 vm_map_t map,
629 vm_offset_t *addrp,
630 vm_size_t size,
631 vm_offset_t mask,
632 ppnum_t max_pnum,
633 ppnum_t pnum_mask,
634 kma_flags_t flags,
635 vm_tag_t tag)
636 {
637 vm_object_t object;
638 vm_object_offset_t offset;
639 vm_map_offset_t map_addr;
640 vm_map_offset_t map_mask;
641 vm_map_size_t map_size, i;
642 vm_map_entry_t entry;
643 vm_page_t m, pages;
644 kern_return_t kr;
645 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
646
647 assert(VM_KERN_MEMORY_NONE != tag);
648 assert(map);
649 assert3u(flags & ~KMEM_ALLOC_CONTIG_FLAGS, ==, 0);
650
651 map_size = vm_map_round_page(size, VM_MAP_PAGE_MASK(map));
652 map_mask = (vm_map_offset_t)mask;
653
654 /* Check for zero allocation size (either directly or via overflow) */
655 if (map_size == 0) {
656 *addrp = 0;
657 return KERN_INVALID_ARGUMENT;
658 }
659
660 /*
661 * Allocate a new object (if necessary) and the reference we
662 * will be donating to the map entry. We must do this before
663 * locking the map, or risk deadlock with the default pager.
664 */
665 if ((flags & KMA_KOBJECT) != 0) {
666 object = kernel_object;
667 vm_object_reference(object);
668 } else {
669 object = vm_object_allocate(map_size);
670 /* stabilize the object to prevent shadowing */
671 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
672 object->true_share = TRUE;
673 }
674 if (flags & KMA_PERMANENT) {
675 vmk_flags.vmkf_permanent = true;
676 }
677 kmem_apply_security_policy(map, flags, KMEM_GUARD_NONE, &vmk_flags, false);
678
679 kr = vm_map_find_space(map, 0, map_size, map_mask,
680 vmk_flags, &entry);
681 if (KERN_SUCCESS != kr) {
682 vm_object_deallocate(object);
683 return kr;
684 }
685
686 map_addr = entry->vme_start;
687 if (object == kernel_object) {
688 offset = map_addr;
689 } else {
690 offset = 0;
691 }
692 VME_OBJECT_SET(entry, object, false, 0);
693 VME_OFFSET_SET(entry, offset);
694 VME_ALIAS_SET(entry, tag);
695
696 /* Take an extra object ref in case the map entry gets deleted */
697 vm_object_reference(object);
698 vm_map_unlock(map);
699
700 kr = cpm_allocate(CAST_DOWN(vm_size_t, map_size), &pages, max_pnum, pnum_mask, FALSE, flags);
701
702 if (kr != KERN_SUCCESS) {
703 vm_map_remove(map,
704 vm_map_trunc_page(map_addr,
705 VM_MAP_PAGE_MASK(map)),
706 vm_map_round_page(map_addr + map_size,
707 VM_MAP_PAGE_MASK(map)));
708 vm_object_deallocate(object);
709 *addrp = 0;
710 return kr;
711 }
712
713 if (flags & KMA_ZERO) {
714 for (m = pages; m; m = NEXT_PAGE(m)) {
715 vm_page_zero_fill(m);
716 }
717 }
718
719
720 vm_object_lock(object);
721 for (i = 0; i < map_size; i += PAGE_SIZE) {
722 m = pages;
723 pages = NEXT_PAGE(m);
724 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
725 m->vmp_busy = FALSE;
726 vm_page_insert(m, object, offset + i);
727 }
728 vm_object_unlock(object);
729
730 kr = vm_map_wire_kernel(map,
731 vm_map_trunc_page(map_addr,
732 VM_MAP_PAGE_MASK(map)),
733 vm_map_round_page(map_addr + map_size,
734 VM_MAP_PAGE_MASK(map)),
735 VM_PROT_DEFAULT, tag,
736 FALSE);
737
738 if (kr != KERN_SUCCESS) {
739 if (object == kernel_object) {
740 vm_object_lock(object);
741 vm_object_page_remove(object, offset, offset + map_size);
742 vm_object_unlock(object);
743 }
744 vm_map_remove(map,
745 vm_map_trunc_page(map_addr,
746 VM_MAP_PAGE_MASK(map)),
747 vm_map_round_page(map_addr + map_size,
748 VM_MAP_PAGE_MASK(map)));
749 vm_object_deallocate(object);
750 return kr;
751 }
752 vm_object_deallocate(object);
753
754 if (object == kernel_object) {
755 vm_map_simplify(map, map_addr);
756 vm_tag_update_size(tag, map_size);
757 }
758 *addrp = (vm_offset_t) map_addr;
759 assert((vm_map_offset_t) *addrp == map_addr);
760
761 return KERN_SUCCESS;
762 }
763
764 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)765 kmem_alloc_guard(
766 vm_map_t map,
767 vm_size_t size,
768 vm_offset_t mask,
769 kma_flags_t flags,
770 kmem_guard_t guard)
771 {
772 vm_object_t object;
773 vm_map_entry_t entry = NULL;
774 vm_map_offset_t map_addr, fill_start;
775 vm_map_size_t map_size, fill_size;
776 vm_page_t guard_left = VM_PAGE_NULL;
777 vm_page_t guard_right = VM_PAGE_NULL;
778 vm_page_t wired_page_list = VM_PAGE_NULL;
779 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
780 bool skip_guards;
781 kmem_return_t kmr = { };
782
783 assert(kernel_map && map->pmap == kernel_pmap);
784
785 #if DEBUG || DEVELOPMENT
786 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
787 size, 0, 0, 0);
788 #endif
789
790 if (size == 0 ||
791 (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
792 (size < __kmem_guard_size(ANYF(flags)))) {
793 __kmem_invalid_size_panic(map, size, flags);
794 }
795
796 /*
797 * limit the size of a single extent of wired memory
798 * to try and limit the damage to the system if
799 * too many pages get wired down
800 * limit raised to 2GB with 128GB max physical limit,
801 * but scaled by installed memory above this
802 */
803 if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
804 size > MAX(1ULL << 31, sane_size / 64))) {
805 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
806 goto out_error;
807 }
808
809 /*
810 * Guard pages:
811 *
812 * Guard pages are implemented as fictitious pages.
813 *
814 * However, some maps, and some objects are known
815 * to manage their memory explicitly, and do not need
816 * those to be materialized, which saves memory.
817 *
818 * By placing guard pages on either end of a stack,
819 * they can help detect cases where a thread walks
820 * off either end of its stack.
821 *
822 * They are allocated and set up here and attempts
823 * to access those pages are trapped in vm_fault_page().
824 *
825 * The map_size we were passed may include extra space for
826 * guard pages. fill_size represents the actual size to populate.
827 * Similarly, fill_start indicates where the actual pages
828 * will begin in the range.
829 */
830
831 map_size = round_page(size);
832 fill_start = 0;
833 fill_size = map_size - __kmem_guard_size(ANYF(flags));
834
835 skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
836 map->never_faults;
837
838 if (flags & KMA_GUARD_FIRST) {
839 vmk_flags.vmkf_guard_before = true;
840 fill_start += PAGE_SIZE;
841 }
842 if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
843 guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
844 if (__improbable(guard_left == VM_PAGE_NULL)) {
845 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
846 goto out_error;
847 }
848 }
849 if ((flags & KMA_GUARD_LAST) && !skip_guards) {
850 guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
851 if (__improbable(guard_right == VM_PAGE_NULL)) {
852 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
853 goto out_error;
854 }
855 }
856
857 if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
858 kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
859 &wired_page_list);
860 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
861 goto out_error;
862 }
863 }
864
865 /*
866 * Allocate a new object (if necessary). We must do this before
867 * locking the map, or risk deadlock with the default pager.
868 */
869 if (flags & KMA_KOBJECT) {
870 object = kernel_object;
871 vm_object_reference(object);
872 } else if (flags & KMA_COMPRESSOR) {
873 object = compressor_object;
874 vm_object_reference(object);
875 } else {
876 object = vm_object_allocate(map_size);
877 /* stabilize the object to prevent shadowing */
878 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
879 object->true_share = TRUE;
880 }
881
882 if (flags & KMA_LAST_FREE) {
883 vmk_flags.vmkf_last_free = true;
884 }
885 if (flags & KMA_PERMANENT) {
886 vmk_flags.vmkf_permanent = true;
887 }
888 kmem_apply_security_policy(map, flags, guard, &vmk_flags, false);
889
890 kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
891 vmk_flags, &entry);
892 if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
893 vm_object_deallocate(object);
894 goto out_error;
895 }
896
897 map_addr = entry->vme_start;
898 VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
899 VME_ALIAS_SET(entry, guard.kmg_tag);
900 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
901 VME_OFFSET_SET(entry, map_addr);
902 } else {
903 vm_object_reference(object);
904 }
905
906 if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
907 entry->wired_count = 1;
908 }
909
910 if (guard_left || guard_right || wired_page_list) {
911 vm_object_offset_t offset = 0ull;
912
913 vm_object_lock(object);
914 vm_map_unlock(map);
915
916 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
917 offset = map_addr;
918 }
919
920 if (guard_left) {
921 vm_page_insert(guard_left, object, offset);
922 guard_left->vmp_busy = FALSE;
923 guard_left = VM_PAGE_NULL;
924 }
925
926 if (guard_right) {
927 vm_page_insert(guard_right, object,
928 offset + fill_start + fill_size);
929 guard_right->vmp_busy = FALSE;
930 guard_right = VM_PAGE_NULL;
931 }
932
933 if (wired_page_list) {
934 kernel_memory_populate_object_and_unlock(object,
935 map_addr + fill_start, offset + fill_start, fill_size,
936 wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT);
937 } else {
938 vm_object_unlock(object);
939 }
940 } else {
941 vm_map_unlock(map);
942 }
943
944 #if KASAN
945 if (flags & KMA_PAGEABLE) {
946 /*
947 * We need to allow the range for pageable memory,
948 * or faulting will not be allowed.
949 */
950 kasan_notify_address(map_addr, map_size);
951 }
952 #endif
953 /*
954 * now that the pages are wired, we no longer have to fear coalesce
955 */
956 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
957 vm_map_simplify(map, map_addr);
958 } else {
959 vm_object_deallocate(object);
960 }
961
962 #if DEBUG || DEVELOPMENT
963 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
964 atop(fill_size), 0, 0, 0);
965 #endif
966 kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
967 return kmr;
968
969 out_error:
970 if (flags & KMA_NOFAIL) {
971 __kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
972 }
973 if (guard_left) {
974 guard_left->vmp_snext = wired_page_list;
975 wired_page_list = guard_left;
976 }
977 if (guard_right) {
978 guard_right->vmp_snext = wired_page_list;
979 wired_page_list = guard_right;
980 }
981 if (wired_page_list) {
982 vm_page_free_list(wired_page_list, FALSE);
983 }
984
985 #if DEBUG || DEVELOPMENT
986 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
987 0, 0, 0, 0);
988 #endif
989
990 return kmr;
991 }
992
993 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)994 kmem_suballoc(
995 vm_map_t parent,
996 mach_vm_offset_t *addr,
997 vm_size_t size,
998 vm_map_create_options_t vmc_options,
999 int vm_flags,
1000 kms_flags_t flags,
1001 vm_tag_t tag)
1002 {
1003 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1004 vm_map_offset_t map_addr = 0;
1005 kmem_return_t kmr = { };
1006 vm_map_t map;
1007
1008 assert(page_aligned(size));
1009 assert(parent->pmap == kernel_pmap);
1010
1011 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1012 if (parent == kernel_map) {
1013 assert((vm_flags & VM_FLAGS_FIXED_RANGE_SUBALLOC) ||
1014 (flags & KMS_DATA));
1015 }
1016 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1017
1018 if ((vm_flags & VM_FLAGS_ANYWHERE) == 0) {
1019 map_addr = trunc_page(*addr);
1020 }
1021
1022 pmap_reference(vm_map_pmap(parent));
1023 map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1024
1025 /*
1026 * 1. vm_map_enter() will consume one ref on success.
1027 *
1028 * 2. make the entry atomic as kernel submaps should never be split.
1029 *
1030 * 3. instruct vm_map_enter() that it is a fresh submap
1031 * that needs to be taught its bounds as it inserted.
1032 */
1033 vm_map_reference(map);
1034 vmk_flags.vmkf_submap = true;
1035 if ((flags & KMS_DATA) == 0) {
1036 /* FIXME: IOKit submaps get fragmented and can't be atomic */
1037 vmk_flags.vmkf_submap_atomic = true;
1038 }
1039 vmk_flags.vmkf_submap_adjust = true;
1040 if (flags & KMS_LAST_FREE) {
1041 vmk_flags.vmkf_last_free = true;
1042 }
1043 if (flags & KMS_PERMANENT) {
1044 vmk_flags.vmkf_permanent = true;
1045 }
1046 if (flags & KMS_DATA) {
1047 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1048 }
1049
1050 kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1051 vm_flags, vmk_flags, tag, (vm_object_t)map, 0, FALSE,
1052 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1053
1054 if (kmr.kmr_return != KERN_SUCCESS) {
1055 if (flags & KMS_NOFAIL) {
1056 panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1057 parent, size, kmr.kmr_return);
1058 }
1059 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1060 vm_map_deallocate(map);
1061 vm_map_deallocate(map); /* also removes ref to pmap */
1062 return kmr;
1063 }
1064
1065 /*
1066 * For kmem_suballocs that register a claim and are assigned a range, ensure
1067 * that the exact same range is returned.
1068 */
1069 if (*addr != 0 && parent == kernel_map &&
1070 startup_phase > STARTUP_SUB_KMEM) {
1071 assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1072 } else {
1073 *addr = map_addr;
1074 }
1075
1076 kmr.kmr_submap = map;
1077 return kmr;
1078 }
1079
1080 /*
1081 * kmem_alloc:
1082 *
1083 * Allocate wired-down memory in the kernel's address map
1084 * or a submap. The memory is not zero-filled.
1085 */
1086
1087 __exported kern_return_t
1088 kmem_alloc_external(
1089 vm_map_t map,
1090 vm_offset_t *addrp,
1091 vm_size_t size);
1092 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1093 kmem_alloc_external(
1094 vm_map_t map,
1095 vm_offset_t *addrp,
1096 vm_size_t size)
1097 {
1098 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1099 return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1100 }
1101 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1102 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1103 }
1104
1105
1106 /*
1107 * kmem_alloc_kobject:
1108 *
1109 * Allocate wired-down memory in the kernel's address map
1110 * or a submap. The memory is not zero-filled.
1111 *
1112 * The memory is allocated in the kernel_object.
1113 * It may not be copied with vm_map_copy, and
1114 * it may not be reallocated with kmem_realloc.
1115 */
1116
1117 __exported kern_return_t
1118 kmem_alloc_kobject_external(
1119 vm_map_t map,
1120 vm_offset_t *addrp,
1121 vm_size_t size);
1122 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1123 kmem_alloc_kobject_external(
1124 vm_map_t map,
1125 vm_offset_t *addrp,
1126 vm_size_t size)
1127 {
1128 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1129 return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1130 }
1131 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1132 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1133 }
1134
1135 /*
1136 * kmem_alloc_pageable:
1137 *
1138 * Allocate pageable memory in the kernel's address map.
1139 */
1140
1141 __exported kern_return_t
1142 kmem_alloc_pageable_external(
1143 vm_map_t map,
1144 vm_offset_t *addrp,
1145 vm_size_t size);
1146 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1147 kmem_alloc_pageable_external(
1148 vm_map_t map,
1149 vm_offset_t *addrp,
1150 vm_size_t size)
1151 {
1152 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1153 return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1154 }
1155 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1156 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1157 }
1158
1159
1160 #pragma mark population
1161
1162 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags)1163 kernel_memory_populate_pmap_enter(
1164 vm_object_t object,
1165 vm_address_t addr,
1166 vm_object_offset_t offset,
1167 vm_page_t mem,
1168 vm_prot_t prot,
1169 int pe_flags)
1170 {
1171 kern_return_t pe_result;
1172 int pe_options;
1173
1174 PMAP_ENTER_CHECK(kernel_pmap, mem);
1175
1176 pe_options = PMAP_OPTIONS_NOWAIT;
1177 if (object->internal) {
1178 pe_options |= PMAP_OPTIONS_INTERNAL;
1179 }
1180 if (mem->vmp_reusable || object->all_reusable) {
1181 pe_options |= PMAP_OPTIONS_REUSABLE;
1182 }
1183
1184 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1185 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1186 pe_flags, /* wired */ TRUE, pe_options, NULL);
1187
1188 if (pe_result == KERN_RESOURCE_SHORTAGE) {
1189 vm_object_unlock(object);
1190
1191 pe_options &= ~PMAP_OPTIONS_NOWAIT;
1192
1193 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1194 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1195 pe_flags, /* wired */ TRUE, pe_options, NULL);
1196
1197 vm_object_lock(object);
1198 }
1199
1200 assert(pe_result == KERN_SUCCESS);
1201 }
1202
1203 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot)1204 kernel_memory_populate_object_and_unlock(
1205 vm_object_t object, /* must be locked */
1206 vm_address_t addr,
1207 vm_offset_t offset,
1208 vm_size_t size,
1209 vm_page_t page_list,
1210 kma_flags_t flags,
1211 vm_tag_t tag,
1212 vm_prot_t prot)
1213 {
1214 vm_page_t mem;
1215 int pe_flags;
1216
1217 assert3u((bool)(flags & KMA_KOBJECT), ==, object == kernel_object);
1218 assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1219 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1220 assert3u(offset, ==, addr);
1221 }
1222
1223 if (flags & KMA_KSTACK) {
1224 pe_flags = VM_MEM_STACK;
1225 } else {
1226 pe_flags = 0;
1227 }
1228
1229 for (vm_object_offset_t pg_offset = 0;
1230 pg_offset < size;
1231 pg_offset += PAGE_SIZE_64) {
1232 if (page_list == NULL) {
1233 panic("%s: page_list too short", __func__);
1234 }
1235
1236 mem = page_list;
1237 page_list = mem->vmp_snext;
1238 mem->vmp_snext = NULL;
1239
1240 assert(mem->vmp_wire_count == 0);
1241 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1242
1243 if (flags & KMA_COMPRESSOR) {
1244 mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1245 /*
1246 * Background processes doing I/O accounting can call
1247 * into NVME driver to do some work which results in
1248 * an allocation here and so we want to make sure
1249 * that the pages used by compressor, regardless of
1250 * process context, are never on the special Q.
1251 */
1252 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1253
1254 vm_page_insert(mem, object, offset + pg_offset);
1255 } else {
1256 mem->vmp_q_state = VM_PAGE_IS_WIRED;
1257 mem->vmp_wire_count = 1;
1258
1259 vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1260 }
1261
1262 mem->vmp_busy = false;
1263 mem->vmp_pmapped = true;
1264 mem->vmp_wpmapped = true;
1265
1266 /*
1267 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1268 * for the kernel and compressor objects.
1269 */
1270
1271 kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1272 mem, prot, pe_flags);
1273
1274 if (flags & KMA_NOENCRYPT) {
1275 pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1276 }
1277 }
1278
1279 if (page_list) {
1280 panic("%s: page_list too long", __func__);
1281 }
1282
1283 vm_object_unlock(object);
1284
1285 if (!(flags & KMA_COMPRESSOR)) {
1286 vm_page_lockspin_queues();
1287 vm_page_wire_count += atop(size);
1288 vm_page_unlock_queues();
1289 }
1290
1291 if (flags & KMA_KOBJECT) {
1292 /* vm_page_insert_wired() handles regular objects already */
1293 vm_tag_update_size(tag, size);
1294 }
1295
1296 #if KASAN
1297 if (flags & KMA_COMPRESSOR) {
1298 kasan_notify_address_nopoison(addr, size);
1299 } else {
1300 kasan_notify_address(addr, size);
1301 }
1302 #endif
1303 }
1304
1305
1306 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1307 kernel_memory_populate(
1308 vm_offset_t addr,
1309 vm_size_t size,
1310 kma_flags_t flags,
1311 vm_tag_t tag)
1312 {
1313 kern_return_t kr = KERN_SUCCESS;
1314 vm_page_t page_list = NULL;
1315 vm_size_t page_count = atop_64(size);
1316 vm_object_t object = __kmem_object(ANYF(flags));
1317
1318 #if DEBUG || DEVELOPMENT
1319 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1320 size, 0, 0, 0);
1321 #endif
1322
1323 kr = vm_page_alloc_list(page_count, flags, &page_list);
1324 if (kr == KERN_SUCCESS) {
1325 vm_object_lock(object);
1326 kernel_memory_populate_object_and_unlock(object, addr,
1327 addr, size, page_list, flags, tag, VM_PROT_DEFAULT);
1328 }
1329
1330 #if DEBUG || DEVELOPMENT
1331 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1332 page_count, 0, 0, 0);
1333 #endif
1334 return kr;
1335 }
1336
1337 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1338 kernel_memory_depopulate(
1339 vm_offset_t addr,
1340 vm_size_t size,
1341 kma_flags_t flags,
1342 vm_tag_t tag)
1343 {
1344 vm_object_t object = __kmem_object(ANYF(flags));
1345 vm_object_offset_t offset = addr;
1346 vm_page_t mem;
1347 vm_page_t local_freeq = NULL;
1348 unsigned int pages_unwired = 0;
1349
1350 vm_object_lock(object);
1351
1352 pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1353
1354 for (vm_object_offset_t pg_offset = 0;
1355 pg_offset < size;
1356 pg_offset += PAGE_SIZE_64) {
1357 mem = vm_page_lookup(object, offset + pg_offset);
1358
1359 assert(mem);
1360
1361 if (flags & KMA_COMPRESSOR) {
1362 assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1363 } else {
1364 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1365 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1366 pages_unwired++;
1367 }
1368
1369 mem->vmp_busy = TRUE;
1370
1371 assert(mem->vmp_tabled);
1372 vm_page_remove(mem, TRUE);
1373 assert(mem->vmp_busy);
1374
1375 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1376
1377 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1378 mem->vmp_snext = local_freeq;
1379 local_freeq = mem;
1380 }
1381
1382 vm_object_unlock(object);
1383
1384 vm_page_free_list(local_freeq, TRUE);
1385
1386 if (!(flags & KMA_COMPRESSOR)) {
1387 vm_page_lockspin_queues();
1388 vm_page_wire_count -= pages_unwired;
1389 vm_page_unlock_queues();
1390 }
1391
1392 if (flags & KMA_KOBJECT) {
1393 /* vm_page_remove() handles regular objects already */
1394 vm_tag_update_size(tag, -ptoa_64(pages_unwired));
1395 }
1396 }
1397
1398 #pragma mark reallocation
1399
1400 __abortlike
1401 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry,vm_object_t object)1402 __kmem_realloc_invalid_object_size_panic(
1403 vm_map_t map,
1404 vm_address_t address,
1405 vm_size_t size,
1406 vm_map_entry_t entry,
1407 vm_object_t object)
1408 {
1409 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1410 "object %p has unexpected size %lld",
1411 map, (void *)address, (size_t)size, entry, object, object->vo_size);
1412 }
1413
1414 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1415 kmem_realloc_shrink_guard(
1416 vm_map_t map,
1417 vm_offset_t oldaddr,
1418 vm_size_t oldsize,
1419 vm_size_t newsize,
1420 kmr_flags_t flags,
1421 kmem_guard_t guard,
1422 vm_map_entry_t entry)
1423 {
1424 vm_object_t object;
1425 kmem_return_t kmr = { .kmr_address = oldaddr };
1426 bool was_atomic;
1427
1428 vm_map_lock_assert_exclusive(map);
1429
1430 if ((flags & KMR_KOBJECT) == 0) {
1431 object = VME_OBJECT(entry);
1432 vm_object_reference(object);
1433 }
1434
1435 /*
1436 * Shrinking an atomic entry starts with splitting it,
1437 * and removing the second half.
1438 */
1439 was_atomic = entry->vme_atomic;
1440 entry->vme_atomic = false;
1441 vm_map_clip_end(map, entry, entry->vme_start + newsize);
1442 entry->vme_atomic = was_atomic;
1443
1444 (void)vm_map_remove_and_unlock(map,
1445 oldaddr + newsize, oldaddr + oldsize,
1446 VM_MAP_REMOVE_KUNWIRE, KMEM_GUARD_NONE);
1447
1448
1449 /*
1450 * Lastly, if there are guard pages, deal with them.
1451 *
1452 * The kernel object just needs to depopulate,
1453 * regular objects require freeing the last page
1454 * and replacing it with a guard.
1455 */
1456 if (flags & KMR_KOBJECT) {
1457 if (flags & KMR_GUARD_LAST) {
1458 kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1459 PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1460 }
1461 } else {
1462 vm_page_t guard_right = VM_PAGE_NULL;
1463 vm_offset_t remove_start = newsize;
1464
1465 if (flags & KMR_GUARD_LAST) {
1466 if (!map->never_faults) {
1467 guard_right = vm_page_grab_guard(true);
1468 }
1469 remove_start -= PAGE_SIZE;
1470 }
1471
1472 vm_object_lock(object);
1473
1474 if (object->vo_size != oldsize) {
1475 __kmem_realloc_invalid_object_size_panic(map,
1476 oldaddr, oldsize, entry, object);
1477 }
1478 object->vo_size = newsize;
1479
1480 vm_object_page_remove(object, remove_start, oldsize);
1481
1482 if (guard_right) {
1483 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1484 guard_right->vmp_busy = false;
1485 }
1486 vm_object_unlock(object);
1487 vm_object_deallocate(object);
1488 }
1489
1490 return kmr;
1491 }
1492
1493 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard)1494 kmem_realloc_guard(
1495 vm_map_t map,
1496 vm_offset_t oldaddr,
1497 vm_size_t oldsize,
1498 vm_size_t newsize,
1499 kmr_flags_t flags,
1500 kmem_guard_t guard)
1501 {
1502 vm_object_t object;
1503 vm_map_offset_t newaddr;
1504 vm_object_offset_t newoffs;
1505 vm_map_entry_t oldentry;
1506 vm_map_entry_t newentry;
1507 vm_page_t page_list = NULL;
1508 bool needs_wakeup = false;
1509 kmem_return_t kmr = { };
1510 unsigned int last_timestamp;
1511 vm_map_kernel_flags_t vmk_flags = {
1512 .vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1513 };
1514
1515 assert(KMEM_REALLOC_FLAGS_VALID(flags));
1516 if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1517 __kmem_invalid_arguments_panic("realloc", map, oldaddr,
1518 oldsize, flags);
1519 }
1520
1521 if (oldaddr == 0ul) {
1522 return kmem_alloc_guard(map, newsize, 0, (kma_flags_t)flags, guard);
1523 }
1524
1525 if (newsize == 0ul) {
1526 kmem_free_guard(map, oldaddr, oldsize, KMF_NONE, guard);
1527 return kmr;
1528 }
1529
1530 if (newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1531 __kmem_invalid_size_panic(map, newsize, flags);
1532 }
1533 if (newsize < __kmem_guard_size(ANYF(flags))) {
1534 __kmem_invalid_size_panic(map, newsize, flags);
1535 }
1536
1537 oldsize = round_page(oldsize);
1538 newsize = round_page(newsize);
1539
1540 if (oldsize == newsize) {
1541 kmr.kmr_address = oldaddr;
1542 return kmr;
1543 }
1544
1545 /*
1546 * If we're growing the allocation,
1547 * then reserve the pages we'll need,
1548 * and find a spot for its new place.
1549 */
1550 if (oldsize < newsize) {
1551 #if DEBUG || DEVELOPMENT
1552 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1553 VM_KERN_REQUEST, DBG_FUNC_START,
1554 newsize - oldsize, 0, 0, 0);
1555 #endif
1556 kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1557 (kma_flags_t)flags, &page_list);
1558 if (kmr.kmr_return == KERN_SUCCESS) {
1559 kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1560 &vmk_flags, true);
1561 kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1562 vmk_flags, &newentry);
1563 }
1564 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1565 if (flags & KMR_REALLOCF) {
1566 kmem_free_guard(map, oldaddr, oldsize,
1567 KMF_NONE, guard);
1568 }
1569 if (page_list) {
1570 vm_page_free_list(page_list, FALSE);
1571 }
1572 #if DEBUG || DEVELOPMENT
1573 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1574 VM_KERN_REQUEST, DBG_FUNC_END,
1575 0, 0, 0, 0);
1576 #endif
1577 return kmr;
1578 }
1579
1580 /* map is locked */
1581 } else {
1582 vm_map_lock(map);
1583 }
1584
1585
1586 /*
1587 * Locate the entry:
1588 * - wait for it to quiesce.
1589 * - validate its guard,
1590 * - learn its correct tag,
1591 */
1592 again:
1593 if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1594 __kmem_entry_not_found_panic(map, oldaddr);
1595 }
1596 if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1597 oldentry->needs_wakeup = true;
1598 vm_map_entry_wait(map, THREAD_UNINT);
1599 goto again;
1600 }
1601 kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1602 if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1603 __kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1604 }
1605 /*
1606 * TODO: We should validate for non atomic entries that the range
1607 * we are acting on is what we expect here.
1608 */
1609
1610 guard.kmg_tag = VME_ALIAS(oldentry);
1611
1612 if (newsize < oldsize) {
1613 return kmem_realloc_shrink_guard(map, oldaddr, oldsize, newsize,
1614 flags, guard, oldentry);
1615 }
1616
1617 /*
1618 * We are growing the entry
1619 *
1620 * For regular objects we use the object `vo_size` updates
1621 * as a guarantee that no 2 kmem_realloc() can happen
1622 * concurrently (by doing it before the map is unlocked.
1623 *
1624 * For the kernel object, prevent the entry from being
1625 * reallocated or changed by marking it "in_transition".
1626 */
1627
1628 object = VME_OBJECT(oldentry);
1629 vm_object_lock(object);
1630 vm_object_reference_locked(object);
1631
1632 newaddr = newentry->vme_start;
1633 newoffs = oldsize;
1634
1635 VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1636 VME_ALIAS_SET(newentry, guard.kmg_tag);
1637 if (flags & KMR_KOBJECT) {
1638 oldentry->in_transition = true;
1639 VME_OFFSET_SET(newentry, newaddr);
1640 newentry->wired_count = 1;
1641 newoffs = newaddr + oldsize;
1642 } else {
1643 if (object->vo_size != oldsize) {
1644 __kmem_realloc_invalid_object_size_panic(map,
1645 oldaddr, oldsize, oldentry, object);
1646 }
1647 object->vo_size = newsize;
1648 }
1649
1650 last_timestamp = map->timestamp;
1651 vm_map_unlock(map);
1652
1653
1654 /*
1655 * Now proceed with the population of pages.
1656 *
1657 * Kernel objects can use the kmem population helpers.
1658 *
1659 * Regular objects will insert pages manually,
1660 * then wire the memory into the new range.
1661 */
1662
1663 vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1664
1665 if (flags & KMR_KOBJECT) {
1666 assert(flags & KMR_FREEOLD);
1667
1668 pmap_protect(kernel_pmap,
1669 oldaddr, oldaddr + oldsize - guard_right_size,
1670 VM_PROT_NONE);
1671
1672 for (vm_object_offset_t offset = 0;
1673 offset < oldsize - guard_right_size;
1674 offset += PAGE_SIZE_64) {
1675 vm_page_t mem;
1676
1677 mem = vm_page_lookup(object, oldaddr + offset);
1678 if (mem == VM_PAGE_NULL) {
1679 continue;
1680 }
1681
1682 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1683
1684 mem->vmp_busy = true;
1685 vm_page_remove(mem, true);
1686 vm_page_insert_wired(mem, object, newaddr + offset,
1687 guard.kmg_tag);
1688 mem->vmp_busy = false;
1689
1690 kernel_memory_populate_pmap_enter(object, newaddr,
1691 offset, mem, VM_PROT_DEFAULT, 0);
1692 }
1693
1694 kernel_memory_populate_object_and_unlock(object,
1695 newaddr + oldsize - guard_right_size,
1696 newoffs - guard_right_size,
1697 newsize - oldsize,
1698 page_list, (kma_flags_t)flags,
1699 guard.kmg_tag, VM_PROT_DEFAULT);
1700 } else {
1701 vm_page_t guard_right = VM_PAGE_NULL;
1702 kern_return_t kr;
1703
1704 /*
1705 * Note: we are borrowing the new entry reference
1706 * on the object for the duration of this code,
1707 * which works because we keep the object locked
1708 * throughout.
1709 */
1710 if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1711 guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1712 assert(guard_right->vmp_fictitious);
1713 guard_right->vmp_busy = true;
1714 vm_page_remove(guard_right, true);
1715 }
1716
1717 for (vm_object_offset_t offset = oldsize - guard_right_size;
1718 offset < newsize - guard_right_size;
1719 offset += PAGE_SIZE_64) {
1720 vm_page_t mem = page_list;
1721
1722 page_list = mem->vmp_snext;
1723 mem->vmp_snext = VM_PAGE_NULL;
1724
1725 vm_page_insert(mem, object, offset);
1726 mem->vmp_busy = false;
1727 }
1728
1729 if (guard_right) {
1730 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1731 guard_right->vmp_busy = false;
1732 }
1733
1734 vm_object_unlock(object);
1735
1736 kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
1737 VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
1738 assert(kr == KERN_SUCCESS);
1739 }
1740
1741 #if KASAN
1742 kasan_notify_address(newaddr, newsize);
1743 #endif
1744
1745
1746 /*
1747 * Mark the entry as idle again,
1748 * and honor KMR_FREEOLD if needed.
1749 */
1750
1751 vm_map_lock(map);
1752 if (last_timestamp + 1 != map->timestamp &&
1753 !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1754 __kmem_entry_not_found_panic(map, oldaddr);
1755 }
1756
1757 if (flags & KMR_KOBJECT) {
1758 assert(oldentry->in_transition);
1759 oldentry->in_transition = false;
1760 if (oldentry->needs_wakeup) {
1761 needs_wakeup = true;
1762 oldentry->needs_wakeup = false;
1763 }
1764 }
1765
1766 if (flags & KMR_FREEOLD) {
1767 (void)vm_map_remove_and_unlock(map,
1768 oldaddr, oldaddr + oldsize,
1769 VM_MAP_REMOVE_KUNWIRE, guard);
1770 } else {
1771 vm_map_unlock(map);
1772 }
1773
1774 if (needs_wakeup) {
1775 vm_map_entry_wakeup(map);
1776 }
1777
1778
1779 #if DEBUG || DEVELOPMENT
1780 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1781 atop(newsize - oldsize), 0, 0, 0);
1782 #endif
1783 kmr.kmr_address = newaddr;
1784 return kmr;
1785 }
1786
1787
1788 #pragma mark free
1789
1790 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t addr,vm_size_t size,kmf_flags_t flags,kmem_guard_t guard)1791 kmem_free_guard(
1792 vm_map_t map,
1793 vm_offset_t addr,
1794 vm_size_t size,
1795 kmf_flags_t flags,
1796 kmem_guard_t guard)
1797 {
1798 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1799
1800 assert(addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS);
1801 assert(map->pmap == kernel_pmap);
1802
1803 if (flags & KMF_GUESS_SIZE) {
1804 vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
1805 size = PAGE_SIZE;
1806 } else if (size == 0) {
1807 __kmem_invalid_size_panic(map, size, flags);
1808 } else {
1809 size = round_page(size);
1810 }
1811
1812 return vm_map_remove_guard(map, addr, addr + size,
1813 vmr_flags, guard).kmr_size;
1814 }
1815
1816 __exported void
1817 kmem_free_external(
1818 vm_map_t map,
1819 vm_offset_t addr,
1820 vm_size_t size);
1821 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)1822 kmem_free_external(
1823 vm_map_t map,
1824 vm_offset_t addr,
1825 vm_size_t size)
1826 {
1827 if (size) {
1828 kmem_free(map, trunc_page(addr), size);
1829 #if MACH_ASSERT
1830 } else {
1831 printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
1832 map, (void *)addr, __builtin_return_address(0));
1833 #endif
1834 }
1835 }
1836
1837
1838 #pragma mark kmem init
1839
1840 /*
1841 * The default percentage of memory that can be mlocked is scaled based on the total
1842 * amount of memory in the system. These percentages are caclulated
1843 * offline and stored in this table. We index this table by
1844 * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
1845 * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
1846 *
1847 * Note that these values were picked for mac.
1848 * If we ever have very large memory config arm devices, we may want to revisit
1849 * since the kernel overhead is smaller there due to the larger page size.
1850 */
1851
1852 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
1853 #define VM_USER_WIREABLE_MIN_CONFIG 32
1854 #if CONFIG_JETSAM
1855 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
1856 * pressure.
1857 */
1858 static vm_map_size_t wire_limit_percents[] =
1859 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
1860 #else
1861 static vm_map_size_t wire_limit_percents[] =
1862 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
1863 #endif /* CONFIG_JETSAM */
1864
1865 /*
1866 * Sets the default global user wire limit which limits the amount of
1867 * memory that can be locked via mlock() based on the above algorithm..
1868 * This can be overridden via a sysctl.
1869 */
1870 static void
kmem_set_user_wire_limits(void)1871 kmem_set_user_wire_limits(void)
1872 {
1873 uint64_t available_mem_log;
1874 uint64_t max_wire_percent;
1875 size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
1876 sizeof(vm_map_size_t);
1877 vm_map_size_t limit;
1878 uint64_t config_memsize = max_mem;
1879 #if defined(XNU_TARGET_OS_OSX)
1880 config_memsize = max_mem_actual;
1881 #endif /* defined(XNU_TARGET_OS_OSX) */
1882
1883 available_mem_log = bit_floor(config_memsize);
1884
1885 if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
1886 available_mem_log = 0;
1887 } else {
1888 available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
1889 }
1890 if (available_mem_log >= wire_limit_percents_length) {
1891 available_mem_log = wire_limit_percents_length - 1;
1892 }
1893 max_wire_percent = wire_limit_percents[available_mem_log];
1894
1895 limit = config_memsize * max_wire_percent / 100;
1896 /* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
1897 if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
1898 limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
1899 }
1900
1901 vm_global_user_wire_limit = limit;
1902 /* the default per task limit is the same as the global limit */
1903 vm_per_task_user_wire_limit = limit;
1904 vm_add_wire_count_over_global_limit = 0;
1905 vm_add_wire_count_over_user_limit = 0;
1906 }
1907
1908 #define KMEM_MAX_CLAIMS 50
1909 __startup_data
1910 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
1911 __startup_data
1912 uint32_t kmem_claim_count = 0;
1913
1914 __startup_func
1915 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)1916 kmem_range_startup_init(
1917 struct kmem_range_startup_spec *sp)
1918 {
1919 assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
1920 if (sp->kc_calculate_sz) {
1921 sp->kc_size = (sp->kc_calculate_sz)();
1922 }
1923 if (sp->kc_size) {
1924 kmem_claims[kmem_claim_count] = *sp;
1925 kmem_claim_count++;
1926 }
1927 }
1928
1929 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1930 static vm_offset_t
kmem_fuzz_start(void)1931 kmem_fuzz_start(void)
1932 {
1933 vm_offset_t kmapoff_kaddr = 0;
1934 uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
1935 vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
1936
1937 kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
1938 KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
1939 VM_KERN_MEMORY_OSFMK);
1940 return kmapoff_kaddr + kmapoff_size;
1941 }
1942
1943 /*
1944 * Returns a 16bit random number between 0 and
1945 * upper_limit (inclusive)
1946 */
1947 __startup_func
1948 uint16_t
kmem_get_random16(uint16_t upper_limit)1949 kmem_get_random16(
1950 uint16_t upper_limit)
1951 {
1952 static uint64_t random_entropy;
1953 assert(upper_limit < UINT16_MAX);
1954 if (random_entropy == 0) {
1955 random_entropy = early_random();
1956 }
1957 uint32_t result = random_entropy & UINT32_MAX;
1958 random_entropy >>= 32;
1959 return (uint16_t)(result % (upper_limit + 1));
1960 }
1961
1962 /*
1963 * Generate a randomly shuffled array of indices from 0 to count - 1
1964 */
1965 __startup_func
1966 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)1967 kmem_shuffle(
1968 uint16_t *shuffle_buf,
1969 uint16_t count)
1970 {
1971 for (uint16_t i = 0; i < count; i++) {
1972 uint16_t j = kmem_get_random16(i);
1973 if (j != i) {
1974 shuffle_buf[i] = shuffle_buf[j];
1975 }
1976 shuffle_buf[j] = i;
1977 }
1978 }
1979
1980 __startup_func
1981 static void
kmem_shuffle_claims(void)1982 kmem_shuffle_claims(void)
1983 {
1984 uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
1985 uint16_t limit = (uint16_t)kmem_claim_count;
1986
1987 kmem_shuffle(&shuffle_buf[0], limit);
1988 for (uint16_t i = 0; i < limit; i++) {
1989 struct kmem_range_startup_spec tmp = kmem_claims[i];
1990 kmem_claims[i] = kmem_claims[shuffle_buf[i]];
1991 kmem_claims[shuffle_buf[i]] = tmp;
1992 }
1993 }
1994
1995 __startup_func
1996 static void
kmem_readjust_ranges(uint32_t cur_idx)1997 kmem_readjust_ranges(
1998 uint32_t cur_idx)
1999 {
2000 assert(cur_idx != 0);
2001 uint32_t j = cur_idx - 1, random;
2002 struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
2003 struct mach_vm_range *sp_range = sp.kc_range;
2004
2005 /*
2006 * Find max index where restriction is met
2007 */
2008 for (; j > 0; j--) {
2009 struct kmem_range_startup_spec spj = kmem_claims[j];
2010 vm_map_offset_t max_start = spj.kc_range->min_address;
2011 if (spj.kc_flags & KC_NO_MOVE) {
2012 panic("kmem_range_init: Can't scramble with multiple constraints");
2013 }
2014 if (max_start <= sp_range->min_address) {
2015 break;
2016 }
2017 }
2018
2019 /*
2020 * Pick a random index from 0 to max index and shift claims to the right
2021 * to make room for restricted claim
2022 */
2023 random = kmem_get_random16((uint16_t)j);
2024 assert(random <= j);
2025
2026 sp_range->min_address = kmem_claims[random].kc_range->min_address;
2027 sp_range->max_address = sp_range->min_address + sp.kc_size;
2028
2029 for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
2030 struct kmem_range_startup_spec spj = kmem_claims[j];
2031 struct mach_vm_range *range = spj.kc_range;
2032 range->min_address += sp.kc_size;
2033 range->max_address += sp.kc_size;
2034 kmem_claims[j + 1] = spj;
2035 }
2036
2037 sp.kc_flags = KC_NO_MOVE;
2038 kmem_claims[random] = sp;
2039 }
2040
2041 __startup_func
2042 static void
kmem_add_extra_claims(void)2043 kmem_add_extra_claims(void)
2044 {
2045 vm_map_size_t largest_free_size = 0, total_claims = 0;
2046
2047 vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
2048 largest_free_size = trunc_page(largest_free_size);
2049
2050 /*
2051 * Determine size of data and pointer kmem_ranges
2052 */
2053 for (uint32_t i = 0; i < kmem_claim_count; i++) {
2054 total_claims += kmem_claims[i].kc_size;
2055 }
2056 assert((total_claims & PAGE_MASK) == 0);
2057 largest_free_size -= total_claims;
2058
2059 /*
2060 * Use half the total available VA for all pointer allocations (this
2061 * includes the kmem_sprayqtn range). Given that we have 4 total
2062 * ranges divide the available VA by 8.
2063 */
2064 ptr_range_size = sprayqtn_range_size = largest_free_size / 8;
2065
2066 if (sprayqtn_range_size > (sane_size / 2)) {
2067 sprayqtn_range_size = sane_size / 2;
2068 }
2069
2070 ptr_range_size = round_page(ptr_range_size);
2071 sprayqtn_range_size = round_page(sprayqtn_range_size);
2072
2073 /*
2074 * kasan and configs w/o *TRR need to have just one ptr range due to
2075 * resource constraints.
2076 */
2077 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
2078 kmem_ptr_ranges = 1;
2079 kmem_sprayqtn_range = 0;
2080 sprayqtn_range_size = 0;
2081 #endif
2082
2083 data_range_size = largest_free_size
2084 - (ptr_range_size * kmem_ptr_ranges)
2085 - sprayqtn_range_size;
2086
2087 /*
2088 * Add claims for kmem's ranges
2089 */
2090 for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
2091 struct kmem_range_startup_spec kmem_spec_ptr = {
2092 .kc_name = "kmem_ptr_range",
2093 .kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
2094 .kc_size = ptr_range_size,
2095 .kc_flags = KC_NO_ENTRY,
2096 };
2097 kmem_claims[kmem_claim_count++] = kmem_spec_ptr;
2098 }
2099
2100 if (kmem_sprayqtn_range) {
2101 struct kmem_range_startup_spec kmem_spec_sprayqtn = {
2102 .kc_name = "kmem_sprayqtn_range",
2103 .kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
2104 .kc_size = sprayqtn_range_size,
2105 .kc_flags = KC_NO_ENTRY,
2106 };
2107 kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
2108 }
2109
2110 struct kmem_range_startup_spec kmem_spec_data = {
2111 .kc_name = "kmem_data_range",
2112 .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
2113 .kc_size = data_range_size,
2114 .kc_flags = KC_NO_ENTRY,
2115 };
2116 kmem_claims[kmem_claim_count++] = kmem_spec_data;
2117 }
2118
2119 __startup_func
2120 static void
kmem_scramble_ranges(void)2121 kmem_scramble_ranges(void)
2122 {
2123 vm_map_offset_t start = 0;
2124
2125 /*
2126 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
2127 * the vm can find the requested ranges.
2128 */
2129 kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
2130 VM_MAP_PAGE_SIZE(kernel_map));
2131 kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
2132
2133 /*
2134 * Allocating the g_kext_map prior to randomizing the remaining submaps as
2135 * this map is 2G in size and starts at the end of kernel_text on x86. It
2136 * could overflow into the heap.
2137 */
2138 kext_alloc_init();
2139
2140 /*
2141 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
2142 * stack addresses. (With a 4K page and 9 bits of randomness, this
2143 * eats about 2M of VA from the map)
2144 *
2145 * Note that we always need to slide by at least one page because the VM
2146 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
2147 * do not admit this address to be part of any zone submap.
2148 */
2149 start = kmem_fuzz_start();
2150
2151 /*
2152 * Add claims for ptr and data kmem_ranges
2153 */
2154 kmem_add_extra_claims();
2155
2156 /*
2157 * Shuffle registered claims
2158 */
2159 assert(kmem_claim_count < UINT16_MAX);
2160 kmem_shuffle_claims();
2161
2162 /*
2163 * Apply restrictions and determine range for each claim
2164 */
2165 for (uint32_t i = 0; i < kmem_claim_count; i++) {
2166 vm_map_offset_t end = 0;
2167 struct kmem_range_startup_spec sp = kmem_claims[i];
2168 struct mach_vm_range *sp_range = sp.kc_range;
2169 if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
2170 VM_MAP_KERNEL_FLAGS_NONE, &start, NULL) != KERN_SUCCESS) {
2171 panic("kmem_range_init: vm_map_locate_space failing for claim %s",
2172 sp.kc_name);
2173 }
2174
2175 end = start + sp.kc_size;
2176 /*
2177 * Re-adjust ranges if restriction not met
2178 */
2179 if (sp_range->min_address && start > sp_range->min_address) {
2180 kmem_readjust_ranges(i);
2181 } else {
2182 sp_range->min_address = start;
2183 sp_range->max_address = end;
2184 }
2185 start = end;
2186 }
2187
2188 /*
2189 * We have settled on the ranges, now create temporary entries for the
2190 * claims
2191 */
2192 for (uint32_t i = 0; i < kmem_claim_count; i++) {
2193 struct kmem_range_startup_spec sp = kmem_claims[i];
2194 vm_map_entry_t entry = NULL;
2195 if (sp.kc_flags & KC_NO_ENTRY) {
2196 continue;
2197 }
2198 if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
2199 VM_MAP_KERNEL_FLAGS_NONE, &entry) != KERN_SUCCESS) {
2200 panic("kmem_range_init: vm_map_find_space failing for claim %s",
2201 sp.kc_name);
2202 }
2203 vm_object_reference(kernel_object);
2204 VME_OBJECT_SET(entry, kernel_object, false, 0);
2205 VME_OFFSET_SET(entry, entry->vme_start);
2206 vm_map_unlock(kernel_map);
2207 }
2208 /*
2209 * Now that we are done assigning all the ranges, reset
2210 * kmem_ranges[KMEM_RANGE_ID_NONE]
2211 */
2212 kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
2213
2214 #if DEBUG || DEVELOPMENT
2215 for (uint32_t i = 0; i < kmem_claim_count; i++) {
2216 struct kmem_range_startup_spec sp = kmem_claims[i];
2217
2218 printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
2219 (void *)sp.kc_range->min_address,
2220 (void *)sp.kc_range->max_address,
2221 mach_vm_size_pretty(sp.kc_size),
2222 mach_vm_size_unit(sp.kc_size));
2223 }
2224 #endif /* DEBUG || DEVELOPMENT */
2225 }
2226
2227 __startup_func
2228 static void
kmem_range_init(void)2229 kmem_range_init(void)
2230 {
2231 vm_size_t range_adjustment;
2232
2233 kmem_scramble_ranges();
2234
2235 /* Initialize kmem_large_ranges.
2236 * For pointer ranges : Skip 1/16th of range size on either side
2237 * For sprayqtn range : Skip 1/8th only from left as we have a single front.
2238 * For data range : Skip 1/8th only from left as we have a single front.
2239 * Permanent allocations use the right of the range for data and sprayqtn.
2240 */
2241 range_adjustment = ptr_range_size >> 4;
2242 for (kmem_range_id_t i = 0; i < kmem_ptr_ranges; i++) {
2243 kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address =
2244 kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address + range_adjustment;
2245 kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address =
2246 kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address - range_adjustment;
2247 }
2248
2249 range_adjustment = sprayqtn_range_size >> 3;
2250 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
2251 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
2252 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
2253 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
2254
2255 range_adjustment = data_range_size >> 3;
2256 kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
2257 kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
2258 kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
2259 kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
2260
2261 /*
2262 * Redirect sprayqtn range to pointer range for configs that have
2263 * kmem_sprayqtn_range disabled
2264 */
2265 if (!kmem_sprayqtn_range) {
2266 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN] = kmem_ranges[KMEM_RANGE_ID_PTR_0];
2267 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN] =
2268 kmem_large_ranges[KMEM_RANGE_ID_PTR_0];
2269 }
2270
2271 #if DEBUG || DEVELOPMENT
2272 for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
2273 vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
2274 printf("kmem_large_ranges[%d] : %p - %p (%u%c)\n", i,
2275 (void *)kmem_large_ranges[i].min_address,
2276 (void *)kmem_large_ranges[i].max_address,
2277 mach_vm_size_pretty(range_size),
2278 mach_vm_size_unit(range_size));
2279 }
2280 #endif
2281 }
2282 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
2283 __startup_func
2284 static void
kmem_range_init(void)2285 kmem_range_init(void)
2286 {
2287 for (kmem_range_id_t i = 0; i < KMEM_RANGE_COUNT; i++) {
2288 kmem_ranges[i].min_address = kernel_map->min_offset;
2289 kmem_ranges[i].max_address = kernel_map->max_offset;
2290 }
2291 kext_alloc_init();
2292 kmem_fuzz_start();
2293 }
2294 #endif
2295 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
2296
2297 /*
2298 * kmem_init:
2299 *
2300 * Initialize the kernel's virtual memory map, taking
2301 * into account all memory allocated up to this time.
2302 */
2303 __startup_func
2304 void
kmem_init(vm_offset_t start,vm_offset_t end)2305 kmem_init(
2306 vm_offset_t start,
2307 vm_offset_t end)
2308 {
2309 vm_map_offset_t map_start;
2310 vm_map_offset_t map_end;
2311 vm_map_kernel_flags_t vmk_flags;
2312
2313 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2314 vmk_flags.vmkf_permanent = TRUE;
2315 vmk_flags.vmkf_no_pmap_check = TRUE;
2316
2317 map_start = vm_map_trunc_page(start,
2318 VM_MAP_PAGE_MASK(kernel_map));
2319 map_end = vm_map_round_page(end,
2320 VM_MAP_PAGE_MASK(kernel_map));
2321
2322 vm_map_will_allocate_early_map(&kernel_map);
2323 #if defined(__arm64__)
2324 kernel_map = vm_map_create_options(pmap_kernel(),
2325 VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2326 VM_MAX_KERNEL_ADDRESS,
2327 VM_MAP_CREATE_DEFAULT);
2328 /*
2329 * Reserve virtual memory allocated up to this time.
2330 */
2331 {
2332 unsigned int region_select = 0;
2333 vm_map_offset_t region_start;
2334 vm_map_size_t region_size;
2335 vm_map_offset_t map_addr;
2336 kern_return_t kr;
2337
2338 while (pmap_virtual_region(region_select, ®ion_start, ®ion_size)) {
2339 map_addr = region_start;
2340 kr = vm_map_enter(kernel_map, &map_addr,
2341 vm_map_round_page(region_size,
2342 VM_MAP_PAGE_MASK(kernel_map)),
2343 (vm_map_offset_t) 0,
2344 VM_FLAGS_FIXED,
2345 vmk_flags,
2346 VM_KERN_MEMORY_NONE,
2347 VM_OBJECT_NULL,
2348 (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
2349 VM_INHERIT_DEFAULT);
2350
2351 if (kr != KERN_SUCCESS) {
2352 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2353 (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
2354 (uint64_t) region_size, kr);
2355 }
2356
2357 region_select++;
2358 }
2359 }
2360 #else
2361 kernel_map = vm_map_create_options(pmap_kernel(),
2362 VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
2363 VM_MAP_CREATE_DEFAULT);
2364 /*
2365 * Reserve virtual memory allocated up to this time.
2366 */
2367 if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
2368 vm_map_offset_t map_addr;
2369 kern_return_t kr;
2370
2371 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2372 vmk_flags.vmkf_no_pmap_check = TRUE;
2373
2374 map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2375 kr = vm_map_enter(kernel_map,
2376 &map_addr,
2377 (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2378 (vm_map_offset_t) 0,
2379 VM_FLAGS_FIXED,
2380 vmk_flags,
2381 VM_KERN_MEMORY_NONE,
2382 VM_OBJECT_NULL,
2383 (vm_object_offset_t) 0, FALSE,
2384 VM_PROT_NONE, VM_PROT_NONE,
2385 VM_INHERIT_DEFAULT);
2386
2387 if (kr != KERN_SUCCESS) {
2388 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2389 (uint64_t) start, (uint64_t) end,
2390 (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2391 (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2392 kr);
2393 }
2394 }
2395 #endif
2396
2397 kmem_set_user_wire_limits();
2398 }
2399
2400
2401 #pragma mark map copyio
2402
2403 /*
2404 * Routine: copyinmap
2405 * Purpose:
2406 * Like copyin, except that fromaddr is an address
2407 * in the specified VM map. This implementation
2408 * is incomplete; it handles the current user map
2409 * and the kernel map/submaps.
2410 */
2411 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)2412 copyinmap(
2413 vm_map_t map,
2414 vm_map_offset_t fromaddr,
2415 void *todata,
2416 vm_size_t length)
2417 {
2418 kern_return_t kr = KERN_SUCCESS;
2419 vm_map_t oldmap;
2420
2421 if (vm_map_pmap(map) == pmap_kernel()) {
2422 /* assume a correct copy */
2423 memcpy(todata, CAST_DOWN(void *, fromaddr), length);
2424 } else if (current_map() == map) {
2425 if (copyin(fromaddr, todata, length) != 0) {
2426 kr = KERN_INVALID_ADDRESS;
2427 }
2428 } else {
2429 vm_map_reference(map);
2430 oldmap = vm_map_switch(map);
2431 if (copyin(fromaddr, todata, length) != 0) {
2432 kr = KERN_INVALID_ADDRESS;
2433 }
2434 vm_map_switch(oldmap);
2435 vm_map_deallocate(map);
2436 }
2437 return kr;
2438 }
2439
2440 /*
2441 * Routine: copyoutmap
2442 * Purpose:
2443 * Like copyout, except that toaddr is an address
2444 * in the specified VM map.
2445 */
2446 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)2447 copyoutmap(
2448 vm_map_t map,
2449 void *fromdata,
2450 vm_map_address_t toaddr,
2451 vm_size_t length)
2452 {
2453 kern_return_t kr = KERN_SUCCESS;
2454 vm_map_t oldmap;
2455
2456 if (vm_map_pmap(map) == pmap_kernel()) {
2457 /* assume a correct copy */
2458 memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
2459 } else if (current_map() == map) {
2460 if (copyout(fromdata, toaddr, length) != 0) {
2461 kr = KERN_INVALID_ADDRESS;
2462 }
2463 } else {
2464 vm_map_reference(map);
2465 oldmap = vm_map_switch(map);
2466 if (copyout(fromdata, toaddr, length) != 0) {
2467 kr = KERN_INVALID_ADDRESS;
2468 }
2469 vm_map_switch(oldmap);
2470 vm_map_deallocate(map);
2471 }
2472 return kr;
2473 }
2474
2475 /*
2476 * Routine: copyoutmap_atomic{32, 64}
2477 * Purpose:
2478 * Like copyoutmap, except that the operation is atomic.
2479 * Takes in value rather than *fromdata pointer.
2480 */
2481 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)2482 copyoutmap_atomic32(
2483 vm_map_t map,
2484 uint32_t value,
2485 vm_map_address_t toaddr)
2486 {
2487 kern_return_t kr = KERN_SUCCESS;
2488 vm_map_t oldmap;
2489
2490 if (vm_map_pmap(map) == pmap_kernel()) {
2491 /* assume a correct toaddr */
2492 *(uint32_t *)toaddr = value;
2493 } else if (current_map() == map) {
2494 if (copyout_atomic32(value, toaddr) != 0) {
2495 kr = KERN_INVALID_ADDRESS;
2496 }
2497 } else {
2498 vm_map_reference(map);
2499 oldmap = vm_map_switch(map);
2500 if (copyout_atomic32(value, toaddr) != 0) {
2501 kr = KERN_INVALID_ADDRESS;
2502 }
2503 vm_map_switch(oldmap);
2504 vm_map_deallocate(map);
2505 }
2506 return kr;
2507 }
2508
2509 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)2510 copyoutmap_atomic64(
2511 vm_map_t map,
2512 uint64_t value,
2513 vm_map_address_t toaddr)
2514 {
2515 kern_return_t kr = KERN_SUCCESS;
2516 vm_map_t oldmap;
2517
2518 if (vm_map_pmap(map) == pmap_kernel()) {
2519 /* assume a correct toaddr */
2520 *(uint64_t *)toaddr = value;
2521 } else if (current_map() == map) {
2522 if (copyout_atomic64(value, toaddr) != 0) {
2523 kr = KERN_INVALID_ADDRESS;
2524 }
2525 } else {
2526 vm_map_reference(map);
2527 oldmap = vm_map_switch(map);
2528 if (copyout_atomic64(value, toaddr) != 0) {
2529 kr = KERN_INVALID_ADDRESS;
2530 }
2531 vm_map_switch(oldmap);
2532 vm_map_deallocate(map);
2533 }
2534 return kr;
2535 }
2536
2537
2538 #pragma mark pointer obfuscation / packing
2539
2540 /*
2541 *
2542 * The following two functions are to be used when exposing kernel
2543 * addresses to userspace via any of the various debug or info
2544 * facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
2545 * and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
2546 * are exported to KEXTs.
2547 *
2548 * NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
2549 */
2550
2551 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)2552 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
2553 {
2554 assert(salt != 0);
2555
2556 if (addr == 0) {
2557 return 0ul;
2558 }
2559
2560 if (VM_KERNEL_IS_SLID(addr)) {
2561 return VM_KERNEL_UNSLIDE(addr);
2562 }
2563
2564 vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
2565 SHA256_CTX sha_ctx;
2566
2567 SHA256_Init(&sha_ctx);
2568 SHA256_Update(&sha_ctx, &salt, sizeof(salt));
2569 SHA256_Update(&sha_ctx, &addr, sizeof(addr));
2570 SHA256_Final(sha_digest, &sha_ctx);
2571
2572 return sha_digest[0];
2573 }
2574
2575 __exported vm_offset_t
2576 vm_kernel_addrhash_external(vm_offset_t addr);
2577 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)2578 vm_kernel_addrhash_external(vm_offset_t addr)
2579 {
2580 return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
2581 }
2582
2583 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)2584 vm_kernel_addrhide(
2585 vm_offset_t addr,
2586 vm_offset_t *hide_addr)
2587 {
2588 *hide_addr = VM_KERNEL_ADDRHIDE(addr);
2589 }
2590
2591 /*
2592 * vm_kernel_addrperm_external:
2593 * vm_kernel_unslide_or_perm_external:
2594 *
2595 * Use these macros when exposing an address to userspace that could come from
2596 * either kernel text/data *or* the heap.
2597 */
2598 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)2599 vm_kernel_addrperm_external(
2600 vm_offset_t addr,
2601 vm_offset_t *perm_addr)
2602 {
2603 if (VM_KERNEL_IS_SLID(addr)) {
2604 *perm_addr = VM_KERNEL_UNSLIDE(addr);
2605 } else if (VM_KERNEL_ADDRESS(addr)) {
2606 *perm_addr = addr + vm_kernel_addrperm_ext;
2607 } else {
2608 *perm_addr = addr;
2609 }
2610 }
2611
2612 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)2613 vm_kernel_unslide_or_perm_external(
2614 vm_offset_t addr,
2615 vm_offset_t *up_addr)
2616 {
2617 vm_kernel_addrperm_external(addr, up_addr);
2618 }
2619
2620 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)2621 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
2622 {
2623 if (ptr & ((1ul << params.vmpp_shift) - 1)) {
2624 panic("pointer %p can't be packed: low %d bits aren't 0",
2625 (void *)ptr, params.vmpp_shift);
2626 } else if (ptr <= params.vmpp_base) {
2627 panic("pointer %p can't be packed: below base %p",
2628 (void *)ptr, (void *)params.vmpp_base);
2629 } else {
2630 panic("pointer %p can't be packed: maximum encodable pointer is %p",
2631 (void *)ptr, (void *)vm_packing_max_packable(params));
2632 }
2633 }
2634
2635 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)2636 vm_packing_verify_range(
2637 const char *subsystem,
2638 vm_offset_t min_address,
2639 vm_offset_t max_address,
2640 vm_packing_params_t params)
2641 {
2642 if (min_address > max_address) {
2643 panic("%s: %s range invalid min:%p > max:%p",
2644 __func__, subsystem, (void *)min_address, (void *)max_address);
2645 }
2646
2647 if (!params.vmpp_base_relative) {
2648 return;
2649 }
2650
2651 if (min_address <= params.vmpp_base) {
2652 panic("%s: %s range invalid min:%p <= base:%p",
2653 __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
2654 }
2655
2656 if (max_address > vm_packing_max_packable(params)) {
2657 panic("%s: %s range invalid max:%p >= max packable:%p",
2658 __func__, subsystem, (void *)max_address,
2659 (void *)vm_packing_max_packable(params));
2660 }
2661 }
2662
2663 #pragma mark tests
2664 #if DEBUG || DEVELOPMENT
2665 #include <sys/errno.h>
2666
2667 static void
2668 kmem_test_for_entry(
2669 vm_map_t map,
2670 vm_offset_t addr,
2671 void (^block)(vm_map_entry_t))
2672 {
2673 vm_map_entry_t entry;
2674
2675 vm_map_lock(map);
2676 block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
2677 vm_map_unlock(map);
2678 }
2679
2680 #define kmem_test_assert_map(map, pg, entries) ({ \
2681 assert3u((map)->size, ==, ptoa(pg)); \
2682 assert3u((map)->hdr.nentries, ==, entries); \
2683 })
2684
2685 static bool
can_write_at(vm_offset_t offs,uint32_t page)2686 can_write_at(vm_offset_t offs, uint32_t page)
2687 {
2688 static const int zero;
2689
2690 return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
2691 }
2692 #define assert_writeable(offs, page) \
2693 assertf(can_write_at(offs, page), \
2694 "can write at %p + ptoa(%d)", (void *)offs, page)
2695
2696 #define assert_faults(offs, page) \
2697 assertf(!can_write_at(offs, page), \
2698 "can write at %p + ptoa(%d)", (void *)offs, page)
2699
2700 #define peek(offs, page) \
2701 (*(uint32_t *)((offs) + ptoa(page)))
2702
2703 #define poke(offs, page, v) \
2704 (*(uint32_t *)((offs) + ptoa(page)) = (v))
2705
2706 __attribute__((noinline))
2707 static void
kmem_alloc_basic_test(vm_map_t map)2708 kmem_alloc_basic_test(vm_map_t map)
2709 {
2710 kmem_guard_t guard = {
2711 .kmg_tag = VM_KERN_MEMORY_DIAG,
2712 };
2713 vm_offset_t addr;
2714
2715 /*
2716 * Test wired basics:
2717 * - KMA_KOBJECT
2718 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
2719 * - allocation alignment
2720 */
2721 addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
2722 KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
2723 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
2724 assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
2725 kmem_test_assert_map(map, 10, 1);
2726
2727 kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
2728 assertf(e, "unable to find address %p in map %p", (void *)addr, map);
2729 assert(e->vme_kernel_object);
2730 assert(!e->vme_atomic);
2731 assert3u(e->vme_start, <=, addr);
2732 assert3u(addr + ptoa(10), <=, e->vme_end);
2733 });
2734
2735 assert_faults(addr, 0);
2736 for (int i = 1; i < 9; i++) {
2737 assert_writeable(addr, i);
2738 }
2739 assert_faults(addr, 9);
2740
2741 kmem_free(map, addr, ptoa(10));
2742 kmem_test_assert_map(map, 0, 0);
2743
2744 /*
2745 * Test pageable basics.
2746 */
2747 addr = kmem_alloc_guard(map, ptoa(10), 0,
2748 KMA_PAGEABLE, guard).kmr_address;
2749 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
2750 kmem_test_assert_map(map, 10, 1);
2751
2752 for (int i = 0; i < 9; i++) {
2753 assert_faults(addr, i);
2754 poke(addr, i, 42);
2755 assert_writeable(addr, i);
2756 }
2757
2758 kmem_free(map, addr, ptoa(10));
2759 kmem_test_assert_map(map, 0, 0);
2760 }
2761
2762 __attribute__((noinline))
2763 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)2764 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
2765 {
2766 kmem_guard_t guard = {
2767 .kmg_atomic = !(kind & KMR_DATA),
2768 .kmg_tag = VM_KERN_MEMORY_DIAG,
2769 .kmg_context = 0xefface,
2770 };
2771 vm_offset_t addr, newaddr;
2772 const int N = 10;
2773
2774 /*
2775 * This isn't something kmem_realloc_guard() _needs_ to do,
2776 * we could conceive an implementation where it grows in place
2777 * if there's space after it.
2778 *
2779 * However, this is what the implementation does today.
2780 */
2781 bool realloc_growth_changes_address = true;
2782 bool GL = (kind & KMR_GUARD_LAST);
2783
2784 /*
2785 * Initial N page allocation
2786 */
2787 addr = kmem_alloc_guard(map, ptoa(N), 0,
2788 (kind & (KMA_KOBJECT | KMA_GUARD_LAST)) | KMA_ZERO,
2789 guard).kmr_address;
2790 assert3u(addr, !=, 0);
2791 kmem_test_assert_map(map, N, 1);
2792 for (int pg = 0; pg < N - GL; pg++) {
2793 poke(addr, pg, 42 + pg);
2794 }
2795 for (int pg = N - GL; pg < N; pg++) {
2796 assert_faults(addr, pg);
2797 }
2798
2799
2800 /*
2801 * Grow to N + 3 pages
2802 */
2803 newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
2804 kind | KMR_ZERO, guard).kmr_address;
2805 assert3u(newaddr, !=, 0);
2806 if (realloc_growth_changes_address) {
2807 assert3u(addr, !=, newaddr);
2808 }
2809 if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
2810 kmem_test_assert_map(map, N + 3, 1);
2811 } else {
2812 kmem_test_assert_map(map, 2 * N + 3, 2);
2813 }
2814 for (int pg = 0; pg < N - GL; pg++) {
2815 assert3u(peek(newaddr, pg), ==, 42 + pg);
2816 }
2817 if ((kind & KMR_FREEOLD) == 0) {
2818 for (int pg = 0; pg < N - GL; pg++) {
2819 assert3u(peek(addr, pg), ==, 42 + pg);
2820 }
2821 /* check for tru-share */
2822 poke(addr + 16, 0, 1234);
2823 assert3u(peek(newaddr + 16, 0), ==, 1234);
2824 kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
2825 kmem_test_assert_map(map, N + 3, 1);
2826 }
2827 if (addr != newaddr) {
2828 for (int pg = 0; pg < N - GL; pg++) {
2829 assert_faults(addr, pg);
2830 }
2831 }
2832 for (int pg = N - GL; pg < N + 3 - GL; pg++) {
2833 assert3u(peek(newaddr, pg), ==, 0);
2834 }
2835 for (int pg = N + 3 - GL; pg < N + 3; pg++) {
2836 assert_faults(newaddr, pg);
2837 }
2838 addr = newaddr;
2839
2840
2841 /*
2842 * Shrink to N - 2 pages
2843 */
2844 newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
2845 kind | KMR_ZERO, guard).kmr_address;
2846 assert3u(map->size, ==, ptoa(N - 2));
2847 assert3u(newaddr, ==, addr);
2848 kmem_test_assert_map(map, N - 2, 1);
2849
2850 for (int pg = 0; pg < N - 2 - GL; pg++) {
2851 assert3u(peek(addr, pg), ==, 42 + pg);
2852 }
2853 for (int pg = N - 2 - GL; pg < N + 3; pg++) {
2854 assert_faults(addr, pg);
2855 }
2856
2857 kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
2858 kmem_test_assert_map(map, 0, 0);
2859 }
2860
2861 static int
kmem_basic_test(__unused int64_t in,int64_t * out)2862 kmem_basic_test(__unused int64_t in, int64_t *out)
2863 {
2864 mach_vm_offset_t addr;
2865 vm_map_t map;
2866
2867 printf("%s: test running\n", __func__);
2868
2869 map = kmem_suballoc(kernel_map, &addr, 64U << 20,
2870 VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
2871 KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
2872
2873 printf("%s: kmem_alloc ...\n", __func__);
2874 kmem_alloc_basic_test(map);
2875 printf("%s: PASS\n", __func__);
2876
2877 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
2878 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
2879 printf("%s: PASS\n", __func__);
2880
2881 printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
2882 kmem_realloc_basic_test(map, KMR_FREEOLD);
2883 printf("%s: PASS\n", __func__);
2884
2885 printf("%s: kmem_realloc (KMR_NONE) ...\n", __func__);
2886 kmem_realloc_basic_test(map, KMR_NONE);
2887 printf("%s: PASS\n", __func__);
2888
2889 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2890 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
2891 printf("%s: PASS\n", __func__);
2892
2893 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2894 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
2895 printf("%s: PASS\n", __func__);
2896
2897 printf("%s: kmem_realloc (KMR_GUARD_LAST) ...\n", __func__);
2898 kmem_realloc_basic_test(map, KMR_GUARD_LAST);
2899 printf("%s: PASS\n", __func__);
2900
2901 /* using KMR_DATA signals to test the non atomic realloc path */
2902 printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
2903 kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
2904 printf("%s: PASS\n", __func__);
2905
2906 printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
2907 kmem_realloc_basic_test(map, KMR_DATA);
2908 printf("%s: PASS\n", __func__);
2909
2910 kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
2911 vm_map_deallocate(map);
2912
2913 printf("%s: test passed\n", __func__);
2914 *out = 1;
2915 return 0;
2916 }
2917 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
2918 #endif /* DEBUG || DEVELOPMENT */
2919