1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_kern.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Kernel memory management.
64 */
65
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #include <kern/misc_protos.h>
80 #include <vm/cpm.h>
81 #include <kern/ledger.h>
82 #include <kern/bits.h>
83 #include <kern/startup.h>
84
85 #include <string.h>
86
87 #include <libkern/OSDebug.h>
88 #include <libkern/crypto/sha2.h>
89 #include <libkern/section_keywords.h>
90 #include <sys/kdebug.h>
91 #include <sys/kdebug_triage.h>
92
93 #include <san/kasan.h>
94 #include <kern/kext_alloc.h>
95 #include <kern/backtrace.h>
96 #include <os/hash.h>
97 #include <kern/zalloc_internal.h>
98 #include <libkern/crypto/rand.h>
99
100 /*
101 * Variables exported by this module.
102 */
103
104 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
105 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
106 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
107
108 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
109 KMEM_RANGE_ID_NUM_PTR);
110 #define KMEM_GOBJ_THRESHOLD (32ULL << 20)
111 #if DEBUG || DEVELOPMENT
112 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
113 #define KMEM_OUTLIER_SIZE 0
114 #define KMEM_OUTLIER_ALIGN 1
115 btlog_t kmem_outlier_log;
116 #endif /* DEBUG || DEVELOPMENT */
117
118 __startup_data static vm_map_size_t data_range_size;
119 __startup_data static vm_map_size_t ptr_range_size;
120 __startup_data static vm_map_size_t sprayqtn_range_size;
121
122 #pragma mark helpers
123
124 __attribute__((overloadable))
125 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)126 ANYF(kma_flags_t flags)
127 {
128 return (kmem_flags_t)flags;
129 }
130
131 __attribute__((overloadable))
132 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)133 ANYF(kmr_flags_t flags)
134 {
135 return (kmem_flags_t)flags;
136 }
137
138 __attribute__((overloadable))
139 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)140 ANYF(kmf_flags_t flags)
141 {
142 return (kmem_flags_t)flags;
143 }
144
145 __abortlike
146 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)147 __kmem_invalid_size_panic(
148 vm_map_t map,
149 vm_size_t size,
150 uint32_t flags)
151 {
152 panic("kmem(map=%p, flags=0x%x): invalid size %zd",
153 map, flags, (size_t)size);
154 }
155
156 __abortlike
157 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)158 __kmem_invalid_arguments_panic(
159 const char *what,
160 vm_map_t map,
161 vm_address_t address,
162 vm_size_t size,
163 uint32_t flags)
164 {
165 panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
166 "invalid arguments passed",
167 what, map, (void *)address, (size_t)size, flags);
168 }
169
170 __abortlike
171 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)172 __kmem_failed_panic(
173 vm_map_t map,
174 vm_size_t size,
175 uint32_t flags,
176 kern_return_t kr,
177 const char *what)
178 {
179 panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
180 what, map, (size_t)size, flags, kr);
181 }
182
183 __abortlike
184 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)185 __kmem_entry_not_found_panic(
186 vm_map_t map,
187 vm_offset_t addr)
188 {
189 panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
190 }
191
192 static inline vm_object_t
__kmem_object(kmem_flags_t flags)193 __kmem_object(kmem_flags_t flags)
194 {
195 if (flags & KMEM_COMPRESSOR) {
196 if (flags & KMEM_KOBJECT) {
197 panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
198 }
199 return compressor_object;
200 }
201 if (!(flags & KMEM_KOBJECT)) {
202 panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
203 }
204 return kernel_object_default;
205 }
206
207 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)208 __kmem_mapping_type(kmem_flags_t flags)
209 {
210 if (flags & (KMEM_DATA | KMEM_COMPRESSOR)) {
211 return PMAP_MAPPING_TYPE_DEFAULT;
212 } else {
213 return PMAP_MAPPING_TYPE_RESTRICTED;
214 }
215 }
216
217 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)218 __kmem_guard_left(kmem_flags_t flags)
219 {
220 return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
221 }
222
223 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)224 __kmem_guard_right(kmem_flags_t flags)
225 {
226 return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
227 }
228
229 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)230 __kmem_guard_size(kmem_flags_t flags)
231 {
232 return __kmem_guard_left(flags) + __kmem_guard_right(flags);
233 }
234
235 __pure2
236 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)237 __kmem_entry_orig_size(vm_map_entry_t entry)
238 {
239 vm_object_t object = VME_OBJECT(entry);
240
241 if (entry->vme_kernel_object) {
242 return entry->vme_end - entry->vme_start -
243 entry->vme_object_or_delta;
244 } else {
245 return object->vo_size - object->vo_size_delta;
246 }
247 }
248
249
250 #pragma mark kmem range methods
251
252 #if __arm64__
253 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
254 #define mach_vm_range_load(r, r_min, r_max) \
255 asm("ldp %[rmin], %[rmax], [%[range]]" \
256 : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
257 : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
258 #else
259 #define mach_vm_range_load(r, rmin, rmax) \
260 ({ rmin = (r)->min_address; rmax = (r)->max_address; })
261 #endif
262
263 __abortlike
264 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)265 __mach_vm_range_overflow(
266 mach_vm_offset_t addr,
267 mach_vm_offset_t size)
268 {
269 panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
270 addr, addr, size);
271 }
272
273 __abortlike
274 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)275 __mach_vm_range_invalid(
276 mach_vm_offset_t min_address,
277 mach_vm_offset_t max_address)
278 {
279 panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
280 min_address, max_address);
281 }
282
283 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)284 mach_vm_range_size(const struct mach_vm_range *r)
285 {
286 mach_vm_offset_t rmin, rmax;
287
288 mach_vm_range_load(r, rmin, rmax);
289 return rmax - rmin;
290 }
291
292 __attribute__((overloadable))
293 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)294 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
295 {
296 mach_vm_offset_t rmin, rmax;
297
298 #if CONFIG_KERNEL_TAGGING
299 if (VM_KERNEL_ADDRESS(addr)) {
300 addr = vm_memtag_canonicalize_address(addr);
301 }
302 #endif /* CONFIG_KERNEL_TAGGING */
303
304 /*
305 * The `&` is not a typo: we really expect the check to pass,
306 * so encourage the compiler to eagerly load and test without branches
307 */
308 mach_vm_range_load(r, rmin, rmax);
309 return (addr >= rmin) & (addr < rmax);
310 }
311
312 __attribute__((overloadable))
313 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)314 mach_vm_range_contains(
315 const struct mach_vm_range *r,
316 mach_vm_offset_t addr,
317 mach_vm_offset_t size)
318 {
319 mach_vm_offset_t rmin, rmax;
320
321 #if CONFIG_KERNEL_TAGGING
322 if (VM_KERNEL_ADDRESS(addr)) {
323 addr = vm_memtag_canonicalize_address(addr);
324 }
325 #endif /* CONFIG_KERNEL_TAGGING */
326
327 /*
328 * The `&` is not a typo: we really expect the check to pass,
329 * so encourage the compiler to eagerly load and test without branches
330 */
331 mach_vm_range_load(r, rmin, rmax);
332 return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
333 }
334
335 __attribute__((overloadable))
336 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)337 mach_vm_range_intersects(
338 const struct mach_vm_range *r1,
339 const struct mach_vm_range *r2)
340 {
341 mach_vm_offset_t r1_min, r1_max;
342 mach_vm_offset_t r2_min, r2_max;
343
344 mach_vm_range_load(r1, r1_min, r1_max);
345 r2_min = r2->min_address;
346 r2_max = r2->max_address;
347
348 if (r1_min > r1_max) {
349 __mach_vm_range_invalid(r1_min, r1_max);
350 }
351
352 if (r2_min > r2_max) {
353 __mach_vm_range_invalid(r2_min, r2_max);
354 }
355
356 return r1_max > r2_min && r1_min < r2_max;
357 }
358
359 __attribute__((overloadable))
360 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)361 mach_vm_range_intersects(
362 const struct mach_vm_range *r1,
363 mach_vm_offset_t addr,
364 mach_vm_offset_t size)
365 {
366 struct mach_vm_range r2;
367
368 addr = VM_KERNEL_STRIP_UPTR(addr);
369 r2.min_address = addr;
370 if (os_add_overflow(addr, size, &r2.max_address)) {
371 __mach_vm_range_overflow(addr, size);
372 }
373
374 return mach_vm_range_intersects(r1, &r2);
375 }
376
377 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)378 kmem_range_id_contains(
379 kmem_range_id_t range_id,
380 vm_map_offset_t addr,
381 vm_map_size_t size)
382 {
383 return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
384 }
385
386 __abortlike
387 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)388 kmem_range_invalid_panic(
389 kmem_range_id_t range_id,
390 vm_map_offset_t addr,
391 vm_map_size_t size)
392 {
393 const struct mach_vm_range *r = &kmem_ranges[range_id];
394 mach_vm_offset_t rmin, rmax;
395
396 mach_vm_range_load(r, rmin, rmax);
397 if (addr + size < rmin) {
398 panic("addr %p + size %llu overflows %p", (void *)addr, size,
399 (void *)(addr + size));
400 }
401 panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
402 (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
403 }
404
405 /*
406 * Return whether the entire allocation is contained in the given range
407 */
408 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)409 kmem_range_contains_fully(
410 kmem_range_id_t range_id,
411 vm_map_offset_t addr,
412 vm_map_size_t size)
413 {
414 const struct mach_vm_range *r = &kmem_ranges[range_id];
415 mach_vm_offset_t rmin, rmax;
416 bool result = false;
417
418 if (VM_KERNEL_ADDRESS(addr)) {
419 addr = vm_memtag_canonicalize_address(addr);
420 }
421
422 /*
423 * The `&` is not a typo: we really expect the check to pass,
424 * so encourage the compiler to eagerly load and test without branches
425 */
426 mach_vm_range_load(r, rmin, rmax);
427 result = (addr >= rmin) & (addr < rmax);
428 if (__improbable(result
429 && ((addr + size < rmin) || (addr + size > rmax)))) {
430 kmem_range_invalid_panic(range_id, addr, size);
431 }
432 return result;
433 }
434
435 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)436 kmem_range_id_size(kmem_range_id_t range_id)
437 {
438 return mach_vm_range_size(&kmem_ranges[range_id]);
439 }
440
441 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)442 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
443 {
444 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
445
446 for (; range_id < KMEM_RANGE_COUNT; range_id++) {
447 if (kmem_range_contains_fully(range_id, addr, size)) {
448 return range_id;
449 }
450 }
451 return KMEM_RANGE_ID_NONE;
452 }
453
454 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)455 kmem_is_ptr_range(vm_map_range_id_t range_id)
456 {
457 return (range_id >= KMEM_RANGE_ID_FIRST) &&
458 (range_id <= KMEM_RANGE_ID_NUM_PTR);
459 }
460
461 __abortlike
462 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)463 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
464 {
465 panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
466 (void *)addr);
467 }
468
469 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)470 kmem_validate_range_for_overwrite(
471 vm_map_offset_t addr,
472 vm_map_size_t size)
473 {
474 vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
475
476 if (kmem_is_ptr_range(range_id)) {
477 kmem_range_invalid_for_overwrite(addr);
478 }
479
480 return &kmem_ranges[range_id];
481 }
482
483
484 #pragma mark entry parameters
485
486
487 __abortlike
488 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)489 __kmem_entry_validate_panic(
490 vm_map_t map,
491 vm_map_entry_t entry,
492 vm_offset_t addr,
493 vm_size_t size,
494 uint32_t flags,
495 kmem_guard_t guard)
496 {
497 const char *what = "???";
498
499 if (entry->vme_atomic != guard.kmg_atomic) {
500 what = "atomicity";
501 } else if (entry->is_sub_map != guard.kmg_submap) {
502 what = "objectness";
503 } else if (addr != entry->vme_start) {
504 what = "left bound";
505 } else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
506 what = "right bound";
507 } else if (guard.kmg_context != entry->vme_context) {
508 what = "guard";
509 }
510
511 panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
512 "entry:%p %s mismatch guard(0x%08x)",
513 map, (void *)addr, size, flags, entry,
514 what, guard.kmg_context);
515 }
516
517 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)518 __kmem_entry_validate_guard(
519 vm_map_entry_t entry,
520 vm_offset_t addr,
521 vm_size_t size,
522 kmem_flags_t flags,
523 kmem_guard_t guard)
524 {
525 if (entry->vme_atomic != guard.kmg_atomic) {
526 return false;
527 }
528
529 if (!guard.kmg_atomic) {
530 return true;
531 }
532
533 if (entry->is_sub_map != guard.kmg_submap) {
534 return false;
535 }
536
537 if (addr != entry->vme_start) {
538 return false;
539 }
540
541 if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
542 return false;
543 }
544
545 if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
546 return false;
547 }
548
549 return true;
550 }
551
552 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)553 kmem_entry_validate_guard(
554 vm_map_t map,
555 vm_map_entry_t entry,
556 vm_offset_t addr,
557 vm_size_t size,
558 kmem_guard_t guard)
559 {
560 if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
561 __kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
562 }
563 }
564
565 __abortlike
566 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)567 __kmem_entry_validate_object_panic(
568 vm_map_t map,
569 vm_map_entry_t entry,
570 kmem_flags_t flags)
571 {
572 const char *what;
573 const char *verb;
574
575 if (entry->is_sub_map) {
576 panic("kmem(map=%p) entry %p is a submap", map, entry);
577 }
578
579 if (flags & KMEM_KOBJECT) {
580 what = "kernel";
581 verb = "isn't";
582 } else if (flags & KMEM_COMPRESSOR) {
583 what = "compressor";
584 verb = "isn't";
585 } else if (entry->vme_kernel_object) {
586 what = "kernel";
587 verb = "is unexpectedly";
588 } else {
589 what = "compressor";
590 verb = "is unexpectedly";
591 }
592
593 panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
594 map, flags, entry, verb, what);
595 }
596
597 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)598 __kmem_entry_validate_object(
599 vm_map_entry_t entry,
600 kmem_flags_t flags)
601 {
602 if (entry->is_sub_map) {
603 return false;
604 }
605 if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
606 return false;
607 }
608
609 return (bool)(flags & KMEM_COMPRESSOR) ==
610 (VME_OBJECT(entry) == compressor_object);
611 }
612
613 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)614 kmem_size_guard(
615 vm_map_t map,
616 vm_offset_t addr,
617 kmem_guard_t guard)
618 {
619 kmem_flags_t flags = KMEM_GUESS_SIZE;
620 vm_map_entry_t entry;
621 vm_size_t size;
622
623 vm_map_lock_read(map);
624
625 #if KASAN_CLASSIC
626 addr -= PAGE_SIZE;
627 #endif /* KASAN_CLASSIC */
628 addr = vm_memtag_canonicalize_address(addr);
629
630 if (!vm_map_lookup_entry(map, addr, &entry)) {
631 __kmem_entry_not_found_panic(map, addr);
632 }
633
634 if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
635 __kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
636 }
637
638 size = __kmem_entry_orig_size(entry);
639
640 vm_map_unlock_read(map);
641
642 return size;
643 }
644
645 static inline uint16_t
kmem_hash_backtrace(void * fp)646 kmem_hash_backtrace(
647 void *fp)
648 {
649 uint64_t bt_count;
650 uintptr_t bt[8] = {};
651
652 struct backtrace_control ctl = {
653 .btc_frame_addr = (uintptr_t)fp,
654 };
655
656 bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
657 return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
658 }
659
660 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
661 "Insufficient bits to represent ptr ranges");
662
663 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)664 kmem_adjust_range_id(
665 uint32_t hash)
666 {
667 return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
668 (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
669 }
670
671 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)672 kmem_use_sprayqtn(
673 kma_flags_t kma_flags,
674 vm_map_size_t map_size,
675 vm_offset_t mask)
676 {
677 /*
678 * Pointer allocations that are above the guard objects threshold or have
679 * leading guard pages with non standard alignment requests are redirected
680 * to the sprayqtn range.
681 */
682 #if DEBUG || DEVELOPMENT
683 btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
684 BTREF_GET_NOWAIT : 0;
685
686 if ((kma_flags & KMA_SPRAYQTN) == 0) {
687 if (map_size > KMEM_GOBJ_THRESHOLD) {
688 btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
689 btref_get(__builtin_frame_address(0), flags));
690 } else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
691 btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
692 btref_get(__builtin_frame_address(0), flags));
693 }
694 }
695 #endif /* DEBUG || DEVELOPMENT */
696
697 return (kma_flags & KMA_SPRAYQTN) ||
698 (map_size > KMEM_GOBJ_THRESHOLD) ||
699 ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
700 }
701
702 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)703 kmem_apply_security_policy(
704 vm_map_t map,
705 kma_flags_t kma_flags,
706 kmem_guard_t guard,
707 vm_map_size_t map_size,
708 vm_offset_t mask,
709 vm_map_kernel_flags_t *vmk_flags,
710 bool assert_dir __unused)
711 {
712 kmem_range_id_t range_id;
713 bool from_right;
714 uint16_t type_hash = guard.kmg_type_hash;
715
716 if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
717 return;
718 }
719
720 /*
721 * A non-zero type-hash must be passed by krealloc_type
722 */
723 #if (DEBUG || DEVELOPMENT)
724 if (assert_dir && !(kma_flags & KMA_DATA)) {
725 assert(type_hash != 0);
726 }
727 #endif
728
729 if (kma_flags & KMA_DATA) {
730 range_id = KMEM_RANGE_ID_DATA;
731 /*
732 * As an optimization in KMA_DATA to avoid fragmentation,
733 * allocate static carveouts at the end of the DATA range.
734 */
735 from_right = (bool)(kma_flags & KMA_PERMANENT);
736 } else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
737 range_id = KMEM_RANGE_ID_SPRAYQTN;
738 from_right = (bool)(kma_flags & KMA_PERMANENT);
739 } else if (type_hash) {
740 range_id = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
741 from_right = type_hash & KMEM_DIRECTION_MASK;
742 } else {
743 /*
744 * Range id needs to correspond to one of the PTR ranges
745 */
746 type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
747 range_id = kmem_adjust_range_id(type_hash);
748 from_right = type_hash & KMEM_DIRECTION_MASK;
749 }
750
751 vmk_flags->vmkf_range_id = range_id;
752 vmk_flags->vmkf_last_free = from_right;
753 }
754
755 #pragma mark allocation
756
757 static kmem_return_t
758 kmem_alloc_guard_internal(
759 vm_map_t map,
760 vm_size_t size,
761 vm_offset_t mask,
762 kma_flags_t flags,
763 kmem_guard_t guard,
764 kern_return_t (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
765 {
766 vm_object_t object;
767 vm_offset_t delta = 0;
768 vm_map_entry_t entry = NULL;
769 vm_map_offset_t map_addr, fill_start;
770 vm_map_size_t map_size, fill_size;
771 vm_page_t guard_left = VM_PAGE_NULL;
772 vm_page_t guard_right = VM_PAGE_NULL;
773 vm_page_t wired_page_list = VM_PAGE_NULL;
774 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
775 bool skip_guards;
776 kmem_return_t kmr = { };
777
778 assert(kernel_map && map->pmap == kernel_pmap);
779
780 #if DEBUG || DEVELOPMENT
781 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
782 size, 0, 0, 0);
783 #endif
784
785 if (size == 0 ||
786 (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
787 (size < __kmem_guard_size(ANYF(flags)))) {
788 __kmem_invalid_size_panic(map, size, flags);
789 }
790
791 /*
792 * limit the size of a single extent of wired memory
793 * to try and limit the damage to the system if
794 * too many pages get wired down
795 * limit raised to 2GB with 128GB max physical limit,
796 * but scaled by installed memory above this
797 *
798 * Note: kmem_alloc_contig_guard() is immune to this check.
799 */
800 if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
801 alloc_pages == NULL &&
802 size > MAX(1ULL << 31, sane_size / 64))) {
803 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
804 goto out_error;
805 }
806
807 /*
808 * Guard pages:
809 *
810 * Guard pages are implemented as fictitious pages.
811 *
812 * However, some maps, and some objects are known
813 * to manage their memory explicitly, and do not need
814 * those to be materialized, which saves memory.
815 *
816 * By placing guard pages on either end of a stack,
817 * they can help detect cases where a thread walks
818 * off either end of its stack.
819 *
820 * They are allocated and set up here and attempts
821 * to access those pages are trapped in vm_fault_page().
822 *
823 * The map_size we were passed may include extra space for
824 * guard pages. fill_size represents the actual size to populate.
825 * Similarly, fill_start indicates where the actual pages
826 * will begin in the range.
827 */
828
829 map_size = round_page(size);
830 fill_start = 0;
831 fill_size = map_size - __kmem_guard_size(ANYF(flags));
832
833 #if KASAN_CLASSIC
834 if (flags & KMA_KASAN_GUARD) {
835 assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
836 flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
837 delta = ptoa(2);
838 map_size += delta;
839 }
840 #else
841 (void)delta;
842 #endif /* KASAN_CLASSIC */
843
844 skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
845 map->never_faults;
846
847 if (flags & KMA_GUARD_FIRST) {
848 vmk_flags.vmkf_guard_before = true;
849 fill_start += PAGE_SIZE;
850 }
851 if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
852 guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
853 if (__improbable(guard_left == VM_PAGE_NULL)) {
854 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
855 goto out_error;
856 }
857 }
858 if ((flags & KMA_GUARD_LAST) && !skip_guards) {
859 guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
860 if (__improbable(guard_right == VM_PAGE_NULL)) {
861 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
862 goto out_error;
863 }
864 }
865
866 if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
867 if (alloc_pages) {
868 kmr.kmr_return = alloc_pages(fill_size, flags,
869 &wired_page_list);
870 } else {
871 kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
872 &wired_page_list);
873 }
874 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
875 goto out_error;
876 }
877 }
878
879 /*
880 * Allocate a new object (if necessary). We must do this before
881 * locking the map, or risk deadlock with the default pager.
882 */
883 if (flags & KMA_KOBJECT) {
884 object = kernel_object_default;
885 vm_object_reference(object);
886 } else if (flags & KMA_COMPRESSOR) {
887 object = compressor_object;
888 vm_object_reference(object);
889 } else {
890 object = vm_object_allocate(map_size);
891 vm_object_set_size(object, map_size, size);
892 /* stabilize the object to prevent shadowing */
893 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
894 object->true_share = TRUE;
895 }
896
897 if (flags & KMA_LAST_FREE) {
898 vmk_flags.vmkf_last_free = true;
899 }
900 if (flags & KMA_PERMANENT) {
901 vmk_flags.vmf_permanent = true;
902 }
903 kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
904 false);
905
906 kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
907 vmk_flags, &entry);
908 if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
909 vm_object_deallocate(object);
910 goto out_error;
911 }
912
913 map_addr = entry->vme_start;
914 VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
915 VME_ALIAS_SET(entry, guard.kmg_tag);
916 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
917 VME_OFFSET_SET(entry, map_addr);
918 }
919
920 #if KASAN
921 if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
922 entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
923 }
924 #endif /* KASAN */
925
926 if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
927 entry->wired_count = 1;
928 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
929 }
930
931 if (guard_left || guard_right || wired_page_list) {
932 vm_object_offset_t offset = 0ull;
933
934 vm_object_lock(object);
935 vm_map_unlock(map);
936
937 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
938 offset = map_addr;
939 }
940
941 if (guard_left) {
942 vm_page_insert(guard_left, object, offset);
943 guard_left->vmp_busy = FALSE;
944 guard_left = VM_PAGE_NULL;
945 }
946
947 if (guard_right) {
948 vm_page_insert(guard_right, object,
949 offset + fill_start + fill_size);
950 guard_right->vmp_busy = FALSE;
951 guard_right = VM_PAGE_NULL;
952 }
953
954 if (wired_page_list) {
955 kernel_memory_populate_object_and_unlock(object,
956 map_addr + fill_start, offset + fill_start, fill_size,
957 wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
958 __kmem_mapping_type(ANYF(flags)));
959 } else {
960 vm_object_unlock(object);
961 }
962 } else {
963 vm_map_unlock(map);
964 }
965
966 /*
967 * now that the pages are wired, we no longer have to fear coalesce
968 */
969 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
970 vm_map_simplify(map, map_addr);
971 }
972
973 #if DEBUG || DEVELOPMENT
974 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
975 atop(fill_size), 0, 0, 0);
976 #endif /* DEBUG || DEVELOPMENT */
977 kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
978
979 #if KASAN
980 if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
981 /*
982 * We need to allow the range for pageable memory,
983 * or faulting will not be allowed.
984 */
985 kasan_notify_address(map_addr, map_size);
986 }
987 #endif /* KASAN */
988 #if KASAN_CLASSIC
989 if (flags & KMA_KASAN_GUARD) {
990 kmr.kmr_address += PAGE_SIZE;
991 kasan_alloc_large(kmr.kmr_address, size);
992 }
993 #endif /* KASAN_CLASSIC */
994 #if CONFIG_KERNEL_TAGGING
995 if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
996 kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, size);
997 vm_memtag_set_tag((vm_offset_t)kmr.kmr_address, size);
998 #if KASAN_TBI
999 kasan_tbi_retag_unused_space((vm_offset_t)kmr.kmr_address, map_size, size);
1000 #endif /* KASAN_TBI */
1001 }
1002 #endif /* CONFIG_KERNEL_TAGGING */
1003 return kmr;
1004
1005 out_error:
1006 if (flags & KMA_NOFAIL) {
1007 __kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1008 }
1009 if (guard_left) {
1010 guard_left->vmp_snext = wired_page_list;
1011 wired_page_list = guard_left;
1012 }
1013 if (guard_right) {
1014 guard_right->vmp_snext = wired_page_list;
1015 wired_page_list = guard_right;
1016 }
1017 if (wired_page_list) {
1018 vm_page_free_list(wired_page_list, FALSE);
1019 }
1020
1021 #if DEBUG || DEVELOPMENT
1022 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1023 0, 0, 0, 0);
1024 #endif /* DEBUG || DEVELOPMENT */
1025
1026 return kmr;
1027 }
1028
1029 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1030 kmem_alloc_guard(
1031 vm_map_t map,
1032 vm_size_t size,
1033 vm_offset_t mask,
1034 kma_flags_t flags,
1035 kmem_guard_t guard)
1036 {
1037 return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1038 }
1039
1040 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1041 kmem_alloc_contig_guard(
1042 vm_map_t map,
1043 vm_size_t size,
1044 vm_offset_t mask,
1045 ppnum_t max_pnum,
1046 ppnum_t pnum_mask,
1047 kma_flags_t flags,
1048 kmem_guard_t guard)
1049 {
1050 __auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1051 return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1052 };
1053
1054 return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1055 }
1056
1057 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1058 kmem_suballoc(
1059 vm_map_t parent,
1060 mach_vm_offset_t *addr,
1061 vm_size_t size,
1062 vm_map_create_options_t vmc_options,
1063 int vm_flags,
1064 kms_flags_t flags,
1065 vm_tag_t tag)
1066 {
1067 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1068 vm_map_offset_t map_addr = 0;
1069 kmem_return_t kmr = { };
1070 vm_map_t map;
1071
1072 assert(page_aligned(size));
1073 assert(parent->pmap == kernel_pmap);
1074
1075 vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1076
1077 if (parent == kernel_map) {
1078 assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1079 }
1080
1081 if (vmk_flags.vmf_fixed) {
1082 map_addr = trunc_page(*addr);
1083 }
1084
1085 pmap_reference(vm_map_pmap(parent));
1086 map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1087
1088 /*
1089 * 1. vm_map_enter() will consume one ref on success.
1090 *
1091 * 2. make the entry atomic as kernel submaps should never be split.
1092 *
1093 * 3. instruct vm_map_enter() that it is a fresh submap
1094 * that needs to be taught its bounds as it inserted.
1095 */
1096 vm_map_reference(map);
1097
1098 vmk_flags.vmkf_submap = true;
1099 if ((flags & KMS_DATA) == 0) {
1100 /* FIXME: IOKit submaps get fragmented and can't be atomic */
1101 vmk_flags.vmkf_submap_atomic = true;
1102 }
1103 vmk_flags.vmkf_submap_adjust = true;
1104 if (flags & KMS_LAST_FREE) {
1105 vmk_flags.vmkf_last_free = true;
1106 }
1107 if (flags & KMS_PERMANENT) {
1108 vmk_flags.vmf_permanent = true;
1109 }
1110 if (flags & KMS_DATA) {
1111 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1112 }
1113
1114 kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1115 vmk_flags, (vm_object_t)map, 0, FALSE,
1116 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1117
1118 if (kmr.kmr_return != KERN_SUCCESS) {
1119 if (flags & KMS_NOFAIL) {
1120 panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1121 parent, size, kmr.kmr_return);
1122 }
1123 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1124 vm_map_deallocate(map);
1125 vm_map_deallocate(map); /* also removes ref to pmap */
1126 return kmr;
1127 }
1128
1129 /*
1130 * For kmem_suballocs that register a claim and are assigned a range, ensure
1131 * that the exact same range is returned.
1132 */
1133 if (*addr != 0 && parent == kernel_map &&
1134 startup_phase > STARTUP_SUB_KMEM) {
1135 assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1136 } else {
1137 *addr = map_addr;
1138 }
1139
1140 kmr.kmr_submap = map;
1141 return kmr;
1142 }
1143
1144 /*
1145 * kmem_alloc:
1146 *
1147 * Allocate wired-down memory in the kernel's address map
1148 * or a submap. The memory is not zero-filled.
1149 */
1150
1151 __exported kern_return_t
1152 kmem_alloc_external(
1153 vm_map_t map,
1154 vm_offset_t *addrp,
1155 vm_size_t size);
1156 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1157 kmem_alloc_external(
1158 vm_map_t map,
1159 vm_offset_t *addrp,
1160 vm_size_t size)
1161 {
1162 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1163 return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1164 }
1165 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1166 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1167 }
1168
1169
1170 /*
1171 * kmem_alloc_kobject:
1172 *
1173 * Allocate wired-down memory in the kernel's address map
1174 * or a submap. The memory is not zero-filled.
1175 *
1176 * The memory is allocated in the kernel_object.
1177 * It may not be copied with vm_map_copy, and
1178 * it may not be reallocated with kmem_realloc.
1179 */
1180
1181 __exported kern_return_t
1182 kmem_alloc_kobject_external(
1183 vm_map_t map,
1184 vm_offset_t *addrp,
1185 vm_size_t size);
1186 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1187 kmem_alloc_kobject_external(
1188 vm_map_t map,
1189 vm_offset_t *addrp,
1190 vm_size_t size)
1191 {
1192 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1193 return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1194 }
1195 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1196 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1197 }
1198
1199 /*
1200 * kmem_alloc_pageable:
1201 *
1202 * Allocate pageable memory in the kernel's address map.
1203 */
1204
1205 __exported kern_return_t
1206 kmem_alloc_pageable_external(
1207 vm_map_t map,
1208 vm_offset_t *addrp,
1209 vm_size_t size);
1210 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1211 kmem_alloc_pageable_external(
1212 vm_map_t map,
1213 vm_offset_t *addrp,
1214 vm_size_t size)
1215 {
1216 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1217 return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1218 }
1219 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1220 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1221 }
1222
1223
1224 #pragma mark population
1225
1226 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1227 kernel_memory_populate_pmap_enter(
1228 vm_object_t object,
1229 vm_address_t addr,
1230 vm_object_offset_t offset,
1231 vm_page_t mem,
1232 vm_prot_t prot,
1233 int pe_flags,
1234 pmap_mapping_type_t mapping_type)
1235 {
1236 kern_return_t pe_result;
1237 int pe_options;
1238
1239 if (VMP_ERROR_GET(mem)) {
1240 panic("VM page %p should not have an error", mem);
1241 }
1242
1243 pe_options = PMAP_OPTIONS_NOWAIT;
1244 if (object->internal) {
1245 pe_options |= PMAP_OPTIONS_INTERNAL;
1246 }
1247 if (mem->vmp_reusable || object->all_reusable) {
1248 pe_options |= PMAP_OPTIONS_REUSABLE;
1249 }
1250
1251 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1252 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1253 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1254
1255 if (pe_result == KERN_RESOURCE_SHORTAGE) {
1256 vm_object_unlock(object);
1257
1258 pe_options &= ~PMAP_OPTIONS_NOWAIT;
1259
1260 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1261 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1262 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1263
1264 vm_object_lock(object);
1265 }
1266
1267 assert(pe_result == KERN_SUCCESS);
1268 }
1269
1270 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1271 kernel_memory_populate_object_and_unlock(
1272 vm_object_t object, /* must be locked */
1273 vm_address_t addr,
1274 vm_offset_t offset,
1275 vm_size_t size,
1276 vm_page_t page_list,
1277 kma_flags_t flags,
1278 vm_tag_t tag,
1279 vm_prot_t prot,
1280 pmap_mapping_type_t mapping_type)
1281 {
1282 vm_page_t mem;
1283 int pe_flags;
1284 bool gobbled_list = page_list && page_list->vmp_gobbled;
1285
1286 assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1287 assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1288 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1289 assert3u(offset, ==, addr);
1290 } else {
1291 /*
1292 * kernel_memory_populate_pmap_enter() might drop the object
1293 * lock, and the caller might not own a reference anymore
1294 * and rely on holding the vm object lock for liveness.
1295 */
1296 vm_object_reference_locked(object);
1297 }
1298
1299 if (flags & KMA_KSTACK) {
1300 pe_flags = VM_MEM_STACK;
1301 } else {
1302 pe_flags = 0;
1303 }
1304
1305
1306 for (vm_object_offset_t pg_offset = 0;
1307 pg_offset < size;
1308 pg_offset += PAGE_SIZE_64) {
1309 if (page_list == NULL) {
1310 panic("%s: page_list too short", __func__);
1311 }
1312
1313 mem = page_list;
1314 page_list = mem->vmp_snext;
1315 mem->vmp_snext = NULL;
1316
1317 assert(mem->vmp_wire_count == 0);
1318 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1319 assert(!mem->vmp_fictitious && !mem->vmp_private);
1320
1321 if (flags & KMA_COMPRESSOR) {
1322 mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1323 /*
1324 * Background processes doing I/O accounting can call
1325 * into NVME driver to do some work which results in
1326 * an allocation here and so we want to make sure
1327 * that the pages used by compressor, regardless of
1328 * process context, are never on the special Q.
1329 */
1330 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1331
1332 vm_page_insert(mem, object, offset + pg_offset);
1333 } else {
1334 mem->vmp_q_state = VM_PAGE_IS_WIRED;
1335 mem->vmp_wire_count = 1;
1336
1337 vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1338 }
1339
1340 mem->vmp_gobbled = false;
1341 mem->vmp_busy = false;
1342 mem->vmp_pmapped = true;
1343 mem->vmp_wpmapped = true;
1344
1345 /*
1346 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1347 * for the kernel and compressor objects.
1348 */
1349 kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1350 mem, prot, pe_flags, mapping_type);
1351
1352 if (flags & KMA_NOENCRYPT) {
1353 pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1354 }
1355 }
1356
1357 if (page_list) {
1358 panic("%s: page_list too long", __func__);
1359 }
1360
1361 vm_object_unlock(object);
1362 if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1363 vm_object_deallocate(object);
1364 }
1365
1366 /*
1367 * Update the accounting:
1368 * - the compressor "wired" pages don't really count as wired
1369 * - kmem_alloc_contig_guard() gives gobbled pages,
1370 * which already count as wired but need to be ungobbled.
1371 */
1372 if (gobbled_list) {
1373 vm_page_lockspin_queues();
1374 if (flags & KMA_COMPRESSOR) {
1375 vm_page_wire_count -= atop(size);
1376 }
1377 vm_page_gobble_count -= atop(size);
1378 vm_page_unlock_queues();
1379 } else if ((flags & KMA_COMPRESSOR) == 0) {
1380 vm_page_lockspin_queues();
1381 vm_page_wire_count += atop(size);
1382 vm_page_unlock_queues();
1383 }
1384
1385 if (flags & KMA_KOBJECT) {
1386 /* vm_page_insert_wired() handles regular objects already */
1387 vm_tag_update_size(tag, size, NULL);
1388 }
1389
1390 #if KASAN
1391 if (flags & KMA_COMPRESSOR) {
1392 kasan_notify_address_nopoison(addr, size);
1393 } else {
1394 kasan_notify_address(addr, size);
1395 }
1396 #endif /* KASAN */
1397 }
1398
1399
1400 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1401 kernel_memory_populate(
1402 vm_offset_t addr,
1403 vm_size_t size,
1404 kma_flags_t flags,
1405 vm_tag_t tag)
1406 {
1407 kern_return_t kr = KERN_SUCCESS;
1408 vm_page_t page_list = NULL;
1409 vm_size_t page_count = atop_64(size);
1410 vm_object_t object = __kmem_object(ANYF(flags));
1411
1412 #if DEBUG || DEVELOPMENT
1413 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1414 size, 0, 0, 0);
1415 #endif /* DEBUG || DEVELOPMENT */
1416
1417 kr = vm_page_alloc_list(page_count, flags, &page_list);
1418 if (kr == KERN_SUCCESS) {
1419 vm_object_lock(object);
1420 kernel_memory_populate_object_and_unlock(object, addr,
1421 addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1422 __kmem_mapping_type(ANYF(flags)));
1423 }
1424
1425 #if DEBUG || DEVELOPMENT
1426 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1427 page_count, 0, 0, 0);
1428 #endif /* DEBUG || DEVELOPMENT */
1429 return kr;
1430 }
1431
1432 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1433 kernel_memory_depopulate(
1434 vm_offset_t addr,
1435 vm_size_t size,
1436 kma_flags_t flags,
1437 vm_tag_t tag)
1438 {
1439 vm_object_t object = __kmem_object(ANYF(flags));
1440 vm_object_offset_t offset = addr;
1441 vm_page_t mem;
1442 vm_page_t local_freeq = NULL;
1443 unsigned int pages_unwired = 0;
1444
1445 vm_object_lock(object);
1446
1447 pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1448
1449 for (vm_object_offset_t pg_offset = 0;
1450 pg_offset < size;
1451 pg_offset += PAGE_SIZE_64) {
1452 mem = vm_page_lookup(object, offset + pg_offset);
1453
1454 assert(mem);
1455
1456 if (flags & KMA_COMPRESSOR) {
1457 assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1458 } else {
1459 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1460 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1461 pages_unwired++;
1462 }
1463
1464 mem->vmp_busy = TRUE;
1465
1466 assert(mem->vmp_tabled);
1467 vm_page_remove(mem, TRUE);
1468 assert(mem->vmp_busy);
1469
1470 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1471
1472 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1473 mem->vmp_snext = local_freeq;
1474 local_freeq = mem;
1475 }
1476
1477 vm_object_unlock(object);
1478
1479 vm_page_free_list(local_freeq, TRUE);
1480
1481 if (!(flags & KMA_COMPRESSOR)) {
1482 vm_page_lockspin_queues();
1483 vm_page_wire_count -= pages_unwired;
1484 vm_page_unlock_queues();
1485 }
1486
1487 if (flags & KMA_KOBJECT) {
1488 /* vm_page_remove() handles regular objects already */
1489 vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1490 }
1491 }
1492
1493 #pragma mark reallocation
1494
1495 __abortlike
1496 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1497 __kmem_realloc_invalid_object_size_panic(
1498 vm_map_t map,
1499 vm_address_t address,
1500 vm_size_t size,
1501 vm_map_entry_t entry)
1502 {
1503 vm_object_t object = VME_OBJECT(entry);
1504 vm_size_t objsize = __kmem_entry_orig_size(entry);
1505
1506 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1507 "object %p has unexpected size %ld",
1508 map, (void *)address, (size_t)size, entry, object, objsize);
1509 }
1510
1511 __abortlike
1512 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1513 __kmem_realloc_invalid_pager_panic(
1514 vm_map_t map,
1515 vm_address_t address,
1516 vm_size_t size,
1517 vm_map_entry_t entry)
1518 {
1519 vm_object_t object = VME_OBJECT(entry);
1520 memory_object_t pager = object->pager;
1521 bool pager_created = object->pager_created;
1522 bool pager_initialized = object->pager_initialized;
1523 bool pager_ready = object->pager_ready;
1524
1525 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1526 "object %p has unexpected pager %p (%d,%d,%d)",
1527 map, (void *)address, (size_t)size, entry, object,
1528 pager, pager_created, pager_initialized, pager_ready);
1529 }
1530
1531 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1532 kmem_realloc_shrink_guard(
1533 vm_map_t map,
1534 vm_offset_t req_oldaddr,
1535 vm_size_t req_oldsize,
1536 vm_size_t req_newsize,
1537 kmr_flags_t flags,
1538 kmem_guard_t guard,
1539 vm_map_entry_t entry)
1540 {
1541 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1542 vm_object_t object;
1543 vm_offset_t delta = 0;
1544 kmem_return_t kmr;
1545 bool was_atomic;
1546 vm_size_t oldsize = round_page(req_oldsize);
1547 vm_size_t newsize = round_page(req_newsize);
1548 vm_address_t oldaddr = req_oldaddr;
1549
1550 #if KASAN_CLASSIC
1551 if (flags & KMR_KASAN_GUARD) {
1552 assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1553 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1554 oldaddr -= PAGE_SIZE;
1555 delta = ptoa(2);
1556 oldsize += delta;
1557 newsize += delta;
1558 }
1559 #endif /* KASAN_CLASSIC */
1560
1561 if (flags & KMR_TAG) {
1562 oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1563 }
1564
1565 vm_map_lock_assert_exclusive(map);
1566
1567 if ((flags & KMR_KOBJECT) == 0) {
1568 object = VME_OBJECT(entry);
1569 vm_object_reference(object);
1570 }
1571
1572 /*
1573 * Shrinking an atomic entry starts with splitting it,
1574 * and removing the second half.
1575 */
1576 was_atomic = entry->vme_atomic;
1577 entry->vme_atomic = false;
1578 vm_map_clip_end(map, entry, entry->vme_start + newsize);
1579 entry->vme_atomic = was_atomic;
1580
1581 #if KASAN
1582 if (entry->vme_kernel_object && was_atomic) {
1583 entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1584 }
1585 #if KASAN_CLASSIC
1586 if (flags & KMR_KASAN_GUARD) {
1587 kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1588 ASAN_VALID);
1589 }
1590 #endif
1591 #if KASAN_TBI
1592 if (flags & KMR_TAG) {
1593 kasan_tbi_mark_free_space(req_oldaddr + newsize, oldsize - newsize);
1594 }
1595 #endif /* KASAN_TBI */
1596 #endif /* KASAN */
1597 (void)vm_map_remove_and_unlock(map,
1598 oldaddr + newsize, oldaddr + oldsize,
1599 vmr_flags, KMEM_GUARD_NONE);
1600
1601
1602 /*
1603 * Lastly, if there are guard pages, deal with them.
1604 *
1605 * The kernel object just needs to depopulate,
1606 * regular objects require freeing the last page
1607 * and replacing it with a guard.
1608 */
1609 if (flags & KMR_KOBJECT) {
1610 if (flags & KMR_GUARD_LAST) {
1611 kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1612 PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1613 }
1614 } else {
1615 vm_page_t guard_right = VM_PAGE_NULL;
1616 vm_offset_t remove_start = newsize;
1617
1618 if (flags & KMR_GUARD_LAST) {
1619 if (!map->never_faults) {
1620 guard_right = vm_page_grab_guard(true);
1621 }
1622 remove_start -= PAGE_SIZE;
1623 }
1624
1625 vm_object_lock(object);
1626
1627 if (object->vo_size != oldsize) {
1628 __kmem_realloc_invalid_object_size_panic(map,
1629 req_oldaddr, req_oldsize + delta, entry);
1630 }
1631 vm_object_set_size(object, newsize, req_newsize);
1632
1633 vm_object_page_remove(object, remove_start, oldsize);
1634
1635 if (guard_right) {
1636 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1637 guard_right->vmp_busy = false;
1638 }
1639 vm_object_unlock(object);
1640 vm_object_deallocate(object);
1641 }
1642
1643 kmr.kmr_address = req_oldaddr;
1644 kmr.kmr_return = 0;
1645 #if KASAN_CLASSIC
1646 if (flags & KMA_KASAN_GUARD) {
1647 kasan_alloc_large(kmr.kmr_address, req_newsize);
1648 }
1649 #endif /* KASAN_CLASSIC */
1650 #if KASAN_TBI
1651 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1652 kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1653 vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1654 kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1655 }
1656 #endif /* KASAN_TBI */
1657
1658 return kmr;
1659 }
1660
1661 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1662 kmem_realloc_guard(
1663 vm_map_t map,
1664 vm_offset_t req_oldaddr,
1665 vm_size_t req_oldsize,
1666 vm_size_t req_newsize,
1667 kmr_flags_t flags,
1668 kmem_guard_t guard)
1669 {
1670 vm_object_t object;
1671 vm_size_t oldsize;
1672 vm_size_t newsize;
1673 vm_offset_t delta = 0;
1674 vm_map_offset_t oldaddr;
1675 vm_map_offset_t newaddr;
1676 vm_object_offset_t newoffs;
1677 vm_map_entry_t oldentry;
1678 vm_map_entry_t newentry;
1679 vm_page_t page_list = NULL;
1680 bool needs_wakeup = false;
1681 kmem_return_t kmr = { };
1682 unsigned int last_timestamp;
1683 vm_map_kernel_flags_t vmk_flags = {
1684 .vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1685 };
1686
1687 assert(KMEM_REALLOC_FLAGS_VALID(flags));
1688 if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1689 __kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1690 req_oldsize, flags);
1691 }
1692
1693 if (req_oldaddr == 0ul) {
1694 return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1695 }
1696
1697 if (req_newsize == 0ul) {
1698 kmem_free_guard(map, req_oldaddr, req_oldsize,
1699 (kmf_flags_t)flags, guard);
1700 return kmr;
1701 }
1702
1703 if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1704 __kmem_invalid_size_panic(map, req_newsize, flags);
1705 }
1706 if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1707 __kmem_invalid_size_panic(map, req_newsize, flags);
1708 }
1709
1710 oldsize = round_page(req_oldsize);
1711 newsize = round_page(req_newsize);
1712 oldaddr = req_oldaddr;
1713 #if KASAN_CLASSIC
1714 if (flags & KMR_KASAN_GUARD) {
1715 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1716 oldaddr -= PAGE_SIZE;
1717 delta = ptoa(2);
1718 oldsize += delta;
1719 newsize += delta;
1720 }
1721 #endif /* KASAN_CLASSIC */
1722 #if CONFIG_KERNEL_TAGGING
1723 if (flags & KMR_TAG) {
1724 vm_memtag_verify_tag(req_oldaddr);
1725 oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1726 }
1727 #endif /* CONFIG_KERNEL_TAGGING */
1728
1729 #if !KASAN
1730 /*
1731 * If not on a KASAN variant and no difference in requested size,
1732 * just return.
1733 *
1734 * Otherwise we want to validate the size and re-tag for KASAN_TBI.
1735 */
1736 if (oldsize == newsize) {
1737 kmr.kmr_address = req_oldaddr;
1738 return kmr;
1739 }
1740 #endif /* !KASAN */
1741
1742 /*
1743 * If we're growing the allocation,
1744 * then reserve the pages we'll need,
1745 * and find a spot for its new place.
1746 */
1747 if (oldsize < newsize) {
1748 #if DEBUG || DEVELOPMENT
1749 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1750 VM_KERN_REQUEST, DBG_FUNC_START,
1751 newsize - oldsize, 0, 0, 0);
1752 #endif /* DEBUG || DEVELOPMENT */
1753 kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1754 (kma_flags_t)flags, &page_list);
1755 if (kmr.kmr_return == KERN_SUCCESS) {
1756 kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1757 newsize, 0, &vmk_flags, true);
1758 kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1759 vmk_flags, &newentry);
1760 }
1761 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1762 if (flags & KMR_REALLOCF) {
1763 kmem_free_guard(map, req_oldaddr, req_oldsize,
1764 KMF_NONE, guard);
1765 }
1766 if (page_list) {
1767 vm_page_free_list(page_list, FALSE);
1768 }
1769 #if DEBUG || DEVELOPMENT
1770 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1771 VM_KERN_REQUEST, DBG_FUNC_END,
1772 0, 0, 0, 0);
1773 #endif /* DEBUG || DEVELOPMENT */
1774 return kmr;
1775 }
1776
1777 /* map is locked */
1778 } else {
1779 vm_map_lock(map);
1780 }
1781
1782
1783 /*
1784 * Locate the entry:
1785 * - wait for it to quiesce.
1786 * - validate its guard,
1787 * - learn its correct tag,
1788 */
1789 again:
1790 if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1791 __kmem_entry_not_found_panic(map, req_oldaddr);
1792 }
1793 if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1794 oldentry->needs_wakeup = true;
1795 vm_map_entry_wait(map, THREAD_UNINT);
1796 goto again;
1797 }
1798 kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1799 if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1800 __kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1801 }
1802 /*
1803 * TODO: We should validate for non atomic entries that the range
1804 * we are acting on is what we expect here.
1805 */
1806 #if KASAN
1807 if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1808 __kmem_realloc_invalid_object_size_panic(map,
1809 req_oldaddr, req_oldsize + delta, oldentry);
1810 }
1811
1812 if (oldsize == newsize) {
1813 kmr.kmr_address = req_oldaddr;
1814 if (oldentry->vme_kernel_object) {
1815 oldentry->vme_object_or_delta = delta +
1816 (-req_newsize & PAGE_MASK);
1817 } else {
1818 object = VME_OBJECT(oldentry);
1819 vm_object_lock(object);
1820 vm_object_set_size(object, newsize, req_newsize);
1821 vm_object_unlock(object);
1822 }
1823 vm_map_unlock(map);
1824
1825 #if KASAN_CLASSIC
1826 if (flags & KMA_KASAN_GUARD) {
1827 kasan_alloc_large(kmr.kmr_address, req_newsize);
1828 }
1829 #endif /* KASAN_CLASSIC */
1830 #if KASAN_TBI
1831 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1832 kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1833 vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1834 kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1835 }
1836 #endif /* KASAN_TBI */
1837 return kmr;
1838 }
1839 #endif /* KASAN */
1840
1841 guard.kmg_tag = VME_ALIAS(oldentry);
1842
1843 if (newsize < oldsize) {
1844 return kmem_realloc_shrink_guard(map, req_oldaddr,
1845 req_oldsize, req_newsize, flags, guard, oldentry);
1846 }
1847
1848
1849 /*
1850 * We are growing the entry
1851 *
1852 * For regular objects we use the object `vo_size` updates
1853 * as a guarantee that no 2 kmem_realloc() can happen
1854 * concurrently (by doing it before the map is unlocked.
1855 *
1856 * For the kernel object, prevent the entry from being
1857 * reallocated or changed by marking it "in_transition".
1858 */
1859
1860 object = VME_OBJECT(oldentry);
1861 vm_object_lock(object);
1862 vm_object_reference_locked(object);
1863
1864 newaddr = newentry->vme_start;
1865 newoffs = oldsize;
1866
1867 VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1868 VME_ALIAS_SET(newentry, guard.kmg_tag);
1869 if (flags & KMR_KOBJECT) {
1870 oldentry->in_transition = true;
1871 VME_OFFSET_SET(newentry, newaddr);
1872 newentry->wired_count = 1;
1873 vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
1874 newoffs = newaddr + oldsize;
1875 } else {
1876 if (object->pager_created || object->pager) {
1877 /*
1878 * We can't "realloc/grow" the pager, so pageable
1879 * allocations should not go through this path.
1880 */
1881 __kmem_realloc_invalid_pager_panic(map,
1882 req_oldaddr, req_oldsize + delta, oldentry);
1883 }
1884 if (object->vo_size != oldsize) {
1885 __kmem_realloc_invalid_object_size_panic(map,
1886 req_oldaddr, req_oldsize + delta, oldentry);
1887 }
1888 vm_object_set_size(object, newsize, req_newsize);
1889 }
1890
1891 last_timestamp = map->timestamp;
1892 vm_map_unlock(map);
1893
1894
1895 /*
1896 * Now proceed with the population of pages.
1897 *
1898 * Kernel objects can use the kmem population helpers.
1899 *
1900 * Regular objects will insert pages manually,
1901 * then wire the memory into the new range.
1902 */
1903
1904 vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1905
1906 if (flags & KMR_KOBJECT) {
1907 pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
1908
1909 pmap_protect(kernel_pmap,
1910 oldaddr, oldaddr + oldsize - guard_right_size,
1911 VM_PROT_NONE);
1912
1913 for (vm_object_offset_t offset = 0;
1914 offset < oldsize - guard_right_size;
1915 offset += PAGE_SIZE_64) {
1916 vm_page_t mem;
1917
1918 mem = vm_page_lookup(object, oldaddr + offset);
1919 if (mem == VM_PAGE_NULL) {
1920 continue;
1921 }
1922
1923 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1924
1925 mem->vmp_busy = true;
1926 vm_page_remove(mem, true);
1927 vm_page_insert_wired(mem, object, newaddr + offset,
1928 guard.kmg_tag);
1929 mem->vmp_busy = false;
1930
1931 kernel_memory_populate_pmap_enter(object, newaddr,
1932 offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
1933 }
1934
1935 kernel_memory_populate_object_and_unlock(object,
1936 newaddr + oldsize - guard_right_size,
1937 newoffs - guard_right_size,
1938 newsize - oldsize,
1939 page_list, (kma_flags_t)flags,
1940 guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
1941 } else {
1942 vm_page_t guard_right = VM_PAGE_NULL;
1943
1944 /*
1945 * Note: we are borrowing the new entry reference
1946 * on the object for the duration of this code,
1947 * which works because we keep the object locked
1948 * throughout.
1949 */
1950 if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1951 guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1952 assert(guard_right->vmp_fictitious);
1953 guard_right->vmp_busy = true;
1954 vm_page_remove(guard_right, true);
1955 }
1956
1957 if (flags & KMR_FREEOLD) {
1958 /*
1959 * Freeing the old mapping will make
1960 * the old pages become pageable until
1961 * the new mapping makes them wired again.
1962 * Let's take an extra "wire_count" to
1963 * prevent any accidental "page out".
1964 * We'll have to undo that after wiring
1965 * the new mapping.
1966 */
1967 vm_object_reference_locked(object); /* keep object alive */
1968 for (vm_object_offset_t offset = 0;
1969 offset < oldsize - guard_right_size;
1970 offset += PAGE_SIZE_64) {
1971 vm_page_t mem;
1972
1973 mem = vm_page_lookup(object, offset);
1974 assert(mem != VM_PAGE_NULL);
1975 assert(VM_PAGE_WIRED(mem));
1976 assert(mem->vmp_wire_count >= 1);
1977 mem->vmp_wire_count++;
1978 }
1979 }
1980
1981 for (vm_object_offset_t offset = oldsize - guard_right_size;
1982 offset < newsize - guard_right_size;
1983 offset += PAGE_SIZE_64) {
1984 vm_page_t mem = page_list;
1985
1986 page_list = mem->vmp_snext;
1987 mem->vmp_snext = VM_PAGE_NULL;
1988 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1989 assert(!VM_PAGE_PAGEABLE(mem));
1990
1991 vm_page_insert(mem, object, offset);
1992 mem->vmp_busy = false;
1993 }
1994
1995 if (guard_right) {
1996 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1997 guard_right->vmp_busy = false;
1998 }
1999
2000 vm_object_unlock(object);
2001 }
2002
2003 /*
2004 * Mark the entry as idle again,
2005 * and honor KMR_FREEOLD if needed.
2006 */
2007
2008 vm_map_lock(map);
2009 if (last_timestamp + 1 != map->timestamp &&
2010 !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2011 __kmem_entry_not_found_panic(map, req_oldaddr);
2012 }
2013
2014 if (flags & KMR_KOBJECT) {
2015 assert(oldentry->in_transition);
2016 oldentry->in_transition = false;
2017 if (oldentry->needs_wakeup) {
2018 needs_wakeup = true;
2019 oldentry->needs_wakeup = false;
2020 }
2021 }
2022
2023 if (flags & KMR_FREEOLD) {
2024 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2025
2026 #if KASAN_CLASSIC
2027 if (flags & KMR_KASAN_GUARD) {
2028 kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2029 }
2030 #endif
2031 #if KASAN_TBI
2032 if (flags & KMR_TAG) {
2033 kasan_tbi_mark_free_space(req_oldaddr, oldsize);
2034 }
2035 #endif /* KASAN_TBI */
2036 if (flags & KMR_GUARD_LAST) {
2037 vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2038 }
2039 (void)vm_map_remove_and_unlock(map,
2040 oldaddr, oldaddr + oldsize,
2041 vmr_flags, guard);
2042 } else {
2043 vm_map_unlock(map);
2044 }
2045
2046 if ((flags & KMR_KOBJECT) == 0) {
2047 kern_return_t kr;
2048 /*
2049 * This must happen _after_ we do the KMR_FREEOLD,
2050 * because wiring the pages will call into the pmap,
2051 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2052 * this would cause a second mapping of the page and panic.
2053 */
2054 kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
2055 VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
2056 assert(kr == KERN_SUCCESS);
2057
2058 if (flags & KMR_FREEOLD) {
2059 /*
2060 * Undo the extra "wiring" we made above
2061 * and release the extra reference we took
2062 * on the object.
2063 */
2064 vm_object_lock(object);
2065 for (vm_object_offset_t offset = 0;
2066 offset < oldsize - guard_right_size;
2067 offset += PAGE_SIZE_64) {
2068 vm_page_t mem;
2069
2070 mem = vm_page_lookup(object, offset);
2071 assert(mem != VM_PAGE_NULL);
2072 assert(VM_PAGE_WIRED(mem));
2073 assert(mem->vmp_wire_count >= 2);
2074 mem->vmp_wire_count--;
2075 assert(VM_PAGE_WIRED(mem));
2076 assert(mem->vmp_wire_count >= 1);
2077 }
2078 vm_object_unlock(object);
2079 vm_object_deallocate(object); /* release extra ref */
2080 }
2081 }
2082
2083 if (needs_wakeup) {
2084 vm_map_entry_wakeup(map);
2085 }
2086
2087 #if DEBUG || DEVELOPMENT
2088 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
2089 atop(newsize - oldsize), 0, 0, 0);
2090 #endif /* DEBUG || DEVELOPMENT */
2091 kmr.kmr_address = newaddr;
2092
2093 #if KASAN
2094 kasan_notify_address(kmr.kmr_address, newsize);
2095 #endif /* KASAN */
2096 #if KASAN_CLASSIC
2097 if (flags & KMR_KASAN_GUARD) {
2098 kmr.kmr_address += PAGE_SIZE;
2099 kasan_alloc_large(kmr.kmr_address, req_newsize);
2100 }
2101 #endif /* KASAN_CLASSIC */
2102 #if KASAN_TBI
2103 if (flags & KMR_TAG) {
2104 kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
2105 vm_memtag_set_tag(kmr.kmr_address, req_newsize);
2106 kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
2107 }
2108 #endif /* KASAN_TBI */
2109
2110 return kmr;
2111 }
2112
2113
2114 #pragma mark free
2115
2116 #if KASAN
2117
2118 __abortlike
2119 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2120 __kmem_free_invalid_object_size_panic(
2121 vm_map_t map,
2122 vm_address_t address,
2123 vm_size_t size,
2124 vm_map_entry_t entry)
2125 {
2126 vm_object_t object = VME_OBJECT(entry);
2127 vm_size_t objsize = __kmem_entry_orig_size(entry);
2128
2129 panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2130 "object %p has unexpected size %ld",
2131 map, (void *)address, (size_t)size, entry, object, objsize);
2132 }
2133
2134 #endif /* KASAN */
2135
2136 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2137 kmem_free_guard(
2138 vm_map_t map,
2139 vm_offset_t req_addr,
2140 vm_size_t req_size,
2141 kmf_flags_t flags,
2142 kmem_guard_t guard)
2143 {
2144 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2145 vm_address_t addr = req_addr;
2146 vm_offset_t delta = 0;
2147 vm_size_t size;
2148 #if KASAN
2149 vm_map_entry_t entry;
2150 #endif /* KASAN */
2151
2152 assert(map->pmap == kernel_pmap);
2153
2154 #if KASAN_CLASSIC
2155 if (flags & KMF_KASAN_GUARD) {
2156 addr -= PAGE_SIZE;
2157 delta = ptoa(2);
2158 }
2159 #endif /* KASAN_CLASSIC */
2160 #if CONFIG_KERNEL_TAGGING
2161 if (flags & KMF_TAG) {
2162 vm_memtag_verify_tag(req_addr);
2163 addr = vm_memtag_canonicalize_address(req_addr);
2164 }
2165 #endif /* CONFIG_KERNEL_TAGGING */
2166
2167 if (flags & KMF_GUESS_SIZE) {
2168 vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2169 size = PAGE_SIZE;
2170 } else if (req_size == 0) {
2171 __kmem_invalid_size_panic(map, req_size, flags);
2172 } else {
2173 size = round_page(req_size) + delta;
2174 }
2175
2176 vm_map_lock(map);
2177
2178 #if KASAN
2179 if (!vm_map_lookup_entry(map, addr, &entry)) {
2180 __kmem_entry_not_found_panic(map, req_addr);
2181 }
2182 if (flags & KMF_GUESS_SIZE) {
2183 vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2184 req_size = __kmem_entry_orig_size(entry);
2185 size = round_page(req_size + delta);
2186 } else if (guard.kmg_atomic && entry->vme_kernel_object &&
2187 __kmem_entry_orig_size(entry) != req_size) {
2188 /*
2189 * We can't make a strict check for regular
2190 * VM objects because it could be:
2191 *
2192 * - the kmem_guard_free() of a kmem_realloc_guard() without
2193 * KMR_FREEOLD, and in that case the object size won't match.
2194 *
2195 * - a submap, in which case there is no "orig size".
2196 */
2197 __kmem_free_invalid_object_size_panic(map,
2198 req_addr, req_size + delta, entry);
2199 }
2200 #endif /* KASAN */
2201 #if KASAN_CLASSIC
2202 if (flags & KMR_KASAN_GUARD) {
2203 kasan_poison_range(addr, size, ASAN_VALID);
2204 }
2205 #endif
2206 #if KASAN_TBI
2207 if (flags & KMF_TAG) {
2208 kasan_tbi_mark_free_space(req_addr, size);
2209 }
2210 #endif /* KASAN_TBI */
2211
2212 /*
2213 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2214 * unwires the kernel mapping. The page won't be mapped any longer so
2215 * there is no extra step that is required for memory tagging to "clear"
2216 * it -- the page will be later laundered when reused.
2217 */
2218 return vm_map_remove_and_unlock(map, addr, addr + size,
2219 vmr_flags, guard).kmr_size - delta;
2220 }
2221
2222 __exported void
2223 kmem_free_external(
2224 vm_map_t map,
2225 vm_offset_t addr,
2226 vm_size_t size);
2227 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2228 kmem_free_external(
2229 vm_map_t map,
2230 vm_offset_t addr,
2231 vm_size_t size)
2232 {
2233 if (size) {
2234 kmem_free(map, trunc_page(addr), size);
2235 #if MACH_ASSERT
2236 } else {
2237 printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2238 map, (void *)addr, __builtin_return_address(0));
2239 #endif
2240 }
2241 }
2242
2243 #pragma mark kmem metadata
2244
2245 /*
2246 * Guard objects for kmem pointer allocation:
2247 *
2248 * Guard objects introduce size slabs to kmem pointer allocations that are
2249 * allocated in chunks of n * sizeclass. When an allocation of a specific
2250 * sizeclass is requested a random slot from [0, n) is returned.
2251 * Allocations are returned from that chunk until m slots are left. The
2252 * remaining m slots are referred to as guard objects. They don't get
2253 * allocated and the chunk is now considered full. When an allocation is
2254 * freed to the chunk 1 slot is now available from m + 1 for the next
2255 * allocation of that sizeclass.
2256 *
2257 * Guard objects are intended to make exploitation of use after frees harder
2258 * as allocations that are freed can no longer be reliable reallocated.
2259 * They also make exploitation of OOBs harder as overflowing out of an
2260 * allocation can no longer be safe even with sufficient spraying.
2261 */
2262
2263 #define KMEM_META_PRIMARY UINT8_MAX
2264 #define KMEM_META_START (UINT8_MAX - 1)
2265 #define KMEM_META_FREE (UINT8_MAX - 2)
2266 #if __ARM_16K_PG__
2267 #define KMEM_MIN_SIZE PAGE_SIZE
2268 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2269 #else /* __ARM_16K_PG__ */
2270 /*
2271 * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2272 * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2273 * Therefore populate sizeclasses from 4k for those devices.
2274 */
2275 #define KMEM_MIN_SIZE (4 * 1024)
2276 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2277 #endif /* __ARM_16K_PG__ */
2278 #define KMEM_MAX_SIZE (32ULL << 20)
2279 #define KMEM_START_IDX (kmem_log2down(KMEM_MIN_SIZE))
2280 #define KMEM_LAST_IDX (kmem_log2down(KMEM_MAX_SIZE))
2281 #define KMEM_NUM_SIZECLASS (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2282 #define KMEM_FRONTS (KMEM_RANGE_ID_NUM_PTR * 2)
2283 #define KMEM_NUM_GUARDS 2
2284
2285 struct kmem_page_meta {
2286 union {
2287 /*
2288 * On primary allocated chunk with KMEM_META_PRIMARY marker
2289 */
2290 uint32_t km_bitmap;
2291 /*
2292 * On start and end of free chunk with KMEM_META_FREE marker
2293 */
2294 uint32_t km_free_chunks;
2295 };
2296 /*
2297 * KMEM_META_PRIMARY: Start meta of allocated chunk
2298 * KMEM_META_FREE : Start and end meta of free chunk
2299 * KMEM_META_START : Meta region start and end
2300 */
2301 uint8_t km_page_marker;
2302 uint8_t km_sizeclass;
2303 union {
2304 /*
2305 * On primary allocated chunk with KMEM_META_PRIMARY marker
2306 */
2307 uint16_t km_chunk_len;
2308 /*
2309 * On secondary allocated chunks
2310 */
2311 uint16_t km_page_idx;
2312 };
2313 LIST_ENTRY(kmem_page_meta) km_link;
2314 } kmem_page_meta_t;
2315
2316 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2317 struct kmem_sizeclass {
2318 vm_map_size_t ks_size;
2319 uint32_t ks_num_chunk;
2320 uint32_t ks_num_elem;
2321 crypto_random_ctx_t __zpercpu ks_rng_ctx;
2322 kmem_list_head_t ks_allfree_head[KMEM_FRONTS];
2323 kmem_list_head_t ks_partial_head[KMEM_FRONTS];
2324 kmem_list_head_t ks_full_head[KMEM_FRONTS];
2325 };
2326
2327 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2328
2329 /*
2330 * Locks to synchronize metadata population
2331 */
2332 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2333 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2334 #define kmem_meta_lock() lck_mtx_lock(&kmem_meta_region_lck)
2335 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2336
2337 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2338 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2339 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2340 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2341 /*
2342 * Keeps track of metadata high water mark for each front
2343 */
2344 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2345 static SECURITY_READ_ONLY_LATE(vm_map_t)
2346 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2347 static vm_map_size_t kmem_meta_size;
2348
2349 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2350 kmem_get_front(
2351 kmem_range_id_t range_id,
2352 bool from_right)
2353 {
2354 assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2355 (range_id <= KMEM_RANGE_ID_NUM_PTR));
2356 return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2357 }
2358
2359 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2360 kmem_slot_idx_to_bit(
2361 uint32_t slot_idx,
2362 uint32_t size_idx __unused)
2363 {
2364 assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2365 return 1ull << slot_idx;
2366 }
2367
2368 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2369 kmem_get_idx_from_size(vm_map_size_t size)
2370 {
2371 assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2372 return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2373 }
2374
2375 __abortlike
2376 static void
kmem_invalid_size_idx(uint32_t idx)2377 kmem_invalid_size_idx(uint32_t idx)
2378 {
2379 panic("Invalid sizeclass idx %u", idx);
2380 }
2381
2382 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2383 kmem_get_size_from_idx(uint32_t idx)
2384 {
2385 if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2386 kmem_invalid_size_idx(idx);
2387 }
2388 return 1ul << (idx + KMEM_START_IDX);
2389 }
2390
2391 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2392 kmem_get_page_idx(struct kmem_page_meta *meta)
2393 {
2394 uint8_t page_marker = meta->km_page_marker;
2395
2396 return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2397 }
2398
2399 __abortlike
2400 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2401 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2402 {
2403 panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2404 meta);
2405 }
2406
2407 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2408 kmem_get_chunk_len(struct kmem_page_meta *meta)
2409 {
2410 if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2411 kmem_invalid_chunk_len(meta);
2412 }
2413
2414 return meta->km_chunk_len;
2415 }
2416
2417 __abortlike
2418 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2419 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2420 {
2421 panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2422 meta);
2423 }
2424
2425 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2426 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2427 {
2428 if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2429 kmem_invalid_free_chunk_len(meta);
2430 }
2431
2432 return meta->km_free_chunks;
2433 }
2434
2435 /*
2436 * Return the metadata corresponding to the specified address
2437 */
2438 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2439 kmem_addr_to_meta(
2440 vm_map_offset_t addr,
2441 vm_map_range_id_t range_id,
2442 vm_map_offset_t *range_start,
2443 uint64_t *meta_idx)
2444 {
2445 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2446
2447 *range_start = kmem_ranges[range_id].min_address;
2448 *meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2449 return &meta_base[*meta_idx];
2450 }
2451
2452 /*
2453 * Return the metadata start of the chunk that the address belongs to
2454 */
2455 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2456 kmem_addr_to_meta_start(
2457 vm_address_t addr,
2458 vm_map_range_id_t range_id,
2459 vm_map_offset_t *chunk_start)
2460 {
2461 vm_map_offset_t range_start;
2462 uint64_t meta_idx;
2463 struct kmem_page_meta *meta;
2464
2465 meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2466 meta_idx -= kmem_get_page_idx(meta);
2467 meta -= kmem_get_page_idx(meta);
2468 assert(meta->km_page_marker == KMEM_META_PRIMARY);
2469 *chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2470 return meta;
2471 }
2472
2473 __startup_func
2474 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2475 kmem_init_meta_front(
2476 struct kmem_page_meta *meta,
2477 kmem_range_id_t range_id,
2478 bool from_right)
2479 {
2480 kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2481 KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2482 meta->km_page_marker = KMEM_META_START;
2483 if (!from_right) {
2484 meta++;
2485 kmem_meta_base[range_id] = meta;
2486 }
2487 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2488 }
2489
2490 __startup_func
2491 static void
kmem_metadata_init(void)2492 kmem_metadata_init(void)
2493 {
2494 for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2495 vm_map_offset_t addr = kmem_meta_range[i].min_address;
2496 struct kmem_page_meta *meta;
2497 uint64_t meta_idx;
2498
2499 vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2500 kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2501 VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2502 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
2503 VM_KERN_MEMORY_OSFMK).kmr_submap;
2504
2505 kmem_meta_range[i].min_address = addr;
2506 kmem_meta_range[i].max_address = addr + kmem_meta_size;
2507
2508 meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2509 kmem_init_meta_front(meta, i, 0);
2510
2511 meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2512 &meta_idx);
2513 kmem_init_meta_front(meta, i, 1);
2514 }
2515 }
2516
2517 __startup_func
2518 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2519 kmem_init_front_head(
2520 struct kmem_sizeclass *ks,
2521 uint32_t front)
2522 {
2523 LIST_INIT(&ks->ks_allfree_head[front]);
2524 LIST_INIT(&ks->ks_partial_head[front]);
2525 LIST_INIT(&ks->ks_full_head[front]);
2526 }
2527
2528 __startup_func
2529 static void
kmem_sizeclass_init(void)2530 kmem_sizeclass_init(void)
2531 {
2532 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2533 struct kmem_sizeclass *ks = &kmem_size_array[i];
2534 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2535
2536 ks->ks_size = kmem_get_size_from_idx(i);
2537 ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2538 KMEM_CHUNK_SIZE_MIN;
2539 ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2540 assert(ks->ks_num_elem <=
2541 (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2542 for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2543 kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2544 kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2545 }
2546 }
2547 }
2548
2549 /*
2550 * This is done during EARLY_BOOT as it needs the corecrypto module to be
2551 * set up.
2552 */
2553 __startup_func
2554 static void
kmem_crypto_init(void)2555 kmem_crypto_init(void)
2556 {
2557 vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2558
2559 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2560 struct kmem_sizeclass *ks = &kmem_size_array[i];
2561
2562 ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2563 zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2564 crypto_random_kmem_init(ctx);
2565 }
2566 }
2567 }
2568 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2569
2570 __abortlike
2571 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2572 kmem_validate_slot_panic(
2573 vm_map_offset_t addr,
2574 struct kmem_page_meta *meta,
2575 uint32_t slot_idx,
2576 uint32_t size_idx)
2577 {
2578 if (meta->km_page_marker != KMEM_META_PRIMARY) {
2579 panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2580 }
2581 if (meta->km_sizeclass != size_idx) {
2582 panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2583 meta, meta->km_sizeclass, size_idx);
2584 }
2585 panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2586 slot_idx, meta, (void *)addr);
2587 }
2588
2589 __abortlike
2590 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2591 kmem_invalid_slot_for_addr(
2592 mach_vm_range_t slot,
2593 vm_map_offset_t start,
2594 vm_map_offset_t end)
2595 {
2596 panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2597 (void *)slot->min_address, (void *)slot->max_address,
2598 (void *)start, (void *)end);
2599 }
2600
2601 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2602 kmem_validate_slot(
2603 vm_map_offset_t addr,
2604 struct kmem_page_meta *meta,
2605 uint32_t size_idx,
2606 uint32_t slot_idx)
2607 {
2608 if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2609 (meta->km_sizeclass != size_idx) ||
2610 ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2611 kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2612 }
2613 }
2614
2615 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2616 kmem_validate_slot_initial(
2617 mach_vm_range_t slot,
2618 vm_map_offset_t start,
2619 vm_map_offset_t end,
2620 struct kmem_page_meta *meta,
2621 uint32_t size_idx,
2622 uint32_t slot_idx)
2623 {
2624 if ((slot->min_address == 0) || (slot->max_address == 0) ||
2625 (start < slot->min_address) || (start >= slot->max_address) ||
2626 (end > slot->max_address)) {
2627 kmem_invalid_slot_for_addr(slot, start, end);
2628 }
2629
2630 kmem_validate_slot(start, meta, size_idx, slot_idx);
2631 }
2632
2633 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2634 kmem_addr_get_slot_idx(
2635 vm_map_offset_t start,
2636 vm_map_offset_t end,
2637 vm_map_range_id_t range_id,
2638 struct kmem_page_meta **meta,
2639 uint32_t *size_idx,
2640 mach_vm_range_t slot)
2641 {
2642 vm_map_offset_t chunk_start;
2643 vm_map_size_t slot_size;
2644 uint32_t slot_idx;
2645
2646 *meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2647 *size_idx = (*meta)->km_sizeclass;
2648 slot_size = kmem_get_size_from_idx(*size_idx);
2649 slot_idx = (start - chunk_start) / slot_size;
2650 slot->min_address = chunk_start + slot_idx * slot_size;
2651 slot->max_address = slot->min_address + slot_size;
2652
2653 kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2654
2655 return slot_idx;
2656 }
2657
2658 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2659 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2660 {
2661 #if KASAN
2662 #pragma unused(from, to)
2663 return true;
2664 #else
2665 vm_offset_t page_addr = trunc_page(from);
2666
2667 for (; page_addr < to; page_addr += PAGE_SIZE) {
2668 /*
2669 * This can race with another thread doing a populate on the same metadata
2670 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2671 * fault in the shadow when we first access the metadata page. Avoid this
2672 * by always synchronizing on the kmem_meta_lock with KASan.
2673 */
2674 if (!pmap_find_phys(kernel_pmap, page_addr)) {
2675 return true;
2676 }
2677 }
2678
2679 return false;
2680 #endif /* !KASAN */
2681 }
2682
2683 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2684 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2685 {
2686 vm_offset_t page_addr = trunc_page(from);
2687
2688 vm_map_unlock(kernel_map);
2689
2690 for (; page_addr < to; page_addr += PAGE_SIZE) {
2691 for (;;) {
2692 kern_return_t ret = KERN_SUCCESS;
2693
2694 /*
2695 * All updates to kmem metadata are done under the kmem_meta_lock
2696 */
2697 kmem_meta_lock();
2698 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2699 ret = kernel_memory_populate(page_addr,
2700 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2701 VM_KERN_MEMORY_OSFMK);
2702 }
2703 kmem_meta_unlock();
2704
2705 if (ret == KERN_SUCCESS) {
2706 break;
2707 }
2708
2709 /*
2710 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2711 * to bad system deadlocks, so if the allocation failed,
2712 * we need to do the VM_PAGE_WAIT() outside of the lock.
2713 */
2714 VM_PAGE_WAIT();
2715 }
2716 }
2717
2718 vm_map_lock(kernel_map);
2719 }
2720
2721 __abortlike
2722 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2723 kmem_invalid_meta_panic(
2724 struct kmem_page_meta *meta,
2725 uint32_t slot_idx,
2726 struct kmem_sizeclass sizeclass)
2727 {
2728 uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2729
2730 if (slot_idx >= sizeclass.ks_num_elem) {
2731 panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2732 sizeclass.ks_num_elem, meta);
2733 }
2734 if (meta->km_sizeclass != size_idx) {
2735 panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2736 meta->km_sizeclass, meta);
2737 }
2738 panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
2739 }
2740
2741 __abortlike
2742 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)2743 kmem_slot_has_entry_panic(
2744 vm_map_entry_t entry,
2745 vm_map_offset_t addr)
2746 {
2747 panic("Entry (%p) already exists for addr (%p) being returned",
2748 entry, (void *)addr);
2749 }
2750
2751 __abortlike
2752 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)2753 kmem_slot_not_found(
2754 struct kmem_page_meta *meta,
2755 uint32_t slot_idx)
2756 {
2757 panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
2758 meta->km_bitmap);
2759 }
2760
2761 /*
2762 * Returns a 16bit random number between 0 and
2763 * upper_limit (inclusive)
2764 */
2765 __startup_func
2766 uint16_t
kmem_get_random16(uint16_t upper_limit)2767 kmem_get_random16(
2768 uint16_t upper_limit)
2769 {
2770 static uint64_t random_entropy;
2771 assert(upper_limit < UINT16_MAX);
2772 if (random_entropy == 0) {
2773 random_entropy = early_random();
2774 }
2775 uint32_t result = random_entropy & UINT32_MAX;
2776 random_entropy >>= 32;
2777 return (uint16_t)(result % (upper_limit + 1));
2778 }
2779
2780 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)2781 kmem_get_nth_free_slot(
2782 struct kmem_page_meta *meta,
2783 uint32_t n,
2784 uint32_t bitmap)
2785 {
2786 uint32_t zeros_seen = 0, ones_seen = 0;
2787
2788 while (bitmap) {
2789 uint32_t count = __builtin_ctz(bitmap);
2790
2791 zeros_seen += count;
2792 bitmap >>= count;
2793 if (__probable(~bitmap)) {
2794 count = __builtin_ctz(~bitmap);
2795 } else {
2796 count = 32;
2797 }
2798 if (count + ones_seen > n) {
2799 return zeros_seen + n;
2800 }
2801 ones_seen += count;
2802 bitmap >>= count;
2803 }
2804
2805 kmem_slot_not_found(meta, n);
2806 }
2807
2808
2809 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)2810 kmem_get_next_slot(
2811 struct kmem_page_meta *meta,
2812 struct kmem_sizeclass sizeclass,
2813 uint32_t bitmap)
2814 {
2815 uint32_t num_slots = __builtin_popcount(bitmap);
2816 uint64_t slot_idx = 0;
2817
2818 assert(num_slots > 0);
2819 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
2820 /*
2821 * Use early random prior to early boot as the ks_rng_ctx requires
2822 * the corecrypto module to be setup before it is initialized and
2823 * used.
2824 *
2825 * num_slots can't be 0 as we take this path when we have more than
2826 * one slot left.
2827 */
2828 slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
2829 } else {
2830 crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
2831 &slot_idx);
2832 }
2833
2834 return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
2835 }
2836
2837 /*
2838 * Returns an unallocated slot from the given metadata
2839 */
2840 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)2841 kmem_get_addr_from_meta(
2842 struct kmem_page_meta *meta,
2843 vm_map_range_id_t range_id,
2844 struct kmem_sizeclass sizeclass,
2845 vm_map_entry_t *entry)
2846 {
2847 vm_map_offset_t addr;
2848 vm_map_size_t size = sizeclass.ks_size;
2849 uint32_t size_idx = kmem_get_idx_from_size(size);
2850 uint64_t meta_idx = meta - kmem_meta_base[range_id];
2851 mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
2852 uint32_t slot_bit;
2853 uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
2854
2855 if ((slot_idx >= sizeclass.ks_num_elem) ||
2856 (meta->km_sizeclass != size_idx) ||
2857 (meta->km_page_marker != KMEM_META_PRIMARY)) {
2858 kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
2859 }
2860
2861 slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
2862 meta->km_bitmap &= ~slot_bit;
2863
2864 addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
2865 assert(kmem_range_contains_fully(range_id, addr, size));
2866 if (vm_map_lookup_entry(kernel_map, addr, entry)) {
2867 kmem_slot_has_entry_panic(*entry, addr);
2868 }
2869 if ((*entry != vm_map_to_entry(kernel_map)) &&
2870 ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
2871 ((*entry)->vme_next->vme_start < (addr + size))) {
2872 kmem_slot_has_entry_panic(*entry, addr);
2873 }
2874 return addr;
2875 }
2876
2877 __abortlike
2878 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)2879 kmem_range_out_of_va(
2880 kmem_range_id_t range_id,
2881 uint32_t num_chunks)
2882 {
2883 panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
2884 }
2885
2886 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)2887 kmem_init_allocated_chunk(
2888 struct kmem_page_meta *meta,
2889 struct kmem_sizeclass sizeclass,
2890 uint32_t size_idx)
2891 {
2892 uint32_t meta_num = sizeclass.ks_num_chunk;
2893 uint32_t num_elem = sizeclass.ks_num_elem;
2894
2895 meta->km_bitmap = (1ull << num_elem) - 1;
2896 meta->km_chunk_len = (uint16_t)meta_num;
2897 assert(LIST_NEXT(meta, km_link) == NULL);
2898 assert(meta->km_link.le_prev == NULL);
2899 meta->km_sizeclass = (uint8_t)size_idx;
2900 meta->km_page_marker = KMEM_META_PRIMARY;
2901 meta++;
2902 for (uint32_t i = 1; i < meta_num; i++) {
2903 meta->km_page_idx = (uint16_t)i;
2904 meta->km_sizeclass = (uint8_t)size_idx;
2905 meta->km_page_marker = 0;
2906 meta->km_bitmap = 0;
2907 meta++;
2908 }
2909 }
2910
2911 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)2912 kmem_get_additional_meta(
2913 struct kmem_page_meta *meta,
2914 uint32_t meta_req,
2915 bool from_right,
2916 struct kmem_page_meta **adj_free_meta)
2917 {
2918 struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
2919
2920 if (meta_prev->km_page_marker == KMEM_META_FREE) {
2921 uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
2922
2923 *adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
2924 meta_req -= chunk_len;
2925 } else {
2926 *adj_free_meta = NULL;
2927 }
2928
2929 return meta_req;
2930 }
2931
2932
2933 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)2934 kmem_get_new_chunk(
2935 vm_map_range_id_t range_id,
2936 bool from_right,
2937 uint32_t size_idx)
2938 {
2939 struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
2940 struct kmem_page_meta *start, *end, *meta_update;
2941 struct kmem_page_meta *adj_free_meta = NULL;
2942 uint32_t meta_req = sizeclass.ks_num_chunk;
2943
2944 for (;;) {
2945 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2946 struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2947 struct kmem_page_meta *meta;
2948 vm_offset_t start_addr, end_addr;
2949 uint32_t meta_num;
2950
2951 meta = from_right ? metab : metaf;
2952 meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
2953 &adj_free_meta);
2954
2955 if (metaf + meta_num >= metab) {
2956 kmem_range_out_of_va(range_id, meta_num);
2957 }
2958
2959 start = from_right ? (metab - meta_num) : metaf;
2960 end = from_right ? metab : (metaf + meta_num);
2961
2962 start_addr = (vm_offset_t)start;
2963 end_addr = (vm_offset_t)end;
2964
2965 /*
2966 * If the new high watermark stays on the same page,
2967 * no need to populate and drop the lock.
2968 */
2969 if (!page_aligned(from_right ? end_addr : start_addr) &&
2970 trunc_page(start_addr) == trunc_page(end_addr - 1)) {
2971 break;
2972 }
2973 if (!kmem_populate_needed(start_addr, end_addr)) {
2974 break;
2975 }
2976
2977 kmem_populate_meta_locked(start_addr, end_addr);
2978
2979 /*
2980 * Since we dropped the lock, reassess conditions still hold:
2981 * - the HWM we are changing must not have moved
2982 * - the other HWM must not intersect with ours
2983 * - in case of coalescing, the adjacent free meta must still
2984 * be free and of the same size.
2985 *
2986 * If we failed to grow, reevaluate whether freelists have
2987 * entries now by returning NULL.
2988 */
2989 metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2990 metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2991 if (meta != (from_right ? metab : metaf)) {
2992 return NULL;
2993 }
2994 if (metaf + meta_num >= metab) {
2995 kmem_range_out_of_va(range_id, meta_num);
2996 }
2997 if (adj_free_meta) {
2998 if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
2999 kmem_get_free_chunk_len(adj_free_meta) !=
3000 meta_req - meta_num) {
3001 return NULL;
3002 }
3003 }
3004
3005 break;
3006 }
3007
3008 /*
3009 * If there is an adjacent free chunk remove it from free list
3010 */
3011 if (adj_free_meta) {
3012 LIST_REMOVE(adj_free_meta, km_link);
3013 LIST_NEXT(adj_free_meta, km_link) = NULL;
3014 adj_free_meta->km_link.le_prev = NULL;
3015 }
3016
3017 /*
3018 * Update hwm
3019 */
3020 meta_update = from_right ? start : end;
3021 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3022
3023 /*
3024 * Initialize metadata
3025 */
3026 start = from_right ? start : (end - meta_req);
3027 kmem_init_allocated_chunk(start, sizeclass, size_idx);
3028
3029 return start;
3030 }
3031
3032 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3033 kmem_requeue_meta(
3034 struct kmem_page_meta *meta,
3035 struct kmem_list_head *head)
3036 {
3037 LIST_REMOVE(meta, km_link);
3038 LIST_INSERT_HEAD(head, meta, km_link);
3039 }
3040
3041 /*
3042 * Return corresponding sizeclass to stash free chunks in
3043 */
3044 __abortlike
3045 static void
kmem_invalid_chunk_num(uint32_t chunks)3046 kmem_invalid_chunk_num(uint32_t chunks)
3047 {
3048 panic("Invalid number of chunks %u\n", chunks);
3049 }
3050
3051 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3052 kmem_get_size_idx_for_chunks(uint32_t chunks)
3053 {
3054 for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3055 if (chunks >= kmem_size_array[i].ks_num_chunk) {
3056 return i;
3057 }
3058 }
3059 kmem_invalid_chunk_num(chunks);
3060 }
3061
3062 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3063 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3064 {
3065 bzero(meta, count * sizeof(struct kmem_page_meta));
3066 }
3067
3068 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3069 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3070 {
3071 #if MACH_ASSERT
3072 size_t size = count * sizeof(struct kmem_page_meta);
3073
3074 assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3075 #else
3076 #pragma unused(meta, count)
3077 #endif
3078 }
3079
3080 /*!
3081 * @function kmem_init_free_chunk()
3082 *
3083 * @discussion
3084 * This function prepares a range of chunks to be put on a free list.
3085 * The first and last metadata might be dirty, but the "inner" ones
3086 * must be zero filled by the caller prior to calling this function.
3087 */
3088 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3089 kmem_init_free_chunk(
3090 struct kmem_page_meta *meta,
3091 uint32_t num_chunks,
3092 uint32_t front)
3093 {
3094 struct kmem_sizeclass *sizeclass;
3095 uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3096
3097 if (num_chunks > 2) {
3098 kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3099 }
3100
3101 meta[0] = (struct kmem_page_meta){
3102 .km_free_chunks = num_chunks,
3103 .km_page_marker = KMEM_META_FREE,
3104 .km_sizeclass = (uint8_t)size_idx,
3105 };
3106 if (num_chunks > 1) {
3107 meta[num_chunks - 1] = (struct kmem_page_meta){
3108 .km_free_chunks = num_chunks,
3109 .km_page_marker = KMEM_META_FREE,
3110 .km_sizeclass = (uint8_t)size_idx,
3111 };
3112 }
3113
3114 sizeclass = &kmem_size_array[size_idx];
3115 LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3116 }
3117
3118 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3119 kmem_get_free_chunk_from_list(
3120 struct kmem_sizeclass *org_sizeclass,
3121 uint32_t size_idx,
3122 uint32_t front)
3123 {
3124 struct kmem_sizeclass *sizeclass;
3125 uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3126 struct kmem_page_meta *meta;
3127 uint32_t idx = size_idx;
3128
3129 while (idx < KMEM_NUM_SIZECLASS) {
3130 sizeclass = &kmem_size_array[idx];
3131 meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3132 if (meta) {
3133 break;
3134 }
3135 idx++;
3136 }
3137
3138 /*
3139 * Trim if larger in size
3140 */
3141 if (meta) {
3142 uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3143
3144 assert(meta->km_page_marker == KMEM_META_FREE);
3145 LIST_REMOVE(meta, km_link);
3146 LIST_NEXT(meta, km_link) = NULL;
3147 meta->km_link.le_prev = NULL;
3148 if (num_chunks_free > num_chunks) {
3149 num_chunks_free -= num_chunks;
3150 kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3151 }
3152
3153 kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3154 }
3155
3156 return meta;
3157 }
3158
3159 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3160 kmem_locate_space(
3161 vm_map_size_t size,
3162 vm_map_range_id_t range_id,
3163 bool from_right,
3164 vm_map_offset_t *start_inout,
3165 vm_map_entry_t *entry_out)
3166 {
3167 vm_map_entry_t entry;
3168 uint32_t size_idx = kmem_get_idx_from_size(size);
3169 uint32_t front = kmem_get_front(range_id, from_right);
3170 struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3171 struct kmem_page_meta *meta;
3172
3173 assert(size <= sizeclass->ks_size);
3174 again:
3175 if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3176 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3177 /*
3178 * Requeue to full if necessary
3179 */
3180 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3181 if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3182 kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3183 }
3184 } else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3185 front)) != NULL) {
3186 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3187 /*
3188 * Queue to partial
3189 */
3190 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3191 assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3192 LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3193 } else {
3194 meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3195 if (meta == NULL) {
3196 goto again;
3197 }
3198 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3199 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3200 LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3201 }
3202
3203 if (entry_out) {
3204 *entry_out = entry;
3205 }
3206
3207 return KERN_SUCCESS;
3208 }
3209
3210 /*
3211 * Determine whether the given metadata was allocated from the right
3212 */
3213 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3214 kmem_meta_is_from_right(
3215 kmem_range_id_t range_id,
3216 struct kmem_page_meta *meta)
3217 {
3218 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3219 #if DEBUG || DEVELOPMENT
3220 struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3221 #endif
3222 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3223 struct kmem_page_meta *meta_end;
3224
3225 meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3226
3227 if ((meta >= meta_base) && (meta < metaf)) {
3228 return false;
3229 }
3230
3231 assert(meta >= metab && meta < meta_end);
3232 return true;
3233 }
3234
3235 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3236 kmem_free_chunk(
3237 kmem_range_id_t range_id,
3238 struct kmem_page_meta *meta,
3239 bool from_right)
3240 {
3241 struct kmem_page_meta *meta_coalesce = meta - 1;
3242 struct kmem_page_meta *meta_start = meta;
3243 uint32_t num_chunks = kmem_get_chunk_len(meta);
3244 uint32_t add_chunks;
3245 struct kmem_page_meta *meta_end = meta + num_chunks;
3246 struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3247 uint32_t front = kmem_get_front(range_id, from_right);
3248
3249 meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3250 meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3251
3252 LIST_REMOVE(meta, km_link);
3253 kmem_clear_meta_range(meta, num_chunks);
3254
3255 /*
3256 * Coalesce left
3257 */
3258 if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3259 (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3260 meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3261 add_chunks = kmem_get_free_chunk_len(meta_start);
3262 num_chunks += add_chunks;
3263 LIST_REMOVE(meta_start, km_link);
3264 kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3265 }
3266
3267 /*
3268 * Coalesce right
3269 */
3270 if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3271 (meta_end->km_page_marker == KMEM_META_FREE)) {
3272 add_chunks = kmem_get_free_chunk_len(meta_end);
3273 LIST_REMOVE(meta_end, km_link);
3274 kmem_clear_meta_range(meta_end, 1);
3275 meta_end = meta_end + add_chunks;
3276 num_chunks += add_chunks;
3277 }
3278
3279 kmem_init_free_chunk(meta_start, num_chunks, front);
3280 }
3281
3282 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3283 kmem_free_slot(
3284 kmem_range_id_t range_id,
3285 mach_vm_range_t slot)
3286 {
3287 struct kmem_page_meta *meta;
3288 vm_map_offset_t chunk_start;
3289 uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3290 struct kmem_sizeclass *sizeclass;
3291 vm_map_size_t slot_size;
3292
3293 meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3294 size_idx = meta->km_sizeclass;
3295 slot_size = kmem_get_size_from_idx(size_idx);
3296 slot_idx = (slot->min_address - chunk_start) / slot_size;
3297 assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3298 meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3299
3300 sizeclass = &kmem_size_array[size_idx];
3301 chunk_elem = sizeclass->ks_num_elem;
3302 num_elem = __builtin_popcount(meta->km_bitmap);
3303
3304 if (num_elem == chunk_elem) {
3305 /*
3306 * If entire chunk empty add to emtpy list
3307 */
3308 bool from_right = kmem_meta_is_from_right(range_id, meta);
3309
3310 kmem_free_chunk(range_id, meta, from_right);
3311 } else if (num_elem == KMEM_NUM_GUARDS + 1) {
3312 /*
3313 * If we freed to full chunk move it to partial
3314 */
3315 uint32_t front = kmem_get_front(range_id,
3316 kmem_meta_is_from_right(range_id, meta));
3317
3318 kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3319 }
3320 }
3321
3322 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3323 kmem_free_space(
3324 vm_map_offset_t start,
3325 vm_map_offset_t end,
3326 vm_map_range_id_t range_id,
3327 mach_vm_range_t slot)
3328 {
3329 bool entry_present = false;
3330 vm_map_entry_t prev_entry;
3331 vm_map_entry_t next_entry;
3332
3333 if ((slot->min_address == start) && (slot->max_address == end)) {
3334 /*
3335 * Entire slot is being freed at once
3336 */
3337 return kmem_free_slot(range_id, slot);
3338 }
3339
3340 entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3341 assert(!entry_present);
3342 next_entry = prev_entry->vme_next;
3343
3344 if (((prev_entry == vm_map_to_entry(kernel_map) ||
3345 prev_entry->vme_end <= slot->min_address)) &&
3346 (next_entry == vm_map_to_entry(kernel_map) ||
3347 (next_entry->vme_start >= slot->max_address))) {
3348 /*
3349 * Free entire slot
3350 */
3351 kmem_free_slot(range_id, slot);
3352 }
3353 }
3354
3355 #pragma mark kmem init
3356
3357 /*
3358 * The default percentage of memory that can be mlocked is scaled based on the total
3359 * amount of memory in the system. These percentages are caclulated
3360 * offline and stored in this table. We index this table by
3361 * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3362 * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3363 *
3364 * Note that these values were picked for mac.
3365 * If we ever have very large memory config arm devices, we may want to revisit
3366 * since the kernel overhead is smaller there due to the larger page size.
3367 */
3368
3369 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3370 #define VM_USER_WIREABLE_MIN_CONFIG 32
3371 #if CONFIG_JETSAM
3372 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3373 * pressure.
3374 */
3375 static vm_map_size_t wire_limit_percents[] =
3376 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3377 #else
3378 static vm_map_size_t wire_limit_percents[] =
3379 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3380 #endif /* CONFIG_JETSAM */
3381
3382 /*
3383 * Sets the default global user wire limit which limits the amount of
3384 * memory that can be locked via mlock() based on the above algorithm..
3385 * This can be overridden via a sysctl.
3386 */
3387 static void
kmem_set_user_wire_limits(void)3388 kmem_set_user_wire_limits(void)
3389 {
3390 uint64_t available_mem_log;
3391 uint64_t max_wire_percent;
3392 size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3393 sizeof(vm_map_size_t);
3394 vm_map_size_t limit;
3395 uint64_t config_memsize = max_mem;
3396 #if defined(XNU_TARGET_OS_OSX)
3397 config_memsize = max_mem_actual;
3398 #endif /* defined(XNU_TARGET_OS_OSX) */
3399
3400 available_mem_log = bit_floor(config_memsize);
3401
3402 if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3403 available_mem_log = 0;
3404 } else {
3405 available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3406 }
3407 if (available_mem_log >= wire_limit_percents_length) {
3408 available_mem_log = wire_limit_percents_length - 1;
3409 }
3410 max_wire_percent = wire_limit_percents[available_mem_log];
3411
3412 limit = config_memsize * max_wire_percent / 100;
3413 /* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3414 if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3415 limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3416 }
3417
3418 vm_global_user_wire_limit = limit;
3419 /* the default per task limit is the same as the global limit */
3420 vm_per_task_user_wire_limit = limit;
3421 vm_add_wire_count_over_global_limit = 0;
3422 vm_add_wire_count_over_user_limit = 0;
3423 }
3424
3425 #define KMEM_MAX_CLAIMS 50
3426 __startup_data
3427 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3428 __startup_data
3429 uint32_t kmem_claim_count = 0;
3430
3431 __startup_func
3432 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3433 kmem_range_startup_init(
3434 struct kmem_range_startup_spec *sp)
3435 {
3436 assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3437 if (sp->kc_calculate_sz) {
3438 sp->kc_size = (sp->kc_calculate_sz)();
3439 }
3440 if (sp->kc_size) {
3441 kmem_claims[kmem_claim_count] = *sp;
3442 kmem_claim_count++;
3443 }
3444 }
3445
3446 static vm_offset_t
kmem_fuzz_start(void)3447 kmem_fuzz_start(void)
3448 {
3449 vm_offset_t kmapoff_kaddr = 0;
3450 uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3451 vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3452
3453 kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3454 KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3455 VM_KERN_MEMORY_OSFMK);
3456 return kmapoff_kaddr + kmapoff_size;
3457 }
3458
3459 /*
3460 * Generate a randomly shuffled array of indices from 0 to count - 1
3461 */
3462 __startup_func
3463 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3464 kmem_shuffle(
3465 uint16_t *shuffle_buf,
3466 uint16_t count)
3467 {
3468 for (uint16_t i = 0; i < count; i++) {
3469 uint16_t j = kmem_get_random16(i);
3470 if (j != i) {
3471 shuffle_buf[i] = shuffle_buf[j];
3472 }
3473 shuffle_buf[j] = i;
3474 }
3475 }
3476
3477 __startup_func
3478 static void
kmem_shuffle_claims(void)3479 kmem_shuffle_claims(void)
3480 {
3481 uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3482 uint16_t limit = (uint16_t)kmem_claim_count;
3483
3484 kmem_shuffle(&shuffle_buf[0], limit);
3485 for (uint16_t i = 0; i < limit; i++) {
3486 struct kmem_range_startup_spec tmp = kmem_claims[i];
3487 kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3488 kmem_claims[shuffle_buf[i]] = tmp;
3489 }
3490 }
3491
3492 __startup_func
3493 static void
kmem_readjust_ranges(uint32_t cur_idx)3494 kmem_readjust_ranges(
3495 uint32_t cur_idx)
3496 {
3497 assert(cur_idx != 0);
3498 uint32_t j = cur_idx - 1, random;
3499 struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3500 struct mach_vm_range *sp_range = sp.kc_range;
3501
3502 /*
3503 * Find max index where restriction is met
3504 */
3505 for (; j > 0; j--) {
3506 struct kmem_range_startup_spec spj = kmem_claims[j];
3507 vm_map_offset_t max_start = spj.kc_range->min_address;
3508 if (spj.kc_flags & KC_NO_MOVE) {
3509 panic("kmem_range_init: Can't scramble with multiple constraints");
3510 }
3511 if (max_start <= sp_range->min_address) {
3512 break;
3513 }
3514 }
3515
3516 /*
3517 * Pick a random index from 0 to max index and shift claims to the right
3518 * to make room for restricted claim
3519 */
3520 random = kmem_get_random16((uint16_t)j);
3521 assert(random <= j);
3522
3523 sp_range->min_address = kmem_claims[random].kc_range->min_address;
3524 sp_range->max_address = sp_range->min_address + sp.kc_size;
3525
3526 for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3527 struct kmem_range_startup_spec spj = kmem_claims[j];
3528 struct mach_vm_range *range = spj.kc_range;
3529 range->min_address += sp.kc_size;
3530 range->max_address += sp.kc_size;
3531 kmem_claims[j + 1] = spj;
3532 }
3533
3534 sp.kc_flags = KC_NO_MOVE;
3535 kmem_claims[random] = sp;
3536 }
3537
3538 __startup_func
3539 static vm_map_size_t
kmem_add_ptr_claims(void)3540 kmem_add_ptr_claims(void)
3541 {
3542 uint64_t kmem_meta_num, kmem_ptr_chunks;
3543 vm_map_size_t org_ptr_range_size = ptr_range_size;
3544
3545 ptr_range_size -= PAGE_SIZE;
3546 ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3547 ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3548
3549 kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3550 ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3551
3552 kmem_meta_num = kmem_ptr_chunks + 2;
3553 kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3554
3555 assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3556 /*
3557 * Add claims for kmem's ranges
3558 */
3559 for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3560 struct kmem_range_startup_spec kmem_spec = {
3561 .kc_name = "kmem_ptr_range",
3562 .kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3563 .kc_size = ptr_range_size,
3564 .kc_flags = KC_NO_ENTRY,
3565 };
3566 kmem_claims[kmem_claim_count++] = kmem_spec;
3567
3568 struct kmem_range_startup_spec kmem_meta_spec = {
3569 .kc_name = "kmem_ptr_range_meta",
3570 .kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3571 .kc_size = kmem_meta_size,
3572 .kc_flags = KC_NONE,
3573 };
3574 kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3575 }
3576 return (org_ptr_range_size - ptr_range_size - kmem_meta_size) *
3577 kmem_ptr_ranges;
3578 }
3579
3580 __startup_func
3581 static void
kmem_add_extra_claims(void)3582 kmem_add_extra_claims(void)
3583 {
3584 vm_map_size_t largest_free_size = 0, total_claims = 0;
3585
3586 vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3587 largest_free_size = trunc_page(largest_free_size);
3588
3589 /*
3590 * kasan and configs w/o *TRR need to have just one ptr range due to
3591 * resource constraints.
3592 */
3593 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3594 kmem_ptr_ranges = 1;
3595 #endif
3596 /*
3597 * Determine size of data and pointer kmem_ranges
3598 */
3599 for (uint32_t i = 0; i < kmem_claim_count; i++) {
3600 total_claims += kmem_claims[i].kc_size;
3601 }
3602 assert((total_claims & PAGE_MASK) == 0);
3603 largest_free_size -= total_claims;
3604
3605 /*
3606 * Use half the total available VA for all pointer allocations (this
3607 * includes the kmem_sprayqtn range). Given that we have 4 total
3608 * ranges divide the available VA by 8.
3609 */
3610 ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3611 sprayqtn_range_size = ptr_range_size;
3612
3613 if (sprayqtn_range_size > (sane_size / 2)) {
3614 sprayqtn_range_size = sane_size / 2;
3615 }
3616
3617 ptr_range_size = round_page(ptr_range_size);
3618 sprayqtn_range_size = round_page(sprayqtn_range_size);
3619
3620
3621 data_range_size = largest_free_size
3622 - (ptr_range_size * kmem_ptr_ranges)
3623 - sprayqtn_range_size;
3624
3625 /*
3626 * Add claims for kmem's ranges
3627 */
3628 data_range_size += kmem_add_ptr_claims();
3629 assert(data_range_size + sprayqtn_range_size +
3630 ((ptr_range_size + kmem_meta_size) * kmem_ptr_ranges) <=
3631 largest_free_size);
3632
3633 struct kmem_range_startup_spec kmem_spec_sprayqtn = {
3634 .kc_name = "kmem_sprayqtn_range",
3635 .kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
3636 .kc_size = sprayqtn_range_size,
3637 .kc_flags = KC_NO_ENTRY,
3638 };
3639 kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
3640
3641 struct kmem_range_startup_spec kmem_spec_data = {
3642 .kc_name = "kmem_data_range",
3643 .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
3644 .kc_size = data_range_size,
3645 .kc_flags = KC_NO_ENTRY,
3646 };
3647 kmem_claims[kmem_claim_count++] = kmem_spec_data;
3648 }
3649
3650 __startup_func
3651 static void
kmem_scramble_ranges(void)3652 kmem_scramble_ranges(void)
3653 {
3654 vm_map_offset_t start = 0;
3655
3656 /*
3657 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
3658 * the vm can find the requested ranges.
3659 */
3660 kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
3661 VM_MAP_PAGE_SIZE(kernel_map));
3662 kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
3663
3664 /*
3665 * Allocating the g_kext_map prior to randomizing the remaining submaps as
3666 * this map is 2G in size and starts at the end of kernel_text on x86. It
3667 * could overflow into the heap.
3668 */
3669 kext_alloc_init();
3670
3671 /*
3672 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
3673 * stack addresses. (With a 4K page and 9 bits of randomness, this
3674 * eats about 2M of VA from the map)
3675 *
3676 * Note that we always need to slide by at least one page because the VM
3677 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
3678 * do not admit this address to be part of any zone submap.
3679 */
3680 start = kmem_fuzz_start();
3681
3682 /*
3683 * Add claims for ptr and data kmem_ranges
3684 */
3685 kmem_add_extra_claims();
3686
3687 /*
3688 * Shuffle registered claims
3689 */
3690 assert(kmem_claim_count < UINT16_MAX);
3691 kmem_shuffle_claims();
3692
3693 /*
3694 * Apply restrictions and determine range for each claim
3695 */
3696 for (uint32_t i = 0; i < kmem_claim_count; i++) {
3697 vm_map_offset_t end = 0;
3698 struct kmem_range_startup_spec sp = kmem_claims[i];
3699 struct mach_vm_range *sp_range = sp.kc_range;
3700 if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
3701 VM_MAP_KERNEL_FLAGS_ANYWHERE(), &start, NULL) != KERN_SUCCESS) {
3702 panic("kmem_range_init: vm_map_locate_space failing for claim %s",
3703 sp.kc_name);
3704 }
3705
3706 end = start + sp.kc_size;
3707 /*
3708 * Re-adjust ranges if restriction not met
3709 */
3710 if (sp_range->min_address && start > sp_range->min_address) {
3711 kmem_readjust_ranges(i);
3712 } else {
3713 sp_range->min_address = start;
3714 sp_range->max_address = end;
3715 }
3716 start = end;
3717 }
3718
3719 /*
3720 * We have settled on the ranges, now create temporary entries for the
3721 * claims
3722 */
3723 for (uint32_t i = 0; i < kmem_claim_count; i++) {
3724 struct kmem_range_startup_spec sp = kmem_claims[i];
3725 vm_map_entry_t entry = NULL;
3726 if (sp.kc_flags & KC_NO_ENTRY) {
3727 continue;
3728 }
3729 if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
3730 VM_MAP_KERNEL_FLAGS_ANYWHERE(), &entry) != KERN_SUCCESS) {
3731 panic("kmem_range_init: vm_map_find_space failing for claim %s",
3732 sp.kc_name);
3733 }
3734 vm_object_reference(kernel_object_default);
3735 VME_OBJECT_SET(entry, kernel_object_default, false, 0);
3736 VME_OFFSET_SET(entry, entry->vme_start);
3737 vm_map_unlock(kernel_map);
3738 }
3739 /*
3740 * Now that we are done assigning all the ranges, reset
3741 * kmem_ranges[KMEM_RANGE_ID_NONE]
3742 */
3743 kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
3744
3745 #if DEBUG || DEVELOPMENT
3746 for (uint32_t i = 0; i < kmem_claim_count; i++) {
3747 struct kmem_range_startup_spec sp = kmem_claims[i];
3748
3749 printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
3750 (void *)sp.kc_range->min_address,
3751 (void *)sp.kc_range->max_address,
3752 mach_vm_size_pretty(sp.kc_size),
3753 mach_vm_size_unit(sp.kc_size));
3754 }
3755 #endif /* DEBUG || DEVELOPMENT */
3756 }
3757
3758 __startup_func
3759 static void
kmem_range_init(void)3760 kmem_range_init(void)
3761 {
3762 vm_size_t range_adjustment;
3763
3764 kmem_scramble_ranges();
3765
3766 range_adjustment = sprayqtn_range_size >> 3;
3767 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
3768 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
3769 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
3770 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
3771
3772 range_adjustment = data_range_size >> 3;
3773 kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
3774 kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
3775 kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
3776 kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
3777
3778 pmap_init();
3779 kmem_metadata_init();
3780 kmem_sizeclass_init();
3781
3782 #if DEBUG || DEVELOPMENT
3783 for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
3784 vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
3785 printf("kmem_large_ranges[%d] : %p - %p (%u%c)\n", i,
3786 (void *)kmem_large_ranges[i].min_address,
3787 (void *)kmem_large_ranges[i].max_address,
3788 mach_vm_size_pretty(range_size),
3789 mach_vm_size_unit(range_size));
3790 }
3791 #endif
3792 }
3793 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
3794
3795 #if DEBUG || DEVELOPMENT
3796 __startup_func
3797 static void
kmem_log_init(void)3798 kmem_log_init(void)
3799 {
3800 /*
3801 * Log can only be created after the the kmem subsystem is initialized as
3802 * btlog creation uses kmem
3803 */
3804 kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
3805 }
3806 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
3807
3808 kmem_gobj_stats
kmem_get_gobj_stats(void)3809 kmem_get_gobj_stats(void)
3810 {
3811 kmem_gobj_stats stats = {};
3812
3813 vm_map_lock(kernel_map);
3814 for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
3815 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
3816 struct mach_vm_range range = kmem_ranges[range_id];
3817 struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3818 struct kmem_page_meta *meta_end;
3819 uint64_t meta_idx = meta - kmem_meta_base[range_id];
3820 vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
3821 vm_map_offset_t addr;
3822 vm_map_entry_t entry;
3823
3824 /*
3825 * Left front
3826 */
3827 va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
3828 meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
3829
3830 /*
3831 * Right front
3832 */
3833 meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3834 meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
3835 &meta_idx);
3836 meta_idx = meta_end - meta;
3837 meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
3838 va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
3839
3840 /*
3841 * Compute VA allocated in entire range
3842 */
3843 if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
3844 entry = entry->vme_next;
3845 }
3846 while (entry != vm_map_to_entry(kernel_map) &&
3847 entry->vme_start < range.max_address) {
3848 used += (entry->vme_end - entry->vme_start);
3849 entry = entry->vme_next;
3850 }
3851
3852 pte_sz = round_page(atop(va - used) * 8);
3853
3854 stats.total_used += used;
3855 stats.total_va += va;
3856 stats.pte_sz += pte_sz;
3857 stats.meta_sz += meta_sz;
3858 }
3859 vm_map_unlock(kernel_map);
3860
3861 return stats;
3862 }
3863
3864 #endif /* DEBUG || DEVELOPMENT */
3865
3866 /*
3867 * kmem_init:
3868 *
3869 * Initialize the kernel's virtual memory map, taking
3870 * into account all memory allocated up to this time.
3871 */
3872 __startup_func
3873 void
kmem_init(vm_offset_t start,vm_offset_t end)3874 kmem_init(
3875 vm_offset_t start,
3876 vm_offset_t end)
3877 {
3878 vm_map_offset_t map_start;
3879 vm_map_offset_t map_end;
3880
3881 map_start = vm_map_trunc_page(start,
3882 VM_MAP_PAGE_MASK(kernel_map));
3883 map_end = vm_map_round_page(end,
3884 VM_MAP_PAGE_MASK(kernel_map));
3885
3886 vm_map_will_allocate_early_map(&kernel_map);
3887 #if defined(__arm64__)
3888 kernel_map = vm_map_create_options(pmap_kernel(),
3889 VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3890 VM_MAX_KERNEL_ADDRESS,
3891 VM_MAP_CREATE_DEFAULT);
3892 /*
3893 * Reserve virtual memory allocated up to this time.
3894 */
3895 {
3896 unsigned int region_select = 0;
3897 vm_map_offset_t region_start;
3898 vm_map_size_t region_size;
3899 vm_map_offset_t map_addr;
3900 kern_return_t kr;
3901
3902 while (pmap_virtual_region(region_select, ®ion_start, ®ion_size)) {
3903 map_addr = region_start;
3904 kr = vm_map_enter(kernel_map, &map_addr,
3905 vm_map_round_page(region_size,
3906 VM_MAP_PAGE_MASK(kernel_map)),
3907 (vm_map_offset_t) 0,
3908 VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vmkf_no_pmap_check = true),
3909 VM_OBJECT_NULL,
3910 (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
3911 VM_INHERIT_DEFAULT);
3912
3913 if (kr != KERN_SUCCESS) {
3914 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3915 (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
3916 (uint64_t) region_size, kr);
3917 }
3918
3919 region_select++;
3920 }
3921 }
3922 #else
3923 kernel_map = vm_map_create_options(pmap_kernel(),
3924 VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
3925 VM_MAP_CREATE_DEFAULT);
3926 /*
3927 * Reserve virtual memory allocated up to this time.
3928 */
3929 if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
3930 vm_map_offset_t map_addr;
3931 kern_return_t kr;
3932
3933 map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3934 kr = vm_map_enter(kernel_map,
3935 &map_addr,
3936 (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3937 (vm_map_offset_t) 0,
3938 VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
3939 VM_OBJECT_NULL,
3940 (vm_object_offset_t) 0, FALSE,
3941 VM_PROT_NONE, VM_PROT_NONE,
3942 VM_INHERIT_DEFAULT);
3943
3944 if (kr != KERN_SUCCESS) {
3945 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3946 (uint64_t) start, (uint64_t) end,
3947 (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3948 (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3949 kr);
3950 }
3951 }
3952 #endif
3953
3954 kmem_set_user_wire_limits();
3955 }
3956
3957
3958 #pragma mark map copyio
3959
3960 /*
3961 * Routine: copyinmap
3962 * Purpose:
3963 * Like copyin, except that fromaddr is an address
3964 * in the specified VM map. This implementation
3965 * is incomplete; it handles the current user map
3966 * and the kernel map/submaps.
3967 */
3968 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)3969 copyinmap(
3970 vm_map_t map,
3971 vm_map_offset_t fromaddr,
3972 void *todata,
3973 vm_size_t length)
3974 {
3975 kern_return_t kr = KERN_SUCCESS;
3976 vm_map_t oldmap;
3977
3978 if (vm_map_pmap(map) == pmap_kernel()) {
3979 /* assume a correct copy */
3980 memcpy(todata, CAST_DOWN(void *, fromaddr), length);
3981 } else if (current_map() == map) {
3982 if (copyin(fromaddr, todata, length) != 0) {
3983 kr = KERN_INVALID_ADDRESS;
3984 }
3985 } else {
3986 vm_map_reference(map);
3987 oldmap = vm_map_switch(map);
3988 if (copyin(fromaddr, todata, length) != 0) {
3989 kr = KERN_INVALID_ADDRESS;
3990 }
3991 vm_map_switch(oldmap);
3992 vm_map_deallocate(map);
3993 }
3994 return kr;
3995 }
3996
3997 /*
3998 * Routine: copyoutmap
3999 * Purpose:
4000 * Like copyout, except that toaddr is an address
4001 * in the specified VM map.
4002 */
4003 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4004 copyoutmap(
4005 vm_map_t map,
4006 void *fromdata,
4007 vm_map_address_t toaddr,
4008 vm_size_t length)
4009 {
4010 kern_return_t kr = KERN_SUCCESS;
4011 vm_map_t oldmap;
4012
4013 if (vm_map_pmap(map) == pmap_kernel()) {
4014 /* assume a correct copy */
4015 memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4016 } else if (current_map() == map) {
4017 if (copyout(fromdata, toaddr, length) != 0) {
4018 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
4019 kr = KERN_INVALID_ADDRESS;
4020 }
4021 } else {
4022 vm_map_reference(map);
4023 oldmap = vm_map_switch(map);
4024 if (copyout(fromdata, toaddr, length) != 0) {
4025 ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
4026 kr = KERN_INVALID_ADDRESS;
4027 }
4028 vm_map_switch(oldmap);
4029 vm_map_deallocate(map);
4030 }
4031 return kr;
4032 }
4033
4034 /*
4035 * Routine: copyoutmap_atomic{32, 64}
4036 * Purpose:
4037 * Like copyoutmap, except that the operation is atomic.
4038 * Takes in value rather than *fromdata pointer.
4039 */
4040 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4041 copyoutmap_atomic32(
4042 vm_map_t map,
4043 uint32_t value,
4044 vm_map_address_t toaddr)
4045 {
4046 kern_return_t kr = KERN_SUCCESS;
4047 vm_map_t oldmap;
4048
4049 if (vm_map_pmap(map) == pmap_kernel()) {
4050 /* assume a correct toaddr */
4051 *(uint32_t *)toaddr = value;
4052 } else if (current_map() == map) {
4053 if (copyout_atomic32(value, toaddr) != 0) {
4054 kr = KERN_INVALID_ADDRESS;
4055 }
4056 } else {
4057 vm_map_reference(map);
4058 oldmap = vm_map_switch(map);
4059 if (copyout_atomic32(value, toaddr) != 0) {
4060 kr = KERN_INVALID_ADDRESS;
4061 }
4062 vm_map_switch(oldmap);
4063 vm_map_deallocate(map);
4064 }
4065 return kr;
4066 }
4067
4068 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4069 copyoutmap_atomic64(
4070 vm_map_t map,
4071 uint64_t value,
4072 vm_map_address_t toaddr)
4073 {
4074 kern_return_t kr = KERN_SUCCESS;
4075 vm_map_t oldmap;
4076
4077 if (vm_map_pmap(map) == pmap_kernel()) {
4078 /* assume a correct toaddr */
4079 *(uint64_t *)toaddr = value;
4080 } else if (current_map() == map) {
4081 if (copyout_atomic64(value, toaddr) != 0) {
4082 kr = KERN_INVALID_ADDRESS;
4083 }
4084 } else {
4085 vm_map_reference(map);
4086 oldmap = vm_map_switch(map);
4087 if (copyout_atomic64(value, toaddr) != 0) {
4088 kr = KERN_INVALID_ADDRESS;
4089 }
4090 vm_map_switch(oldmap);
4091 vm_map_deallocate(map);
4092 }
4093 return kr;
4094 }
4095
4096
4097 #pragma mark pointer obfuscation / packing
4098
4099 /*
4100 *
4101 * The following two functions are to be used when exposing kernel
4102 * addresses to userspace via any of the various debug or info
4103 * facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4104 * and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4105 * are exported to KEXTs.
4106 *
4107 * NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4108 */
4109
4110 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4111 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4112 {
4113 assert(salt != 0);
4114
4115 if (addr == 0) {
4116 return 0ul;
4117 }
4118
4119 if (VM_KERNEL_IS_SLID(addr)) {
4120 return VM_KERNEL_UNSLIDE(addr);
4121 }
4122
4123 vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4124 SHA256_CTX sha_ctx;
4125
4126 SHA256_Init(&sha_ctx);
4127 SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4128 SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4129 SHA256_Final(sha_digest, &sha_ctx);
4130
4131 return sha_digest[0];
4132 }
4133
4134 __exported vm_offset_t
4135 vm_kernel_addrhash_external(vm_offset_t addr);
4136 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4137 vm_kernel_addrhash_external(vm_offset_t addr)
4138 {
4139 return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4140 }
4141
4142 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4143 vm_kernel_addrhide(
4144 vm_offset_t addr,
4145 vm_offset_t *hide_addr)
4146 {
4147 *hide_addr = VM_KERNEL_ADDRHIDE(addr);
4148 }
4149
4150 /*
4151 * vm_kernel_addrperm_external:
4152 * vm_kernel_unslide_or_perm_external:
4153 *
4154 * Use these macros when exposing an address to userspace that could come from
4155 * either kernel text/data *or* the heap.
4156 */
4157 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4158 vm_kernel_addrperm_external(
4159 vm_offset_t addr,
4160 vm_offset_t *perm_addr)
4161 {
4162 if (VM_KERNEL_IS_SLID(addr)) {
4163 *perm_addr = VM_KERNEL_UNSLIDE(addr);
4164 } else if (VM_KERNEL_ADDRESS(addr)) {
4165 *perm_addr = addr + vm_kernel_addrperm_ext;
4166 } else {
4167 *perm_addr = addr;
4168 }
4169 }
4170
4171 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4172 vm_kernel_unslide_or_perm_external(
4173 vm_offset_t addr,
4174 vm_offset_t *up_addr)
4175 {
4176 vm_kernel_addrperm_external(addr, up_addr);
4177 }
4178
4179 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4180 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4181 {
4182 if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4183 panic("pointer %p can't be packed: low %d bits aren't 0",
4184 (void *)ptr, params.vmpp_shift);
4185 } else if (ptr <= params.vmpp_base) {
4186 panic("pointer %p can't be packed: below base %p",
4187 (void *)ptr, (void *)params.vmpp_base);
4188 } else {
4189 panic("pointer %p can't be packed: maximum encodable pointer is %p",
4190 (void *)ptr, (void *)vm_packing_max_packable(params));
4191 }
4192 }
4193
4194 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4195 vm_packing_verify_range(
4196 const char *subsystem,
4197 vm_offset_t min_address,
4198 vm_offset_t max_address,
4199 vm_packing_params_t params)
4200 {
4201 if (min_address > max_address) {
4202 panic("%s: %s range invalid min:%p > max:%p",
4203 __func__, subsystem, (void *)min_address, (void *)max_address);
4204 }
4205
4206 if (!params.vmpp_base_relative) {
4207 return;
4208 }
4209
4210 if (min_address <= params.vmpp_base) {
4211 panic("%s: %s range invalid min:%p <= base:%p",
4212 __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4213 }
4214
4215 if (max_address > vm_packing_max_packable(params)) {
4216 panic("%s: %s range invalid max:%p >= max packable:%p",
4217 __func__, subsystem, (void *)max_address,
4218 (void *)vm_packing_max_packable(params));
4219 }
4220 }
4221
4222 #pragma mark tests
4223 #if DEBUG || DEVELOPMENT
4224 #include <sys/errno.h>
4225
4226 static void
4227 kmem_test_for_entry(
4228 vm_map_t map,
4229 vm_offset_t addr,
4230 void (^block)(vm_map_entry_t))
4231 {
4232 vm_map_entry_t entry;
4233
4234 vm_map_lock(map);
4235 block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4236 vm_map_unlock(map);
4237 }
4238
4239 #define kmem_test_assert_map(map, pg, entries) ({ \
4240 assert3u((map)->size, ==, ptoa(pg)); \
4241 assert3u((map)->hdr.nentries, ==, entries); \
4242 })
4243
4244 static bool
can_write_at(vm_offset_t offs,uint32_t page)4245 can_write_at(vm_offset_t offs, uint32_t page)
4246 {
4247 static const int zero;
4248
4249 return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4250 }
4251 #define assert_writeable(offs, page) \
4252 assertf(can_write_at(offs, page), \
4253 "can write at %p + ptoa(%d)", (void *)offs, page)
4254
4255 #define assert_faults(offs, page) \
4256 assertf(!can_write_at(offs, page), \
4257 "can write at %p + ptoa(%d)", (void *)offs, page)
4258
4259 #define peek(offs, page) \
4260 (*(uint32_t *)((offs) + ptoa(page)))
4261
4262 #define poke(offs, page, v) \
4263 (*(uint32_t *)((offs) + ptoa(page)) = (v))
4264
4265 __attribute__((noinline))
4266 static void
kmem_alloc_basic_test(vm_map_t map)4267 kmem_alloc_basic_test(vm_map_t map)
4268 {
4269 kmem_guard_t guard = {
4270 .kmg_tag = VM_KERN_MEMORY_DIAG,
4271 };
4272 vm_offset_t addr;
4273
4274 /*
4275 * Test wired basics:
4276 * - KMA_KOBJECT
4277 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4278 * - allocation alignment
4279 */
4280 addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4281 KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4282 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4283 assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4284 kmem_test_assert_map(map, 10, 1);
4285
4286 kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
4287 assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4288 assert(e->vme_kernel_object);
4289 assert(!e->vme_atomic);
4290 assert3u(e->vme_start, <=, addr);
4291 assert3u(addr + ptoa(10), <=, e->vme_end);
4292 });
4293
4294 assert_faults(addr, 0);
4295 for (int i = 1; i < 9; i++) {
4296 assert_writeable(addr, i);
4297 }
4298 assert_faults(addr, 9);
4299
4300 kmem_free(map, addr, ptoa(10));
4301 kmem_test_assert_map(map, 0, 0);
4302
4303 /*
4304 * Test pageable basics.
4305 */
4306 addr = kmem_alloc_guard(map, ptoa(10), 0,
4307 KMA_PAGEABLE, guard).kmr_address;
4308 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4309 kmem_test_assert_map(map, 10, 1);
4310
4311 for (int i = 0; i < 9; i++) {
4312 assert_faults(addr, i);
4313 poke(addr, i, 42);
4314 assert_writeable(addr, i);
4315 }
4316
4317 kmem_free(map, addr, ptoa(10));
4318 kmem_test_assert_map(map, 0, 0);
4319 }
4320
4321 __attribute__((noinline))
4322 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4323 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4324 {
4325 kmem_guard_t guard = {
4326 .kmg_atomic = !(kind & KMR_DATA),
4327 .kmg_tag = VM_KERN_MEMORY_DIAG,
4328 .kmg_context = 0xefface,
4329 };
4330 vm_offset_t addr, newaddr;
4331 const int N = 10;
4332
4333 /*
4334 * This isn't something kmem_realloc_guard() _needs_ to do,
4335 * we could conceive an implementation where it grows in place
4336 * if there's space after it.
4337 *
4338 * However, this is what the implementation does today.
4339 */
4340 bool realloc_growth_changes_address = true;
4341 bool GL = (kind & KMR_GUARD_LAST);
4342
4343 /*
4344 * Initial N page allocation
4345 */
4346 addr = kmem_alloc_guard(map, ptoa(N), 0,
4347 (kind & (KMA_KOBJECT | KMA_GUARD_LAST | KMA_DATA)) | KMA_ZERO,
4348 guard).kmr_address;
4349 assert3u(addr, !=, 0);
4350 kmem_test_assert_map(map, N, 1);
4351 for (int pg = 0; pg < N - GL; pg++) {
4352 poke(addr, pg, 42 + pg);
4353 }
4354 for (int pg = N - GL; pg < N; pg++) {
4355 assert_faults(addr, pg);
4356 }
4357
4358
4359 /*
4360 * Grow to N + 3 pages
4361 */
4362 newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4363 kind | KMR_ZERO, guard).kmr_address;
4364 assert3u(newaddr, !=, 0);
4365 if (realloc_growth_changes_address) {
4366 assert3u(addr, !=, newaddr);
4367 }
4368 if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4369 kmem_test_assert_map(map, N + 3, 1);
4370 } else {
4371 kmem_test_assert_map(map, 2 * N + 3, 2);
4372 }
4373 for (int pg = 0; pg < N - GL; pg++) {
4374 assert3u(peek(newaddr, pg), ==, 42 + pg);
4375 }
4376 if ((kind & KMR_FREEOLD) == 0) {
4377 for (int pg = 0; pg < N - GL; pg++) {
4378 assert3u(peek(addr, pg), ==, 42 + pg);
4379 }
4380 /* check for tru-share */
4381 poke(addr + 16, 0, 1234);
4382 assert3u(peek(newaddr + 16, 0), ==, 1234);
4383 kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
4384 kmem_test_assert_map(map, N + 3, 1);
4385 }
4386 if (addr != newaddr) {
4387 for (int pg = 0; pg < N - GL; pg++) {
4388 assert_faults(addr, pg);
4389 }
4390 }
4391 for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4392 assert3u(peek(newaddr, pg), ==, 0);
4393 }
4394 for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4395 assert_faults(newaddr, pg);
4396 }
4397 addr = newaddr;
4398
4399
4400 /*
4401 * Shrink to N - 2 pages
4402 */
4403 newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4404 kind | KMR_ZERO, guard).kmr_address;
4405 assert3u(map->size, ==, ptoa(N - 2));
4406 assert3u(newaddr, ==, addr);
4407 kmem_test_assert_map(map, N - 2, 1);
4408
4409 for (int pg = 0; pg < N - 2 - GL; pg++) {
4410 assert3u(peek(addr, pg), ==, 42 + pg);
4411 }
4412 for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4413 assert_faults(addr, pg);
4414 }
4415
4416 kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
4417 kmem_test_assert_map(map, 0, 0);
4418 }
4419
4420 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4421 kmem_basic_test(__unused int64_t in, int64_t *out)
4422 {
4423 mach_vm_offset_t addr;
4424 vm_map_t map;
4425
4426 printf("%s: test running\n", __func__);
4427
4428 map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4429 VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4430 KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4431
4432 printf("%s: kmem_alloc ...\n", __func__);
4433 kmem_alloc_basic_test(map);
4434 printf("%s: PASS\n", __func__);
4435
4436 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4437 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4438 printf("%s: PASS\n", __func__);
4439
4440 printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4441 kmem_realloc_basic_test(map, KMR_FREEOLD);
4442 printf("%s: PASS\n", __func__);
4443
4444 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4445 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
4446 printf("%s: PASS\n", __func__);
4447
4448 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4449 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4450 printf("%s: PASS\n", __func__);
4451
4452 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4453 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4454 printf("%s: PASS\n", __func__);
4455
4456 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4457 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
4458 printf("%s: PASS\n", __func__);
4459
4460 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4461 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4462 printf("%s: PASS\n", __func__);
4463
4464 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4465 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4466 printf("%s: PASS\n", __func__);
4467
4468 /* using KMR_DATA signals to test the non atomic realloc path */
4469 printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4470 kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4471 printf("%s: PASS\n", __func__);
4472
4473 printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4474 kmem_realloc_basic_test(map, KMR_DATA);
4475 printf("%s: PASS\n", __func__);
4476
4477 kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
4478 vm_map_deallocate(map);
4479
4480 printf("%s: test passed\n", __func__);
4481 *out = 1;
4482 return 0;
4483 }
4484 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
4485
4486 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)4487 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
4488 {
4489 uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
4490
4491 assert(chunks >= kmem_size_array[idx].ks_num_chunk);
4492 }
4493
4494 __attribute__((noinline))
4495 static void
kmem_test_get_size_idx_for_all_chunks()4496 kmem_test_get_size_idx_for_all_chunks()
4497 {
4498 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
4499 uint32_t chunks = kmem_size_array[i].ks_num_chunk;
4500
4501 if (chunks != 1) {
4502 kmem_test_get_size_idx_for_chunks(chunks - 1);
4503 }
4504 kmem_test_get_size_idx_for_chunks(chunks);
4505 kmem_test_get_size_idx_for_chunks(chunks + 1);
4506 }
4507 }
4508
4509 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)4510 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
4511 {
4512 printf("%s: test running\n", __func__);
4513
4514 printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
4515 kmem_test_get_size_idx_for_all_chunks();
4516 printf("%s: PASS\n", __func__);
4517
4518 printf("%s: test passed\n", __func__);
4519 *out = 1;
4520 return 0;
4521 }
4522 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
4523 #endif /* DEBUG || DEVELOPMENT */
4524