1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_kern.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Kernel memory management.
64 */
65
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern_internal.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object_internal.h>
73 #include <vm/vm_page_internal.h>
74 #include <vm/vm_compressor_xnu.h>
75 #include <vm/vm_pageout_xnu.h>
76 #include <vm/vm_init_xnu.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #include <vm/vm_far.h>
80 #include <kern/misc_protos.h>
81 #include <vm/cpm_internal.h>
82 #include <kern/ledger.h>
83 #include <kern/bits.h>
84 #include <kern/startup.h>
85 #include <kern/telemetry.h>
86
87 #include <string.h>
88
89 #include <libkern/OSDebug.h>
90 #include <libkern/crypto/sha2.h>
91 #include <libkern/section_keywords.h>
92 #include <sys/kdebug.h>
93 #include <sys/kdebug_triage.h>
94
95 #include <san/kasan.h>
96 #include <kern/kext_alloc.h>
97 #include <kern/backtrace.h>
98 #include <os/hash.h>
99 #include <kern/zalloc_internal.h>
100 #include <libkern/crypto/rand.h>
101
102 /*
103 * Variables exported by this module.
104 */
105
106 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
107 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
108 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
109
110 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
111 KMEM_RANGE_ID_NUM_PTR);
112 #define KMEM_GOBJ_THRESHOLD (32ULL << 20)
113 #if DEBUG || DEVELOPMENT
114 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
115 #define KMEM_OUTLIER_SIZE 0
116 #define KMEM_OUTLIER_ALIGN 1
117 btlog_t kmem_outlier_log;
118 #endif /* DEBUG || DEVELOPMENT */
119
120 __startup_data static vm_map_size_t iokit_range_size;
121 __startup_data static vm_map_size_t data_range_size;
122 __startup_data static vm_map_size_t ptr_range_size;
123 __startup_data static vm_map_size_t sprayqtn_range_size;
124
125 #pragma mark helpers
126
127 __attribute__((overloadable))
128 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)129 ANYF(kma_flags_t flags)
130 {
131 return (kmem_flags_t)flags;
132 }
133
134 __attribute__((overloadable))
135 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)136 ANYF(kmr_flags_t flags)
137 {
138 return (kmem_flags_t)flags;
139 }
140
141 __attribute__((overloadable))
142 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)143 ANYF(kmf_flags_t flags)
144 {
145 return (kmem_flags_t)flags;
146 }
147
148 __abortlike
149 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)150 __kmem_invalid_size_panic(
151 vm_map_t map,
152 vm_size_t size,
153 uint32_t flags)
154 {
155 panic("kmem(map=%p, flags=0x%x): invalid size %zd",
156 map, flags, (size_t)size);
157 }
158
159 __abortlike
160 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)161 __kmem_invalid_arguments_panic(
162 const char *what,
163 vm_map_t map,
164 vm_address_t address,
165 vm_size_t size,
166 uint32_t flags)
167 {
168 panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
169 "invalid arguments passed",
170 what, map, (void *)address, (size_t)size, flags);
171 }
172
173 __abortlike
174 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)175 __kmem_failed_panic(
176 vm_map_t map,
177 vm_size_t size,
178 uint32_t flags,
179 kern_return_t kr,
180 const char *what)
181 {
182 panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
183 what, map, (size_t)size, flags, kr);
184 }
185
186 __abortlike
187 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)188 __kmem_entry_not_found_panic(
189 vm_map_t map,
190 vm_offset_t addr)
191 {
192 panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
193 }
194
195 static inline vm_object_t
__kmem_object(kmem_flags_t flags)196 __kmem_object(kmem_flags_t flags)
197 {
198 if (flags & KMEM_COMPRESSOR) {
199 if (flags & KMEM_KOBJECT) {
200 panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
201 }
202 return compressor_object;
203 }
204 if (!(flags & KMEM_KOBJECT)) {
205 panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
206 }
207 return kernel_object_default;
208 }
209
210 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)211 __kmem_mapping_type(kmem_flags_t flags)
212 {
213 if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
214 return PMAP_MAPPING_TYPE_DEFAULT;
215 } else {
216 return PMAP_MAPPING_TYPE_RESTRICTED;
217 }
218 }
219
220 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)221 __kmem_guard_left(kmem_flags_t flags)
222 {
223 return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
224 }
225
226 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)227 __kmem_guard_right(kmem_flags_t flags)
228 {
229 return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
230 }
231
232 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)233 __kmem_guard_size(kmem_flags_t flags)
234 {
235 return __kmem_guard_left(flags) + __kmem_guard_right(flags);
236 }
237
238 __pure2
239 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)240 __kmem_entry_orig_size(vm_map_entry_t entry)
241 {
242 vm_object_t object = VME_OBJECT(entry);
243
244 if (entry->vme_kernel_object) {
245 return entry->vme_end - entry->vme_start -
246 entry->vme_object_or_delta;
247 } else {
248 return object->vo_size - object->vo_size_delta;
249 }
250 }
251
252
253 #pragma mark kmem range methods
254
255 #define mach_vm_range_load(r, rmin, rmax) \
256 ({ (rmin) = (r)->min_address; (rmax) = (r)->max_address; })
257
258 __abortlike
259 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)260 __mach_vm_range_overflow(
261 mach_vm_offset_t addr,
262 mach_vm_offset_t size)
263 {
264 panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
265 addr, addr, size);
266 }
267
268 __abortlike
269 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)270 __mach_vm_range_invalid(
271 mach_vm_offset_t min_address,
272 mach_vm_offset_t max_address)
273 {
274 panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
275 min_address, max_address);
276 }
277
278 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)279 mach_vm_range_size(const struct mach_vm_range *r)
280 {
281 mach_vm_offset_t rmin, rmax;
282
283 mach_vm_range_load(r, rmin, rmax);
284 return rmax - rmin;
285 }
286
287 __attribute__((overloadable))
288 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)289 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
290 {
291 mach_vm_offset_t rmin, rmax;
292
293 #if CONFIG_KERNEL_TAGGING
294 if (VM_KERNEL_ADDRESS(addr)) {
295 addr = vm_memtag_canonicalize_kernel(addr);
296 }
297 #endif /* CONFIG_KERNEL_TAGGING */
298
299 /*
300 * The `&` is not a typo: we really expect the check to pass,
301 * so encourage the compiler to eagerly load and test without branches
302 */
303 mach_vm_range_load(r, rmin, rmax);
304 return (addr >= rmin) & (addr < rmax);
305 }
306
307 __attribute__((overloadable))
308 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)309 mach_vm_range_contains(
310 const struct mach_vm_range *r,
311 mach_vm_offset_t addr,
312 mach_vm_offset_t size)
313 {
314 mach_vm_offset_t rmin, rmax;
315
316 #if CONFIG_KERNEL_TAGGING
317 if (VM_KERNEL_ADDRESS(addr)) {
318 addr = vm_memtag_canonicalize_kernel(addr);
319 }
320 #endif /* CONFIG_KERNEL_TAGGING */
321
322 mach_vm_offset_t end;
323 if (__improbable(os_add_overflow(addr, size, &end))) {
324 return false;
325 }
326
327 /*
328 * The `&` is not a typo: we really expect the check to pass,
329 * so encourage the compiler to eagerly load and test without branches
330 */
331 mach_vm_range_load(r, rmin, rmax);
332 return (addr >= rmin) & (end >= rmin) & (end <= rmax);
333 }
334
335 __attribute__((overloadable))
336 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)337 mach_vm_range_intersects(
338 const struct mach_vm_range *r1,
339 const struct mach_vm_range *r2)
340 {
341 mach_vm_offset_t r1_min, r1_max;
342 mach_vm_offset_t r2_min, r2_max;
343
344 mach_vm_range_load(r1, r1_min, r1_max);
345 r2_min = r2->min_address;
346 r2_max = r2->max_address;
347
348 if (r1_min > r1_max) {
349 __mach_vm_range_invalid(r1_min, r1_max);
350 }
351
352 if (r2_min > r2_max) {
353 __mach_vm_range_invalid(r2_min, r2_max);
354 }
355
356 return r1_max > r2_min && r1_min < r2_max;
357 }
358
359 __attribute__((overloadable))
360 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)361 mach_vm_range_intersects(
362 const struct mach_vm_range *r1,
363 mach_vm_offset_t addr,
364 mach_vm_offset_t size)
365 {
366 struct mach_vm_range r2;
367
368 #if CONFIG_KERNEL_TAGGING
369 addr = VM_KERNEL_STRIP_UPTR(addr);
370 #endif /* CONFIG_KERNEL_TAGGING */
371
372 r2.min_address = addr;
373 if (os_add_overflow(addr, size, &r2.max_address)) {
374 __mach_vm_range_overflow(addr, size);
375 }
376
377 return mach_vm_range_intersects(r1, &r2);
378 }
379
380 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)381 kmem_range_id_contains(
382 kmem_range_id_t range_id,
383 vm_map_offset_t addr,
384 vm_map_size_t size)
385 {
386 return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
387 }
388
389 __abortlike
390 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)391 kmem_range_invalid_panic(
392 kmem_range_id_t range_id,
393 vm_map_offset_t addr,
394 vm_map_size_t size)
395 {
396 const struct mach_vm_range *r = &kmem_ranges[range_id];
397 mach_vm_offset_t rmin, rmax;
398
399 mach_vm_range_load(r, rmin, rmax);
400 if (addr + size < rmin) {
401 panic("addr %p + size %llu overflows %p", (void *)addr, size,
402 (void *)(addr + size));
403 }
404 panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
405 (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
406 }
407
408 /*
409 * Return whether the entire allocation is contained in the given range
410 */
411 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)412 kmem_range_contains_fully(
413 kmem_range_id_t range_id,
414 vm_map_offset_t addr,
415 vm_map_size_t size)
416 {
417 const struct mach_vm_range *r = &kmem_ranges[range_id];
418 mach_vm_offset_t rmin, rmax;
419 bool result = false;
420
421 if (VM_KERNEL_ADDRESS(addr)) {
422 addr = vm_memtag_canonicalize_kernel(addr);
423 }
424
425 /*
426 * The `&` is not a typo: we really expect the check to pass,
427 * so encourage the compiler to eagerly load and test without branches
428 */
429 mach_vm_range_load(r, rmin, rmax);
430 result = (addr >= rmin) & (addr < rmax);
431 if (__improbable(result
432 && ((addr + size < rmin) || (addr + size > rmax)))) {
433 kmem_range_invalid_panic(range_id, addr, size);
434 }
435 return result;
436 }
437
438 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)439 kmem_range_id_size(kmem_range_id_t range_id)
440 {
441 return mach_vm_range_size(&kmem_ranges[range_id]);
442 }
443
444 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)445 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
446 {
447 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
448
449 for (; range_id < KMEM_RANGE_COUNT; range_id++) {
450 if (kmem_range_contains_fully(range_id, addr, size)) {
451 return range_id;
452 }
453 }
454 return KMEM_RANGE_ID_NONE;
455 }
456
457 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)458 kmem_is_ptr_range(vm_map_range_id_t range_id)
459 {
460 return (range_id >= KMEM_RANGE_ID_FIRST) &&
461 (range_id <= KMEM_RANGE_ID_NUM_PTR);
462 }
463
464 __abortlike
465 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)466 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
467 {
468 panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
469 (void *)addr);
470 }
471
472 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)473 kmem_validate_range_for_overwrite(
474 vm_map_offset_t addr,
475 vm_map_size_t size)
476 {
477 vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
478
479 if (kmem_is_ptr_range(range_id)) {
480 kmem_range_invalid_for_overwrite(addr);
481 }
482
483 return &kmem_ranges[range_id];
484 }
485
486
487 #pragma mark entry parameters
488
489
490 __abortlike
491 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)492 __kmem_entry_validate_panic(
493 vm_map_t map,
494 vm_map_entry_t entry,
495 vm_offset_t addr,
496 vm_size_t size,
497 uint32_t flags,
498 kmem_guard_t guard)
499 {
500 const char *what = "???";
501
502 if (entry->vme_atomic != guard.kmg_atomic) {
503 what = "atomicity";
504 } else if (entry->is_sub_map != guard.kmg_submap) {
505 what = "objectness";
506 } else if (addr != entry->vme_start) {
507 what = "left bound";
508 } else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
509 what = "right bound";
510 } else if (guard.kmg_context != entry->vme_context) {
511 what = "guard";
512 }
513
514 panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
515 "entry:%p %s mismatch guard(0x%08x)",
516 map, (void *)addr, size, flags, entry,
517 what, guard.kmg_context);
518 }
519
520 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)521 __kmem_entry_validate_guard(
522 vm_map_entry_t entry,
523 vm_offset_t addr,
524 vm_size_t size,
525 kmem_flags_t flags,
526 kmem_guard_t guard)
527 {
528 if (entry->vme_atomic != guard.kmg_atomic) {
529 return false;
530 }
531
532 if (!guard.kmg_atomic) {
533 return true;
534 }
535
536 if (entry->is_sub_map != guard.kmg_submap) {
537 return false;
538 }
539
540 if (addr != entry->vme_start) {
541 return false;
542 }
543
544 if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
545 return false;
546 }
547
548 if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
549 return false;
550 }
551
552 return true;
553 }
554
555 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)556 kmem_entry_validate_guard(
557 vm_map_t map,
558 vm_map_entry_t entry,
559 vm_offset_t addr,
560 vm_size_t size,
561 kmem_guard_t guard)
562 {
563 if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
564 __kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
565 }
566 }
567
568 __abortlike
569 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)570 __kmem_entry_validate_object_panic(
571 vm_map_t map,
572 vm_map_entry_t entry,
573 kmem_flags_t flags)
574 {
575 const char *what;
576 const char *verb;
577
578 if (entry->is_sub_map) {
579 panic("kmem(map=%p) entry %p is a submap", map, entry);
580 }
581
582 if (flags & KMEM_KOBJECT) {
583 what = "kernel";
584 verb = "isn't";
585 } else if (flags & KMEM_COMPRESSOR) {
586 what = "compressor";
587 verb = "isn't";
588 } else if (entry->vme_kernel_object) {
589 what = "kernel";
590 verb = "is unexpectedly";
591 } else {
592 what = "compressor";
593 verb = "is unexpectedly";
594 }
595
596 panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
597 map, flags, entry, verb, what);
598 }
599
600 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)601 __kmem_entry_validate_object(
602 vm_map_entry_t entry,
603 kmem_flags_t flags)
604 {
605 if (entry->is_sub_map) {
606 return false;
607 }
608 if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
609 return false;
610 }
611
612 return (bool)(flags & KMEM_COMPRESSOR) ==
613 (VME_OBJECT(entry) == compressor_object);
614 }
615
616 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)617 kmem_size_guard(
618 vm_map_t map,
619 vm_offset_t addr,
620 kmem_guard_t guard)
621 {
622 kmem_flags_t flags = KMEM_GUESS_SIZE;
623 vm_map_entry_t entry;
624 vm_size_t size;
625
626 vm_map_lock_read(map);
627
628 #if KASAN_CLASSIC
629 addr -= PAGE_SIZE;
630 #endif /* KASAN_CLASSIC */
631 addr = vm_memtag_canonicalize_kernel(addr);
632
633 if (!vm_map_lookup_entry(map, addr, &entry)) {
634 __kmem_entry_not_found_panic(map, addr);
635 }
636
637 if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
638 __kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
639 }
640
641 size = __kmem_entry_orig_size(entry);
642
643 vm_map_unlock_read(map);
644
645 return size;
646 }
647
648 static inline uint16_t
kmem_hash_backtrace(void * fp)649 kmem_hash_backtrace(
650 void *fp)
651 {
652 uint64_t bt_count;
653 uintptr_t bt[8] = {};
654
655 struct backtrace_control ctl = {
656 .btc_frame_addr = (uintptr_t)fp,
657 };
658
659 bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
660 return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
661 }
662
663 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
664 "Insufficient bits to represent ptr ranges");
665
666 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)667 kmem_adjust_range_id(
668 uint32_t hash)
669 {
670 return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
671 (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
672 }
673
674 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)675 kmem_use_sprayqtn(
676 kma_flags_t kma_flags,
677 vm_map_size_t map_size,
678 vm_offset_t mask)
679 {
680 /*
681 * Pointer allocations that are above the guard objects threshold or have
682 * leading guard pages with non standard alignment requests are redirected
683 * to the sprayqtn range.
684 */
685 #if DEBUG || DEVELOPMENT
686 btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
687 BTREF_GET_NOWAIT : 0;
688
689 if ((kma_flags & KMA_SPRAYQTN) == 0) {
690 if (map_size > KMEM_GOBJ_THRESHOLD) {
691 btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
692 btref_get(__builtin_frame_address(0), flags));
693 } else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
694 btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
695 btref_get(__builtin_frame_address(0), flags));
696 }
697 }
698 #endif /* DEBUG || DEVELOPMENT */
699
700 return (kma_flags & KMA_SPRAYQTN) ||
701 (map_size > KMEM_GOBJ_THRESHOLD) ||
702 ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
703 }
704
705 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)706 kmem_apply_security_policy(
707 vm_map_t map,
708 kma_flags_t kma_flags,
709 kmem_guard_t guard,
710 vm_map_size_t map_size,
711 vm_offset_t mask,
712 vm_map_kernel_flags_t *vmk_flags,
713 bool assert_dir __unused)
714 {
715 kmem_range_id_t range_id;
716 bool from_right;
717 uint16_t type_hash = guard.kmg_type_hash;
718
719 if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
720 return;
721 }
722
723 /*
724 * A non-zero type-hash must be passed by krealloc_type
725 */
726 #if (DEBUG || DEVELOPMENT)
727 if (assert_dir && !(kma_flags & (KMA_DATA | KMA_DATA_SHARED))) {
728 assert(type_hash != 0);
729 }
730 #endif
731
732 if (kma_flags & (KMA_DATA | KMA_DATA_SHARED)) {
733 range_id = KMEM_RANGE_ID_DATA;
734 /*
735 * As an optimization in KMA_DATA to avoid fragmentation,
736 * allocate static carveouts at the end of the DATA range.
737 */
738 from_right = (bool)(kma_flags & KMA_PERMANENT);
739 } else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
740 range_id = KMEM_RANGE_ID_SPRAYQTN;
741 from_right = (bool)(kma_flags & KMA_PERMANENT);
742 } else if (type_hash) {
743 range_id = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
744 from_right = type_hash & KMEM_DIRECTION_MASK;
745 } else {
746 /*
747 * Range id needs to correspond to one of the PTR ranges
748 */
749 type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
750 range_id = kmem_adjust_range_id(type_hash);
751 from_right = type_hash & KMEM_DIRECTION_MASK;
752 }
753
754 vmk_flags->vmkf_range_id = range_id;
755 vmk_flags->vmkf_last_free = from_right;
756 }
757
758 #pragma mark allocation
759
760 static kmem_return_t
761 kmem_alloc_guard_internal(
762 vm_map_t map,
763 vm_size_t size,
764 vm_offset_t mask,
765 kma_flags_t flags,
766 kmem_guard_t guard,
767 kern_return_t (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
768 {
769 vm_object_t object;
770 vm_offset_t delta = 0;
771 vm_map_entry_t entry = NULL;
772 vm_map_offset_t map_addr, fill_start;
773 vm_map_size_t map_size, fill_size;
774 vm_page_t guard_left = VM_PAGE_NULL;
775 vm_page_t guard_right = VM_PAGE_NULL;
776 vm_page_t wired_page_list = VM_PAGE_NULL;
777 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
778 bool skip_guards;
779 kmem_return_t kmr = { };
780
781 assert(kernel_map && map->pmap == kernel_pmap);
782
783 #if DEBUG || DEVELOPMENT
784 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
785 size, 0, 0, 0);
786 #endif
787
788
789 if (size == 0 ||
790 (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
791 (size < __kmem_guard_size(ANYF(flags)))) {
792 __kmem_invalid_size_panic(map, size, flags);
793 }
794
795 /*
796 * limit the size of a single extent of wired memory
797 * to try and limit the damage to the system if
798 * too many pages get wired down
799 * limit raised to 2GB with 128GB max physical limit,
800 * but scaled by installed memory above this
801 *
802 * Note: kmem_alloc_contig_guard() is immune to this check.
803 */
804 if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
805 alloc_pages == NULL &&
806 size > MAX(1ULL << 31, sane_size / 64))) {
807 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
808 goto out_error;
809 }
810
811 #if 136275805
812 /*
813 * XXX: Redundantly check the mapping size here so that failure stack traces
814 * are more useful. This has no functional value but is helpful because
815 * telemetry traps can currently only capture the last five calls and
816 * so we want to trap as shallow as possible in a select few cases
817 * where we anticipate issues.
818 *
819 * When telemetry collection is complete, this will be removed.
820 */
821 if (__improbable(!vm_map_is_map_size_valid(
822 kernel_map, size, flags & KMA_NOSOFTLIMIT))) {
823 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
824 goto out_error;
825 }
826 #endif /* 136275805 */
827
828 /*
829 * Guard pages:
830 *
831 * Guard pages are implemented as fictitious pages.
832 *
833 * However, some maps, and some objects are known
834 * to manage their memory explicitly, and do not need
835 * those to be materialized, which saves memory.
836 *
837 * By placing guard pages on either end of a stack,
838 * they can help detect cases where a thread walks
839 * off either end of its stack.
840 *
841 * They are allocated and set up here and attempts
842 * to access those pages are trapped in vm_fault_page().
843 *
844 * The map_size we were passed may include extra space for
845 * guard pages. fill_size represents the actual size to populate.
846 * Similarly, fill_start indicates where the actual pages
847 * will begin in the range.
848 */
849
850 map_size = round_page(size);
851 fill_start = 0;
852 fill_size = map_size - __kmem_guard_size(ANYF(flags));
853
854 #if KASAN_CLASSIC
855 if (flags & KMA_KASAN_GUARD) {
856 assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
857 flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
858 delta = ptoa(2);
859 map_size += delta;
860 }
861 #else
862 (void)delta;
863 #endif /* KASAN_CLASSIC */
864
865 skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
866 map->never_faults;
867
868 if (flags & KMA_GUARD_FIRST) {
869 vmk_flags.vmkf_guard_before = true;
870 fill_start += PAGE_SIZE;
871 }
872 if (flags & KMA_NOSOFTLIMIT) {
873 vmk_flags.vmkf_no_soft_limit = true;
874 }
875 if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
876 guard_left = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
877 if (__improbable(guard_left == VM_PAGE_NULL)) {
878 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
879 goto out_error;
880 }
881 }
882 if ((flags & KMA_GUARD_LAST) && !skip_guards) {
883 guard_right = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
884 if (__improbable(guard_right == VM_PAGE_NULL)) {
885 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
886 goto out_error;
887 }
888 }
889
890 if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
891 if (alloc_pages) {
892 kmr.kmr_return = alloc_pages(fill_size, flags,
893 &wired_page_list);
894 } else {
895 kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
896 &wired_page_list);
897 }
898 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
899 goto out_error;
900 }
901 }
902
903 /*
904 * Allocate a new object (if necessary). We must do this before
905 * locking the map, or risk deadlock with the default pager.
906 */
907 if (flags & KMA_KOBJECT) {
908 {
909 object = kernel_object_default;
910 }
911 vm_object_reference(object);
912 } else if (flags & KMA_COMPRESSOR) {
913 object = compressor_object;
914 vm_object_reference(object);
915 } else {
916 object = vm_object_allocate(map_size);
917 vm_object_lock(object);
918 vm_object_set_size(object, map_size, size);
919 /* stabilize the object to prevent shadowing */
920 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
921 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
922 vm_object_unlock(object);
923 }
924
925 if (flags & KMA_LAST_FREE) {
926 vmk_flags.vmkf_last_free = true;
927 }
928 if (flags & KMA_PERMANENT) {
929 vmk_flags.vmf_permanent = true;
930 }
931 kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
932 false);
933
934 kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
935 vmk_flags, &entry);
936 if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
937 vm_object_deallocate(object);
938 goto out_error;
939 }
940
941 map_addr = entry->vme_start;
942 VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
943 VME_ALIAS_SET(entry, guard.kmg_tag);
944 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
945 VME_OFFSET_SET(entry, map_addr);
946 }
947
948 #if KASAN
949 if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
950 entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
951 }
952 #endif /* KASAN */
953
954 if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
955 entry->wired_count = 1;
956 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
957 }
958
959 if (guard_left || guard_right || wired_page_list) {
960 vm_object_offset_t offset = 0ull;
961
962 vm_object_lock(object);
963 vm_map_unlock(map);
964
965 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
966 offset = map_addr;
967 }
968
969 if (guard_left) {
970 vm_page_insert(guard_left, object, offset);
971 guard_left->vmp_busy = FALSE;
972 guard_left = VM_PAGE_NULL;
973 }
974
975 if (guard_right) {
976 vm_page_insert(guard_right, object,
977 offset + fill_start + fill_size);
978 guard_right->vmp_busy = FALSE;
979 guard_right = VM_PAGE_NULL;
980 }
981
982 if (wired_page_list) {
983 kernel_memory_populate_object_and_unlock(object,
984 map_addr + fill_start, offset + fill_start, fill_size,
985 wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
986 __kmem_mapping_type(ANYF(flags)));
987 } else {
988 vm_object_unlock(object);
989 }
990 } else {
991 vm_map_unlock(map);
992 }
993
994 /*
995 * now that the pages are wired, we no longer have to fear coalesce
996 */
997 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
998 vm_map_simplify(map, map_addr);
999 }
1000
1001 #if DEBUG || DEVELOPMENT
1002 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1003 atop(fill_size), 0, 0, 0);
1004 #endif /* DEBUG || DEVELOPMENT */
1005 kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
1006
1007 #if KASAN
1008 if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
1009 /*
1010 * We need to allow the range for pageable memory,
1011 * or faulting will not be allowed.
1012 */
1013 kasan_notify_address(map_addr, map_size);
1014 }
1015 #endif /* KASAN */
1016 #if KASAN_CLASSIC
1017 if (flags & KMA_KASAN_GUARD) {
1018 kmr.kmr_address += PAGE_SIZE;
1019 kasan_alloc_large(kmr.kmr_address, size);
1020 }
1021 #endif /* KASAN_CLASSIC */
1022 #if CONFIG_KERNEL_TAGGING
1023 if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
1024 kmr.kmr_ptr = vm_memtag_generate_and_store_tag((caddr_t)kmr.kmr_address + fill_start, fill_size);
1025 kmr.kmr_ptr = (caddr_t)kmr.kmr_ptr - fill_start;
1026 #if KASAN_TBI
1027 kasan_tbi_retag_unused_space(kmr.kmr_ptr, map_size, size);
1028 #endif /* KASAN_TBI */
1029 }
1030 #endif /* CONFIG_KERNEL_TAGGING */
1031 return kmr;
1032
1033 out_error:
1034 if (flags & KMA_NOFAIL) {
1035 __kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1036 }
1037 if (guard_left) {
1038 guard_left->vmp_snext = wired_page_list;
1039 wired_page_list = guard_left;
1040 }
1041 if (guard_right) {
1042 guard_right->vmp_snext = wired_page_list;
1043 wired_page_list = guard_right;
1044 }
1045 if (wired_page_list) {
1046 vm_page_free_list(wired_page_list, FALSE);
1047 }
1048
1049 #if DEBUG || DEVELOPMENT
1050 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1051 0, 0, 0, 0);
1052 #endif /* DEBUG || DEVELOPMENT */
1053
1054 return kmr;
1055 }
1056
1057 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1058 kmem_alloc_guard(
1059 vm_map_t map,
1060 vm_size_t size,
1061 vm_offset_t mask,
1062 kma_flags_t flags,
1063 kmem_guard_t guard)
1064 {
1065 return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1066 }
1067
1068 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1069 kmem_alloc_contig_guard(
1070 vm_map_t map,
1071 vm_size_t size,
1072 vm_offset_t mask,
1073 ppnum_t max_pnum,
1074 ppnum_t pnum_mask,
1075 kma_flags_t flags,
1076 kmem_guard_t guard)
1077 {
1078 __auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1079 return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1080 };
1081
1082 return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1083 }
1084
1085 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1086 kmem_suballoc(
1087 vm_map_t parent,
1088 mach_vm_offset_t *addr,
1089 vm_size_t size,
1090 vm_map_create_options_t vmc_options,
1091 int vm_flags,
1092 kms_flags_t flags,
1093 vm_tag_t tag)
1094 {
1095 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1096 vm_map_offset_t map_addr = 0;
1097 kmem_return_t kmr = { };
1098 vm_map_t map;
1099
1100 assert(page_aligned(size));
1101 assert(parent->pmap == kernel_pmap);
1102
1103 vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1104
1105 if (parent == kernel_map) {
1106 assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1107 }
1108
1109 if (vmk_flags.vmf_fixed) {
1110 map_addr = trunc_page(*addr);
1111 }
1112
1113 pmap_reference(vm_map_pmap(parent));
1114 map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1115
1116 /*
1117 * 1. vm_map_enter() will consume one ref on success.
1118 *
1119 * 2. make the entry atomic as kernel submaps should never be split.
1120 *
1121 * 3. instruct vm_map_enter() that it is a fresh submap
1122 * that needs to be taught its bounds as it inserted.
1123 */
1124 vm_map_reference(map);
1125
1126 vmk_flags.vmkf_submap = true;
1127 if ((flags & KMS_DATA) == 0) {
1128 /* FIXME: IOKit submaps get fragmented and can't be atomic */
1129 vmk_flags.vmkf_submap_atomic = true;
1130 }
1131 vmk_flags.vmkf_submap_adjust = true;
1132 if (flags & KMS_LAST_FREE) {
1133 vmk_flags.vmkf_last_free = true;
1134 }
1135 if (flags & KMS_PERMANENT) {
1136 vmk_flags.vmf_permanent = true;
1137 }
1138 if (flags & KMS_DATA) {
1139 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1140 }
1141 if (flags & KMS_NOSOFTLIMIT) {
1142 vmk_flags.vmkf_no_soft_limit = true;
1143 }
1144
1145 kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1146 vmk_flags, (vm_object_t)map, 0, FALSE,
1147 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1148
1149 if (kmr.kmr_return != KERN_SUCCESS) {
1150 if (flags & KMS_NOFAIL) {
1151 panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1152 parent, size, kmr.kmr_return);
1153 }
1154 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1155 vm_map_deallocate(map);
1156 vm_map_deallocate(map); /* also removes ref to pmap */
1157 return kmr;
1158 }
1159
1160 /*
1161 * For kmem_suballocs that register a claim and are assigned a range, ensure
1162 * that the exact same range is returned.
1163 */
1164 if (*addr != 0 && parent == kernel_map &&
1165 startup_phase > STARTUP_SUB_KMEM) {
1166 assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1167 } else {
1168 *addr = map_addr;
1169 }
1170
1171 kmr.kmr_submap = map;
1172 return kmr;
1173 }
1174
1175 /*
1176 * kmem_alloc:
1177 *
1178 * Allocate wired-down memory in the kernel's address map
1179 * or a submap. The memory is not zero-filled.
1180 */
1181
1182 __exported kern_return_t
1183 kmem_alloc_external(
1184 vm_map_t map,
1185 vm_offset_t *addrp,
1186 vm_size_t size);
1187 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1188 kmem_alloc_external(
1189 vm_map_t map,
1190 vm_offset_t *addrp,
1191 vm_size_t size)
1192 {
1193 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1194 return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1195 }
1196 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1197 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1198 }
1199
1200
1201 /*
1202 * kmem_alloc_kobject:
1203 *
1204 * Allocate wired-down memory in the kernel's address map
1205 * or a submap. The memory is not zero-filled.
1206 *
1207 * The memory is allocated in the kernel_object.
1208 * It may not be copied with vm_map_copy, and
1209 * it may not be reallocated with kmem_realloc.
1210 */
1211
1212 __exported kern_return_t
1213 kmem_alloc_kobject_external(
1214 vm_map_t map,
1215 vm_offset_t *addrp,
1216 vm_size_t size);
1217 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1218 kmem_alloc_kobject_external(
1219 vm_map_t map,
1220 vm_offset_t *addrp,
1221 vm_size_t size)
1222 {
1223 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1224 return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1225 }
1226 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1227 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1228 }
1229
1230 /*
1231 * kmem_alloc_pageable:
1232 *
1233 * Allocate pageable memory in the kernel's address map.
1234 */
1235
1236 __exported kern_return_t
1237 kmem_alloc_pageable_external(
1238 vm_map_t map,
1239 vm_offset_t *addrp,
1240 vm_size_t size);
1241 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1242 kmem_alloc_pageable_external(
1243 vm_map_t map,
1244 vm_offset_t *addrp,
1245 vm_size_t size)
1246 {
1247 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1248 return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1249 }
1250 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1251 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1252 }
1253
1254 static __attribute__((always_inline, warn_unused_result))
1255 kern_return_t
mach_vm_allocate_kernel_sanitize(vm_map_t map,mach_vm_offset_ut addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * map_addr,vm_map_size_t * map_size)1256 mach_vm_allocate_kernel_sanitize(
1257 vm_map_t map,
1258 mach_vm_offset_ut addr_u,
1259 mach_vm_size_ut size_u,
1260 vm_map_kernel_flags_t vmk_flags,
1261 vm_map_offset_t *map_addr,
1262 vm_map_size_t *map_size)
1263 {
1264 kern_return_t result;
1265 vm_map_offset_t map_end;
1266
1267 if (vmk_flags.vmf_fixed) {
1268 result = vm_sanitize_addr_size(addr_u, size_u,
1269 VM_SANITIZE_CALLER_VM_ALLOCATE_FIXED,
1270 map,
1271 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_REALIGN_START,
1272 map_addr, &map_end, map_size);
1273 if (__improbable(result != KERN_SUCCESS)) {
1274 return result;
1275 }
1276 } else {
1277 *map_addr = 0;
1278 result = vm_sanitize_size(0, size_u,
1279 VM_SANITIZE_CALLER_VM_ALLOCATE_ANYWHERE, map,
1280 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
1281 map_size);
1282 if (__improbable(result != KERN_SUCCESS)) {
1283 return result;
1284 }
1285 }
1286
1287 return KERN_SUCCESS;
1288 }
1289
1290 kern_return_t
mach_vm_allocate_kernel(vm_map_t map,mach_vm_offset_ut * addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags)1291 mach_vm_allocate_kernel(
1292 vm_map_t map,
1293 mach_vm_offset_ut *addr_u,
1294 mach_vm_size_ut size_u,
1295 vm_map_kernel_flags_t vmk_flags)
1296 {
1297 vm_map_offset_t map_addr;
1298 vm_map_size_t map_size;
1299 kern_return_t result;
1300
1301 if (map == VM_MAP_NULL) {
1302 ktriage_record(thread_tid(current_thread()),
1303 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1304 KDBG_TRIAGE_RESERVED,
1305 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR),
1306 KERN_INVALID_ARGUMENT /* arg */);
1307 return KERN_INVALID_ARGUMENT;
1308 }
1309
1310 if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
1311 VM_FLAGS_USER_ALLOCATE)) {
1312 return KERN_INVALID_ARGUMENT;
1313 }
1314
1315 result = mach_vm_allocate_kernel_sanitize(map,
1316 *addr_u,
1317 size_u,
1318 vmk_flags,
1319 &map_addr,
1320 &map_size);
1321 if (__improbable(result != KERN_SUCCESS)) {
1322 result = vm_sanitize_get_kr(result);
1323 if (result == KERN_SUCCESS) {
1324 *addr_u = vm_sanitize_wrap_addr(0);
1325 } else {
1326 ktriage_record(thread_tid(current_thread()),
1327 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1328 KDBG_TRIAGE_RESERVED,
1329 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR),
1330 KERN_INVALID_ARGUMENT /* arg */);
1331 }
1332 return result;
1333 }
1334
1335 vm_map_kernel_flags_update_range_id(&vmk_flags, map, map_size);
1336
1337 result = vm_map_enter(
1338 map,
1339 &map_addr,
1340 map_size,
1341 (vm_map_offset_t)0,
1342 vmk_flags,
1343 VM_OBJECT_NULL,
1344 (vm_object_offset_t)0,
1345 FALSE,
1346 VM_PROT_DEFAULT,
1347 VM_PROT_ALL,
1348 VM_INHERIT_DEFAULT);
1349
1350 if (result == KERN_SUCCESS) {
1351 #if KASAN
1352 if (map->pmap == kernel_pmap) {
1353 kasan_notify_address(map_addr, map_size);
1354 }
1355 #endif
1356 *addr_u = vm_sanitize_wrap_addr(map_addr);
1357 } else {
1358 ktriage_record(thread_tid(current_thread()),
1359 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1360 KDBG_TRIAGE_RESERVED,
1361 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR),
1362 result /* arg */);
1363 }
1364 return result;
1365 }
1366
1367 #pragma mark population
1368
1369 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1370 kernel_memory_populate_pmap_enter(
1371 vm_object_t object,
1372 vm_address_t addr,
1373 vm_object_offset_t offset,
1374 vm_page_t mem,
1375 vm_prot_t prot,
1376 int pe_flags,
1377 pmap_mapping_type_t mapping_type)
1378 {
1379 kern_return_t pe_result;
1380 int pe_options;
1381
1382 if (VMP_ERROR_GET(mem)) {
1383 panic("VM page %p should not have an error", mem);
1384 }
1385
1386 pe_options = PMAP_OPTIONS_NOWAIT;
1387 if (object->internal) {
1388 pe_options |= PMAP_OPTIONS_INTERNAL;
1389 }
1390 if (mem->vmp_reusable || object->all_reusable) {
1391 pe_options |= PMAP_OPTIONS_REUSABLE;
1392 }
1393
1394 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1395 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1396 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1397
1398 if (pe_result == KERN_RESOURCE_SHORTAGE) {
1399 vm_object_unlock(object);
1400
1401 pe_options &= ~PMAP_OPTIONS_NOWAIT;
1402
1403 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1404 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1405 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1406
1407 vm_object_lock(object);
1408 }
1409
1410 assert(pe_result == KERN_SUCCESS);
1411 }
1412
1413 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1414 kernel_memory_populate_object_and_unlock(
1415 vm_object_t object, /* must be locked */
1416 vm_address_t addr,
1417 vm_offset_t offset,
1418 vm_size_t size,
1419 vm_page_t page_list,
1420 kma_flags_t flags,
1421 vm_tag_t tag,
1422 vm_prot_t prot,
1423 pmap_mapping_type_t mapping_type)
1424 {
1425 vm_page_t mem;
1426 int pe_flags;
1427 bool gobbled_list = page_list && page_list->vmp_gobbled;
1428
1429 assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1430 assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1431
1432
1433 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1434 assert3u(offset, ==, addr);
1435 } else {
1436 /*
1437 * kernel_memory_populate_pmap_enter() might drop the object
1438 * lock, and the caller might not own a reference anymore
1439 * and rely on holding the vm object lock for liveness.
1440 */
1441 vm_object_reference_locked(object);
1442 }
1443
1444 if (flags & KMA_KSTACK) {
1445 pe_flags = VM_MEM_STACK;
1446 } else {
1447 pe_flags = 0;
1448 }
1449
1450
1451 for (vm_object_offset_t pg_offset = 0;
1452 pg_offset < size;
1453 pg_offset += PAGE_SIZE_64) {
1454 if (page_list == NULL) {
1455 panic("%s: page_list too short", __func__);
1456 }
1457
1458 mem = page_list;
1459 page_list = mem->vmp_snext;
1460 mem->vmp_snext = NULL;
1461
1462 assert(mem->vmp_wire_count == 0);
1463 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1464 assert(vm_page_is_canonical(mem));
1465
1466 if (flags & KMA_COMPRESSOR) {
1467 mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1468 /*
1469 * Background processes doing I/O accounting can call
1470 * into NVME driver to do some work which results in
1471 * an allocation here and so we want to make sure
1472 * that the pages used by compressor, regardless of
1473 * process context, are never on the special Q.
1474 */
1475 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1476
1477 vm_page_insert(mem, object, offset + pg_offset);
1478 } else {
1479 mem->vmp_q_state = VM_PAGE_IS_WIRED;
1480 mem->vmp_wire_count = 1;
1481
1482
1483 vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1484 }
1485
1486 mem->vmp_gobbled = false;
1487 mem->vmp_busy = false;
1488 mem->vmp_pmapped = true;
1489 mem->vmp_wpmapped = true;
1490
1491 /*
1492 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1493 * for the kernel and compressor objects.
1494 */
1495 kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1496 mem, prot, pe_flags, mapping_type);
1497
1498 if (flags & KMA_NOENCRYPT) {
1499 pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1500 }
1501 }
1502
1503 if (page_list) {
1504 panic("%s: page_list too long", __func__);
1505 }
1506
1507 vm_object_unlock(object);
1508 if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1509 vm_object_deallocate(object);
1510 }
1511
1512 /*
1513 * Update the accounting:
1514 * - the compressor "wired" pages don't really count as wired
1515 * - kmem_alloc_contig_guard() gives gobbled pages,
1516 * which already count as wired but need to be ungobbled.
1517 */
1518 if (gobbled_list) {
1519 vm_page_lockspin_queues();
1520 if (flags & KMA_COMPRESSOR) {
1521 vm_page_wire_count -= atop(size);
1522 }
1523 vm_page_gobble_count -= atop(size);
1524 vm_page_unlock_queues();
1525 } else if ((flags & KMA_COMPRESSOR) == 0) {
1526 vm_page_lockspin_queues();
1527 vm_page_wire_count += atop(size);
1528 vm_page_unlock_queues();
1529 }
1530
1531 if (flags & KMA_KOBJECT) {
1532 /* vm_page_insert_wired() handles regular objects already */
1533 vm_tag_update_size(tag, size, NULL);
1534 }
1535
1536 #if KASAN
1537 if (flags & KMA_COMPRESSOR) {
1538 kasan_notify_address_nopoison(addr, size);
1539 } else {
1540 kasan_notify_address(addr, size);
1541 }
1542 #endif /* KASAN */
1543 }
1544
1545
1546 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1547 kernel_memory_populate(
1548 vm_offset_t addr,
1549 vm_size_t size,
1550 kma_flags_t flags,
1551 vm_tag_t tag)
1552 {
1553 kern_return_t kr = KERN_SUCCESS;
1554 vm_page_t page_list = NULL;
1555 vm_size_t page_count = atop_64(size);
1556 vm_object_t object = __kmem_object(ANYF(flags));
1557
1558 #if DEBUG || DEVELOPMENT
1559 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1560 size, 0, 0, 0);
1561 #endif /* DEBUG || DEVELOPMENT */
1562
1563
1564 kr = vm_page_alloc_list(page_count, flags, &page_list);
1565 if (kr == KERN_SUCCESS) {
1566 vm_object_lock(object);
1567 kernel_memory_populate_object_and_unlock(object, addr,
1568 addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1569 __kmem_mapping_type(ANYF(flags)));
1570 }
1571
1572 #if DEBUG || DEVELOPMENT
1573 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1574 page_count, 0, 0, 0);
1575 #endif /* DEBUG || DEVELOPMENT */
1576 return kr;
1577 }
1578
1579 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1580 kernel_memory_depopulate(
1581 vm_offset_t addr,
1582 vm_size_t size,
1583 kma_flags_t flags,
1584 vm_tag_t tag)
1585 {
1586 vm_object_t object = __kmem_object(ANYF(flags));
1587 vm_object_offset_t offset = addr;
1588 vm_page_t mem;
1589 vm_page_t local_freeq = NULL;
1590 unsigned int pages_unwired = 0;
1591
1592 vm_object_lock(object);
1593
1594 pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1595
1596 for (vm_object_offset_t pg_offset = 0;
1597 pg_offset < size;
1598 pg_offset += PAGE_SIZE_64) {
1599 mem = vm_page_lookup(object, offset + pg_offset);
1600
1601 assert(mem);
1602
1603 if (flags & KMA_COMPRESSOR) {
1604 assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1605 } else {
1606 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1607 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1608 pages_unwired++;
1609 }
1610
1611 mem->vmp_busy = TRUE;
1612
1613 assert(mem->vmp_tabled);
1614 vm_page_remove(mem, TRUE);
1615 assert(mem->vmp_busy);
1616
1617 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1618
1619 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1620 mem->vmp_snext = local_freeq;
1621 local_freeq = mem;
1622 }
1623
1624 vm_object_unlock(object);
1625
1626 vm_page_free_list(local_freeq, TRUE);
1627
1628 if (!(flags & KMA_COMPRESSOR)) {
1629 vm_page_lockspin_queues();
1630 vm_page_wire_count -= pages_unwired;
1631 vm_page_unlock_queues();
1632 }
1633
1634 if (flags & KMA_KOBJECT) {
1635 /* vm_page_remove() handles regular objects already */
1636 vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1637 }
1638 }
1639
1640 #pragma mark reallocation
1641
1642 __abortlike
1643 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1644 __kmem_realloc_invalid_object_size_panic(
1645 vm_map_t map,
1646 vm_address_t address,
1647 vm_size_t size,
1648 vm_map_entry_t entry)
1649 {
1650 vm_object_t object = VME_OBJECT(entry);
1651 vm_size_t objsize = __kmem_entry_orig_size(entry);
1652
1653 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1654 "object %p has unexpected size %ld",
1655 map, (void *)address, (size_t)size, entry, object, objsize);
1656 }
1657
1658 __abortlike
1659 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1660 __kmem_realloc_invalid_pager_panic(
1661 vm_map_t map,
1662 vm_address_t address,
1663 vm_size_t size,
1664 vm_map_entry_t entry)
1665 {
1666 vm_object_t object = VME_OBJECT(entry);
1667 memory_object_t pager = object->pager;
1668 bool pager_created = object->pager_created;
1669 bool pager_initialized = object->pager_initialized;
1670 bool pager_ready = object->pager_ready;
1671
1672 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1673 "object %p has unexpected pager %p (%d,%d,%d)",
1674 map, (void *)address, (size_t)size, entry, object,
1675 pager, pager_created, pager_initialized, pager_ready);
1676 }
1677
1678 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1679 kmem_realloc_shrink_guard(
1680 vm_map_t map,
1681 vm_offset_t req_oldaddr,
1682 vm_size_t req_oldsize,
1683 vm_size_t req_newsize,
1684 kmr_flags_t flags,
1685 kmem_guard_t guard,
1686 vm_map_entry_t entry)
1687 {
1688 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1689 vm_object_t object;
1690 vm_offset_t delta = 0;
1691 kmem_return_t kmr;
1692 bool was_atomic;
1693 vm_size_t oldsize = round_page(req_oldsize);
1694 vm_size_t newsize = round_page(req_newsize);
1695 vm_address_t oldaddr = req_oldaddr;
1696
1697 #if KASAN_CLASSIC
1698 if (flags & KMR_KASAN_GUARD) {
1699 assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1700 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1701 oldaddr -= PAGE_SIZE;
1702 delta = ptoa(2);
1703 oldsize += delta;
1704 newsize += delta;
1705 }
1706 #endif /* KASAN_CLASSIC */
1707
1708 if (flags & KMR_TAG) {
1709 oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1710 }
1711
1712 vm_map_lock_assert_exclusive(map);
1713
1714 if ((flags & KMR_KOBJECT) == 0) {
1715 object = VME_OBJECT(entry);
1716 vm_object_reference(object);
1717 }
1718
1719 /*
1720 * Shrinking an atomic entry starts with splitting it,
1721 * and removing the second half.
1722 */
1723 was_atomic = entry->vme_atomic;
1724 entry->vme_atomic = false;
1725 vm_map_clip_end(map, entry, entry->vme_start + newsize);
1726 entry->vme_atomic = was_atomic;
1727
1728 #if KASAN
1729 if (entry->vme_kernel_object && was_atomic) {
1730 entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1731 }
1732 #if KASAN_CLASSIC
1733 if (flags & KMR_KASAN_GUARD) {
1734 kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1735 ASAN_VALID);
1736 }
1737 #endif
1738 #if KASAN_TBI
1739 if (flags & KMR_TAG) {
1740 kasan_tbi_mark_free_space((caddr_t)req_oldaddr + newsize, oldsize - newsize);
1741 }
1742 #endif /* KASAN_TBI */
1743 #endif /* KASAN */
1744 (void)vm_map_remove_and_unlock(map,
1745 oldaddr + newsize, oldaddr + oldsize,
1746 vmr_flags, KMEM_GUARD_NONE);
1747
1748
1749 /*
1750 * Lastly, if there are guard pages, deal with them.
1751 *
1752 * The kernel object just needs to depopulate,
1753 * regular objects require freeing the last page
1754 * and replacing it with a guard.
1755 */
1756 if (flags & KMR_KOBJECT) {
1757 if (flags & KMR_GUARD_LAST) {
1758 kma_flags_t dflags = KMA_KOBJECT;
1759 kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1760 PAGE_SIZE, dflags, guard.kmg_tag);
1761 }
1762 } else {
1763 vm_page_t guard_right = VM_PAGE_NULL;
1764 vm_offset_t remove_start = newsize;
1765
1766 if (flags & KMR_GUARD_LAST) {
1767 if (!map->never_faults) {
1768 guard_right = vm_page_create_guard(true);
1769 }
1770 remove_start -= PAGE_SIZE;
1771 }
1772
1773 vm_object_lock(object);
1774
1775 if (object->vo_size != oldsize) {
1776 __kmem_realloc_invalid_object_size_panic(map,
1777 req_oldaddr, req_oldsize + delta, entry);
1778 }
1779 vm_object_set_size(object, newsize, req_newsize);
1780
1781 vm_object_page_remove(object, remove_start, oldsize);
1782
1783 if (guard_right) {
1784 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1785 guard_right->vmp_busy = false;
1786 }
1787 vm_object_unlock(object);
1788 vm_object_deallocate(object);
1789 }
1790
1791 kmr.kmr_address = req_oldaddr;
1792 kmr.kmr_return = 0;
1793 #if KASAN_CLASSIC
1794 if (flags & KMA_KASAN_GUARD) {
1795 kasan_alloc_large(kmr.kmr_address, req_newsize);
1796 }
1797 #endif /* KASAN_CLASSIC */
1798 #if KASAN_TBI
1799 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1800 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1801 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1802 }
1803 #endif /* KASAN_TBI */
1804
1805 return kmr;
1806 }
1807
1808 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1809 kmem_realloc_guard(
1810 vm_map_t map,
1811 vm_offset_t req_oldaddr,
1812 vm_size_t req_oldsize,
1813 vm_size_t req_newsize,
1814 kmr_flags_t flags,
1815 kmem_guard_t guard)
1816 {
1817 vm_object_t object;
1818 vm_size_t oldsize;
1819 vm_size_t newsize;
1820 vm_offset_t delta = 0;
1821 vm_map_offset_t oldaddr;
1822 vm_map_offset_t newaddr;
1823 vm_object_offset_t newoffs;
1824 vm_map_entry_t oldentry;
1825 vm_map_entry_t newentry;
1826 vm_page_t page_list = NULL;
1827 bool needs_wakeup = false;
1828 kmem_return_t kmr = { };
1829 unsigned int last_timestamp;
1830 vm_map_kernel_flags_t vmk_flags = {
1831 .vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1832 };
1833
1834 assert(KMEM_REALLOC_FLAGS_VALID(flags));
1835
1836 if (!guard.kmg_atomic) {
1837 if (!(flags & (KMR_DATA | KMR_DATA_SHARED))) {
1838 __kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1839 req_oldsize, flags);
1840 }
1841
1842 if (flags & KMR_KOBJECT) {
1843 __kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1844 req_oldsize, flags);
1845 }
1846 }
1847
1848 if (req_oldaddr == 0ul) {
1849 return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1850 }
1851
1852 if (req_newsize == 0ul) {
1853 kmem_free_guard(map, req_oldaddr, req_oldsize,
1854 (kmf_flags_t)flags, guard);
1855 return kmr;
1856 }
1857
1858 if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1859 __kmem_invalid_size_panic(map, req_newsize, flags);
1860 }
1861 if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1862 __kmem_invalid_size_panic(map, req_newsize, flags);
1863 }
1864
1865 oldsize = round_page(req_oldsize);
1866 newsize = round_page(req_newsize);
1867 oldaddr = req_oldaddr;
1868 #if KASAN_CLASSIC
1869 if (flags & KMR_KASAN_GUARD) {
1870 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1871 oldaddr -= PAGE_SIZE;
1872 delta = ptoa(2);
1873 oldsize += delta;
1874 newsize += delta;
1875 }
1876 #endif /* KASAN_CLASSIC */
1877 #if CONFIG_KERNEL_TAGGING
1878 if (flags & KMR_TAG) {
1879 vm_memtag_verify_tag(req_oldaddr + __kmem_guard_left(ANYF(flags)));
1880 oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1881 }
1882 #endif /* CONFIG_KERNEL_TAGGING */
1883
1884 #if !KASAN
1885 /*
1886 * If not on a KASAN variant and no difference in requested size,
1887 * just return.
1888 *
1889 * Otherwise we want to validate the size and re-tag for KASAN_TBI.
1890 */
1891 if (oldsize == newsize) {
1892 kmr.kmr_address = req_oldaddr;
1893 return kmr;
1894 }
1895 #endif /* !KASAN */
1896
1897 /*
1898 * If we're growing the allocation,
1899 * then reserve the pages we'll need,
1900 * and find a spot for its new place.
1901 */
1902 if (oldsize < newsize) {
1903 #if DEBUG || DEVELOPMENT
1904 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1905 DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1906 newsize - oldsize, 0, 0, 0);
1907 #endif /* DEBUG || DEVELOPMENT */
1908 kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1909 (kma_flags_t)flags, &page_list);
1910 if (kmr.kmr_return == KERN_SUCCESS) {
1911 kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1912 newsize, 0, &vmk_flags, true);
1913 kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1914 vmk_flags, &newentry);
1915 }
1916 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1917 if (flags & KMR_REALLOCF) {
1918 kmem_free_guard(map, req_oldaddr, req_oldsize,
1919 flags & (KMF_TAG | KMF_GUARD_FIRST |
1920 KMF_GUARD_LAST | KMF_KASAN_GUARD), guard);
1921 }
1922 if (page_list) {
1923 vm_page_free_list(page_list, FALSE);
1924 }
1925 #if DEBUG || DEVELOPMENT
1926 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1927 DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1928 0, 0, 0, 0);
1929 #endif /* DEBUG || DEVELOPMENT */
1930 return kmr;
1931 }
1932
1933 /* map is locked */
1934 } else {
1935 vm_map_lock(map);
1936 }
1937
1938
1939 /*
1940 * Locate the entry:
1941 * - wait for it to quiesce.
1942 * - validate its guard,
1943 * - learn its correct tag,
1944 */
1945 again:
1946 if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1947 __kmem_entry_not_found_panic(map, req_oldaddr);
1948 }
1949 if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1950 oldentry->needs_wakeup = true;
1951 vm_map_entry_wait(map, THREAD_UNINT);
1952 goto again;
1953 }
1954 kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1955 if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1956 __kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1957 }
1958 /*
1959 * TODO: We should validate for non atomic entries that the range
1960 * we are acting on is what we expect here.
1961 */
1962 #if KASAN
1963 if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1964 __kmem_realloc_invalid_object_size_panic(map,
1965 req_oldaddr, req_oldsize + delta, oldentry);
1966 }
1967
1968 if (oldsize == newsize) {
1969 kmr.kmr_address = req_oldaddr;
1970 if (oldentry->vme_kernel_object) {
1971 oldentry->vme_object_or_delta = delta +
1972 (-req_newsize & PAGE_MASK);
1973 } else {
1974 object = VME_OBJECT(oldentry);
1975 vm_object_lock(object);
1976 vm_object_set_size(object, newsize, req_newsize);
1977 vm_object_unlock(object);
1978 }
1979 vm_map_unlock(map);
1980
1981 #if KASAN_CLASSIC
1982 if (flags & KMA_KASAN_GUARD) {
1983 kasan_alloc_large(kmr.kmr_address, req_newsize);
1984 }
1985 #endif /* KASAN_CLASSIC */
1986 #if KASAN_TBI
1987 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1988 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1989 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1990 }
1991 #endif /* KASAN_TBI */
1992 return kmr;
1993 }
1994 #endif /* KASAN */
1995
1996 guard.kmg_tag = VME_ALIAS(oldentry);
1997
1998 if (newsize < oldsize) {
1999 return kmem_realloc_shrink_guard(map, req_oldaddr,
2000 req_oldsize, req_newsize, flags, guard, oldentry);
2001 }
2002
2003
2004 /*
2005 * We are growing the entry
2006 *
2007 * For regular objects we use the object `vo_size` updates
2008 * as a guarantee that no 2 kmem_realloc() can happen
2009 * concurrently (by doing it before the map is unlocked.
2010 *
2011 * For the kernel object, prevent the entry from being
2012 * reallocated or changed by marking it "in_transition".
2013 */
2014
2015 object = VME_OBJECT(oldentry);
2016 vm_object_lock(object);
2017 vm_object_reference_locked(object);
2018
2019 newaddr = newentry->vme_start;
2020 newoffs = oldsize;
2021
2022 VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
2023 VME_ALIAS_SET(newentry, guard.kmg_tag);
2024 if (flags & KMR_KOBJECT) {
2025 oldentry->in_transition = true;
2026 VME_OFFSET_SET(newentry, newaddr);
2027 newentry->wired_count = 1;
2028 vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
2029 newoffs = newaddr + oldsize;
2030 #if KASAN
2031 newentry->vme_object_or_delta = delta +
2032 (-req_newsize & PAGE_MASK);
2033 #endif /* KASAN */
2034 } else {
2035 if (object->pager_created || object->pager) {
2036 /*
2037 * We can't "realloc/grow" the pager, so pageable
2038 * allocations should not go through this path.
2039 */
2040 __kmem_realloc_invalid_pager_panic(map,
2041 req_oldaddr, req_oldsize + delta, oldentry);
2042 }
2043 if (object->vo_size != oldsize) {
2044 __kmem_realloc_invalid_object_size_panic(map,
2045 req_oldaddr, req_oldsize + delta, oldentry);
2046 }
2047 vm_object_set_size(object, newsize, req_newsize);
2048 }
2049
2050 last_timestamp = map->timestamp;
2051 vm_map_unlock(map);
2052
2053
2054 /*
2055 * Now proceed with the population of pages.
2056 *
2057 * Kernel objects can use the kmem population helpers.
2058 *
2059 * Regular objects will insert pages manually,
2060 * then wire the memory into the new range.
2061 */
2062
2063 vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
2064
2065 if (flags & KMR_KOBJECT) {
2066 pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
2067
2068 pmap_protect(kernel_pmap,
2069 oldaddr, oldaddr + oldsize - guard_right_size,
2070 VM_PROT_NONE);
2071
2072 for (vm_object_offset_t offset = 0;
2073 offset < oldsize - guard_right_size;
2074 offset += PAGE_SIZE_64) {
2075 vm_page_t mem;
2076
2077 mem = vm_page_lookup(object, oldaddr + offset);
2078 if (mem == VM_PAGE_NULL) {
2079 continue;
2080 }
2081
2082 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
2083
2084 mem->vmp_busy = true;
2085 vm_page_remove(mem, true);
2086 vm_page_insert_wired(mem, object, newaddr + offset,
2087 guard.kmg_tag);
2088 mem->vmp_busy = false;
2089
2090 kernel_memory_populate_pmap_enter(object, newaddr,
2091 offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
2092 }
2093
2094 kernel_memory_populate_object_and_unlock(object,
2095 newaddr + oldsize - guard_right_size,
2096 newoffs - guard_right_size,
2097 newsize - oldsize,
2098 page_list, (kma_flags_t)flags,
2099 guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
2100 } else {
2101 vm_page_t guard_right = VM_PAGE_NULL;
2102
2103 /*
2104 * Note: we are borrowing the new entry reference
2105 * on the object for the duration of this code,
2106 * which works because we keep the object locked
2107 * throughout.
2108 */
2109 if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
2110 guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
2111 assert(vm_page_is_guard(guard_right));
2112 guard_right->vmp_busy = true;
2113 vm_page_remove(guard_right, true);
2114 }
2115
2116 if (flags & KMR_FREEOLD) {
2117 /*
2118 * Freeing the old mapping will make
2119 * the old pages become pageable until
2120 * the new mapping makes them wired again.
2121 * Let's take an extra "wire_count" to
2122 * prevent any accidental "page out".
2123 * We'll have to undo that after wiring
2124 * the new mapping.
2125 */
2126 vm_object_reference_locked(object); /* keep object alive */
2127 for (vm_object_offset_t offset = 0;
2128 offset < oldsize - guard_right_size;
2129 offset += PAGE_SIZE_64) {
2130 vm_page_t mem;
2131
2132 mem = vm_page_lookup(object, offset);
2133 assert(mem != VM_PAGE_NULL);
2134 assertf(!VM_PAGE_PAGEABLE(mem),
2135 "mem %p qstate %d",
2136 mem, mem->vmp_q_state);
2137 if (vm_page_is_guard(mem)) {
2138 /* guard pages are not wired */
2139 } else {
2140 assertf(VM_PAGE_WIRED(mem),
2141 "mem %p qstate %d wirecount %d",
2142 mem,
2143 mem->vmp_q_state,
2144 mem->vmp_wire_count);
2145 assertf(mem->vmp_wire_count >= 1,
2146 "mem %p wirecount %d",
2147 mem, mem->vmp_wire_count);
2148 mem->vmp_wire_count++;
2149 }
2150 }
2151 }
2152
2153 for (vm_object_offset_t offset = oldsize - guard_right_size;
2154 offset < newsize - guard_right_size;
2155 offset += PAGE_SIZE_64) {
2156 vm_page_t mem = page_list;
2157
2158 page_list = mem->vmp_snext;
2159 mem->vmp_snext = VM_PAGE_NULL;
2160 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
2161 assert(!VM_PAGE_PAGEABLE(mem));
2162
2163 vm_page_insert(mem, object, offset);
2164 mem->vmp_busy = false;
2165 }
2166
2167 if (guard_right) {
2168 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
2169 guard_right->vmp_busy = false;
2170 }
2171
2172 vm_object_unlock(object);
2173 }
2174
2175 /*
2176 * Mark the entry as idle again,
2177 * and honor KMR_FREEOLD if needed.
2178 */
2179
2180 vm_map_lock(map);
2181 if (last_timestamp + 1 != map->timestamp &&
2182 !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2183 __kmem_entry_not_found_panic(map, req_oldaddr);
2184 }
2185
2186 if (flags & KMR_KOBJECT) {
2187 assert(oldentry->in_transition);
2188 oldentry->in_transition = false;
2189 if (oldentry->needs_wakeup) {
2190 needs_wakeup = true;
2191 oldentry->needs_wakeup = false;
2192 }
2193 }
2194
2195 if (flags & KMR_FREEOLD) {
2196 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2197
2198 #if KASAN_CLASSIC
2199 if (flags & KMR_KASAN_GUARD) {
2200 kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2201 }
2202 #endif
2203 #if KASAN_TBI
2204 if (flags & KMR_TAG) {
2205 kasan_tbi_mark_free_space((caddr_t)req_oldaddr, oldsize);
2206 }
2207 #endif /* KASAN_TBI */
2208 if (flags & KMR_GUARD_LAST) {
2209 vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2210 }
2211 (void)vm_map_remove_and_unlock(map,
2212 oldaddr, oldaddr + oldsize,
2213 vmr_flags, guard);
2214 } else {
2215 vm_map_unlock(map);
2216 }
2217
2218 if ((flags & KMR_KOBJECT) == 0) {
2219 kern_return_t kr;
2220 /*
2221 * This must happen _after_ we do the KMR_FREEOLD,
2222 * because wiring the pages will call into the pmap,
2223 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2224 * this would cause a second mapping of the page and panic.
2225 */
2226 kr = vm_map_wire_kernel(map,
2227 vm_sanitize_wrap_addr(newaddr),
2228 vm_sanitize_wrap_addr(newaddr + newsize),
2229 vm_sanitize_wrap_prot(VM_PROT_DEFAULT),
2230 guard.kmg_tag, FALSE);
2231 assert(kr == KERN_SUCCESS);
2232
2233 if (flags & KMR_FREEOLD) {
2234 /*
2235 * Undo the extra "wiring" we made above
2236 * and release the extra reference we took
2237 * on the object.
2238 */
2239 vm_object_lock(object);
2240 for (vm_object_offset_t offset = 0;
2241 offset < oldsize - guard_right_size;
2242 offset += PAGE_SIZE_64) {
2243 vm_page_t mem;
2244
2245 mem = vm_page_lookup(object, offset);
2246 assert(mem != VM_PAGE_NULL);
2247 assertf(!VM_PAGE_PAGEABLE(mem),
2248 "mem %p qstate %d",
2249 mem, mem->vmp_q_state);
2250 if (vm_page_is_guard(mem)) {
2251 /* guard pages are not wired */
2252 } else {
2253 assertf(VM_PAGE_WIRED(mem),
2254 "mem %p qstate %d wirecount %d",
2255 mem,
2256 mem->vmp_q_state,
2257 mem->vmp_wire_count);
2258 assertf(mem->vmp_wire_count >= 2,
2259 "mem %p wirecount %d",
2260 mem, mem->vmp_wire_count);
2261 mem->vmp_wire_count--;
2262 assert(VM_PAGE_WIRED(mem));
2263 assert(mem->vmp_wire_count >= 1);
2264 }
2265 }
2266 vm_object_unlock(object);
2267 vm_object_deallocate(object); /* release extra ref */
2268 }
2269 }
2270
2271 if (needs_wakeup) {
2272 vm_map_entry_wakeup(map);
2273 }
2274
2275 #if DEBUG || DEVELOPMENT
2276 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
2277 atop(newsize - oldsize), 0, 0, 0);
2278 #endif /* DEBUG || DEVELOPMENT */
2279 kmr.kmr_address = newaddr;
2280
2281 #if KASAN
2282 kasan_notify_address(kmr.kmr_address, newsize);
2283 #endif /* KASAN */
2284 #if KASAN_CLASSIC
2285 if (flags & KMR_KASAN_GUARD) {
2286 kmr.kmr_address += PAGE_SIZE;
2287 kasan_alloc_large(kmr.kmr_address, req_newsize);
2288 }
2289 #endif /* KASAN_CLASSIC */
2290 #if CONFIG_KERNEL_TAGGING
2291 if (flags & KMR_TAG) {
2292 #if KASAN_TBI
2293 /*
2294 * Validate the current buffer, then generate a new tag,
2295 * even if the address is stable, it's a "new" allocation.
2296 */
2297 __asan_loadN((vm_offset_t)kmr.kmr_address, oldsize);
2298 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2299 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2300 #endif /* KASAN_TBI */
2301 }
2302 #endif /* CONFIG_KERNEL_TAGGING */
2303
2304 return kmr;
2305 }
2306
2307 #pragma mark map/remap/wire
2308
2309 kern_return_t
mach_vm_map_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut initial_size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,memory_object_offset_ut offset,boolean_t copy,vm_prot_ut cur_protection,vm_prot_ut max_protection,vm_inherit_ut inheritance)2310 mach_vm_map_kernel(
2311 vm_map_t target_map,
2312 mach_vm_offset_ut *address,
2313 mach_vm_size_ut initial_size,
2314 mach_vm_offset_ut mask,
2315 vm_map_kernel_flags_t vmk_flags,
2316 ipc_port_t port,
2317 memory_object_offset_ut offset,
2318 boolean_t copy,
2319 vm_prot_ut cur_protection,
2320 vm_prot_ut max_protection,
2321 vm_inherit_ut inheritance)
2322 {
2323 /* range_id is set by vm_map_enter_mem_object */
2324 return vm_map_enter_mem_object(target_map,
2325 address,
2326 initial_size,
2327 mask,
2328 vmk_flags,
2329 port,
2330 offset,
2331 copy,
2332 cur_protection,
2333 max_protection,
2334 inheritance,
2335 NULL,
2336 0);
2337 }
2338
2339 kern_return_t
mach_vm_remap_new_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,mach_vm_offset_ut memory_address,boolean_t copy,vm_prot_ut * cur_protection,vm_prot_ut * max_protection,vm_inherit_ut inheritance)2340 mach_vm_remap_new_kernel(
2341 vm_map_t target_map,
2342 mach_vm_offset_ut *address,
2343 mach_vm_size_ut size,
2344 mach_vm_offset_ut mask,
2345 vm_map_kernel_flags_t vmk_flags,
2346 vm_map_t src_map,
2347 mach_vm_offset_ut memory_address,
2348 boolean_t copy,
2349 vm_prot_ut *cur_protection, /* IN/OUT */
2350 vm_prot_ut *max_protection, /* IN/OUT */
2351 vm_inherit_ut inheritance)
2352 {
2353 if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
2354 VM_FLAGS_USER_REMAP)) {
2355 return KERN_INVALID_ARGUMENT;
2356 }
2357
2358
2359 vmk_flags.vmf_return_data_addr = true;
2360
2361 /* range_id is set by vm_map_remap */
2362 return vm_map_remap(target_map,
2363 address,
2364 size,
2365 mask,
2366 vmk_flags,
2367 src_map,
2368 memory_address,
2369 copy,
2370 cur_protection,
2371 max_protection,
2372 inheritance);
2373 }
2374
2375 #pragma mark free
2376
2377 #if KASAN
2378
2379 __abortlike
2380 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2381 __kmem_free_invalid_object_size_panic(
2382 vm_map_t map,
2383 vm_address_t address,
2384 vm_size_t size,
2385 vm_map_entry_t entry)
2386 {
2387 vm_object_t object = VME_OBJECT(entry);
2388 vm_size_t objsize = __kmem_entry_orig_size(entry);
2389
2390 panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2391 "object %p has unexpected size %ld",
2392 map, (void *)address, (size_t)size, entry, object, objsize);
2393 }
2394
2395 #endif /* KASAN */
2396
2397 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2398 kmem_free_guard(
2399 vm_map_t map,
2400 vm_offset_t req_addr,
2401 vm_size_t req_size,
2402 kmf_flags_t flags,
2403 kmem_guard_t guard)
2404 {
2405 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2406 vm_address_t addr = req_addr;
2407 vm_offset_t delta = 0;
2408 vm_size_t size;
2409 #if KASAN
2410 vm_map_entry_t entry;
2411 #endif /* KASAN */
2412
2413 assert(map->pmap == kernel_pmap);
2414
2415 #if KASAN_CLASSIC
2416 if (flags & KMF_KASAN_GUARD) {
2417 addr -= PAGE_SIZE;
2418 delta = ptoa(2);
2419 }
2420 #endif /* KASAN_CLASSIC */
2421 #if CONFIG_KERNEL_TAGGING
2422 if (flags & KMF_TAG) {
2423 vm_memtag_verify_tag(req_addr + __kmem_guard_left(ANYF(flags)));
2424 addr = vm_memtag_canonicalize_kernel(req_addr);
2425 }
2426 #endif /* CONFIG_KERNEL_TAGGING */
2427
2428 if (flags & KMF_GUESS_SIZE) {
2429 vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2430 size = PAGE_SIZE;
2431 } else if (req_size == 0) {
2432 __kmem_invalid_size_panic(map, req_size, flags);
2433 } else {
2434 size = round_page(req_size) + delta;
2435 }
2436
2437 vm_map_lock(map);
2438
2439 #if KASAN
2440 if (!vm_map_lookup_entry(map, addr, &entry)) {
2441 __kmem_entry_not_found_panic(map, req_addr);
2442 }
2443 if (flags & KMF_GUESS_SIZE) {
2444 vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2445 req_size = __kmem_entry_orig_size(entry);
2446 size = round_page(req_size + delta);
2447 } else if (guard.kmg_atomic && entry->vme_kernel_object &&
2448 __kmem_entry_orig_size(entry) != req_size) {
2449 /*
2450 * We can't make a strict check for regular
2451 * VM objects because it could be:
2452 *
2453 * - the kmem_guard_free() of a kmem_realloc_guard() without
2454 * KMR_FREEOLD, and in that case the object size won't match.
2455 *
2456 * - a submap, in which case there is no "orig size".
2457 */
2458 __kmem_free_invalid_object_size_panic(map,
2459 req_addr, req_size + delta, entry);
2460 }
2461 #endif /* KASAN */
2462 #if KASAN_CLASSIC
2463 if (flags & KMR_KASAN_GUARD) {
2464 kasan_poison_range(addr, size, ASAN_VALID);
2465 }
2466 #endif
2467 #if KASAN_TBI
2468 if (flags & KMF_TAG) {
2469 kasan_tbi_mark_free_space((caddr_t)req_addr, size);
2470 }
2471 #endif /* KASAN_TBI */
2472
2473 /*
2474 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2475 * unwires the kernel mapping. The page won't be mapped any longer so
2476 * there is no extra step that is required for memory tagging to "clear"
2477 * it -- the page will be later laundered when reused.
2478 */
2479 return vm_map_remove_and_unlock(map, addr, addr + size,
2480 vmr_flags, guard).kmr_size - delta;
2481 }
2482
2483 __exported void
2484 kmem_free_external(
2485 vm_map_t map,
2486 vm_offset_t addr,
2487 vm_size_t size);
2488 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2489 kmem_free_external(
2490 vm_map_t map,
2491 vm_offset_t addr,
2492 vm_size_t size)
2493 {
2494 if (size) {
2495 kmem_free(map, trunc_page(addr), size);
2496 #if MACH_ASSERT
2497 } else {
2498 printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2499 map, (void *)addr, __builtin_return_address(0));
2500 #endif
2501 }
2502 }
2503
2504 #pragma mark kmem metadata
2505
2506 /*
2507 * Guard objects for kmem pointer allocation:
2508 *
2509 * Guard objects introduce size slabs to kmem pointer allocations that are
2510 * allocated in chunks of n * sizeclass. When an allocation of a specific
2511 * sizeclass is requested a random slot from [0, n) is returned.
2512 * Allocations are returned from that chunk until m slots are left. The
2513 * remaining m slots are referred to as guard objects. They don't get
2514 * allocated and the chunk is now considered full. When an allocation is
2515 * freed to the chunk 1 slot is now available from m + 1 for the next
2516 * allocation of that sizeclass.
2517 *
2518 * Guard objects are intended to make exploitation of use after frees harder
2519 * as allocations that are freed can no longer be reliable reallocated.
2520 * They also make exploitation of OOBs harder as overflowing out of an
2521 * allocation can no longer be safe even with sufficient spraying.
2522 */
2523
2524 #define KMEM_META_PRIMARY UINT8_MAX
2525 #define KMEM_META_START (UINT8_MAX - 1)
2526 #define KMEM_META_FREE (UINT8_MAX - 2)
2527 #if __ARM_16K_PG__
2528 #define KMEM_MIN_SIZE PAGE_SIZE
2529 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2530 #else /* __ARM_16K_PG__ */
2531 /*
2532 * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2533 * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2534 * Therefore populate sizeclasses from 4k for those devices.
2535 */
2536 #define KMEM_MIN_SIZE (4 * 1024)
2537 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2538 #endif /* __ARM_16K_PG__ */
2539 #define KMEM_MAX_SIZE (32ULL << 20)
2540 #define KMEM_START_IDX (kmem_log2down(KMEM_MIN_SIZE))
2541 #define KMEM_LAST_IDX (kmem_log2down(KMEM_MAX_SIZE))
2542 #define KMEM_NUM_SIZECLASS (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2543 #define KMEM_FRONTS (KMEM_RANGE_ID_NUM_PTR * 2)
2544 #define KMEM_NUM_GUARDS 2
2545
2546 struct kmem_page_meta {
2547 union {
2548 /*
2549 * On primary allocated chunk with KMEM_META_PRIMARY marker
2550 */
2551 uint32_t km_bitmap;
2552 /*
2553 * On start and end of free chunk with KMEM_META_FREE marker
2554 */
2555 uint32_t km_free_chunks;
2556 };
2557 /*
2558 * KMEM_META_PRIMARY: Start meta of allocated chunk
2559 * KMEM_META_FREE : Start and end meta of free chunk
2560 * KMEM_META_START : Meta region start and end
2561 */
2562 uint8_t km_page_marker;
2563 uint8_t km_sizeclass;
2564 union {
2565 /*
2566 * On primary allocated chunk with KMEM_META_PRIMARY marker
2567 */
2568 uint16_t km_chunk_len;
2569 /*
2570 * On secondary allocated chunks
2571 */
2572 uint16_t km_page_idx;
2573 };
2574 LIST_ENTRY(kmem_page_meta) km_link;
2575 } kmem_page_meta_t;
2576
2577 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2578 struct kmem_sizeclass {
2579 vm_map_size_t ks_size;
2580 uint32_t ks_num_chunk;
2581 uint32_t ks_num_elem;
2582 crypto_random_ctx_t __zpercpu ks_rng_ctx;
2583 kmem_list_head_t ks_allfree_head[KMEM_FRONTS];
2584 kmem_list_head_t ks_partial_head[KMEM_FRONTS];
2585 kmem_list_head_t ks_full_head[KMEM_FRONTS];
2586 };
2587
2588 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2589
2590 /*
2591 * Locks to synchronize metadata population
2592 */
2593 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2594 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2595 #define kmem_meta_lock() lck_mtx_lock(&kmem_meta_region_lck)
2596 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2597
2598 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2599 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2600 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2601 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2602 /*
2603 * Keeps track of metadata high water mark for each front
2604 */
2605 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2606 static SECURITY_READ_ONLY_LATE(vm_map_t)
2607 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2608 static vm_map_size_t kmem_meta_size;
2609
2610 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2611 kmem_get_front(
2612 kmem_range_id_t range_id,
2613 bool from_right)
2614 {
2615 assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2616 (range_id <= KMEM_RANGE_ID_NUM_PTR));
2617 return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2618 }
2619
2620 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2621 kmem_slot_idx_to_bit(
2622 uint32_t slot_idx,
2623 uint32_t size_idx __unused)
2624 {
2625 assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2626 return 1ull << slot_idx;
2627 }
2628
2629 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2630 kmem_get_idx_from_size(vm_map_size_t size)
2631 {
2632 assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2633 return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2634 }
2635
2636 __abortlike
2637 static void
kmem_invalid_size_idx(uint32_t idx)2638 kmem_invalid_size_idx(uint32_t idx)
2639 {
2640 panic("Invalid sizeclass idx %u", idx);
2641 }
2642
2643 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2644 kmem_get_size_from_idx(uint32_t idx)
2645 {
2646 if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2647 kmem_invalid_size_idx(idx);
2648 }
2649 return 1ul << (idx + KMEM_START_IDX);
2650 }
2651
2652 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2653 kmem_get_page_idx(struct kmem_page_meta *meta)
2654 {
2655 uint8_t page_marker = meta->km_page_marker;
2656
2657 return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2658 }
2659
2660 __abortlike
2661 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2662 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2663 {
2664 panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2665 meta);
2666 }
2667
2668 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2669 kmem_get_chunk_len(struct kmem_page_meta *meta)
2670 {
2671 if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2672 kmem_invalid_chunk_len(meta);
2673 }
2674
2675 return meta->km_chunk_len;
2676 }
2677
2678 __abortlike
2679 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2680 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2681 {
2682 panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2683 meta);
2684 }
2685
2686 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2687 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2688 {
2689 if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2690 kmem_invalid_free_chunk_len(meta);
2691 }
2692
2693 return meta->km_free_chunks;
2694 }
2695
2696 /*
2697 * Return the metadata corresponding to the specified address
2698 */
2699 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2700 kmem_addr_to_meta(
2701 vm_map_offset_t addr,
2702 vm_map_range_id_t range_id,
2703 vm_map_offset_t *range_start,
2704 uint64_t *meta_idx)
2705 {
2706 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2707
2708 *range_start = kmem_ranges[range_id].min_address;
2709 *meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2710 return VM_FAR_ADD_PTR_UNBOUNDED(meta_base, *meta_idx);
2711 }
2712
2713 /*
2714 * Return the metadata start of the chunk that the address belongs to
2715 */
2716 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2717 kmem_addr_to_meta_start(
2718 vm_address_t addr,
2719 vm_map_range_id_t range_id,
2720 vm_map_offset_t *chunk_start)
2721 {
2722 vm_map_offset_t range_start;
2723 uint64_t meta_idx;
2724 struct kmem_page_meta *meta;
2725
2726 meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2727 meta_idx -= kmem_get_page_idx(meta);
2728 meta = VM_FAR_ADD_PTR_UNBOUNDED(meta, -(ptrdiff_t)kmem_get_page_idx(meta));
2729 assert(meta->km_page_marker == KMEM_META_PRIMARY);
2730 *chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2731 return meta;
2732 }
2733
2734 __startup_func
2735 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2736 kmem_init_meta_front(
2737 struct kmem_page_meta *meta,
2738 kmem_range_id_t range_id,
2739 bool from_right)
2740 {
2741 kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2742 KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2743 meta->km_page_marker = KMEM_META_START;
2744 if (!from_right) {
2745 meta++;
2746 kmem_meta_base[range_id] = meta;
2747 }
2748 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2749 }
2750
2751 __startup_func
2752 static void
kmem_metadata_init(void)2753 kmem_metadata_init(void)
2754 {
2755 for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2756 vm_map_offset_t addr = kmem_meta_range[i].min_address;
2757 struct kmem_page_meta *meta;
2758 uint64_t meta_idx;
2759
2760 vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2761 kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2762 VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2763 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
2764 KMS_PERMANENT | KMS_NOFAIL | KMS_NOSOFTLIMIT,
2765 VM_KERN_MEMORY_OSFMK).kmr_submap;
2766
2767 kmem_meta_range[i].min_address = addr;
2768 kmem_meta_range[i].max_address = addr + kmem_meta_size;
2769
2770 meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2771 kmem_init_meta_front(meta, i, 0);
2772
2773 meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2774 &meta_idx);
2775 kmem_init_meta_front(meta, i, 1);
2776 }
2777 }
2778
2779 __startup_func
2780 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2781 kmem_init_front_head(
2782 struct kmem_sizeclass *ks,
2783 uint32_t front)
2784 {
2785 LIST_INIT(&ks->ks_allfree_head[front]);
2786 LIST_INIT(&ks->ks_partial_head[front]);
2787 LIST_INIT(&ks->ks_full_head[front]);
2788 }
2789
2790 __startup_func
2791 static void
kmem_sizeclass_init(void)2792 kmem_sizeclass_init(void)
2793 {
2794 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2795 struct kmem_sizeclass *ks = &kmem_size_array[i];
2796 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2797
2798 ks->ks_size = kmem_get_size_from_idx(i);
2799 ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2800 KMEM_CHUNK_SIZE_MIN;
2801 ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2802 assert(ks->ks_num_elem <=
2803 (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2804 for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2805 kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2806 kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2807 }
2808 }
2809 }
2810
2811 /*
2812 * This is done during EARLY_BOOT as it needs the corecrypto module to be
2813 * set up.
2814 */
2815 __startup_func
2816 static void
kmem_crypto_init(void)2817 kmem_crypto_init(void)
2818 {
2819 vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2820
2821 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2822 struct kmem_sizeclass *ks = &kmem_size_array[i];
2823
2824 ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2825 zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2826 crypto_random_kmem_init(ctx);
2827 }
2828 }
2829 }
2830 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2831
2832 __abortlike
2833 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2834 kmem_validate_slot_panic(
2835 vm_map_offset_t addr,
2836 struct kmem_page_meta *meta,
2837 uint32_t slot_idx,
2838 uint32_t size_idx)
2839 {
2840 if (meta->km_page_marker != KMEM_META_PRIMARY) {
2841 panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2842 }
2843 if (meta->km_sizeclass != size_idx) {
2844 panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2845 meta, meta->km_sizeclass, size_idx);
2846 }
2847 panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2848 slot_idx, meta, (void *)addr);
2849 }
2850
2851 __abortlike
2852 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2853 kmem_invalid_slot_for_addr(
2854 mach_vm_range_t slot,
2855 vm_map_offset_t start,
2856 vm_map_offset_t end)
2857 {
2858 panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2859 (void *)slot->min_address, (void *)slot->max_address,
2860 (void *)start, (void *)end);
2861 }
2862
2863 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2864 kmem_validate_slot(
2865 vm_map_offset_t addr,
2866 struct kmem_page_meta *meta,
2867 uint32_t size_idx,
2868 uint32_t slot_idx)
2869 {
2870 if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2871 (meta->km_sizeclass != size_idx) ||
2872 ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2873 kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2874 }
2875 }
2876
2877 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2878 kmem_validate_slot_initial(
2879 mach_vm_range_t slot,
2880 vm_map_offset_t start,
2881 vm_map_offset_t end,
2882 struct kmem_page_meta *meta,
2883 uint32_t size_idx,
2884 uint32_t slot_idx)
2885 {
2886 if ((slot->min_address == 0) || (slot->max_address == 0) ||
2887 (start < slot->min_address) || (start >= slot->max_address) ||
2888 (end > slot->max_address)) {
2889 kmem_invalid_slot_for_addr(slot, start, end);
2890 }
2891
2892 kmem_validate_slot(start, meta, size_idx, slot_idx);
2893 }
2894
2895 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2896 kmem_addr_get_slot_idx(
2897 vm_map_offset_t start,
2898 vm_map_offset_t end,
2899 vm_map_range_id_t range_id,
2900 struct kmem_page_meta **meta,
2901 uint32_t *size_idx,
2902 mach_vm_range_t slot)
2903 {
2904 vm_map_offset_t chunk_start;
2905 vm_map_size_t slot_size;
2906 uint32_t slot_idx;
2907
2908 *meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2909 *size_idx = (*meta)->km_sizeclass;
2910 slot_size = kmem_get_size_from_idx(*size_idx);
2911 slot_idx = (start - chunk_start) / slot_size;
2912 slot->min_address = chunk_start + slot_idx * slot_size;
2913 slot->max_address = slot->min_address + slot_size;
2914
2915 kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2916
2917 return slot_idx;
2918 }
2919
2920 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2921 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2922 {
2923 #if KASAN
2924 #pragma unused(from, to)
2925 return true;
2926 #else
2927 vm_offset_t page_addr = trunc_page(from);
2928
2929 for (; page_addr < to; page_addr += PAGE_SIZE) {
2930 /*
2931 * This can race with another thread doing a populate on the same metadata
2932 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2933 * fault in the shadow when we first access the metadata page. Avoid this
2934 * by always synchronizing on the kmem_meta_lock with KASan.
2935 */
2936 if (!pmap_find_phys(kernel_pmap, page_addr)) {
2937 return true;
2938 }
2939 }
2940
2941 return false;
2942 #endif /* !KASAN */
2943 }
2944
2945 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2946 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2947 {
2948 vm_offset_t page_addr = trunc_page(from);
2949
2950 vm_map_unlock(kernel_map);
2951
2952 for (; page_addr < to; page_addr += PAGE_SIZE) {
2953 for (;;) {
2954 kern_return_t ret = KERN_SUCCESS;
2955
2956 /*
2957 * All updates to kmem metadata are done under the kmem_meta_lock
2958 */
2959 kmem_meta_lock();
2960 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2961 ret = kernel_memory_populate(page_addr,
2962 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2963 VM_KERN_MEMORY_OSFMK);
2964 }
2965 kmem_meta_unlock();
2966
2967 if (ret == KERN_SUCCESS) {
2968 break;
2969 }
2970
2971 /*
2972 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2973 * to bad system deadlocks, so if the allocation failed,
2974 * we need to do the VM_PAGE_WAIT() outside of the lock.
2975 */
2976 VM_PAGE_WAIT();
2977 }
2978 }
2979
2980 vm_map_lock(kernel_map);
2981 }
2982
2983 __abortlike
2984 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2985 kmem_invalid_meta_panic(
2986 struct kmem_page_meta *meta,
2987 uint32_t slot_idx,
2988 struct kmem_sizeclass sizeclass)
2989 {
2990 uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2991
2992 if (slot_idx >= sizeclass.ks_num_elem) {
2993 panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2994 sizeclass.ks_num_elem, meta);
2995 }
2996 if (meta->km_sizeclass != size_idx) {
2997 panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2998 meta->km_sizeclass, meta);
2999 }
3000 panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
3001 }
3002
3003 __abortlike
3004 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)3005 kmem_slot_has_entry_panic(
3006 vm_map_entry_t entry,
3007 vm_map_offset_t addr)
3008 {
3009 panic("Entry (%p) already exists for addr (%p) being returned",
3010 entry, (void *)addr);
3011 }
3012
3013 __abortlike
3014 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)3015 kmem_slot_not_found(
3016 struct kmem_page_meta *meta,
3017 uint32_t slot_idx)
3018 {
3019 panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
3020 meta->km_bitmap);
3021 }
3022
3023 /*
3024 * Returns a 16bit random number between 0 and
3025 * upper_limit (inclusive)
3026 */
3027 __startup_func
3028 uint16_t
kmem_get_random16(uint16_t upper_limit)3029 kmem_get_random16(
3030 uint16_t upper_limit)
3031 {
3032 static uint64_t random_entropy;
3033 assert(upper_limit < UINT16_MAX);
3034 if (random_entropy == 0) {
3035 random_entropy = early_random();
3036 }
3037 uint32_t result = random_entropy & UINT32_MAX;
3038 random_entropy >>= 32;
3039 return (uint16_t)(result % (upper_limit + 1));
3040 }
3041
3042 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)3043 kmem_get_nth_free_slot(
3044 struct kmem_page_meta *meta,
3045 uint32_t n,
3046 uint32_t bitmap)
3047 {
3048 uint32_t zeros_seen = 0, ones_seen = 0;
3049
3050 while (bitmap) {
3051 uint32_t count = __builtin_ctz(bitmap);
3052
3053 zeros_seen += count;
3054 bitmap >>= count;
3055 if (__probable(~bitmap)) {
3056 count = __builtin_ctz(~bitmap);
3057 } else {
3058 count = 32;
3059 }
3060 if (count + ones_seen > n) {
3061 return zeros_seen + n;
3062 }
3063 ones_seen += count;
3064 bitmap >>= count;
3065 }
3066
3067 kmem_slot_not_found(meta, n);
3068 }
3069
3070
3071 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)3072 kmem_get_next_slot(
3073 struct kmem_page_meta *meta,
3074 struct kmem_sizeclass sizeclass,
3075 uint32_t bitmap)
3076 {
3077 uint32_t num_slots = __builtin_popcount(bitmap);
3078 uint64_t slot_idx = 0;
3079
3080 assert(num_slots > 0);
3081 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
3082 /*
3083 * Use early random prior to early boot as the ks_rng_ctx requires
3084 * the corecrypto module to be setup before it is initialized and
3085 * used.
3086 *
3087 * num_slots can't be 0 as we take this path when we have more than
3088 * one slot left.
3089 */
3090 slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
3091 } else {
3092 crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
3093 &slot_idx);
3094 }
3095
3096 return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
3097 }
3098
3099 /*
3100 * Returns an unallocated slot from the given metadata
3101 */
3102 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)3103 kmem_get_addr_from_meta(
3104 struct kmem_page_meta *meta,
3105 vm_map_range_id_t range_id,
3106 struct kmem_sizeclass sizeclass,
3107 vm_map_entry_t *entry)
3108 {
3109 vm_map_offset_t addr;
3110 vm_map_size_t size = sizeclass.ks_size;
3111 uint32_t size_idx = kmem_get_idx_from_size(size);
3112 uint64_t meta_idx = meta - kmem_meta_base[range_id];
3113 mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
3114 uint32_t slot_bit;
3115 uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
3116
3117 if ((slot_idx >= sizeclass.ks_num_elem) ||
3118 (meta->km_sizeclass != size_idx) ||
3119 (meta->km_page_marker != KMEM_META_PRIMARY)) {
3120 kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
3121 }
3122
3123 slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
3124 meta->km_bitmap &= ~slot_bit;
3125
3126 addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
3127 assert(kmem_range_contains_fully(range_id, addr, size));
3128 if (vm_map_lookup_entry(kernel_map, addr, entry)) {
3129 kmem_slot_has_entry_panic(*entry, addr);
3130 }
3131 if ((*entry != vm_map_to_entry(kernel_map)) &&
3132 ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
3133 ((*entry)->vme_next->vme_start < (addr + size))) {
3134 kmem_slot_has_entry_panic(*entry, addr);
3135 }
3136 return addr;
3137 }
3138
3139 __abortlike
3140 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)3141 kmem_range_out_of_va(
3142 kmem_range_id_t range_id,
3143 uint32_t num_chunks)
3144 {
3145 panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
3146 }
3147
3148 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)3149 kmem_init_allocated_chunk(
3150 struct kmem_page_meta *meta,
3151 struct kmem_sizeclass sizeclass,
3152 uint32_t size_idx)
3153 {
3154 uint32_t meta_num = sizeclass.ks_num_chunk;
3155 uint32_t num_elem = sizeclass.ks_num_elem;
3156
3157 meta->km_bitmap = (1ull << num_elem) - 1;
3158 meta->km_chunk_len = (uint16_t)meta_num;
3159 assert(LIST_NEXT(meta, km_link) == NULL);
3160 assert(meta->km_link.le_prev == NULL);
3161 meta->km_sizeclass = (uint8_t)size_idx;
3162 meta->km_page_marker = KMEM_META_PRIMARY;
3163 meta++;
3164 for (uint32_t i = 1; i < meta_num; i++) {
3165 meta->km_page_idx = (uint16_t)i;
3166 meta->km_sizeclass = (uint8_t)size_idx;
3167 meta->km_page_marker = 0;
3168 meta->km_bitmap = 0;
3169 meta++;
3170 }
3171 }
3172
3173 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)3174 kmem_get_additional_meta(
3175 struct kmem_page_meta *meta,
3176 uint32_t meta_req,
3177 bool from_right,
3178 struct kmem_page_meta **adj_free_meta)
3179 {
3180 struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
3181
3182 if (meta_prev->km_page_marker == KMEM_META_FREE) {
3183 uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
3184
3185 *adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
3186 meta_req -= chunk_len;
3187 } else {
3188 *adj_free_meta = NULL;
3189 }
3190
3191 return meta_req;
3192 }
3193
3194
3195 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)3196 kmem_get_new_chunk(
3197 vm_map_range_id_t range_id,
3198 bool from_right,
3199 uint32_t size_idx)
3200 {
3201 struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
3202 struct kmem_page_meta *start, *end, *meta_update;
3203 struct kmem_page_meta *adj_free_meta = NULL;
3204 uint32_t meta_req = sizeclass.ks_num_chunk;
3205
3206 for (;;) {
3207 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3208 struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3209 struct kmem_page_meta *meta;
3210 vm_offset_t start_addr, end_addr;
3211 uint32_t meta_num;
3212
3213 meta = from_right ? metab : metaf;
3214 meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
3215 &adj_free_meta);
3216
3217 if (metaf + meta_num >= metab) {
3218 kmem_range_out_of_va(range_id, meta_num);
3219 }
3220
3221 start = from_right ? (metab - meta_num) : metaf;
3222 end = from_right ? metab : (metaf + meta_num);
3223
3224 start_addr = (vm_offset_t)start;
3225 end_addr = (vm_offset_t)end;
3226
3227 /*
3228 * If the new high watermark stays on the same page,
3229 * no need to populate and drop the lock.
3230 */
3231 if (!page_aligned(from_right ? end_addr : start_addr) &&
3232 trunc_page(start_addr) == trunc_page(end_addr - 1)) {
3233 break;
3234 }
3235 if (!kmem_populate_needed(start_addr, end_addr)) {
3236 break;
3237 }
3238
3239 kmem_populate_meta_locked(start_addr, end_addr);
3240
3241 /*
3242 * Since we dropped the lock, reassess conditions still hold:
3243 * - the HWM we are changing must not have moved
3244 * - the other HWM must not intersect with ours
3245 * - in case of coalescing, the adjacent free meta must still
3246 * be free and of the same size.
3247 *
3248 * If we failed to grow, reevaluate whether freelists have
3249 * entries now by returning NULL.
3250 */
3251 metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3252 metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3253 if (meta != (from_right ? metab : metaf)) {
3254 return NULL;
3255 }
3256 if (metaf + meta_num >= metab) {
3257 kmem_range_out_of_va(range_id, meta_num);
3258 }
3259 if (adj_free_meta) {
3260 if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
3261 kmem_get_free_chunk_len(adj_free_meta) !=
3262 meta_req - meta_num) {
3263 return NULL;
3264 }
3265 }
3266
3267 break;
3268 }
3269
3270 /*
3271 * If there is an adjacent free chunk remove it from free list
3272 */
3273 if (adj_free_meta) {
3274 LIST_REMOVE(adj_free_meta, km_link);
3275 LIST_NEXT(adj_free_meta, km_link) = NULL;
3276 adj_free_meta->km_link.le_prev = NULL;
3277 }
3278
3279 /*
3280 * Update hwm
3281 */
3282 meta_update = from_right ? start : end;
3283 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3284
3285 /*
3286 * Initialize metadata
3287 */
3288 start = from_right ? start : (end - meta_req);
3289 kmem_init_allocated_chunk(start, sizeclass, size_idx);
3290
3291 return start;
3292 }
3293
3294 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3295 kmem_requeue_meta(
3296 struct kmem_page_meta *meta,
3297 struct kmem_list_head *head)
3298 {
3299 LIST_REMOVE(meta, km_link);
3300 LIST_INSERT_HEAD(head, meta, km_link);
3301 }
3302
3303 /*
3304 * Return corresponding sizeclass to stash free chunks in
3305 */
3306 __abortlike
3307 static void
kmem_invalid_chunk_num(uint32_t chunks)3308 kmem_invalid_chunk_num(uint32_t chunks)
3309 {
3310 panic("Invalid number of chunks %u\n", chunks);
3311 }
3312
3313 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3314 kmem_get_size_idx_for_chunks(uint32_t chunks)
3315 {
3316 for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3317 if (chunks >= kmem_size_array[i].ks_num_chunk) {
3318 return i;
3319 }
3320 }
3321 kmem_invalid_chunk_num(chunks);
3322 }
3323
3324 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3325 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3326 {
3327 bzero(meta, count * sizeof(struct kmem_page_meta));
3328 }
3329
3330 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3331 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3332 {
3333 #if MACH_ASSERT
3334 size_t size = count * sizeof(struct kmem_page_meta);
3335
3336 assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3337 #else
3338 #pragma unused(meta, count)
3339 #endif
3340 }
3341
3342 /*!
3343 * @function kmem_init_free_chunk()
3344 *
3345 * @discussion
3346 * This function prepares a range of chunks to be put on a free list.
3347 * The first and last metadata might be dirty, but the "inner" ones
3348 * must be zero filled by the caller prior to calling this function.
3349 */
3350 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3351 kmem_init_free_chunk(
3352 struct kmem_page_meta *meta,
3353 uint32_t num_chunks,
3354 uint32_t front)
3355 {
3356 struct kmem_sizeclass *sizeclass;
3357 uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3358
3359 if (num_chunks > 2) {
3360 kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3361 }
3362
3363 meta[0] = (struct kmem_page_meta){
3364 .km_free_chunks = num_chunks,
3365 .km_page_marker = KMEM_META_FREE,
3366 .km_sizeclass = (uint8_t)size_idx,
3367 };
3368 if (num_chunks > 1) {
3369 meta[num_chunks - 1] = (struct kmem_page_meta){
3370 .km_free_chunks = num_chunks,
3371 .km_page_marker = KMEM_META_FREE,
3372 .km_sizeclass = (uint8_t)size_idx,
3373 };
3374 }
3375
3376 sizeclass = &kmem_size_array[size_idx];
3377 LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3378 }
3379
3380 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3381 kmem_get_free_chunk_from_list(
3382 struct kmem_sizeclass *org_sizeclass,
3383 uint32_t size_idx,
3384 uint32_t front)
3385 {
3386 struct kmem_sizeclass *sizeclass;
3387 uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3388 struct kmem_page_meta *meta;
3389 uint32_t idx = size_idx;
3390
3391 while (idx < KMEM_NUM_SIZECLASS) {
3392 sizeclass = &kmem_size_array[idx];
3393 meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3394 if (meta) {
3395 break;
3396 }
3397 idx++;
3398 }
3399
3400 /*
3401 * Trim if larger in size
3402 */
3403 if (meta) {
3404 uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3405
3406 assert(meta->km_page_marker == KMEM_META_FREE);
3407 LIST_REMOVE(meta, km_link);
3408 LIST_NEXT(meta, km_link) = NULL;
3409 meta->km_link.le_prev = NULL;
3410 if (num_chunks_free > num_chunks) {
3411 num_chunks_free -= num_chunks;
3412 kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3413 }
3414
3415 kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3416 }
3417
3418 return meta;
3419 }
3420
3421 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3422 kmem_locate_space(
3423 vm_map_size_t size,
3424 vm_map_range_id_t range_id,
3425 bool from_right,
3426 vm_map_offset_t *start_inout,
3427 vm_map_entry_t *entry_out)
3428 {
3429 vm_map_entry_t entry;
3430 uint32_t size_idx = kmem_get_idx_from_size(size);
3431 uint32_t front = kmem_get_front(range_id, from_right);
3432 struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3433 struct kmem_page_meta *meta;
3434
3435 assert(size <= sizeclass->ks_size);
3436 again:
3437 if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3438 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3439 /*
3440 * Requeue to full if necessary
3441 */
3442 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3443 if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3444 kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3445 }
3446 } else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3447 front)) != NULL) {
3448 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3449 /*
3450 * Queue to partial
3451 */
3452 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3453 assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3454 LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3455 } else {
3456 meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3457 if (meta == NULL) {
3458 goto again;
3459 }
3460 *start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3461 assert(meta->km_page_marker == KMEM_META_PRIMARY);
3462 LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3463 }
3464
3465 if (entry_out) {
3466 *entry_out = entry;
3467 }
3468
3469 return KERN_SUCCESS;
3470 }
3471
3472 /*
3473 * Determine whether the given metadata was allocated from the right
3474 */
3475 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3476 kmem_meta_is_from_right(
3477 kmem_range_id_t range_id,
3478 struct kmem_page_meta *meta)
3479 {
3480 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3481 __assert_only struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3482 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3483 struct kmem_page_meta *meta_end;
3484
3485 meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3486
3487 if ((meta >= meta_base) && (meta < metaf)) {
3488 return false;
3489 }
3490
3491 assert(meta >= metab && meta < meta_end);
3492 return true;
3493 }
3494
3495 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3496 kmem_free_chunk(
3497 kmem_range_id_t range_id,
3498 struct kmem_page_meta *meta,
3499 bool from_right)
3500 {
3501 struct kmem_page_meta *meta_coalesce = meta - 1;
3502 struct kmem_page_meta *meta_start = meta;
3503 uint32_t num_chunks = kmem_get_chunk_len(meta);
3504 uint32_t add_chunks;
3505 struct kmem_page_meta *meta_end = meta + num_chunks;
3506 struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3507 uint32_t front = kmem_get_front(range_id, from_right);
3508
3509 meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3510 meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3511
3512 LIST_REMOVE(meta, km_link);
3513 kmem_clear_meta_range(meta, num_chunks);
3514
3515 /*
3516 * Coalesce left
3517 */
3518 if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3519 (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3520 meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3521 add_chunks = kmem_get_free_chunk_len(meta_start);
3522 num_chunks += add_chunks;
3523 LIST_REMOVE(meta_start, km_link);
3524 kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3525 }
3526
3527 /*
3528 * Coalesce right
3529 */
3530 if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3531 (meta_end->km_page_marker == KMEM_META_FREE)) {
3532 add_chunks = kmem_get_free_chunk_len(meta_end);
3533 LIST_REMOVE(meta_end, km_link);
3534 kmem_clear_meta_range(meta_end, 1);
3535 meta_end = meta_end + add_chunks;
3536 num_chunks += add_chunks;
3537 }
3538
3539 kmem_init_free_chunk(meta_start, num_chunks, front);
3540 }
3541
3542 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3543 kmem_free_slot(
3544 kmem_range_id_t range_id,
3545 mach_vm_range_t slot)
3546 {
3547 struct kmem_page_meta *meta;
3548 vm_map_offset_t chunk_start;
3549 uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3550 struct kmem_sizeclass *sizeclass;
3551 vm_map_size_t slot_size;
3552
3553 meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3554 size_idx = meta->km_sizeclass;
3555 slot_size = kmem_get_size_from_idx(size_idx);
3556 slot_idx = (slot->min_address - chunk_start) / slot_size;
3557 assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3558 meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3559
3560 sizeclass = &kmem_size_array[size_idx];
3561 chunk_elem = sizeclass->ks_num_elem;
3562 num_elem = __builtin_popcount(meta->km_bitmap);
3563
3564 if (num_elem == chunk_elem) {
3565 /*
3566 * If entire chunk empty add to emtpy list
3567 */
3568 bool from_right = kmem_meta_is_from_right(range_id, meta);
3569
3570 kmem_free_chunk(range_id, meta, from_right);
3571 } else if (num_elem == KMEM_NUM_GUARDS + 1) {
3572 /*
3573 * If we freed to full chunk move it to partial
3574 */
3575 uint32_t front = kmem_get_front(range_id,
3576 kmem_meta_is_from_right(range_id, meta));
3577
3578 kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3579 }
3580 }
3581
3582 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3583 kmem_free_space(
3584 vm_map_offset_t start,
3585 vm_map_offset_t end,
3586 vm_map_range_id_t range_id,
3587 mach_vm_range_t slot)
3588 {
3589 bool entry_present = false;
3590 vm_map_entry_t prev_entry;
3591 vm_map_entry_t next_entry;
3592
3593 if ((slot->min_address == start) && (slot->max_address == end)) {
3594 /*
3595 * Entire slot is being freed at once
3596 */
3597 return kmem_free_slot(range_id, slot);
3598 }
3599
3600 entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3601 assert(!entry_present);
3602 next_entry = prev_entry->vme_next;
3603
3604 if (((prev_entry == vm_map_to_entry(kernel_map) ||
3605 prev_entry->vme_end <= slot->min_address)) &&
3606 (next_entry == vm_map_to_entry(kernel_map) ||
3607 (next_entry->vme_start >= slot->max_address))) {
3608 /*
3609 * Free entire slot
3610 */
3611 kmem_free_slot(range_id, slot);
3612 }
3613 }
3614
3615 #pragma mark kmem init
3616
3617 /*
3618 * The default percentage of memory that can be mlocked is scaled based on the total
3619 * amount of memory in the system. These percentages are caclulated
3620 * offline and stored in this table. We index this table by
3621 * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3622 * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3623 *
3624 * Note that these values were picked for mac.
3625 * If we ever have very large memory config arm devices, we may want to revisit
3626 * since the kernel overhead is smaller there due to the larger page size.
3627 */
3628
3629 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3630 #define VM_USER_WIREABLE_MIN_CONFIG 32
3631 #if CONFIG_JETSAM
3632 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3633 * pressure.
3634 */
3635 static vm_map_size_t wire_limit_percents[] =
3636 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3637 #else
3638 static vm_map_size_t wire_limit_percents[] =
3639 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3640 #endif /* CONFIG_JETSAM */
3641
3642 /* Set limit to 95% of DRAM if serverperfmode=1 */
3643 #define VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT 95
3644 /* Use special serverperfmode behavior iff DRAM > 2^35 = 32GiB of RAM. */
3645 #define VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG 35
3646
3647 /*
3648 * Sets the default global user wire limit which limits the amount of
3649 * memory that can be locked via mlock() based on the above algorithm..
3650 * This can be overridden via a sysctl.
3651 */
3652 static void
kmem_set_user_wire_limits(void)3653 kmem_set_user_wire_limits(void)
3654 {
3655 uint64_t available_mem_log;
3656 uint64_t max_wire_percent;
3657 size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3658 sizeof(vm_map_size_t);
3659 vm_map_size_t limit;
3660 uint64_t config_memsize = max_mem;
3661 #if defined(XNU_TARGET_OS_OSX)
3662 config_memsize = max_mem_actual;
3663 #endif /* defined(XNU_TARGET_OS_OSX) */
3664
3665 available_mem_log = bit_floor(config_memsize);
3666
3667 if (serverperfmode &&
3668 (available_mem_log >= VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG)) {
3669 max_wire_percent = VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT;
3670 } else {
3671 if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3672 available_mem_log = 0;
3673 } else {
3674 available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3675 }
3676 if (available_mem_log >= wire_limit_percents_length) {
3677 available_mem_log = wire_limit_percents_length - 1;
3678 }
3679 max_wire_percent = wire_limit_percents[available_mem_log];
3680 }
3681
3682 limit = config_memsize * max_wire_percent / 100;
3683 /* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3684 if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3685 limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3686 }
3687
3688 vm_global_user_wire_limit = limit;
3689 /* the default per task limit is the same as the global limit */
3690 vm_per_task_user_wire_limit = limit;
3691 vm_add_wire_count_over_global_limit = 0;
3692 vm_add_wire_count_over_user_limit = 0;
3693 }
3694
3695 #define KMEM_MAX_CLAIMS 50
3696 __startup_data
3697 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3698
3699 #if !MACH_ASSERT
3700 __startup_data
3701 #endif /* !MACH_ASSERT */
3702 uint32_t kmem_claim_count = 0;
3703
3704 #if MACH_ASSERT
3705 /**
3706 * Save off some minimal information about the ranges for consumption by
3707 * post-lockdown tests.
3708 */
3709 static struct mach_vm_range kmem_test_saved_ranges[KMEM_MAX_CLAIMS];
3710 #endif /* MACH_ASSERT */
3711
3712 /**
3713 * For a requested claim size (i.e. kc_size), get the number of bytes which
3714 * should actually be allocated for a region in order to be able to properly
3715 * provide the requested size (the allocation size).
3716 *
3717 * This allocation size is always greater or equal to the claim size. It can,
3718 * for example, include additional space as required by the kernel memory
3719 * configuration.
3720 *
3721 * @param known_last Is the claim in question known to be the last region after
3722 * all placing has completed? The size for a known_last allocation is always
3723 * less than or equal to a non-known_last allocation of the same size.
3724 */
3725 __startup_func
3726 static vm_map_size_t
kmem_claim_to_allocation_size(vm_map_size_t claim_size,bool known_last)3727 kmem_claim_to_allocation_size(vm_map_size_t claim_size, bool known_last)
3728 {
3729 (void)known_last;
3730 /*
3731 * Allocation size and claim size are identical.
3732 */
3733 return claim_size;
3734 }
3735
3736 /**
3737 * Compute the largest claim which can be made from a given allocation size.
3738 */
3739 static vm_map_size_t
kmem_allocation_to_claim_size(vm_map_size_t allocation_size)3740 kmem_allocation_to_claim_size(vm_map_size_t allocation_size)
3741 {
3742 /*
3743 * Allocation size and claim size are identical.
3744 */
3745 return allocation_size;
3746 }
3747
3748 __startup_func
3749 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3750 kmem_range_startup_init(
3751 struct kmem_range_startup_spec *sp)
3752 {
3753 assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3754 if (sp->kc_calculate_sz) {
3755 sp->kc_size = (sp->kc_calculate_sz)();
3756 }
3757 if (sp->kc_size) {
3758 kmem_claims[kmem_claim_count] = *sp;
3759 kmem_claim_count++;
3760 }
3761 }
3762
3763 static vm_offset_t
kmem_fuzz_start(void)3764 kmem_fuzz_start(void)
3765 {
3766 vm_offset_t kmapoff_kaddr = 0;
3767 uint32_t kmapoff_pgcnt;
3768
3769 kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3770
3771 vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3772
3773 kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3774 KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3775 VM_KERN_MEMORY_OSFMK);
3776
3777
3778 return kmapoff_kaddr + kmapoff_size;
3779 }
3780
3781 /*
3782 * Generate a randomly shuffled array of indices from 0 to count - 1
3783 */
3784 __startup_func
3785 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3786 kmem_shuffle(
3787 uint16_t *shuffle_buf,
3788 uint16_t count)
3789 {
3790 for (uint16_t i = 0; i < count; i++) {
3791 uint16_t j = kmem_get_random16(i);
3792 if (j != i) {
3793 shuffle_buf[i] = shuffle_buf[j];
3794 }
3795 shuffle_buf[j] = i;
3796 }
3797 }
3798
3799 __startup_func
3800 static void
kmem_shuffle_claims(void)3801 kmem_shuffle_claims(void)
3802 {
3803 uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3804 uint16_t limit = (uint16_t)kmem_claim_count;
3805
3806 kmem_shuffle(&shuffle_buf[0], limit);
3807 for (uint16_t i = 0; i < limit; i++) {
3808 struct kmem_range_startup_spec tmp = kmem_claims[i];
3809 kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3810 kmem_claims[shuffle_buf[i]] = tmp;
3811 }
3812 }
3813
3814 __startup_func
3815 static void
kmem_readjust_ranges(uint32_t cur_idx)3816 kmem_readjust_ranges(
3817 uint32_t cur_idx)
3818 {
3819 assert(cur_idx != 0);
3820 uint32_t j = cur_idx - 1, random;
3821 struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3822 struct mach_vm_range *sp_range = sp.kc_range;
3823 /*
3824 * Even if sp is currently last, it will never be last after it is moved.
3825 * As such, we want to bump other claims over it and include any necessary
3826 * padding for a non-last claim.
3827 *
3828 * While changing which claim is last can impact the total VA usage, since a
3829 * known_last allocation size is guaranteed to always be less-than-or-equal
3830 * to a non-known_last allocation (which is used for pre-placement sizing),
3831 * we will always have enough space so long as the pre-placement sizing had
3832 * enough space.
3833 */
3834 vm_map_offset_t sp_allocation_size =
3835 kmem_claim_to_allocation_size(sp.kc_size, /* known_last */ false);
3836
3837 /*
3838 * Find max index where restriction is met
3839 */
3840 for (; j > 0; j--) {
3841 struct kmem_range_startup_spec spj = kmem_claims[j];
3842 vm_map_offset_t max_start = spj.kc_range->min_address;
3843 if (spj.kc_flags & KC_NO_MOVE) {
3844 panic("kmem_range_init: Can't scramble with multiple constraints");
3845 }
3846 if (max_start <= sp_range->min_address) {
3847 break;
3848 }
3849 }
3850
3851 /*
3852 * Pick a random index from 0 to max index and shift claims to the right
3853 * to make room for restricted claim
3854 */
3855 random = kmem_get_random16((uint16_t)j);
3856 assert(random <= j);
3857
3858 sp_range->min_address = kmem_claims[random].kc_range->min_address;
3859 sp_range->max_address = sp_range->min_address + sp.kc_size;
3860
3861 for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3862 struct kmem_range_startup_spec spj = kmem_claims[j];
3863 struct mach_vm_range *range = spj.kc_range;
3864 range->min_address += sp_allocation_size;
3865 range->max_address += sp_allocation_size;
3866 kmem_claims[j + 1] = spj;
3867 }
3868
3869 sp.kc_flags |= KC_NO_MOVE;
3870 kmem_claims[random] = sp;
3871 }
3872
3873 __startup_func
3874 static void
kmem_add_ptr_claims(void)3875 kmem_add_ptr_claims(void)
3876 {
3877 uint64_t kmem_meta_num, kmem_ptr_chunks;
3878 vm_map_size_t org_ptr_range_size __assert_only;
3879
3880 org_ptr_range_size = ptr_range_size;
3881
3882 ptr_range_size -= PAGE_SIZE;
3883 ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3884 ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3885
3886 kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3887 ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3888
3889 kmem_meta_num = kmem_ptr_chunks + 2;
3890 kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3891
3892 assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3893 /*
3894 * Add claims for kmem's ranges
3895 */
3896 for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3897 struct kmem_range_startup_spec kmem_spec = {
3898 .kc_name = "kmem_ptr_range",
3899 .kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3900 .kc_size = ptr_range_size,
3901 .kc_flags = KC_NO_ENTRY,
3902 };
3903 kmem_claims[kmem_claim_count++] = kmem_spec;
3904
3905 struct kmem_range_startup_spec kmem_meta_spec = {
3906 .kc_name = "kmem_ptr_range_meta",
3907 .kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3908 .kc_size = kmem_meta_size,
3909 .kc_flags = KC_NONE,
3910 };
3911 kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3912 }
3913 }
3914
3915 __startup_func
3916 static void
kmem_add_extra_claims(void)3917 kmem_add_extra_claims(void)
3918 {
3919 vm_map_size_t largest_free_size = 0, total_claims = 0;
3920 vm_map_size_t sane_sprayqtn_size = 0, sprayqtn_allocation_size = 0;
3921 vm_map_size_t ptr_total_allocation_size = 0;
3922
3923 vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3924 largest_free_size = trunc_page(largest_free_size);
3925
3926 /*
3927 * kasan and configs w/o *TRR need to have just one ptr range due to
3928 * resource constraints.
3929 */
3930 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3931 kmem_ptr_ranges = 1;
3932 #endif
3933 /*
3934 * Determine size of data and pointer kmem_ranges
3935 */
3936 for (uint32_t i = 0; i < kmem_claim_count; i++) {
3937 struct kmem_range_startup_spec sp_i = kmem_claims[i];
3938
3939 total_claims += kmem_claim_to_allocation_size(
3940 sp_i.kc_size, /* known_last */ false);
3941 }
3942 assert((total_claims & PAGE_MASK) == 0);
3943
3944
3945 largest_free_size -= total_claims;
3946
3947 /*
3948 * Use half the total available VA for all pointer allocations (this
3949 * includes the kmem_sprayqtn range). Given that we have 4 total
3950 * ranges divide the available VA by 8.
3951 */
3952 ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3953
3954 sprayqtn_range_size = ptr_range_size;
3955 sane_sprayqtn_size = kmem_claim_to_allocation_size(
3956 /* claim_size */ sane_size / 2, /* known_last */ false);
3957 if (sprayqtn_range_size > sane_sprayqtn_size) {
3958 vm_map_size_t sprayqtn_extra;
3959
3960 /*
3961 * Spray quarantine doesn't need that much space.
3962 * Shrink it to something reasonable and equally share the leftover VA
3963 * with the other pointer ranges.
3964 */
3965 sprayqtn_extra = sprayqtn_range_size - sane_sprayqtn_size;
3966 sprayqtn_range_size -= sprayqtn_extra;
3967 ptr_range_size += sprayqtn_extra / kmem_ptr_ranges;
3968 }
3969
3970 ptr_range_size = round_page(ptr_range_size);
3971 sprayqtn_range_size = round_page(sprayqtn_range_size);
3972
3973 iokit_range_size = 0;
3974
3975 /* Less any necessary allocation padding... */
3976 ptr_range_size = kmem_allocation_to_claim_size(ptr_range_size);
3977 sprayqtn_range_size = kmem_allocation_to_claim_size(sprayqtn_range_size);
3978
3979 /*
3980 * Add the pointer and metadata claims
3981 * Note: this call modifies ptr_range_size and may, depending on the padding
3982 * requirements, slightly increase or decrease the overall allocation size
3983 * of the pointer+metadata region.
3984 */
3985 kmem_add_ptr_claims();
3986
3987 sprayqtn_allocation_size = kmem_claim_to_allocation_size(
3988 sprayqtn_range_size, /* known_last */ false);
3989 ptr_total_allocation_size =
3990 (kmem_claim_to_allocation_size(ptr_range_size, /* known_last */ false) +
3991 kmem_claim_to_allocation_size(kmem_meta_size, /* known_last */ false)) *
3992 kmem_ptr_ranges;
3993
3994 /*
3995 * Check: spray and ptr_range are minimally valid.
3996 * This is a useful assert as it should catch us if we were to end up with a
3997 * "negative" (or extremely large) data_range_size.
3998 */
3999 assert(sprayqtn_allocation_size + ptr_total_allocation_size < largest_free_size);
4000
4001 /*
4002 * Finally, give any remaining allocable space to the data region.
4003 */
4004 data_range_size = largest_free_size - sprayqtn_allocation_size -
4005 ptr_total_allocation_size;
4006
4007 #if defined(ARM_LARGE_MEMORY)
4008 /*
4009 * Reserve space for our dedicated IOKit carveout.
4010 * Currently, we carve off a quarter of the data region.
4011 */
4012 iokit_range_size = round_page(data_range_size / 4);
4013 data_range_size -= kmem_claim_to_allocation_size(
4014 iokit_range_size, /* known_last */ false);
4015 #endif /* defined(ARM_LARGE_MEMORY) */
4016
4017 /* Less any necessary allocation padding... */
4018 data_range_size = kmem_allocation_to_claim_size(data_range_size);
4019
4020 /* Check: our allocations should all still fit in the free space */
4021 assert(sprayqtn_allocation_size + ptr_total_allocation_size +
4022 kmem_claim_to_allocation_size(iokit_range_size, /* known_last */ false) +
4023 kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) <=
4024 largest_free_size);
4025
4026 struct kmem_range_startup_spec kmem_spec_sprayqtn = {
4027 .kc_name = "kmem_sprayqtn_range",
4028 .kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
4029 .kc_size = sprayqtn_range_size,
4030 .kc_flags = KC_NO_ENTRY,
4031 };
4032 kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
4033
4034 /*
4035 * If !defined(ARM_LARGE_MEMORY), KMEM_RANGE_ID_IOKIT is coalesced into the data range.
4036 * This is to minimize wasted translation tables in constrained environments.
4037 * The coalescing happens during kmem_scramble_ranges.
4038 */
4039 #if defined(ARM_LARGE_MEMORY)
4040 struct kmem_range_startup_spec kmem_spec_iokit = {
4041 .kc_name = "kmem_iokit_range",
4042 .kc_range = &kmem_ranges[KMEM_RANGE_ID_IOKIT],
4043 .kc_size = iokit_range_size,
4044 .kc_flags = KC_NO_ENTRY,
4045 };
4046 kmem_claims[kmem_claim_count++] = kmem_spec_iokit;
4047 #endif /* defined(ARM_LARGE_MEMORY) */
4048
4049 struct kmem_range_startup_spec kmem_spec_data = {
4050 .kc_name = "kmem_data_range",
4051 .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
4052 .kc_size = data_range_size,
4053 .kc_flags = KC_NO_ENTRY,
4054 };
4055 kmem_claims[kmem_claim_count++] = kmem_spec_data;
4056 }
4057
4058 __startup_func
4059 static void
kmem_scramble_ranges(void)4060 kmem_scramble_ranges(void)
4061 {
4062 vm_map_offset_t va_alloc_head = 0;
4063
4064 /*
4065 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
4066 * the vm can find the requested ranges.
4067 */
4068 kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
4069 VM_MAP_PAGE_SIZE(kernel_map));
4070 kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
4071
4072 /*
4073 * Allocating the g_kext_map prior to randomizing the remaining submaps as
4074 * this map is 2G in size and starts at the end of kernel_text on x86. It
4075 * could overflow into the heap.
4076 */
4077 kext_alloc_init();
4078
4079 /*
4080 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
4081 * stack addresses. (With a 4K page and 9 bits of randomness, this
4082 * eats about 2M of VA from the map)
4083 *
4084 * Note that we always need to slide by at least one page because the VM
4085 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
4086 * do not admit this address to be part of any zone submap.
4087 */
4088 va_alloc_head = kmem_fuzz_start();
4089
4090 /*
4091 * Add claims for ptr and data kmem_ranges
4092 */
4093 kmem_add_extra_claims();
4094
4095 /*
4096 * Minimally verify that our placer will be able to resolve the constraints
4097 * of all claims
4098 */
4099 bool has_min_address = false;
4100 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4101 struct kmem_range_startup_spec sp_i = kmem_claims[i];
4102
4103 /* Verify that we have only one claim with a min address constraint */
4104 if (sp_i.kc_range->min_address) {
4105 if (has_min_address) {
4106 panic("Cannot place with multiple min_address constraints");
4107 } else {
4108 has_min_address = true;
4109 }
4110 }
4111
4112 if (sp_i.kc_range->max_address) {
4113 panic("Cannot place with a max_address constraint");
4114 }
4115 }
4116
4117
4118 /*
4119 * Shuffle registered claims
4120 */
4121 assert(kmem_claim_count < UINT16_MAX);
4122 kmem_shuffle_claims();
4123
4124 /*
4125 * Apply restrictions and determine range for each claim
4126 */
4127 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4128 struct kmem_range_startup_spec sp = kmem_claims[i];
4129 struct mach_vm_range *sp_range = sp.kc_range;
4130
4131 /*
4132 * Find space using the allocation size (rather than the claim size) in
4133 * order to ensure we provide any applicable padding.
4134 */
4135 bool is_last = (i == kmem_claim_count - 1);
4136 vm_map_offset_t sp_allocation_size =
4137 kmem_claim_to_allocation_size(sp.kc_size, is_last);
4138
4139 if (vm_map_locate_space_anywhere(kernel_map, sp_allocation_size, 0,
4140 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4141 &va_alloc_head, NULL) != KERN_SUCCESS) {
4142 panic("kmem_range_init: vm_map_locate_space failing for claim %s, "
4143 "size 0x%llx",
4144 sp.kc_name, sp_allocation_size);
4145 }
4146
4147 /*
4148 * Re-adjust ranges if restriction not met
4149 */
4150 if (sp_range->min_address && va_alloc_head > sp_range->min_address) {
4151 kmem_readjust_ranges(i);
4152 } else {
4153 /*
4154 * Though the actual allocated space may be larger, provide only the
4155 * size requested by the original claim.
4156 */
4157 sp_range->min_address = va_alloc_head;
4158 sp_range->max_address = va_alloc_head + sp.kc_size;
4159 }
4160
4161 va_alloc_head += sp_allocation_size;
4162 }
4163
4164 /*
4165 * We have settled on the ranges, now create temporary entries for the
4166 * claims
4167 */
4168 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4169 struct kmem_range_startup_spec sp = kmem_claims[i];
4170 bool is_last = (i == kmem_claim_count - 1);
4171 vm_map_offset_t sp_allocation_size =
4172 kmem_claim_to_allocation_size(sp.kc_size, is_last);
4173 vm_map_entry_t entry = NULL;
4174 if (sp.kc_flags & KC_NO_ENTRY) {
4175 continue;
4176 }
4177
4178
4179 /*
4180 * We reserve the full allocation size (rather than the claim size) so
4181 * that nothing ends up placed in the padding space (if applicable).
4182 */
4183 if (vm_map_find_space(kernel_map, sp.kc_range->min_address,
4184 sp_allocation_size, 0,
4185 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4186 &entry) != KERN_SUCCESS) {
4187 panic("kmem_range_init: vm_map_find_space failing for claim %s",
4188 sp.kc_name);
4189 }
4190 vm_object_reference(kernel_object_default);
4191 VME_OBJECT_SET(entry, kernel_object_default, false, 0);
4192 VME_OFFSET_SET(entry, entry->vme_start);
4193 vm_map_unlock(kernel_map);
4194 }
4195
4196 /*
4197 * If we're not on a large memory system KMEM_RANGE_ID_IOKIT acts as a synonym for KMEM_RANGE_ID_DATA.
4198 * On large memory systems KMEM_RANGE_ID_IOKIT is a dedicated carveout.
4199 */
4200 #if !defined(ARM_LARGE_MEMORY)
4201 kmem_ranges[KMEM_RANGE_ID_IOKIT] = kmem_ranges[KMEM_RANGE_ID_DATA];
4202 #endif /* !defined(ARM_LARGE_MEMORY) */
4203
4204 /*
4205 * Now that we are done assigning all the ranges, reset
4206 * kmem_ranges[KMEM_RANGE_ID_NONE]
4207 */
4208 kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
4209
4210 #if DEBUG || DEVELOPMENT
4211 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4212 struct kmem_range_startup_spec sp = kmem_claims[i];
4213
4214 printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
4215 (void *)sp.kc_range->min_address,
4216 (void *)sp.kc_range->max_address,
4217 mach_vm_size_pretty(sp.kc_size),
4218 mach_vm_size_unit(sp.kc_size));
4219 }
4220 #endif /* DEBUG || DEVELOPMENT */
4221
4222 #if MACH_ASSERT
4223 /*
4224 * Since many parts of the claim infrastructure are marked as startup data
4225 * (and are thus unavailable post-lockdown), save off information our tests
4226 * need now.
4227 */
4228 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4229 kmem_test_saved_ranges[i] = *(kmem_claims[i].kc_range);
4230 }
4231 #endif /* MACH_ASSERT */
4232 }
4233
4234 __startup_func
4235 static void
kmem_range_init(void)4236 kmem_range_init(void)
4237 {
4238 vm_size_t range_adjustment;
4239
4240 kmem_scramble_ranges();
4241
4242 range_adjustment = sprayqtn_range_size >> 3;
4243 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
4244 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
4245 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
4246 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
4247
4248 range_adjustment = iokit_range_size >> 3;
4249 kmem_large_ranges[KMEM_RANGE_ID_IOKIT].min_address =
4250 kmem_ranges[KMEM_RANGE_ID_IOKIT].min_address + range_adjustment;
4251 kmem_large_ranges[KMEM_RANGE_ID_IOKIT].max_address =
4252 kmem_ranges[KMEM_RANGE_ID_IOKIT].max_address;
4253
4254 range_adjustment = data_range_size >> 3;
4255 kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
4256 kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
4257 kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
4258 kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
4259
4260 pmap_init();
4261 kmem_metadata_init();
4262 kmem_sizeclass_init();
4263
4264 #if DEBUG || DEVELOPMENT
4265 for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
4266 vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
4267 printf("kmem_large_ranges[%d] : %p - %p (%u%c)\n", i,
4268 (void *)kmem_large_ranges[i].min_address,
4269 (void *)kmem_large_ranges[i].max_address,
4270 mach_vm_size_pretty(range_size),
4271 mach_vm_size_unit(range_size));
4272 }
4273 #endif
4274 }
4275 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
4276
4277 #if DEBUG || DEVELOPMENT
4278 __startup_func
4279 static void
kmem_log_init(void)4280 kmem_log_init(void)
4281 {
4282 /*
4283 * Log can only be created after the the kmem subsystem is initialized as
4284 * btlog creation uses kmem
4285 */
4286 kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
4287 }
4288 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
4289
4290 kmem_gobj_stats
kmem_get_gobj_stats(void)4291 kmem_get_gobj_stats(void)
4292 {
4293 kmem_gobj_stats stats = {};
4294
4295 vm_map_lock(kernel_map);
4296 for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
4297 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
4298 struct mach_vm_range range = kmem_ranges[range_id];
4299 struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
4300 struct kmem_page_meta *meta_end;
4301 uint64_t meta_idx = meta - kmem_meta_base[range_id];
4302 vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
4303 vm_map_offset_t addr;
4304 vm_map_entry_t entry;
4305
4306 /*
4307 * Left front
4308 */
4309 va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
4310 meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
4311
4312 /*
4313 * Right front
4314 */
4315 meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
4316 meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
4317 &meta_idx);
4318 meta_idx = meta_end - meta;
4319 meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
4320 va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
4321
4322 /*
4323 * Compute VA allocated in entire range
4324 */
4325 if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
4326 entry = entry->vme_next;
4327 }
4328 while (entry != vm_map_to_entry(kernel_map) &&
4329 entry->vme_start < range.max_address) {
4330 used += (entry->vme_end - entry->vme_start);
4331 entry = entry->vme_next;
4332 }
4333
4334 pte_sz = round_page(atop(va - used) * 8);
4335
4336 stats.total_used += used;
4337 stats.total_va += va;
4338 stats.pte_sz += pte_sz;
4339 stats.meta_sz += meta_sz;
4340 }
4341 vm_map_unlock(kernel_map);
4342
4343 return stats;
4344 }
4345
4346 #endif /* DEBUG || DEVELOPMENT */
4347
4348 /*
4349 * kmem_init:
4350 *
4351 * Initialize the kernel's virtual memory map, taking
4352 * into account all memory allocated up to this time.
4353 */
4354 __startup_func
4355 void
kmem_init(vm_offset_t start,vm_offset_t end)4356 kmem_init(
4357 vm_offset_t start,
4358 vm_offset_t end)
4359 {
4360 vm_map_offset_t map_start;
4361 vm_map_offset_t map_end;
4362
4363 map_start = vm_map_trunc_page(start,
4364 VM_MAP_PAGE_MASK(kernel_map));
4365 map_end = vm_map_round_page(end,
4366 VM_MAP_PAGE_MASK(kernel_map));
4367
4368 vm_map_will_allocate_early_map(&kernel_map);
4369 #if defined(__arm64__)
4370 kernel_map = vm_map_create_options(pmap_kernel(),
4371 VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4372 VM_MAX_KERNEL_ADDRESS,
4373 VM_MAP_CREATE_DEFAULT);
4374 /*
4375 * Reserve virtual memory allocated up to this time.
4376 */
4377 {
4378 unsigned int region_select = 0;
4379 vm_map_offset_t region_start;
4380 vm_map_size_t region_size;
4381 vm_map_offset_t map_addr;
4382 kern_return_t kr;
4383
4384 while (pmap_virtual_region(region_select, ®ion_start, ®ion_size)) {
4385 map_addr = region_start;
4386 kr = vm_map_enter(kernel_map, &map_addr,
4387 vm_map_round_page(region_size,
4388 VM_MAP_PAGE_MASK(kernel_map)),
4389 (vm_map_offset_t) 0,
4390 VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(
4391 .vmkf_no_pmap_check = true,
4392 .vmkf_no_soft_limit = true),
4393 VM_OBJECT_NULL,
4394 (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
4395 VM_INHERIT_DEFAULT);
4396
4397 if (kr != KERN_SUCCESS) {
4398 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4399 (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
4400 (uint64_t) region_size, kr);
4401 }
4402
4403 region_select++;
4404 }
4405 }
4406 #else
4407 kernel_map = vm_map_create_options(pmap_kernel(),
4408 VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
4409 VM_MAP_CREATE_DEFAULT);
4410 /*
4411 * Reserve virtual memory allocated up to this time.
4412 */
4413 if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
4414 vm_map_offset_t map_addr;
4415 kern_return_t kr;
4416
4417 map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
4418 kr = vm_map_enter(kernel_map,
4419 &map_addr,
4420 (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4421 (vm_map_offset_t) 0,
4422 VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
4423 VM_OBJECT_NULL,
4424 (vm_object_offset_t) 0, FALSE,
4425 VM_PROT_NONE, VM_PROT_NONE,
4426 VM_INHERIT_DEFAULT);
4427
4428 if (kr != KERN_SUCCESS) {
4429 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4430 (uint64_t) start, (uint64_t) end,
4431 (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4432 (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4433 kr);
4434 }
4435 }
4436 #endif
4437
4438 kmem_set_user_wire_limits();
4439 }
4440
4441
4442 #pragma mark map copyio
4443 static inline void
current_thread_set_sec_override(bool val)4444 current_thread_set_sec_override(bool val)
4445 {
4446 #pragma unused(val)
4447 }
4448
4449 /*
4450 * Note: semantic types aren't used as `copyio` already validates.
4451 */
4452
4453 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)4454 copyinmap(
4455 vm_map_t map,
4456 vm_map_offset_t fromaddr,
4457 void *todata,
4458 vm_size_t length)
4459 {
4460 kern_return_t kr = KERN_SUCCESS;
4461 vm_map_switch_context_t switch_ctx;
4462
4463 if (vm_map_pmap(map) == pmap_kernel()) {
4464 /* assume a correct copy */
4465 memcpy(todata, CAST_DOWN(void *, fromaddr), length);
4466 } else if (current_map() == map) {
4467 if (copyin(fromaddr, todata, length) != 0) {
4468 kr = KERN_INVALID_ADDRESS;
4469 }
4470 } else {
4471 vm_map_reference(map);
4472 current_thread_set_sec_override(true);
4473 switch_ctx = vm_map_switch_to(map);
4474 if (copyin(fromaddr, todata, length) != 0) {
4475 kr = KERN_INVALID_ADDRESS;
4476 }
4477 current_thread_set_sec_override(false);
4478 vm_map_switch_back(switch_ctx);
4479 vm_map_deallocate(map);
4480 }
4481 return kr;
4482 }
4483
4484 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4485 copyoutmap(
4486 vm_map_t map,
4487 void *fromdata,
4488 vm_map_address_t toaddr,
4489 vm_size_t length)
4490 {
4491 kern_return_t kr = KERN_SUCCESS;
4492 vm_map_switch_context_t switch_ctx;
4493
4494 if (vm_map_pmap(map) == pmap_kernel()) {
4495 /* assume a correct copy */
4496 memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4497 } else if (current_map() == map) {
4498 if (copyout(fromdata, toaddr, length) != 0) {
4499 ktriage_record(thread_tid(current_thread()),
4500 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4501 KDBG_TRIAGE_RESERVED,
4502 KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR),
4503 KERN_INVALID_ADDRESS /* arg */);
4504 kr = KERN_INVALID_ADDRESS;
4505 }
4506 } else {
4507 vm_map_reference(map);
4508 current_thread_set_sec_override(true);
4509 switch_ctx = vm_map_switch_to(map);
4510 if (copyout(fromdata, toaddr, length) != 0) {
4511 ktriage_record(thread_tid(current_thread()),
4512 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4513 KDBG_TRIAGE_RESERVED,
4514 KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR),
4515 KERN_INVALID_ADDRESS /* arg */);
4516 kr = KERN_INVALID_ADDRESS;
4517 }
4518 current_thread_set_sec_override(false);
4519 vm_map_switch_back(switch_ctx);
4520 vm_map_deallocate(map);
4521 }
4522 return kr;
4523 }
4524
4525 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4526 copyoutmap_atomic32(
4527 vm_map_t map,
4528 uint32_t value,
4529 vm_map_address_t toaddr)
4530 {
4531 kern_return_t kr = KERN_SUCCESS;
4532 vm_map_switch_context_t switch_ctx;
4533
4534 if (vm_map_pmap(map) == pmap_kernel()) {
4535 /* assume a correct toaddr */
4536 *(uint32_t *)toaddr = value;
4537 } else if (current_map() == map) {
4538 if (copyout_atomic32(value, toaddr) != 0) {
4539 kr = KERN_INVALID_ADDRESS;
4540 }
4541 } else {
4542 vm_map_reference(map);
4543 current_thread_set_sec_override(true);
4544 switch_ctx = vm_map_switch_to(map);
4545 if (copyout_atomic32(value, toaddr) != 0) {
4546 kr = KERN_INVALID_ADDRESS;
4547 }
4548 current_thread_set_sec_override(false);
4549 vm_map_switch_back(switch_ctx);
4550 vm_map_deallocate(map);
4551 }
4552 return kr;
4553 }
4554
4555 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4556 copyoutmap_atomic64(
4557 vm_map_t map,
4558 uint64_t value,
4559 vm_map_address_t toaddr)
4560 {
4561 kern_return_t kr = KERN_SUCCESS;
4562 vm_map_switch_context_t switch_ctx;
4563
4564 if (vm_map_pmap(map) == pmap_kernel()) {
4565 /* assume a correct toaddr */
4566 *(uint64_t *)toaddr = value;
4567 } else if (current_map() == map) {
4568 if (copyout_atomic64(value, toaddr) != 0) {
4569 kr = KERN_INVALID_ADDRESS;
4570 }
4571 } else {
4572 vm_map_reference(map);
4573 current_thread_set_sec_override(true);
4574 switch_ctx = vm_map_switch_to(map);
4575 if (copyout_atomic64(value, toaddr) != 0) {
4576 kr = KERN_INVALID_ADDRESS;
4577 }
4578 current_thread_set_sec_override(false);
4579 vm_map_switch_back(switch_ctx);
4580 vm_map_deallocate(map);
4581 }
4582 return kr;
4583 }
4584
4585
4586 #pragma mark pointer obfuscation / packing
4587
4588 /*
4589 *
4590 * The following two functions are to be used when exposing kernel
4591 * addresses to userspace via any of the various debug or info
4592 * facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4593 * and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4594 * are exported to KEXTs.
4595 *
4596 * NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4597 */
4598
4599 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4600 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4601 {
4602 assert(salt != 0);
4603
4604 if (addr == 0) {
4605 return 0ul;
4606 }
4607
4608 if (VM_KERNEL_IS_SLID(addr)) {
4609 return VM_KERNEL_UNSLIDE(addr);
4610 }
4611
4612 addr = VM_KERNEL_STRIP_UPTR(addr);
4613
4614 vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4615 SHA256_CTX sha_ctx;
4616
4617 SHA256_Init(&sha_ctx);
4618 SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4619 SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4620 SHA256_Final(sha_digest, &sha_ctx);
4621
4622 return sha_digest[0];
4623 }
4624
4625 __exported vm_offset_t
4626 vm_kernel_addrhash_external(vm_offset_t addr);
4627 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4628 vm_kernel_addrhash_external(vm_offset_t addr)
4629 {
4630 return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4631 }
4632
4633 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4634 vm_kernel_addrhide(
4635 vm_offset_t addr,
4636 vm_offset_t *hide_addr)
4637 {
4638 *hide_addr = VM_KERNEL_ADDRHIDE(addr);
4639 }
4640
4641 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4642 vm_kernel_addrperm_external(
4643 vm_offset_t addr,
4644 vm_offset_t *perm_addr)
4645 {
4646 if (VM_KERNEL_IS_SLID(addr)) {
4647 *perm_addr = VM_KERNEL_UNSLIDE(addr);
4648 } else if (VM_KERNEL_ADDRESS(addr)) {
4649 *perm_addr = addr + vm_kernel_addrperm_ext;
4650 } else {
4651 *perm_addr = addr;
4652 }
4653 }
4654
4655 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4656 vm_kernel_unslide_or_perm_external(
4657 vm_offset_t addr,
4658 vm_offset_t *up_addr)
4659 {
4660 vm_kernel_addrperm_external(addr, up_addr);
4661 }
4662
4663 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4664 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4665 {
4666 if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4667 panic("pointer %p can't be packed: low %d bits aren't 0",
4668 (void *)ptr, params.vmpp_shift);
4669 } else if (ptr <= params.vmpp_base) {
4670 panic("pointer %p can't be packed: below base %p",
4671 (void *)ptr, (void *)params.vmpp_base);
4672 } else {
4673 panic("pointer %p can't be packed: maximum encodable pointer is %p",
4674 (void *)ptr, (void *)vm_packing_max_packable(params));
4675 }
4676 }
4677
4678 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4679 vm_packing_verify_range(
4680 const char *subsystem,
4681 vm_offset_t min_address,
4682 vm_offset_t max_address,
4683 vm_packing_params_t params)
4684 {
4685 if (min_address > max_address) {
4686 panic("%s: %s range invalid min:%p > max:%p",
4687 __func__, subsystem, (void *)min_address, (void *)max_address);
4688 }
4689
4690 if (!params.vmpp_base_relative) {
4691 return;
4692 }
4693
4694 if (min_address <= params.vmpp_base) {
4695 panic("%s: %s range invalid min:%p <= base:%p",
4696 __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4697 }
4698
4699 if (max_address > vm_packing_max_packable(params)) {
4700 panic("%s: %s range invalid max:%p >= max packable:%p",
4701 __func__, subsystem, (void *)max_address,
4702 (void *)vm_packing_max_packable(params));
4703 }
4704 }
4705
4706 #pragma mark tests
4707 #if MACH_ASSERT
4708 #include <sys/errno.h>
4709
4710 static void
4711 kmem_test_for_entry(
4712 vm_map_t map,
4713 vm_offset_t addr,
4714 void (^block)(vm_map_entry_t))
4715 {
4716 vm_map_entry_t entry;
4717
4718 vm_map_lock(map);
4719 block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4720 vm_map_unlock(map);
4721 }
4722
4723 #define kmem_test_assert_map(map, pg, entries) ({ \
4724 assert3u((map)->size, ==, ptoa(pg)); \
4725 assert3u((map)->hdr.nentries, ==, entries); \
4726 })
4727
4728 static bool
can_write_at(vm_offset_t offs,uint32_t page)4729 can_write_at(vm_offset_t offs, uint32_t page)
4730 {
4731 static const int zero;
4732
4733 return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4734 }
4735 #define assert_writeable(offs, page) \
4736 assertf(can_write_at(offs, page), \
4737 "can write at %p + ptoa(%d)", (void *)offs, page)
4738
4739 #define assert_faults(offs, page) \
4740 assertf(!can_write_at(offs, page), \
4741 "can write at %p + ptoa(%d)", (void *)offs, page)
4742
4743 #define peek(offs, page) \
4744 (*(uint32_t *)((offs) + ptoa(page)))
4745
4746 #define poke(offs, page, v) \
4747 (*(uint32_t *)((offs) + ptoa(page)) = (v))
4748
4749 #if CONFIG_SPTM
4750 __attribute__((noinline))
4751 static void
kmem_test_verify_type_policy(vm_offset_t addr,kmem_flags_t flags)4752 kmem_test_verify_type_policy(vm_offset_t addr, kmem_flags_t flags)
4753 {
4754 extern bool use_xnu_restricted;
4755 pmap_mapping_type_t expected_type = PMAP_MAPPING_TYPE_RESTRICTED;
4756
4757 /* Explicitly state the expected policy */
4758 if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
4759 expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4760 }
4761
4762 /* If X_K_R is disabled, DEFAULT is the only possible mapping */
4763 if (!use_xnu_restricted) {
4764 expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4765 }
4766
4767 /* Verify if derived correctly */
4768 assert3u(expected_type, ==, __kmem_mapping_type(flags));
4769
4770 pmap_paddr_t pa = kvtophys(addr);
4771 if (pa == 0) {
4772 return;
4773 }
4774
4775 /* Verify if the mapped address actually got the expected type */
4776 assert3u(expected_type, ==, sptm_get_frame_type(pa));
4777 }
4778 #endif /* CONFIG_SPTM */
4779
4780 __attribute__((noinline))
4781 static void
kmem_alloc_basic_test(vm_map_t map)4782 kmem_alloc_basic_test(vm_map_t map)
4783 {
4784 kmem_guard_t guard = {
4785 .kmg_tag = VM_KERN_MEMORY_DIAG,
4786 };
4787 vm_offset_t addr;
4788
4789 /*
4790 * Test wired basics:
4791 * - KMA_KOBJECT
4792 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4793 * - allocation alignment
4794 */
4795 addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4796 KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4797 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4798 assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4799 kmem_test_assert_map(map, 10, 1);
4800
4801 kmem_test_for_entry(map, addr, ^(__assert_only vm_map_entry_t e){
4802 assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4803 assert(e->vme_kernel_object);
4804 assert(!e->vme_atomic);
4805 assert3u(e->vme_start, <=, addr);
4806 assert3u(addr + ptoa(10), <=, e->vme_end);
4807 });
4808
4809 assert_faults(addr, 0);
4810 for (int i = 1; i < 9; i++) {
4811 assert_writeable(addr, i);
4812 }
4813 assert_faults(addr, 9);
4814
4815 kmem_free(map, addr, ptoa(10));
4816 kmem_test_assert_map(map, 0, 0);
4817
4818 /*
4819 * Test pageable basics.
4820 */
4821 addr = kmem_alloc_guard(map, ptoa(10), 0,
4822 KMA_PAGEABLE, guard).kmr_address;
4823 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4824 kmem_test_assert_map(map, 10, 1);
4825
4826 for (int i = 0; i < 9; i++) {
4827 assert_faults(addr, i);
4828 poke(addr, i, 42);
4829 assert_writeable(addr, i);
4830 }
4831
4832 kmem_free_guard(map, addr, ptoa(10),
4833 KMF_GUARD_FIRST | KMF_GUARD_LAST, guard);
4834 kmem_test_assert_map(map, 0, 0);
4835 }
4836
4837 __attribute__((noinline))
4838 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4839 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4840 {
4841 kmem_guard_t guard = {
4842 .kmg_atomic = !(kind & KMR_DATA),
4843 .kmg_tag = VM_KERN_MEMORY_DIAG,
4844 .kmg_context = 0xefface,
4845 };
4846 vm_offset_t addr, newaddr;
4847 const int N = 10;
4848
4849 /*
4850 * This isn't something kmem_realloc_guard() _needs_ to do,
4851 * we could conceive an implementation where it grows in place
4852 * if there's space after it.
4853 *
4854 * However, this is what the implementation does today.
4855 */
4856 bool realloc_growth_changes_address = true;
4857 bool GF = (kind & KMR_GUARD_FIRST);
4858 bool GL = (kind & KMR_GUARD_LAST);
4859
4860 /*
4861 * Initial N page allocation
4862 */
4863 addr = kmem_alloc_guard(map, ptoa(N), 0,
4864 (kind & ~KMEM_FREEOLD) | KMA_ZERO, guard).kmr_address;
4865 assert3u(addr, !=, 0);
4866
4867 kmem_test_assert_map(map, N, 1);
4868 for (int pg = GF; pg < N - GL; pg++) {
4869 poke(addr, pg, 42 + pg);
4870 }
4871 for (int pg = N - GL; pg < N; pg++) {
4872 assert_faults(addr, pg);
4873 }
4874
4875 #if CONFIG_SPTM
4876 kmem_test_verify_type_policy(addr, ANYF(kind));
4877 #endif /* CONFIG_SPTM */
4878 /*
4879 * Grow to N + 3 pages
4880 */
4881 newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4882 kind | KMR_ZERO, guard).kmr_address;
4883 assert3u(newaddr, !=, 0);
4884 if (realloc_growth_changes_address) {
4885 assert3u(addr, !=, newaddr);
4886 }
4887 if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4888 kmem_test_assert_map(map, N + 3, 1);
4889 } else {
4890 kmem_test_assert_map(map, 2 * N + 3, 2);
4891 }
4892 for (int pg = GF; pg < N - GL; pg++) {
4893 assert3u(peek(newaddr, pg), ==, 42 + pg);
4894 }
4895 if ((kind & KMR_FREEOLD) == 0) {
4896 for (int pg = GF; pg < N - GL; pg++) {
4897 assert3u(peek(addr, pg), ==, 42 + pg);
4898 }
4899 /* check for tru-share */
4900 poke(addr + 16, 0, 1234);
4901 assert3u(peek(newaddr + 16, 0), ==, 1234);
4902 kmem_free_guard(map, addr, ptoa(N),
4903 kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
4904 kmem_test_assert_map(map, N + 3, 1);
4905 }
4906 if (addr != newaddr) {
4907 for (int pg = GF; pg < N - GL; pg++) {
4908 assert_faults(addr, pg);
4909 }
4910 }
4911 for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4912 assert3u(peek(newaddr, pg), ==, 0);
4913 }
4914 for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4915 assert_faults(newaddr, pg);
4916 }
4917 addr = newaddr;
4918
4919
4920 /*
4921 * Shrink to N - 2 pages
4922 */
4923 newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4924 kind | KMR_ZERO, guard).kmr_address;
4925 assert3u(map->size, ==, ptoa(N - 2));
4926 assert3u(newaddr, ==, addr);
4927 kmem_test_assert_map(map, N - 2, 1);
4928
4929 for (int pg = GF; pg < N - 2 - GL; pg++) {
4930 assert3u(peek(addr, pg), ==, 42 + pg);
4931 }
4932 for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4933 assert_faults(addr, pg);
4934 }
4935
4936 kmem_free_guard(map, addr, ptoa(N - 2),
4937 kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
4938 kmem_test_assert_map(map, 0, 0);
4939 }
4940
4941 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4942 kmem_basic_test(__unused int64_t in, int64_t *out)
4943 {
4944 mach_vm_offset_t addr;
4945 vm_map_t map;
4946
4947 printf("%s: test running\n", __func__);
4948
4949 map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4950 VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4951 KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4952
4953 printf("%s: kmem_alloc ...\n", __func__);
4954 kmem_alloc_basic_test(map);
4955 printf("%s: PASS\n", __func__);
4956
4957 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4958 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4959 printf("%s: PASS\n", __func__);
4960
4961 printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4962 kmem_realloc_basic_test(map, KMR_FREEOLD);
4963 printf("%s: PASS\n", __func__);
4964
4965 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4966 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
4967 printf("%s: PASS\n", __func__);
4968
4969 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4970 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4971 printf("%s: PASS\n", __func__);
4972
4973 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4974 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4975 printf("%s: PASS\n", __func__);
4976
4977 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4978 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
4979 printf("%s: PASS\n", __func__);
4980
4981 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4982 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4983 printf("%s: PASS\n", __func__);
4984
4985 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4986 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4987 printf("%s: PASS\n", __func__);
4988
4989
4990 /* using KMR_DATA signals to test the non atomic realloc path */
4991 printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4992 kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4993 printf("%s: PASS\n", __func__);
4994
4995 printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4996 kmem_realloc_basic_test(map, KMR_DATA);
4997 printf("%s: PASS\n", __func__);
4998
4999 /* test KMR_SHARED_DATA for the new shared kheap */
5000 printf("%s: kmem_realloc (KMR_DATA_SHARED) ...\n", __func__);
5001 kmem_realloc_basic_test(map, KMR_DATA_SHARED);
5002 printf("%s: PASS\n", __func__);
5003
5004 kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
5005 vm_map_deallocate(map);
5006
5007 printf("%s: test passed\n", __func__);
5008 *out = 1;
5009 return 0;
5010 }
5011 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
5012
5013 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)5014 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
5015 {
5016 __assert_only uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
5017
5018 assert(chunks >= kmem_size_array[idx].ks_num_chunk);
5019 }
5020
5021 __attribute__((noinline))
5022 static void
kmem_test_get_size_idx_for_all_chunks()5023 kmem_test_get_size_idx_for_all_chunks()
5024 {
5025 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
5026 uint32_t chunks = kmem_size_array[i].ks_num_chunk;
5027
5028 if (chunks != 1) {
5029 kmem_test_get_size_idx_for_chunks(chunks - 1);
5030 }
5031 kmem_test_get_size_idx_for_chunks(chunks);
5032 kmem_test_get_size_idx_for_chunks(chunks + 1);
5033 }
5034 }
5035
5036 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)5037 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
5038 {
5039 printf("%s: test running\n", __func__);
5040
5041 printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
5042 kmem_test_get_size_idx_for_all_chunks();
5043 printf("%s: PASS\n", __func__);
5044
5045 printf("%s: test passed\n", __func__);
5046 *out = 1;
5047 return 0;
5048 }
5049 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
5050
5051
5052 #endif /* MACH_ASSERT */
5053