1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: vm/vm_kern.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Kernel memory management.
64 */
65
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern_internal.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object_internal.h>
73 #include <vm/vm_page_internal.h>
74 #include <vm/vm_compressor_xnu.h>
75 #include <vm/vm_pageout_xnu.h>
76 #include <vm/vm_init_xnu.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #if HAS_MTE
80 #include <vm/vm_mteinfo_internal.h>
81 #endif /* HAS_MTE */
82 #include <vm/vm_far.h>
83 #include <kern/misc_protos.h>
84 #include <vm/cpm_internal.h>
85 #include <kern/ledger.h>
86 #include <kern/bits.h>
87 #include <kern/startup.h>
88 #include <kern/telemetry.h>
89
90 #include <string.h>
91
92 #include <libkern/OSDebug.h>
93 #include <libkern/crypto/sha2.h>
94 #include <libkern/section_keywords.h>
95 #include <sys/kdebug.h>
96 #include <sys/kdebug_triage.h>
97
98 #include <san/kasan.h>
99 #include <kern/kext_alloc.h>
100 #include <kern/backtrace.h>
101 #include <os/hash.h>
102 #include <kern/zalloc_internal.h>
103 #include <libkern/crypto/rand.h>
104
105 /*
106 * Variables exported by this module.
107 */
108
109 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
110 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
111 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
112
113 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
114 KMEM_RANGE_ID_NUM_PTR);
115 #define KMEM_GOBJ_THRESHOLD (32ULL << 20)
116 #if DEBUG || DEVELOPMENT
117 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
118 #define KMEM_OUTLIER_SIZE 0
119 #define KMEM_OUTLIER_ALIGN 1
120 btlog_t kmem_outlier_log;
121 #endif /* DEBUG || DEVELOPMENT */
122
123 __startup_data static vm_map_size_t data_range_size;
124 __startup_data static vm_map_size_t shared_data_range_size;
125 __startup_data static vm_map_size_t ptr_range_size;
126 __startup_data static vm_map_size_t sprayqtn_range_size;
127
128 #pragma mark helpers
129
130 __attribute__((overloadable))
131 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)132 ANYF(kma_flags_t flags)
133 {
134 return (kmem_flags_t)flags;
135 }
136
137 __attribute__((overloadable))
138 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)139 ANYF(kmr_flags_t flags)
140 {
141 return (kmem_flags_t)flags;
142 }
143
144 __attribute__((overloadable))
145 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)146 ANYF(kmf_flags_t flags)
147 {
148 return (kmem_flags_t)flags;
149 }
150
151 __abortlike
152 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)153 __kmem_invalid_size_panic(
154 vm_map_t map,
155 vm_size_t size,
156 uint32_t flags)
157 {
158 panic("kmem(map=%p, flags=0x%x): invalid size %zd",
159 map, flags, (size_t)size);
160 }
161
162 __abortlike
163 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)164 __kmem_invalid_arguments_panic(
165 const char *what,
166 vm_map_t map,
167 vm_address_t address,
168 vm_size_t size,
169 uint32_t flags)
170 {
171 panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
172 "invalid arguments passed",
173 what, map, (void *)address, (size_t)size, flags);
174 }
175
176 __abortlike
177 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)178 __kmem_failed_panic(
179 vm_map_t map,
180 vm_size_t size,
181 uint32_t flags,
182 kern_return_t kr,
183 const char *what)
184 {
185 panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
186 what, map, (size_t)size, flags, kr);
187 }
188
189 __abortlike
190 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)191 __kmem_entry_not_found_panic(
192 vm_map_t map,
193 vm_offset_t addr)
194 {
195 panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
196 }
197
198 static inline vm_object_t
__kmem_object(kmem_flags_t flags)199 __kmem_object(kmem_flags_t flags)
200 {
201 if (flags & KMEM_COMPRESSOR) {
202 if (flags & KMEM_KOBJECT) {
203 panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
204 }
205 return compressor_object;
206 }
207 if (!(flags & KMEM_KOBJECT)) {
208 panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
209 }
210 #if HAS_MTE
211 if (flags & KMEM_TAG) {
212 return kernel_object_tagged;
213 }
214 #endif /* HAS_MTE */
215 return kernel_object_default;
216 }
217
218 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)219 __kmem_mapping_type(kmem_flags_t flags)
220 {
221 if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
222 return PMAP_MAPPING_TYPE_DEFAULT;
223 } else if (flags & KMEM_DATA) {
224 return kalloc_is_restricted_data_mode_enforced() ?
225 PMAP_MAPPING_TYPE_RESTRICTED : PMAP_MAPPING_TYPE_DEFAULT;
226 } else {
227 return PMAP_MAPPING_TYPE_RESTRICTED;
228 }
229 }
230
231 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)232 __kmem_guard_left(kmem_flags_t flags)
233 {
234 return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
235 }
236
237 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)238 __kmem_guard_right(kmem_flags_t flags)
239 {
240 return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
241 }
242
243 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)244 __kmem_guard_size(kmem_flags_t flags)
245 {
246 return __kmem_guard_left(flags) + __kmem_guard_right(flags);
247 }
248
249 __pure2
250 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)251 __kmem_entry_orig_size(vm_map_entry_t entry)
252 {
253 vm_object_t object = VME_OBJECT(entry);
254
255 if (entry->vme_kernel_object) {
256 return entry->vme_end - entry->vme_start -
257 entry->vme_object_or_delta;
258 } else {
259 return object->vo_size - object->vo_size_delta;
260 }
261 }
262
263
264 #pragma mark kmem range methods
265
266 #define mach_vm_range_load(r, rmin, rmax) \
267 ({ (rmin) = (r)->min_address; (rmax) = (r)->max_address; })
268
269 __abortlike
270 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)271 __mach_vm_range_overflow(
272 mach_vm_offset_t addr,
273 mach_vm_offset_t size)
274 {
275 panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
276 addr, addr, size);
277 }
278
279 __abortlike
280 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)281 __mach_vm_range_invalid(
282 mach_vm_offset_t min_address,
283 mach_vm_offset_t max_address)
284 {
285 panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
286 min_address, max_address);
287 }
288
289 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)290 mach_vm_range_size(const struct mach_vm_range *r)
291 {
292 mach_vm_offset_t rmin, rmax;
293
294 mach_vm_range_load(r, rmin, rmax);
295 return rmax - rmin;
296 }
297
298 __attribute__((overloadable))
299 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)300 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
301 {
302 mach_vm_offset_t rmin, rmax;
303 /*
304 * The `&` is not a typo: we really expect the check to pass,
305 * so encourage the compiler to eagerly load and test without branches
306 */
307 mach_vm_range_load(r, rmin, rmax);
308 return (addr >= rmin) & (addr < rmax);
309 }
310
311 __attribute__((overloadable))
312 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)313 mach_vm_range_contains(
314 const struct mach_vm_range *r,
315 mach_vm_offset_t addr,
316 mach_vm_offset_t size)
317 {
318 mach_vm_offset_t rmin, rmax;
319 mach_vm_offset_t end;
320
321 if (__improbable(os_add_overflow(addr, size, &end))) {
322 return false;
323 }
324
325 /*
326 * The `&` is not a typo: we really expect the check to pass,
327 * so encourage the compiler to eagerly load and test without branches
328 */
329 mach_vm_range_load(r, rmin, rmax);
330 return (addr >= rmin) & (end >= rmin) & (end <= rmax);
331 }
332
333 __attribute__((overloadable))
334 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)335 mach_vm_range_intersects(
336 const struct mach_vm_range *r1,
337 const struct mach_vm_range *r2)
338 {
339 mach_vm_offset_t r1_min, r1_max;
340 mach_vm_offset_t r2_min, r2_max;
341
342 mach_vm_range_load(r1, r1_min, r1_max);
343 r2_min = r2->min_address;
344 r2_max = r2->max_address;
345
346 if (r1_min > r1_max) {
347 __mach_vm_range_invalid(r1_min, r1_max);
348 }
349
350 if (r2_min > r2_max) {
351 __mach_vm_range_invalid(r2_min, r2_max);
352 }
353
354 return r1_max > r2_min && r1_min < r2_max;
355 }
356
357 __attribute__((overloadable))
358 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)359 mach_vm_range_intersects(
360 const struct mach_vm_range *r1,
361 mach_vm_offset_t addr,
362 mach_vm_offset_t size)
363 {
364 struct mach_vm_range r2;
365
366 r2.min_address = addr;
367 if (os_add_overflow(addr, size, &r2.max_address)) {
368 __mach_vm_range_overflow(addr, size);
369 }
370
371 return mach_vm_range_intersects(r1, &r2);
372 }
373
374 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)375 kmem_range_id_contains(
376 kmem_range_id_t range_id,
377 vm_map_offset_t addr,
378 vm_map_size_t size)
379 {
380 return mach_vm_range_contains(&kmem_ranges[range_id], vm_memtag_canonicalize_kernel(addr), size);
381 }
382
383 __abortlike
384 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)385 kmem_range_invalid_panic(
386 kmem_range_id_t range_id,
387 vm_map_offset_t addr,
388 vm_map_size_t size)
389 {
390 const struct mach_vm_range *r = &kmem_ranges[range_id];
391 mach_vm_offset_t rmin, rmax;
392
393 mach_vm_range_load(r, rmin, rmax);
394 if (addr + size < rmin) {
395 panic("addr %p + size %llu overflows %p", (void *)addr, size,
396 (void *)(addr + size));
397 }
398 panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
399 (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
400 }
401
402 /*
403 * Return whether the entire allocation is contained in the given range
404 */
405 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)406 kmem_range_contains_fully(
407 kmem_range_id_t range_id,
408 vm_map_offset_t addr,
409 vm_map_size_t size)
410 {
411 const struct mach_vm_range *r = &kmem_ranges[range_id];
412 mach_vm_offset_t rmin, rmax;
413 bool result = false;
414
415 if (VM_KERNEL_ADDRESS(addr)) {
416 addr = vm_memtag_canonicalize_kernel(addr);
417 }
418
419 /*
420 * The `&` is not a typo: we really expect the check to pass,
421 * so encourage the compiler to eagerly load and test without branches
422 */
423 mach_vm_range_load(r, rmin, rmax);
424 result = (addr >= rmin) & (addr < rmax);
425 if (__improbable(result
426 && ((addr + size < rmin) || (addr + size > rmax)))) {
427 kmem_range_invalid_panic(range_id, addr, size);
428 }
429 return result;
430 }
431
432 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)433 kmem_range_id_size(kmem_range_id_t range_id)
434 {
435 return mach_vm_range_size(&kmem_ranges[range_id]);
436 }
437
438 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)439 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
440 {
441 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
442
443 for (; range_id < KMEM_RANGE_COUNT; range_id++) {
444 if (kmem_range_contains_fully(range_id, addr, size)) {
445 return range_id;
446 }
447 }
448 return KMEM_RANGE_ID_NONE;
449 }
450
451 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)452 kmem_is_ptr_range(vm_map_range_id_t range_id)
453 {
454 return (range_id >= KMEM_RANGE_ID_FIRST) &&
455 (range_id <= KMEM_RANGE_ID_NUM_PTR);
456 }
457
458 __abortlike
459 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)460 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
461 {
462 panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
463 (void *)addr);
464 }
465
466 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)467 kmem_validate_range_for_overwrite(
468 vm_map_offset_t addr,
469 vm_map_size_t size)
470 {
471 vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
472
473 if (kmem_is_ptr_range(range_id)) {
474 kmem_range_invalid_for_overwrite(addr);
475 }
476
477 return &kmem_ranges[range_id];
478 }
479
480
481 #pragma mark entry parameters
482
483
484 __abortlike
485 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)486 __kmem_entry_validate_panic(
487 vm_map_t map,
488 vm_map_entry_t entry,
489 vm_offset_t addr,
490 vm_size_t size,
491 uint32_t flags,
492 kmem_guard_t guard)
493 {
494 const char *what = "???";
495
496 if (entry->vme_atomic != guard.kmg_atomic) {
497 what = "atomicity";
498 } else if (entry->is_sub_map != guard.kmg_submap) {
499 what = "objectness";
500 } else if (addr != entry->vme_start) {
501 what = "left bound";
502 } else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
503 what = "right bound";
504 } else if (guard.kmg_context != entry->vme_context) {
505 what = "guard";
506 }
507
508 panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
509 "entry:%p %s mismatch guard(0x%08x)",
510 map, (void *)addr, size, flags, entry,
511 what, guard.kmg_context);
512 }
513
514 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)515 __kmem_entry_validate_guard(
516 vm_map_entry_t entry,
517 vm_offset_t addr,
518 vm_size_t size,
519 kmem_flags_t flags,
520 kmem_guard_t guard)
521 {
522 if (entry->vme_atomic != guard.kmg_atomic) {
523 return false;
524 }
525
526 if (!guard.kmg_atomic) {
527 return true;
528 }
529
530 if (entry->is_sub_map != guard.kmg_submap) {
531 return false;
532 }
533
534 if (addr != entry->vme_start) {
535 return false;
536 }
537
538 if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
539 return false;
540 }
541
542 if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
543 return false;
544 }
545
546 return true;
547 }
548
549 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)550 kmem_entry_validate_guard(
551 vm_map_t map,
552 vm_map_entry_t entry,
553 vm_offset_t addr,
554 vm_size_t size,
555 kmem_guard_t guard)
556 {
557 if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
558 __kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
559 }
560 }
561
562 __abortlike
563 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)564 __kmem_entry_validate_object_panic(
565 vm_map_t map,
566 vm_map_entry_t entry,
567 kmem_flags_t flags)
568 {
569 const char *what;
570 const char *verb;
571
572 if (entry->is_sub_map) {
573 panic("kmem(map=%p) entry %p is a submap", map, entry);
574 }
575
576 if (flags & KMEM_KOBJECT) {
577 what = "kernel";
578 verb = "isn't";
579 } else if (flags & KMEM_COMPRESSOR) {
580 what = "compressor";
581 verb = "isn't";
582 } else if (entry->vme_kernel_object) {
583 what = "kernel";
584 verb = "is unexpectedly";
585 } else {
586 what = "compressor";
587 verb = "is unexpectedly";
588 }
589
590 panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
591 map, flags, entry, verb, what);
592 }
593
594 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)595 __kmem_entry_validate_object(
596 vm_map_entry_t entry,
597 kmem_flags_t flags)
598 {
599 if (entry->is_sub_map) {
600 return false;
601 }
602 if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
603 return false;
604 }
605
606 return (bool)(flags & KMEM_COMPRESSOR) ==
607 (VME_OBJECT(entry) == compressor_object);
608 }
609
610 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)611 kmem_size_guard(
612 vm_map_t map,
613 vm_offset_t addr,
614 kmem_guard_t guard)
615 {
616 kmem_flags_t flags = KMEM_GUESS_SIZE;
617 vm_map_entry_t entry;
618 vm_size_t size;
619
620 vmlp_api_start(KMEM_SIZE_GUARD);
621
622 vm_map_lock_read(map);
623
624 #if KASAN_CLASSIC
625 addr -= PAGE_SIZE;
626 #endif /* KASAN_CLASSIC */
627 addr = vm_memtag_canonicalize_kernel(addr);
628
629 if (!vm_map_lookup_entry(map, addr, &entry)) {
630 __kmem_entry_not_found_panic(map, addr);
631 }
632
633 vmlp_range_event_entry(map, entry);
634
635 if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
636 __kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
637 }
638
639 size = __kmem_entry_orig_size(entry);
640
641 vm_map_unlock_read(map);
642
643 vmlp_api_end(KMEM_SIZE_GUARD, 0);
644 return size;
645 }
646
647 static inline uint16_t
kmem_hash_backtrace(void * fp)648 kmem_hash_backtrace(
649 void *fp)
650 {
651 uint64_t bt_count;
652 uintptr_t bt[8] = {};
653
654 struct backtrace_control ctl = {
655 .btc_frame_addr = (uintptr_t)fp,
656 };
657
658 bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
659 return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
660 }
661
662 static_assert(KMEM_RANGE_ID_DATA_SHARED - 1 <= KMEM_RANGE_MASK,
663 "Insufficient bits to represent ptr ranges");
664
665 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)666 kmem_adjust_range_id(
667 uint32_t hash)
668 {
669 return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
670 (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
671 }
672
673 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)674 kmem_use_sprayqtn(
675 kma_flags_t kma_flags,
676 vm_map_size_t map_size,
677 vm_offset_t mask)
678 {
679 /*
680 * Pointer allocations that are above the guard objects threshold or have
681 * leading guard pages with non standard alignment requests are redirected
682 * to the sprayqtn range.
683 */
684 #if DEBUG || DEVELOPMENT
685 btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
686 BTREF_GET_NOWAIT : 0;
687
688 if ((kma_flags & KMA_SPRAYQTN) == 0) {
689 if (map_size > KMEM_GOBJ_THRESHOLD) {
690 btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
691 btref_get(__builtin_frame_address(0), flags));
692 } else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
693 btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
694 btref_get(__builtin_frame_address(0), flags));
695 }
696 }
697 #endif /* DEBUG || DEVELOPMENT */
698
699 return (kma_flags & KMA_SPRAYQTN) ||
700 (map_size > KMEM_GOBJ_THRESHOLD) ||
701 ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
702 }
703
704 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)705 kmem_apply_security_policy(
706 vm_map_t map,
707 kma_flags_t kma_flags,
708 kmem_guard_t guard,
709 vm_map_size_t map_size,
710 vm_offset_t mask,
711 vm_map_kernel_flags_t *vmk_flags,
712 bool assert_dir __unused)
713 {
714 kmem_range_id_t range_id;
715 bool from_right;
716 uint16_t type_hash = guard.kmg_type_hash;
717
718 if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
719 return;
720 }
721
722 /*
723 * A non-zero type-hash must be passed by krealloc_type
724 */
725 #if (DEBUG || DEVELOPMENT)
726 if (assert_dir && !(kma_flags & (KMA_DATA | KMA_DATA_SHARED))) {
727 assert(type_hash != 0);
728 }
729 #endif
730
731 if (kma_flags & (KMA_DATA | KMA_DATA_SHARED)) {
732 /*
733 * Choose the specific which data range.
734 */
735 if (kma_flags & KMA_DATA) {
736 range_id = KMEM_RANGE_ID_DATA;
737 } else {
738 range_id = kmem_needs_data_share_range() ?
739 KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA;
740 }
741
742 /*
743 * As an optimization in KMA_DATA to avoid fragmentation,
744 * allocate static carveouts at the end of the DATA range.
745 */
746 from_right = (bool)(kma_flags & KMA_PERMANENT);
747 } else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
748 range_id = KMEM_RANGE_ID_SPRAYQTN;
749 from_right = (bool)(kma_flags & KMA_PERMANENT);
750 } else if (type_hash) {
751 range_id = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
752 from_right = type_hash & KMEM_DIRECTION_MASK;
753 } else {
754 /*
755 * Range id needs to correspond to one of the PTR ranges
756 */
757 type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
758 range_id = kmem_adjust_range_id(type_hash);
759 from_right = type_hash & KMEM_DIRECTION_MASK;
760 }
761
762 vmk_flags->vmkf_range_id = range_id;
763 vmk_flags->vmkf_last_free = from_right;
764 }
765
766 #pragma mark allocation
767
768 static kmem_return_t
769 kmem_alloc_guard_internal(
770 vm_map_t map,
771 vm_size_t size,
772 vm_offset_t mask,
773 kma_flags_t flags,
774 kmem_guard_t guard,
775 kern_return_t (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
776 {
777 vm_object_t object;
778 vm_offset_t delta = 0;
779 vm_map_entry_t entry = NULL;
780 vm_map_offset_t map_addr, fill_start;
781 vm_map_size_t map_size, fill_size;
782 vm_page_t guard_left = VM_PAGE_NULL;
783 vm_page_t guard_right = VM_PAGE_NULL;
784 vm_page_t wired_page_list = VM_PAGE_NULL;
785 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
786 bool skip_guards;
787 kmem_return_t kmr = { };
788
789 vmlp_api_start(KMEM_ALLOC_GUARD_INTERNAL);
790
791 assert(kernel_map && map->pmap == kernel_pmap);
792
793 /* DATA and DATA_SHARED are mutually exclusive */
794 assert((flags & (KMA_DATA | KMA_DATA_SHARED)) != (KMA_DATA | KMA_DATA_SHARED));
795
796 #if defined(__arm64__)
797 /*
798 * Pageable allocations should be marked as shared.
799 *
800 * Only assert this on arm64 architectures, since we do not
801 * adopt the shared heap on older ones.
802 */
803 assert((flags & (KMA_PAGEABLE | KMA_DATA)) != (KMA_PAGEABLE | KMA_DATA));
804 #endif /* defined(__arm64__) */
805
806 #if DEBUG || DEVELOPMENT
807 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
808 size, 0, 0, 0);
809 #endif
810
811 #if HAS_MTE
812 if (__improbable(!is_mte_enabled)) {
813 flags &= ~KMA_TAG;
814 }
815 #endif /* HAS_MTE */
816
817 if (size == 0 ||
818 (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
819 (size < __kmem_guard_size(ANYF(flags)))) {
820 __kmem_invalid_size_panic(map, size, flags);
821 }
822
823 /*
824 * limit the size of a single extent of wired memory
825 * to try and limit the damage to the system if
826 * too many pages get wired down
827 * limit raised to 2GB with 128GB max physical limit,
828 * but scaled by installed memory above this
829 *
830 * Note: kmem_alloc_contig_guard() is immune to this check.
831 */
832 if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
833 alloc_pages == NULL &&
834 size > MAX(1ULL << 31, sane_size / 64))) {
835 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
836 goto out_error;
837 }
838
839 /*
840 * Guard pages:
841 *
842 * Guard pages are implemented as fictitious pages.
843 *
844 * However, some maps, and some objects are known
845 * to manage their memory explicitly, and do not need
846 * those to be materialized, which saves memory.
847 *
848 * By placing guard pages on either end of a stack,
849 * they can help detect cases where a thread walks
850 * off either end of its stack.
851 *
852 * They are allocated and set up here and attempts
853 * to access those pages are trapped in vm_fault_page().
854 *
855 * The map_size we were passed may include extra space for
856 * guard pages. fill_size represents the actual size to populate.
857 * Similarly, fill_start indicates where the actual pages
858 * will begin in the range.
859 */
860
861 map_size = round_page(size);
862 fill_start = 0;
863 fill_size = map_size - __kmem_guard_size(ANYF(flags));
864
865 #if KASAN_CLASSIC
866 if (flags & KMA_KASAN_GUARD) {
867 assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
868 flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
869 delta = ptoa(2);
870 map_size += delta;
871 }
872 #else
873 (void)delta;
874 #endif /* KASAN_CLASSIC */
875
876 skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
877 map->never_faults;
878
879 if (flags & KMA_GUARD_FIRST) {
880 vmk_flags.vmkf_guard_before = true;
881 fill_start += PAGE_SIZE;
882 }
883 if (flags & KMA_NOSOFTLIMIT) {
884 vmk_flags.vmkf_no_soft_limit = true;
885 }
886 if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
887 guard_left = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
888 if (__improbable(guard_left == VM_PAGE_NULL)) {
889 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
890 goto out_error;
891 }
892 }
893 if ((flags & KMA_GUARD_LAST) && !skip_guards) {
894 guard_right = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
895 if (__improbable(guard_right == VM_PAGE_NULL)) {
896 kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
897 goto out_error;
898 }
899 }
900
901 if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
902 if (alloc_pages) {
903 kmr.kmr_return = alloc_pages(fill_size, flags,
904 &wired_page_list);
905 } else {
906 kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
907 &wired_page_list);
908 }
909 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
910 goto out_error;
911 }
912 }
913
914 /*
915 * Allocate a new object (if necessary). We must do this before
916 * locking the map, or risk deadlock with the default pager.
917 */
918 if (flags & KMA_KOBJECT) {
919 #if HAS_MTE
920 if (flags & KMA_TAG) {
921 object = kernel_object_tagged;
922 vmk_flags.vmf_mte = true;
923 } else
924 #endif /* HAS_MTE */
925 {
926 object = kernel_object_default;
927 }
928 vm_object_reference(object);
929 } else if (flags & KMA_COMPRESSOR) {
930 object = compressor_object;
931 vm_object_reference(object);
932 } else {
933 object = vm_object_allocate(map_size, map->serial_id);
934 vm_object_lock(object);
935 vm_object_set_size(object, map_size, size);
936 /* stabilize the object to prevent shadowing */
937 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
938 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
939 #if HAS_MTE
940 if (flags & KMA_TAG) {
941 object->wimg_bits = VM_WIMG_MTE;
942 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
943 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
944 }
945 #endif /* HAS_MTE */
946 vm_object_unlock(object);
947 }
948
949 if (flags & KMA_LAST_FREE) {
950 vmk_flags.vmkf_last_free = true;
951 }
952 if (flags & KMA_PERMANENT) {
953 vmk_flags.vmf_permanent = true;
954 }
955 kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
956 false);
957
958 kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
959 vmk_flags, &entry);
960 if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
961 vm_object_deallocate(object);
962 goto out_error;
963 }
964
965 vmlp_range_event_entry(map, entry);
966
967 map_addr = entry->vme_start;
968 VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
969 VME_ALIAS_SET(entry, guard.kmg_tag);
970 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
971 VME_OFFSET_SET(entry, map_addr);
972 }
973
974 #if KASAN
975 if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
976 entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
977 }
978 #endif /* KASAN */
979
980 if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
981 entry->wired_count = 1;
982 vme_btref_consider_and_set(entry, __builtin_frame_address(0));
983 }
984
985 if (guard_left || guard_right || wired_page_list) {
986 vm_object_offset_t offset = 0ull;
987
988 vm_object_lock(object);
989 vm_map_unlock(map);
990
991 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
992 offset = map_addr;
993 }
994
995 if (guard_left) {
996 vm_page_insert(guard_left, object, offset);
997 guard_left->vmp_busy = FALSE;
998 guard_left = VM_PAGE_NULL;
999 }
1000
1001 if (guard_right) {
1002 vm_page_insert(guard_right, object,
1003 offset + fill_start + fill_size);
1004 guard_right->vmp_busy = FALSE;
1005 guard_right = VM_PAGE_NULL;
1006 }
1007
1008 if (wired_page_list) {
1009 kernel_memory_populate_object_and_unlock(object,
1010 map_addr + fill_start, offset + fill_start, fill_size,
1011 wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
1012 __kmem_mapping_type(ANYF(flags)));
1013 } else {
1014 vm_object_unlock(object);
1015 }
1016 } else {
1017 vm_map_unlock(map);
1018 }
1019
1020 /*
1021 * now that the pages are wired, we no longer have to fear coalesce
1022 */
1023 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1024 vm_map_simplify(map, map_addr);
1025 }
1026
1027 #if DEBUG || DEVELOPMENT
1028 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1029 atop(fill_size), 0, 0, 0);
1030 #endif /* DEBUG || DEVELOPMENT */
1031 kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
1032
1033 #if KASAN
1034 if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
1035 /*
1036 * We need to allow the range for pageable memory,
1037 * or faulting will not be allowed.
1038 */
1039 kasan_notify_address(map_addr, map_size);
1040 }
1041 #endif /* KASAN */
1042 #if KASAN_CLASSIC
1043 if (flags & KMA_KASAN_GUARD) {
1044 kmr.kmr_address += PAGE_SIZE;
1045 kasan_alloc_large(kmr.kmr_address, size);
1046 }
1047 #endif /* KASAN_CLASSIC */
1048 #if CONFIG_KERNEL_TAGGING
1049 if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
1050 kmr.kmr_ptr = vm_memtag_generate_and_store_tag((caddr_t)kmr.kmr_address + fill_start, fill_size);
1051 kmr.kmr_ptr = (caddr_t)kmr.kmr_ptr - fill_start;
1052 #if KASAN_TBI
1053 kasan_tbi_retag_unused_space(kmr.kmr_ptr, map_size, size);
1054 #endif /* KASAN_TBI */
1055 }
1056 #endif /* CONFIG_KERNEL_TAGGING */
1057 vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return);
1058 return kmr;
1059
1060 out_error:
1061 if (flags & KMA_NOFAIL) {
1062 __kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1063 }
1064 if (guard_left) {
1065 guard_left->vmp_snext = wired_page_list;
1066 wired_page_list = guard_left;
1067 }
1068 if (guard_right) {
1069 guard_right->vmp_snext = wired_page_list;
1070 wired_page_list = guard_right;
1071 }
1072 if (wired_page_list) {
1073 vm_page_free_list(wired_page_list, FALSE);
1074 }
1075
1076 #if DEBUG || DEVELOPMENT
1077 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1078 0, 0, 0, 0);
1079 #endif /* DEBUG || DEVELOPMENT */
1080
1081 vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return);
1082 return kmr;
1083 }
1084
1085 __mockable kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1086 kmem_alloc_guard(
1087 vm_map_t map,
1088 vm_size_t size,
1089 vm_offset_t mask,
1090 kma_flags_t flags,
1091 kmem_guard_t guard)
1092 {
1093 return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1094 }
1095
1096 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1097 kmem_alloc_contig_guard(
1098 vm_map_t map,
1099 vm_size_t size,
1100 vm_offset_t mask,
1101 ppnum_t max_pnum,
1102 ppnum_t pnum_mask,
1103 kma_flags_t flags,
1104 kmem_guard_t guard)
1105 {
1106 __auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1107 return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1108 };
1109
1110 return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1111 }
1112
1113 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1114 kmem_suballoc(
1115 vm_map_t parent,
1116 mach_vm_offset_t *addr,
1117 vm_size_t size,
1118 vm_map_create_options_t vmc_options,
1119 int vm_flags,
1120 kms_flags_t flags,
1121 vm_tag_t tag)
1122 {
1123 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1124 vm_map_offset_t map_addr = 0;
1125 kmem_return_t kmr = { };
1126 vm_map_t map;
1127
1128 assert(page_aligned(size));
1129 assert(parent->pmap == kernel_pmap);
1130
1131 vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1132
1133 if (parent == kernel_map) {
1134 assert(vmk_flags.vmf_overwrite || (flags & (KMS_DATA | KMS_DATA_SHARED)));
1135 }
1136
1137 if (vmk_flags.vmf_fixed) {
1138 map_addr = trunc_page(*addr);
1139 }
1140
1141 pmap_reference(vm_map_pmap(parent));
1142 map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1143
1144 /*
1145 * 1. vm_map_enter() will consume one ref on success.
1146 *
1147 * 2. make the entry atomic as kernel submaps should never be split.
1148 *
1149 * 3. instruct vm_map_enter() that it is a fresh submap
1150 * that needs to be taught its bounds as it inserted.
1151 */
1152 vm_map_reference(map);
1153
1154 vmk_flags.vmkf_submap = true;
1155 if ((flags & (KMS_DATA | KMS_DATA_SHARED)) == 0) {
1156 /* FIXME: IOKit submaps get fragmented and can't be atomic */
1157 vmk_flags.vmkf_submap_atomic = true;
1158 }
1159 vmk_flags.vmkf_submap_adjust = true;
1160 if (flags & KMS_LAST_FREE) {
1161 vmk_flags.vmkf_last_free = true;
1162 }
1163 if (flags & KMS_PERMANENT) {
1164 vmk_flags.vmf_permanent = true;
1165 }
1166 if (flags & (KMS_DATA | KMS_DATA_SHARED)) {
1167 if (flags & KMS_DATA) {
1168 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1169 } else {
1170 vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ?
1171 KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA;
1172 }
1173 }
1174 if (flags & KMS_NOSOFTLIMIT) {
1175 vmk_flags.vmkf_no_soft_limit = true;
1176 }
1177
1178 kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1179 vmk_flags, (vm_object_t)map, 0, FALSE,
1180 VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1181
1182 if (kmr.kmr_return != KERN_SUCCESS) {
1183 if (flags & KMS_NOFAIL) {
1184 panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1185 parent, size, kmr.kmr_return);
1186 }
1187 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1188 vm_map_deallocate(map);
1189 vm_map_deallocate(map); /* also removes ref to pmap */
1190 return kmr;
1191 }
1192
1193 /*
1194 * For kmem_suballocs that register a claim and are assigned a range, ensure
1195 * that the exact same range is returned.
1196 */
1197 if (*addr != 0 && parent == kernel_map &&
1198 startup_phase > STARTUP_SUB_KMEM) {
1199 assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1200 } else {
1201 *addr = map_addr;
1202 }
1203
1204 kmr.kmr_submap = map;
1205 return kmr;
1206 }
1207
1208 /*
1209 * kmem_alloc:
1210 *
1211 * Allocate wired-down memory in the kernel's address map
1212 * or a submap. The memory is not zero-filled.
1213 */
1214
1215 __exported kern_return_t
1216 kmem_alloc_external(
1217 vm_map_t map,
1218 vm_offset_t *addrp,
1219 vm_size_t size);
1220 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1221 kmem_alloc_external(
1222 vm_map_t map,
1223 vm_offset_t *addrp,
1224 vm_size_t size)
1225 {
1226 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1227 return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1228 }
1229 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1230 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1231 }
1232
1233
1234 /*
1235 * kmem_alloc_kobject:
1236 *
1237 * Allocate wired-down memory in the kernel's address map
1238 * or a submap. The memory is not zero-filled.
1239 *
1240 * The memory is allocated in the kernel_object.
1241 * It may not be copied with vm_map_copy, and
1242 * it may not be reallocated with kmem_realloc.
1243 */
1244
1245 __exported kern_return_t
1246 kmem_alloc_kobject_external(
1247 vm_map_t map,
1248 vm_offset_t *addrp,
1249 vm_size_t size);
1250 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1251 kmem_alloc_kobject_external(
1252 vm_map_t map,
1253 vm_offset_t *addrp,
1254 vm_size_t size)
1255 {
1256 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1257 return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1258 }
1259 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1260 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1261 }
1262
1263 /*
1264 * kmem_alloc_pageable:
1265 *
1266 * Allocate pageable memory in the kernel's address map.
1267 */
1268
1269 __exported kern_return_t
1270 kmem_alloc_pageable_external(
1271 vm_map_t map,
1272 vm_offset_t *addrp,
1273 vm_size_t size);
1274 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1275 kmem_alloc_pageable_external(
1276 vm_map_t map,
1277 vm_offset_t *addrp,
1278 vm_size_t size)
1279 {
1280 if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1281 return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA_SHARED, vm_tag_bt());
1282 }
1283 /* Maintain ABI compatibility: invalid sizes used to be allowed */
1284 return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1285 }
1286
1287 static __attribute__((always_inline, warn_unused_result))
1288 kern_return_t
mach_vm_allocate_kernel_sanitize(vm_map_t map,mach_vm_offset_ut addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * map_addr,vm_map_size_t * map_size)1289 mach_vm_allocate_kernel_sanitize(
1290 vm_map_t map,
1291 mach_vm_offset_ut addr_u,
1292 mach_vm_size_ut size_u,
1293 vm_map_kernel_flags_t vmk_flags,
1294 vm_map_offset_t *map_addr,
1295 vm_map_size_t *map_size)
1296 {
1297 kern_return_t result;
1298 vm_map_offset_t map_end;
1299
1300 if (vmk_flags.vmf_fixed) {
1301 result = vm_sanitize_addr_size(addr_u, size_u,
1302 VM_SANITIZE_CALLER_VM_ALLOCATE_FIXED,
1303 map,
1304 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_REALIGN_START,
1305 map_addr, &map_end, map_size);
1306 if (__improbable(result != KERN_SUCCESS)) {
1307 return result;
1308 }
1309 } else {
1310 *map_addr = 0;
1311 result = vm_sanitize_size(0, size_u,
1312 VM_SANITIZE_CALLER_VM_ALLOCATE_ANYWHERE, map,
1313 VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
1314 map_size);
1315 if (__improbable(result != KERN_SUCCESS)) {
1316 return result;
1317 }
1318 }
1319
1320 return KERN_SUCCESS;
1321 }
1322
1323 kern_return_t
mach_vm_allocate_kernel(vm_map_t map,mach_vm_offset_ut * addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags)1324 mach_vm_allocate_kernel(
1325 vm_map_t map,
1326 mach_vm_offset_ut *addr_u,
1327 mach_vm_size_ut size_u,
1328 vm_map_kernel_flags_t vmk_flags)
1329 {
1330 vm_map_offset_t map_addr;
1331 vm_map_size_t map_size;
1332 kern_return_t result;
1333
1334 if (map == VM_MAP_NULL) {
1335 ktriage_record(thread_tid(current_thread()),
1336 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1337 KDBG_TRIAGE_RESERVED,
1338 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR),
1339 KERN_INVALID_ARGUMENT /* arg */);
1340 return KERN_INVALID_ARGUMENT;
1341 }
1342
1343 if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
1344 VM_FLAGS_USER_ALLOCATE)) {
1345 return KERN_INVALID_ARGUMENT;
1346 }
1347
1348 result = mach_vm_allocate_kernel_sanitize(map,
1349 *addr_u,
1350 size_u,
1351 vmk_flags,
1352 &map_addr,
1353 &map_size);
1354 if (__improbable(result != KERN_SUCCESS)) {
1355 result = vm_sanitize_get_kr(result);
1356 if (result == KERN_SUCCESS) {
1357 *addr_u = vm_sanitize_wrap_addr(0);
1358 } else {
1359 ktriage_record(thread_tid(current_thread()),
1360 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1361 KDBG_TRIAGE_RESERVED,
1362 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR),
1363 KERN_INVALID_ARGUMENT /* arg */);
1364 }
1365 return result;
1366 }
1367
1368 vm_map_kernel_flags_update_range_id(&vmk_flags, map, map_size);
1369
1370 result = vm_map_enter(
1371 map,
1372 &map_addr,
1373 map_size,
1374 (vm_map_offset_t)0,
1375 vmk_flags,
1376 VM_OBJECT_NULL,
1377 (vm_object_offset_t)0,
1378 FALSE,
1379 VM_PROT_DEFAULT,
1380 VM_PROT_ALL,
1381 VM_INHERIT_DEFAULT);
1382
1383 if (result == KERN_SUCCESS) {
1384 #if KASAN
1385 if (map->pmap == kernel_pmap) {
1386 kasan_notify_address(map_addr, map_size);
1387 }
1388 #endif
1389 *addr_u = vm_sanitize_wrap_addr(map_addr);
1390 } else {
1391 ktriage_record(thread_tid(current_thread()),
1392 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1393 KDBG_TRIAGE_RESERVED,
1394 KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR),
1395 result /* arg */);
1396 }
1397 return result;
1398 }
1399
1400 #pragma mark population
1401
1402 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1403 kernel_memory_populate_pmap_enter(
1404 vm_object_t object,
1405 vm_address_t addr,
1406 vm_object_offset_t offset,
1407 vm_page_t mem,
1408 vm_prot_t prot,
1409 int pe_flags,
1410 pmap_mapping_type_t mapping_type)
1411 {
1412 kern_return_t pe_result;
1413 int pe_options;
1414
1415 if (VMP_ERROR_GET(mem)) {
1416 panic("VM page %p should not have an error", mem);
1417 }
1418
1419 pe_options = PMAP_OPTIONS_NOWAIT;
1420 if (object->internal) {
1421 pe_options |= PMAP_OPTIONS_INTERNAL;
1422 }
1423 if (mem->vmp_reusable || object->all_reusable) {
1424 pe_options |= PMAP_OPTIONS_REUSABLE;
1425 }
1426
1427 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1428 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1429 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1430
1431 if (pe_result == KERN_RESOURCE_SHORTAGE) {
1432 vm_object_unlock(object);
1433
1434 pe_options &= ~PMAP_OPTIONS_NOWAIT;
1435
1436 pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1437 VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1438 pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1439
1440 vm_object_lock(object);
1441 }
1442
1443 assert(pe_result == KERN_SUCCESS);
1444 }
1445
1446 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1447 kernel_memory_populate_object_and_unlock(
1448 vm_object_t object, /* must be locked */
1449 vm_address_t addr,
1450 vm_offset_t offset,
1451 vm_size_t size,
1452 vm_page_t page_list,
1453 kma_flags_t flags,
1454 vm_tag_t tag,
1455 vm_prot_t prot,
1456 pmap_mapping_type_t mapping_type)
1457 {
1458 vm_page_t mem;
1459 int pe_flags;
1460 bool gobbled_list = page_list && page_list->vmp_gobbled;
1461
1462 assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1463 assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1464
1465 #if HAS_MTE
1466 if (!is_mte_enabled) {
1467 assert(!(flags & KMA_TAG));
1468 }
1469 #endif /* HAS_MTE */
1470
1471 if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1472 assert3u(offset, ==, addr);
1473 } else {
1474 /*
1475 * kernel_memory_populate_pmap_enter() might drop the object
1476 * lock, and the caller might not own a reference anymore
1477 * and rely on holding the vm object lock for liveness.
1478 */
1479 vm_object_reference_locked(object);
1480 }
1481
1482 if (flags & KMA_KSTACK) {
1483 pe_flags = VM_MEM_STACK;
1484 } else {
1485 pe_flags = 0;
1486 }
1487
1488 #if HAS_MTE
1489 /* Inform the PMAP layer that we want an MTE backed page. */
1490 if (flags & KMA_TAG) {
1491 pe_flags |= VM_MEM_MAP_MTE;
1492 assert((object->wimg_bits & VM_WIMG_MTE) != 0);
1493 } else {
1494 assert((object->wimg_bits & VM_WIMG_MTE) == 0);
1495 }
1496 #endif /* HAS_MTE */
1497
1498 for (vm_object_offset_t pg_offset = 0;
1499 pg_offset < size;
1500 pg_offset += PAGE_SIZE_64) {
1501 if (page_list == NULL) {
1502 panic("%s: page_list too short", __func__);
1503 }
1504
1505 mem = page_list;
1506 page_list = mem->vmp_snext;
1507 mem->vmp_snext = NULL;
1508
1509 assert(mem->vmp_wire_count == 0);
1510 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1511 assert(vm_page_is_canonical(mem));
1512
1513 if (flags & KMA_COMPRESSOR) {
1514 mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1515 /*
1516 * Background processes doing I/O accounting can call
1517 * into NVME driver to do some work which results in
1518 * an allocation here and so we want to make sure
1519 * that the pages used by compressor, regardless of
1520 * process context, are never on the special Q.
1521 */
1522 mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1523
1524 vm_page_insert(mem, object, offset + pg_offset);
1525 } else {
1526 mem->vmp_q_state = VM_PAGE_IS_WIRED;
1527 mem->vmp_wire_count = 1;
1528
1529 #if HAS_MTE
1530 mteinfo_increment_wire_count(mem);
1531 #endif /* HAS_MTE */
1532
1533 vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1534 }
1535
1536 mem->vmp_gobbled = false;
1537 mem->vmp_busy = false;
1538 mem->vmp_pmapped = true;
1539 mem->vmp_wpmapped = true;
1540
1541 /*
1542 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1543 * for the kernel and compressor objects.
1544 */
1545 kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1546 mem, prot, pe_flags, mapping_type);
1547
1548 if (flags & KMA_NOENCRYPT) {
1549 pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1550 }
1551 }
1552
1553 if (page_list) {
1554 panic("%s: page_list too long", __func__);
1555 }
1556
1557 vm_object_unlock(object);
1558 if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1559 vm_object_deallocate(object);
1560 }
1561
1562 /*
1563 * Update the accounting:
1564 * - the compressor "wired" pages don't really count as wired
1565 * - kmem_alloc_contig_guard() gives gobbled pages,
1566 * which already count as wired but need to be ungobbled.
1567 */
1568 if (gobbled_list) {
1569 vm_page_lockspin_queues();
1570 if (flags & KMA_COMPRESSOR) {
1571 vm_page_wire_count -= atop(size);
1572 }
1573 vm_page_gobble_count -= atop(size);
1574 vm_page_unlock_queues();
1575 } else if ((flags & KMA_COMPRESSOR) == 0) {
1576 vm_page_lockspin_queues();
1577 vm_page_wire_count += atop(size);
1578 vm_page_unlock_queues();
1579 }
1580
1581 if (flags & KMA_KOBJECT) {
1582 /* vm_page_insert_wired() handles regular objects already */
1583 vm_tag_update_size(tag, size, NULL);
1584 }
1585
1586 #if KASAN
1587 if (flags & KMA_COMPRESSOR) {
1588 kasan_notify_address_nopoison(addr, size);
1589 } else {
1590 kasan_notify_address(addr, size);
1591 }
1592 #endif /* KASAN */
1593 }
1594
1595
1596 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1597 kernel_memory_populate(
1598 vm_offset_t addr,
1599 vm_size_t size,
1600 kma_flags_t flags,
1601 vm_tag_t tag)
1602 {
1603 kern_return_t kr = KERN_SUCCESS;
1604 vm_page_t page_list = NULL;
1605 vm_size_t page_count = atop_64(size);
1606 vm_object_t object = __kmem_object(ANYF(flags));
1607
1608 #if DEBUG || DEVELOPMENT
1609 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1610 size, 0, 0, 0);
1611 #endif /* DEBUG || DEVELOPMENT */
1612
1613 #if HAS_MTE
1614 if (!is_mte_enabled) {
1615 assert(!(flags & KMA_TAG));
1616 }
1617 #endif /* HAS_MTE */
1618
1619 kr = vm_page_alloc_list(page_count, flags, &page_list);
1620 if (kr == KERN_SUCCESS) {
1621 vm_object_lock(object);
1622 kernel_memory_populate_object_and_unlock(object, addr,
1623 addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1624 __kmem_mapping_type(ANYF(flags)));
1625 }
1626
1627 #if DEBUG || DEVELOPMENT
1628 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1629 page_count, 0, 0, 0);
1630 #endif /* DEBUG || DEVELOPMENT */
1631 return kr;
1632 }
1633
1634 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1635 kernel_memory_depopulate(
1636 vm_offset_t addr,
1637 vm_size_t size,
1638 kma_flags_t flags,
1639 vm_tag_t tag)
1640 {
1641 vm_object_t object = __kmem_object(ANYF(flags));
1642 vm_object_offset_t offset = addr;
1643 vm_page_t mem;
1644 vm_page_t local_freeq = NULL;
1645 unsigned int pages_unwired = 0;
1646
1647 vm_object_lock(object);
1648
1649 pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1650
1651 for (vm_object_offset_t pg_offset = 0;
1652 pg_offset < size;
1653 pg_offset += PAGE_SIZE_64) {
1654 mem = vm_page_lookup(object, offset + pg_offset);
1655
1656 assert(mem);
1657
1658 if (flags & KMA_COMPRESSOR) {
1659 assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1660 } else {
1661 assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1662 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1663 pages_unwired++;
1664 }
1665
1666 mem->vmp_busy = TRUE;
1667
1668 assert(mem->vmp_tabled);
1669 vm_page_remove(mem, TRUE);
1670 assert(mem->vmp_busy);
1671
1672 assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1673
1674 mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1675 mem->vmp_snext = local_freeq;
1676 local_freeq = mem;
1677 }
1678
1679 vm_object_unlock(object);
1680
1681 vm_page_free_list(local_freeq, TRUE);
1682
1683 if (!(flags & KMA_COMPRESSOR)) {
1684 vm_page_lockspin_queues();
1685 vm_page_wire_count -= pages_unwired;
1686 vm_page_unlock_queues();
1687 }
1688
1689 if (flags & KMA_KOBJECT) {
1690 /* vm_page_remove() handles regular objects already */
1691 vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1692 }
1693 }
1694
1695 #pragma mark reallocation
1696
1697 __abortlike
1698 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1699 __kmem_realloc_invalid_object_size_panic(
1700 vm_map_t map,
1701 vm_address_t address,
1702 vm_size_t size,
1703 vm_map_entry_t entry)
1704 {
1705 vm_object_t object = VME_OBJECT(entry);
1706 vm_size_t objsize = __kmem_entry_orig_size(entry);
1707
1708 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1709 "object %p has unexpected size %ld",
1710 map, (void *)address, (size_t)size, entry, object, objsize);
1711 }
1712
1713 __abortlike
1714 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1715 __kmem_realloc_invalid_pager_panic(
1716 vm_map_t map,
1717 vm_address_t address,
1718 vm_size_t size,
1719 vm_map_entry_t entry)
1720 {
1721 vm_object_t object = VME_OBJECT(entry);
1722 memory_object_t pager = object->pager;
1723 bool pager_created = object->pager_created;
1724 bool pager_initialized = object->pager_initialized;
1725 bool pager_ready = object->pager_ready;
1726
1727 panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1728 "object %p has unexpected pager %p (%d,%d,%d)",
1729 map, (void *)address, (size_t)size, entry, object,
1730 pager, pager_created, pager_initialized, pager_ready);
1731 }
1732
1733 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1734 kmem_realloc_shrink_guard(
1735 vm_map_t map,
1736 vm_offset_t req_oldaddr,
1737 vm_size_t req_oldsize,
1738 vm_size_t req_newsize,
1739 kmr_flags_t flags,
1740 kmem_guard_t guard,
1741 vm_map_entry_t entry)
1742 {
1743 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1744 vm_object_t object;
1745 vm_offset_t delta = 0;
1746 kmem_return_t kmr;
1747 bool was_atomic;
1748 vm_size_t oldsize = round_page(req_oldsize);
1749 vm_size_t newsize = round_page(req_newsize);
1750 vm_address_t oldaddr = req_oldaddr;
1751
1752 #if KASAN_CLASSIC
1753 if (flags & KMR_KASAN_GUARD) {
1754 assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1755 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1756 oldaddr -= PAGE_SIZE;
1757 delta = ptoa(2);
1758 oldsize += delta;
1759 newsize += delta;
1760 }
1761 #endif /* KASAN_CLASSIC */
1762
1763 if (flags & KMR_TAG) {
1764 oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1765 }
1766
1767 vm_map_lock_assert_exclusive(map);
1768
1769 if ((flags & KMR_KOBJECT) == 0) {
1770 object = VME_OBJECT(entry);
1771 vm_object_reference(object);
1772 }
1773
1774 /*
1775 * Shrinking an atomic entry starts with splitting it,
1776 * and removing the second half.
1777 */
1778 was_atomic = entry->vme_atomic;
1779 entry->vme_atomic = false;
1780 vm_map_clip_end(map, entry, entry->vme_start + newsize);
1781 entry->vme_atomic = was_atomic;
1782
1783 #if KASAN
1784 if (entry->vme_kernel_object && was_atomic) {
1785 entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1786 }
1787 #if KASAN_CLASSIC
1788 if (flags & KMR_KASAN_GUARD) {
1789 kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1790 ASAN_VALID);
1791 }
1792 #endif
1793 #if KASAN_TBI
1794 if (flags & KMR_TAG) {
1795 kasan_tbi_mark_free_space((caddr_t)req_oldaddr + newsize, oldsize - newsize);
1796 }
1797 #endif /* KASAN_TBI */
1798 #endif /* KASAN */
1799 (void)vm_map_remove_and_unlock(map,
1800 oldaddr + newsize, oldaddr + oldsize,
1801 vmr_flags, KMEM_GUARD_NONE);
1802
1803
1804 /*
1805 * Lastly, if there are guard pages, deal with them.
1806 *
1807 * The kernel object just needs to depopulate,
1808 * regular objects require freeing the last page
1809 * and replacing it with a guard.
1810 */
1811 if (flags & KMR_KOBJECT) {
1812 if (flags & KMR_GUARD_LAST) {
1813 kma_flags_t dflags = KMA_KOBJECT;
1814 #if HAS_MTE
1815 dflags |= (ANYF(flags) & KMEM_TAG);
1816 #endif
1817 kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1818 PAGE_SIZE, dflags, guard.kmg_tag);
1819 }
1820 } else {
1821 vm_page_t guard_right = VM_PAGE_NULL;
1822 vm_offset_t remove_start = newsize;
1823
1824 if (flags & KMR_GUARD_LAST) {
1825 if (!map->never_faults) {
1826 guard_right = vm_page_create_guard(true);
1827 }
1828 remove_start -= PAGE_SIZE;
1829 }
1830
1831 vm_object_lock(object);
1832
1833 if (object->vo_size != oldsize) {
1834 __kmem_realloc_invalid_object_size_panic(map,
1835 req_oldaddr, req_oldsize + delta, entry);
1836 }
1837 vm_object_set_size(object, newsize, req_newsize);
1838
1839 vm_object_page_remove(object, remove_start, oldsize);
1840
1841 if (guard_right) {
1842 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1843 guard_right->vmp_busy = false;
1844 }
1845 vm_object_unlock(object);
1846 vm_object_deallocate(object);
1847 }
1848
1849 kmr.kmr_address = req_oldaddr;
1850 kmr.kmr_return = 0;
1851 #if KASAN_CLASSIC
1852 if (flags & KMA_KASAN_GUARD) {
1853 kasan_alloc_large(kmr.kmr_address, req_newsize);
1854 }
1855 #endif /* KASAN_CLASSIC */
1856 #if KASAN_TBI
1857 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1858 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1859 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1860 }
1861 #endif /* KASAN_TBI */
1862
1863 return kmr;
1864 }
1865
1866 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1867 kmem_realloc_guard(
1868 vm_map_t map,
1869 vm_offset_t req_oldaddr,
1870 vm_size_t req_oldsize,
1871 vm_size_t req_newsize,
1872 kmr_flags_t flags,
1873 kmem_guard_t guard)
1874 {
1875 vm_object_t object;
1876 vm_size_t oldsize;
1877 vm_size_t newsize;
1878 vm_offset_t delta = 0;
1879 vm_map_offset_t oldaddr;
1880 vm_map_offset_t newaddr;
1881 vm_object_offset_t newoffs;
1882 vm_map_entry_t oldentry;
1883 vm_map_entry_t newentry;
1884 vm_page_t page_list = NULL;
1885 bool needs_wakeup = false;
1886 kmem_return_t kmr = { };
1887 unsigned int last_timestamp;
1888 vm_map_kernel_flags_t vmk_flags = {
1889 .vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1890 };
1891
1892 vmlp_api_start(KMEM_REALLOC_GUARD);
1893
1894 assert(KMEM_REALLOC_FLAGS_VALID(flags));
1895
1896 if (!guard.kmg_atomic) {
1897 if (!(flags & (KMR_DATA | KMR_DATA_SHARED))) {
1898 __kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1899 req_oldsize, flags);
1900 }
1901
1902 if (flags & KMR_KOBJECT) {
1903 __kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1904 req_oldsize, flags);
1905 }
1906 }
1907
1908 if (req_oldaddr == 0ul) {
1909 kmem_return_t ret = kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1910 vmlp_api_end(KMEM_REALLOC_GUARD, ret.kmr_return);
1911 return ret;
1912 }
1913
1914 if (req_newsize == 0ul) {
1915 kmem_free_guard(map, req_oldaddr, req_oldsize,
1916 (kmf_flags_t)flags, guard);
1917 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1918 return kmr;
1919 }
1920
1921 if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1922 __kmem_invalid_size_panic(map, req_newsize, flags);
1923 }
1924 if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1925 __kmem_invalid_size_panic(map, req_newsize, flags);
1926 }
1927
1928 oldsize = round_page(req_oldsize);
1929 newsize = round_page(req_newsize);
1930 oldaddr = req_oldaddr;
1931 #if KASAN_CLASSIC
1932 if (flags & KMR_KASAN_GUARD) {
1933 flags |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1934 oldaddr -= PAGE_SIZE;
1935 delta = ptoa(2);
1936 oldsize += delta;
1937 newsize += delta;
1938 }
1939 #endif /* KASAN_CLASSIC */
1940 #if CONFIG_KERNEL_TAGGING
1941 if (flags & KMR_TAG) {
1942 vm_memtag_verify_tag(req_oldaddr + __kmem_guard_left(ANYF(flags)));
1943 oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1944 #if HAS_MTE
1945 vmk_flags.vmf_mte = true;
1946 #endif /* HAS_MTE */
1947 }
1948 #endif /* CONFIG_KERNEL_TAGGING */
1949
1950 #if !KASAN
1951 /*
1952 * If not on a KASAN variant and no difference in requested size,
1953 * just return.
1954 *
1955 * Otherwise we want to validate the size and re-tag for KASAN_TBI.
1956 */
1957 if (oldsize == newsize) {
1958 kmr.kmr_address = req_oldaddr;
1959 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1960 return kmr;
1961 }
1962 #endif /* !KASAN */
1963
1964 /*
1965 * If we're growing the allocation,
1966 * then reserve the pages we'll need,
1967 * and find a spot for its new place.
1968 */
1969 if (oldsize < newsize) {
1970 #if DEBUG || DEVELOPMENT
1971 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1972 DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1973 newsize - oldsize, 0, 0, 0);
1974 #endif /* DEBUG || DEVELOPMENT */
1975 kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1976 (kma_flags_t)flags, &page_list);
1977 if (kmr.kmr_return == KERN_SUCCESS) {
1978 kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1979 newsize, 0, &vmk_flags, true);
1980 kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1981 vmk_flags, &newentry);
1982 }
1983 if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1984 if (flags & KMR_REALLOCF) {
1985 kmem_free_guard(map, req_oldaddr, req_oldsize,
1986 flags & (KMF_TAG | KMF_GUARD_FIRST |
1987 KMF_GUARD_LAST | KMF_KASAN_GUARD), guard);
1988 }
1989 if (page_list) {
1990 vm_page_free_list(page_list, FALSE);
1991 }
1992 #if DEBUG || DEVELOPMENT
1993 VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1994 DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1995 0, 0, 0, 0);
1996 #endif /* DEBUG || DEVELOPMENT */
1997 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1998 return kmr;
1999 }
2000
2001 /* map is locked */
2002 } else {
2003 vm_map_lock(map);
2004 }
2005
2006
2007 /*
2008 * Locate the entry:
2009 * - wait for it to quiesce.
2010 * - validate its guard,
2011 * - learn its correct tag,
2012 */
2013 again:
2014 if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2015 __kmem_entry_not_found_panic(map, req_oldaddr);
2016 }
2017
2018 vmlp_range_event_entry(map, oldentry);
2019
2020 if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
2021 oldentry->needs_wakeup = true;
2022 vm_map_entry_wait(map, THREAD_UNINT);
2023 goto again;
2024 }
2025 kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
2026 if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
2027 __kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
2028 }
2029 /*
2030 * TODO: We should validate for non atomic entries that the range
2031 * we are acting on is what we expect here.
2032 */
2033 #if KASAN
2034 if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
2035 __kmem_realloc_invalid_object_size_panic(map,
2036 req_oldaddr, req_oldsize + delta, oldentry);
2037 }
2038
2039 if (oldsize == newsize) {
2040 kmr.kmr_address = req_oldaddr;
2041 if (oldentry->vme_kernel_object) {
2042 oldentry->vme_object_or_delta = delta +
2043 (-req_newsize & PAGE_MASK);
2044 } else {
2045 object = VME_OBJECT(oldentry);
2046 vm_object_lock(object);
2047 vm_object_set_size(object, newsize, req_newsize);
2048 vm_object_unlock(object);
2049 }
2050 vm_map_unlock(map);
2051
2052 #if KASAN_CLASSIC
2053 if (flags & KMA_KASAN_GUARD) {
2054 kasan_alloc_large(kmr.kmr_address, req_newsize);
2055 }
2056 #endif /* KASAN_CLASSIC */
2057 #if KASAN_TBI
2058 if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
2059 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2060 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2061 }
2062 #endif /* KASAN_TBI */
2063 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2064 return kmr;
2065 }
2066 #endif /* KASAN */
2067
2068 guard.kmg_tag = VME_ALIAS(oldentry);
2069
2070 if (newsize < oldsize) {
2071 kmem_return_t ret = kmem_realloc_shrink_guard(map, req_oldaddr,
2072 req_oldsize, req_newsize, flags, guard, oldentry);
2073 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2074 return ret;
2075 }
2076
2077
2078 /*
2079 * We are growing the entry
2080 *
2081 * For regular objects we use the object `vo_size` updates
2082 * as a guarantee that no 2 kmem_realloc() can happen
2083 * concurrently (by doing it before the map is unlocked.
2084 *
2085 * For the kernel object, prevent the entry from being
2086 * reallocated or changed by marking it "in_transition".
2087 */
2088
2089 object = VME_OBJECT(oldentry);
2090 vm_object_lock(object);
2091 vm_object_reference_locked(object);
2092
2093 newaddr = newentry->vme_start;
2094 newoffs = oldsize;
2095
2096 vmlp_range_event_entry(map, newentry);
2097
2098 VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
2099 VME_ALIAS_SET(newentry, guard.kmg_tag);
2100 if (flags & KMR_KOBJECT) {
2101 oldentry->in_transition = true;
2102 VME_OFFSET_SET(newentry, newaddr);
2103 newentry->wired_count = 1;
2104 vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
2105 newoffs = newaddr + oldsize;
2106 #if KASAN
2107 newentry->vme_object_or_delta = delta +
2108 (-req_newsize & PAGE_MASK);
2109 #endif /* KASAN */
2110 } else {
2111 if (object->pager_created || object->pager) {
2112 /*
2113 * We can't "realloc/grow" the pager, so pageable
2114 * allocations should not go through this path.
2115 */
2116 __kmem_realloc_invalid_pager_panic(map,
2117 req_oldaddr, req_oldsize + delta, oldentry);
2118 }
2119 if (object->vo_size != oldsize) {
2120 __kmem_realloc_invalid_object_size_panic(map,
2121 req_oldaddr, req_oldsize + delta, oldentry);
2122 }
2123 vm_object_set_size(object, newsize, req_newsize);
2124 }
2125
2126 last_timestamp = map->timestamp;
2127 vm_map_unlock(map);
2128
2129
2130 /*
2131 * Now proceed with the population of pages.
2132 *
2133 * Kernel objects can use the kmem population helpers.
2134 *
2135 * Regular objects will insert pages manually,
2136 * then wire the memory into the new range.
2137 */
2138
2139 vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
2140
2141 if (flags & KMR_KOBJECT) {
2142 pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
2143
2144 pmap_protect(kernel_pmap,
2145 oldaddr, oldaddr + oldsize - guard_right_size,
2146 VM_PROT_NONE);
2147
2148 for (vm_object_offset_t offset = 0;
2149 offset < oldsize - guard_right_size;
2150 offset += PAGE_SIZE_64) {
2151 vm_page_t mem;
2152
2153 mem = vm_page_lookup(object, oldaddr + offset);
2154 if (mem == VM_PAGE_NULL) {
2155 continue;
2156 }
2157
2158 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
2159
2160 mem->vmp_busy = true;
2161 vm_page_remove(mem, true);
2162 vm_page_insert_wired(mem, object, newaddr + offset,
2163 guard.kmg_tag);
2164 mem->vmp_busy = false;
2165
2166 kernel_memory_populate_pmap_enter(object, newaddr,
2167 offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
2168 }
2169
2170 kernel_memory_populate_object_and_unlock(object,
2171 newaddr + oldsize - guard_right_size,
2172 newoffs - guard_right_size,
2173 newsize - oldsize,
2174 page_list, (kma_flags_t)flags,
2175 guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
2176 } else {
2177 vm_page_t guard_right = VM_PAGE_NULL;
2178
2179 /*
2180 * Note: we are borrowing the new entry reference
2181 * on the object for the duration of this code,
2182 * which works because we keep the object locked
2183 * throughout.
2184 */
2185 if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
2186 guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
2187 assert(vm_page_is_guard(guard_right));
2188 guard_right->vmp_busy = true;
2189 vm_page_remove(guard_right, true);
2190 }
2191
2192 if (flags & KMR_FREEOLD) {
2193 /*
2194 * Freeing the old mapping will make
2195 * the old pages become pageable until
2196 * the new mapping makes them wired again.
2197 * Let's take an extra "wire_count" to
2198 * prevent any accidental "page out".
2199 * We'll have to undo that after wiring
2200 * the new mapping.
2201 */
2202 vm_object_reference_locked(object); /* keep object alive */
2203 for (vm_object_offset_t offset = 0;
2204 offset < oldsize - guard_right_size;
2205 offset += PAGE_SIZE_64) {
2206 vm_page_t mem;
2207
2208 mem = vm_page_lookup(object, offset);
2209 assert(mem != VM_PAGE_NULL);
2210 assertf(!VM_PAGE_PAGEABLE(mem),
2211 "mem %p qstate %d",
2212 mem, mem->vmp_q_state);
2213 if (vm_page_is_guard(mem)) {
2214 /* guard pages are not wired */
2215 } else {
2216 assertf(VM_PAGE_WIRED(mem),
2217 "mem %p qstate %d wirecount %d",
2218 mem,
2219 mem->vmp_q_state,
2220 mem->vmp_wire_count);
2221 assertf(mem->vmp_wire_count >= 1,
2222 "mem %p wirecount %d",
2223 mem, mem->vmp_wire_count);
2224 mem->vmp_wire_count++;
2225 }
2226 }
2227 }
2228
2229 for (vm_object_offset_t offset = oldsize - guard_right_size;
2230 offset < newsize - guard_right_size;
2231 offset += PAGE_SIZE_64) {
2232 vm_page_t mem = page_list;
2233
2234 page_list = mem->vmp_snext;
2235 mem->vmp_snext = VM_PAGE_NULL;
2236 assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
2237 assert(!VM_PAGE_PAGEABLE(mem));
2238
2239 vm_page_insert(mem, object, offset);
2240 mem->vmp_busy = false;
2241 }
2242
2243 if (guard_right) {
2244 vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
2245 guard_right->vmp_busy = false;
2246 }
2247
2248 vm_object_unlock(object);
2249 }
2250
2251 /*
2252 * Mark the entry as idle again,
2253 * and honor KMR_FREEOLD if needed.
2254 */
2255
2256 vm_map_lock(map);
2257 if (last_timestamp + 1 != map->timestamp &&
2258 !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2259 __kmem_entry_not_found_panic(map, req_oldaddr);
2260 }
2261
2262 if (flags & KMR_KOBJECT) {
2263 assert(oldentry->in_transition);
2264 oldentry->in_transition = false;
2265 if (oldentry->needs_wakeup) {
2266 needs_wakeup = true;
2267 oldentry->needs_wakeup = false;
2268 }
2269 }
2270
2271 if (flags & KMR_FREEOLD) {
2272 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2273
2274 #if KASAN_CLASSIC
2275 if (flags & KMR_KASAN_GUARD) {
2276 kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2277 }
2278 #endif
2279 #if KASAN_TBI
2280 if (flags & KMR_TAG) {
2281 kasan_tbi_mark_free_space((caddr_t)req_oldaddr, oldsize);
2282 }
2283 #endif /* KASAN_TBI */
2284 if (flags & KMR_GUARD_LAST) {
2285 vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2286 }
2287 (void)vm_map_remove_and_unlock(map,
2288 oldaddr, oldaddr + oldsize,
2289 vmr_flags, guard);
2290 } else {
2291 vm_map_unlock(map);
2292 }
2293
2294 if ((flags & KMR_KOBJECT) == 0) {
2295 kern_return_t kr;
2296 /*
2297 * This must happen _after_ we do the KMR_FREEOLD,
2298 * because wiring the pages will call into the pmap,
2299 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2300 * this would cause a second mapping of the page and panic.
2301 */
2302 kr = vm_map_wire_kernel(map,
2303 vm_sanitize_wrap_addr(newaddr),
2304 vm_sanitize_wrap_addr(newaddr + newsize),
2305 vm_sanitize_wrap_prot(VM_PROT_DEFAULT),
2306 guard.kmg_tag, FALSE);
2307 assert(kr == KERN_SUCCESS);
2308
2309 if (flags & KMR_FREEOLD) {
2310 /*
2311 * Undo the extra "wiring" we made above
2312 * and release the extra reference we took
2313 * on the object.
2314 */
2315 vm_object_lock(object);
2316 for (vm_object_offset_t offset = 0;
2317 offset < oldsize - guard_right_size;
2318 offset += PAGE_SIZE_64) {
2319 vm_page_t mem;
2320
2321 mem = vm_page_lookup(object, offset);
2322 assert(mem != VM_PAGE_NULL);
2323 assertf(!VM_PAGE_PAGEABLE(mem),
2324 "mem %p qstate %d",
2325 mem, mem->vmp_q_state);
2326 if (vm_page_is_guard(mem)) {
2327 /* guard pages are not wired */
2328 } else {
2329 assertf(VM_PAGE_WIRED(mem),
2330 "mem %p qstate %d wirecount %d",
2331 mem,
2332 mem->vmp_q_state,
2333 mem->vmp_wire_count);
2334 assertf(mem->vmp_wire_count >= 2,
2335 "mem %p wirecount %d",
2336 mem, mem->vmp_wire_count);
2337 mem->vmp_wire_count--;
2338 assert(VM_PAGE_WIRED(mem));
2339 assert(mem->vmp_wire_count >= 1);
2340 }
2341 }
2342 vm_object_unlock(object);
2343 vm_object_deallocate(object); /* release extra ref */
2344 }
2345 }
2346
2347 if (needs_wakeup) {
2348 vm_map_entry_wakeup(map);
2349 }
2350
2351 #if DEBUG || DEVELOPMENT
2352 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
2353 atop(newsize - oldsize), 0, 0, 0);
2354 #endif /* DEBUG || DEVELOPMENT */
2355 kmr.kmr_address = newaddr;
2356
2357 #if KASAN
2358 kasan_notify_address(kmr.kmr_address, newsize);
2359 #endif /* KASAN */
2360 #if KASAN_CLASSIC
2361 if (flags & KMR_KASAN_GUARD) {
2362 kmr.kmr_address += PAGE_SIZE;
2363 kasan_alloc_large(kmr.kmr_address, req_newsize);
2364 }
2365 #endif /* KASAN_CLASSIC */
2366 #if CONFIG_KERNEL_TAGGING
2367 if (flags & KMR_TAG) {
2368 #if HAS_MTE
2369 kmr.kmr_address = vm_memtag_insert_tag(kmr.kmr_address,
2370 vm_memtag_extract_tag(req_oldaddr));
2371 vm_memtag_store_tag((caddr_t)kmr.kmr_ptr + oldsize - guard_right_size,
2372 newsize - oldsize);
2373 #elif KASAN_TBI
2374 /*
2375 * Validate the current buffer, then generate a new tag,
2376 * even if the address is stable, it's a "new" allocation.
2377 */
2378 __asan_loadN((vm_offset_t)kmr.kmr_address, oldsize);
2379 kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2380 kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2381 #endif /* KASAN_TBI */
2382 }
2383 #endif /* CONFIG_KERNEL_TAGGING */
2384
2385 vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2386 return kmr;
2387 }
2388
2389 #pragma mark map/remap/wire
2390
2391 kern_return_t
mach_vm_map_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut initial_size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,memory_object_offset_ut offset,boolean_t copy,vm_prot_ut cur_protection,vm_prot_ut max_protection,vm_inherit_ut inheritance)2392 mach_vm_map_kernel(
2393 vm_map_t target_map,
2394 mach_vm_offset_ut *address,
2395 mach_vm_size_ut initial_size,
2396 mach_vm_offset_ut mask,
2397 vm_map_kernel_flags_t vmk_flags,
2398 ipc_port_t port,
2399 memory_object_offset_ut offset,
2400 boolean_t copy,
2401 vm_prot_ut cur_protection,
2402 vm_prot_ut max_protection,
2403 vm_inherit_ut inheritance)
2404 {
2405 /* range_id is set by vm_map_enter_mem_object */
2406 return vm_map_enter_mem_object(target_map,
2407 address,
2408 initial_size,
2409 mask,
2410 vmk_flags,
2411 port,
2412 offset,
2413 copy,
2414 cur_protection,
2415 max_protection,
2416 inheritance,
2417 NULL,
2418 0);
2419 }
2420
2421 kern_return_t
mach_vm_remap_new_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,mach_vm_offset_ut memory_address,boolean_t copy,vm_prot_ut * cur_protection,vm_prot_ut * max_protection,vm_inherit_ut inheritance)2422 mach_vm_remap_new_kernel(
2423 vm_map_t target_map,
2424 mach_vm_offset_ut *address,
2425 mach_vm_size_ut size,
2426 mach_vm_offset_ut mask,
2427 vm_map_kernel_flags_t vmk_flags,
2428 vm_map_t src_map,
2429 mach_vm_offset_ut memory_address,
2430 boolean_t copy,
2431 vm_prot_ut *cur_protection, /* IN/OUT */
2432 vm_prot_ut *max_protection, /* IN/OUT */
2433 vm_inherit_ut inheritance)
2434 {
2435 if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
2436 VM_FLAGS_USER_REMAP)) {
2437 return KERN_INVALID_ARGUMENT;
2438 }
2439
2440
2441 vmk_flags.vmf_return_data_addr = true;
2442
2443 /* range_id is set by vm_map_remap */
2444 return vm_map_remap(target_map,
2445 address,
2446 size,
2447 mask,
2448 vmk_flags,
2449 src_map,
2450 memory_address,
2451 copy,
2452 cur_protection,
2453 max_protection,
2454 inheritance);
2455 }
2456
2457 #pragma mark free
2458
2459 #if KASAN
2460
2461 __abortlike
2462 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2463 __kmem_free_invalid_object_size_panic(
2464 vm_map_t map,
2465 vm_address_t address,
2466 vm_size_t size,
2467 vm_map_entry_t entry)
2468 {
2469 vm_object_t object = VME_OBJECT(entry);
2470 vm_size_t objsize = __kmem_entry_orig_size(entry);
2471
2472 panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2473 "object %p has unexpected size %ld",
2474 map, (void *)address, (size_t)size, entry, object, objsize);
2475 }
2476
2477 #endif /* KASAN */
2478
2479 __mockable vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2480 kmem_free_guard(
2481 vm_map_t map,
2482 vm_offset_t req_addr,
2483 vm_size_t req_size,
2484 kmf_flags_t flags,
2485 kmem_guard_t guard)
2486 {
2487 vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2488 vm_address_t addr = req_addr;
2489 vm_offset_t delta = 0;
2490 vm_size_t size;
2491 #if KASAN
2492 vm_map_entry_t entry;
2493 #endif /* KASAN */
2494
2495 vmlp_api_start(KMEM_FREE_GUARD);
2496
2497 assert(map->pmap == kernel_pmap);
2498
2499 #if KASAN_CLASSIC
2500 if (flags & KMF_KASAN_GUARD) {
2501 addr -= PAGE_SIZE;
2502 delta = ptoa(2);
2503 }
2504 #endif /* KASAN_CLASSIC */
2505 #if CONFIG_KERNEL_TAGGING
2506 if (flags & KMF_TAG) {
2507 vm_memtag_verify_tag(req_addr + __kmem_guard_left(ANYF(flags)));
2508 addr = vm_memtag_canonicalize_kernel(req_addr);
2509 }
2510 #endif /* CONFIG_KERNEL_TAGGING */
2511
2512 if (flags & KMF_GUESS_SIZE) {
2513 vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2514 size = PAGE_SIZE;
2515 } else if (req_size == 0) {
2516 __kmem_invalid_size_panic(map, req_size, flags);
2517 } else {
2518 size = round_page(req_size) + delta;
2519 }
2520
2521 vm_map_lock(map);
2522
2523 #if KASAN
2524 if (!vm_map_lookup_entry(map, addr, &entry)) {
2525 __kmem_entry_not_found_panic(map, req_addr);
2526 }
2527 if (flags & KMF_GUESS_SIZE) {
2528 vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2529 req_size = __kmem_entry_orig_size(entry);
2530 size = round_page(req_size + delta);
2531 } else if (guard.kmg_atomic && entry->vme_kernel_object &&
2532 __kmem_entry_orig_size(entry) != req_size) {
2533 /*
2534 * We can't make a strict check for regular
2535 * VM objects because it could be:
2536 *
2537 * - the kmem_guard_free() of a kmem_realloc_guard() without
2538 * KMR_FREEOLD, and in that case the object size won't match.
2539 *
2540 * - a submap, in which case there is no "orig size".
2541 */
2542 __kmem_free_invalid_object_size_panic(map,
2543 req_addr, req_size + delta, entry);
2544 }
2545 #endif /* KASAN */
2546 #if KASAN_CLASSIC
2547 if (flags & KMR_KASAN_GUARD) {
2548 kasan_poison_range(addr, size, ASAN_VALID);
2549 }
2550 #endif
2551 #if KASAN_TBI
2552 if (flags & KMF_TAG) {
2553 kasan_tbi_mark_free_space((caddr_t)req_addr, size);
2554 }
2555 #endif /* KASAN_TBI */
2556
2557 /*
2558 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2559 * unwires the kernel mapping. The page won't be mapped any longer so
2560 * there is no extra step that is required for memory tagging to "clear"
2561 * it -- the page will be later laundered when reused.
2562 */
2563 vmlp_range_event(map, addr, size);
2564 vmlp_api_end(KMEM_FREE_GUARD, 0);
2565 return vm_map_remove_and_unlock(map, addr, addr + size,
2566 vmr_flags, guard).kmr_size - delta;
2567 }
2568
2569 __exported void
2570 kmem_free_external(
2571 vm_map_t map,
2572 vm_offset_t addr,
2573 vm_size_t size);
2574 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2575 kmem_free_external(
2576 vm_map_t map,
2577 vm_offset_t addr,
2578 vm_size_t size)
2579 {
2580 if (size) {
2581 kmem_free(map, trunc_page(addr), size);
2582 #if MACH_ASSERT
2583 } else {
2584 printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2585 map, (void *)addr, __builtin_return_address(0));
2586 #endif
2587 }
2588 }
2589
2590 #pragma mark kmem metadata
2591
2592 /*
2593 * Guard objects for kmem pointer allocation:
2594 *
2595 * Guard objects introduce size slabs to kmem pointer allocations that are
2596 * allocated in chunks of n * sizeclass. When an allocation of a specific
2597 * sizeclass is requested a random slot from [0, n) is returned.
2598 * Allocations are returned from that chunk until m slots are left. The
2599 * remaining m slots are referred to as guard objects. They don't get
2600 * allocated and the chunk is now considered full. When an allocation is
2601 * freed to the chunk 1 slot is now available from m + 1 for the next
2602 * allocation of that sizeclass.
2603 *
2604 * Guard objects are intended to make exploitation of use after frees harder
2605 * as allocations that are freed can no longer be reliable reallocated.
2606 * They also make exploitation of OOBs harder as overflowing out of an
2607 * allocation can no longer be safe even with sufficient spraying.
2608 */
2609
2610 #define KMEM_META_PRIMARY 0xf
2611 #define KMEM_META_START 0xe
2612 #define KMEM_META_FREE 0xd
2613 #if __ARM_16K_PG__
2614 #define KMEM_MIN_SIZE PAGE_SIZE
2615 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2616 #else /* __ARM_16K_PG__ */
2617 /*
2618 * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2619 * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2620 * Therefore populate sizeclasses from 4k for those devices.
2621 */
2622 #define KMEM_MIN_SIZE (4 * 1024)
2623 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2624 #endif /* __ARM_16K_PG__ */
2625 #define KMEM_MAX_SIZE (32ULL << 20)
2626 #define KMEM_START_IDX (kmem_log2down(KMEM_MIN_SIZE))
2627 #define KMEM_LAST_IDX (kmem_log2down(KMEM_MAX_SIZE))
2628 #define KMEM_NUM_SIZECLASS (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2629 #define KMEM_FRONTS (KMEM_RANGE_ID_NUM_PTR * 2)
2630 #define KMEM_NUM_SLOTS 8
2631 #define KMEM_NUM_GUARDS 2
2632 #define KMEM_NUM_QUARANTINE 2
2633
2634 #define KMEM_PAGEMARKER_BITS 4
2635 #define KMEM_SIZECLASS_BITS 4
2636 #define KMEM_QUARANTINE_BITS 3
2637 #define KMEM_AVAIL_BITS 5
2638
2639 static_assert(KMEM_NUM_SIZECLASS <= (1u << KMEM_SIZECLASS_BITS));
2640
2641 struct kmem_page_meta {
2642 union {
2643 /*
2644 * On primary allocated chunk with KMEM_META_PRIMARY marker
2645 */
2646 uint32_t km_bitmap;
2647 /*
2648 * On start and end of free chunk with KMEM_META_FREE marker
2649 */
2650 uint32_t km_free_chunks;
2651 };
2652
2653 /*
2654 * KMEM_META_PRIMARY: Start meta of allocated chunk
2655 * KMEM_META_FREE : Start and end meta of free chunk
2656 * KMEM_META_START : Meta region start and end
2657 */
2658 uint8_t km_page_marker : KMEM_PAGEMARKER_BITS;
2659 uint8_t km_sizeclass : KMEM_SIZECLASS_BITS;
2660 uint8_t km_quarantined : KMEM_QUARANTINE_BITS;
2661 uint8_t km_avail_count : KMEM_AVAIL_BITS;
2662
2663 union {
2664 /*
2665 * On primary allocated chunk with KMEM_META_PRIMARY marker
2666 */
2667 uint16_t km_chunk_len;
2668 /*
2669 * On secondary allocated chunks
2670 */
2671 uint16_t km_page_idx;
2672 };
2673 LIST_ENTRY(kmem_page_meta) km_link;
2674 } kmem_page_meta_t;
2675
2676 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2677 struct kmem_sizeclass {
2678 vm_map_size_t ks_size;
2679 uint32_t ks_num_chunk;
2680 uint32_t ks_num_elem;
2681 crypto_random_ctx_t __zpercpu ks_rng_ctx;
2682 kmem_list_head_t ks_allfree_head[KMEM_FRONTS];
2683 kmem_list_head_t ks_partial_head[KMEM_FRONTS];
2684 kmem_list_head_t ks_full_head[KMEM_FRONTS];
2685 };
2686
2687 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2688
2689 /*
2690 * Locks to synchronize metadata population
2691 */
2692 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2693 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2694 #define kmem_meta_lock() lck_mtx_lock(&kmem_meta_region_lck)
2695 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2696
2697 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2698 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2699 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2700 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2701 /*
2702 * Keeps track of metadata high water mark for each front
2703 */
2704 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2705 static SECURITY_READ_ONLY_LATE(vm_map_t)
2706 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2707 static vm_map_size_t kmem_meta_size;
2708
2709 static uint32_t
kmem_guard_count(struct kmem_sizeclass * kmem)2710 kmem_guard_count(struct kmem_sizeclass *kmem)
2711 {
2712 return kmem->ks_num_elem * KMEM_NUM_GUARDS / KMEM_NUM_SLOTS;
2713 }
2714
2715 static uint32_t
kmem_quarantine_count(struct kmem_sizeclass * kmem)2716 kmem_quarantine_count(struct kmem_sizeclass *kmem)
2717 {
2718 return kmem->ks_num_elem * KMEM_NUM_QUARANTINE /
2719 KMEM_NUM_SLOTS;
2720 }
2721
2722 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2723 kmem_get_front(
2724 kmem_range_id_t range_id,
2725 bool from_right)
2726 {
2727 assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2728 (range_id <= KMEM_RANGE_ID_NUM_PTR));
2729 return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2730 }
2731
2732 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2733 kmem_slot_idx_to_bit(
2734 uint32_t slot_idx,
2735 uint32_t size_idx __unused)
2736 {
2737 assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2738 return 1ull << slot_idx;
2739 }
2740
2741 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2742 kmem_get_idx_from_size(vm_map_size_t size)
2743 {
2744 assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2745 return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2746 }
2747
2748 __abortlike
2749 static void
kmem_invalid_size_idx(uint32_t idx)2750 kmem_invalid_size_idx(uint32_t idx)
2751 {
2752 panic("Invalid sizeclass idx %u", idx);
2753 }
2754
2755 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2756 kmem_get_size_from_idx(uint32_t idx)
2757 {
2758 if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2759 kmem_invalid_size_idx(idx);
2760 }
2761 return 1ul << (idx + KMEM_START_IDX);
2762 }
2763
2764 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2765 kmem_get_page_idx(struct kmem_page_meta *meta)
2766 {
2767 uint8_t page_marker = meta->km_page_marker;
2768
2769 return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2770 }
2771
2772 __abortlike
2773 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2774 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2775 {
2776 panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2777 meta);
2778 }
2779
2780 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2781 kmem_get_chunk_len(struct kmem_page_meta *meta)
2782 {
2783 if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2784 kmem_invalid_chunk_len(meta);
2785 }
2786
2787 return meta->km_chunk_len;
2788 }
2789
2790 __abortlike
2791 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2792 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2793 {
2794 panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2795 meta);
2796 }
2797
2798 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2799 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2800 {
2801 if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2802 kmem_invalid_free_chunk_len(meta);
2803 }
2804
2805 return meta->km_free_chunks;
2806 }
2807
2808 /*
2809 * Return the metadata corresponding to the specified address
2810 */
2811 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2812 kmem_addr_to_meta(
2813 vm_map_offset_t addr,
2814 vm_map_range_id_t range_id,
2815 vm_map_offset_t *range_start,
2816 uint64_t *meta_idx)
2817 {
2818 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2819
2820 *range_start = kmem_ranges[range_id].min_address;
2821 *meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2822 return VM_FAR_ADD_PTR_UNBOUNDED(meta_base, *meta_idx);
2823 }
2824
2825 /*
2826 * Return the metadata start of the chunk that the address belongs to
2827 */
2828 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2829 kmem_addr_to_meta_start(
2830 vm_address_t addr,
2831 vm_map_range_id_t range_id,
2832 vm_map_offset_t *chunk_start)
2833 {
2834 vm_map_offset_t range_start;
2835 uint64_t meta_idx;
2836 struct kmem_page_meta *meta;
2837
2838 meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2839 meta_idx -= kmem_get_page_idx(meta);
2840 meta = VM_FAR_ADD_PTR_UNBOUNDED(meta, -(ptrdiff_t)kmem_get_page_idx(meta));
2841 assert(meta->km_page_marker == KMEM_META_PRIMARY);
2842 *chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2843 return meta;
2844 }
2845
2846 __startup_func
2847 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2848 kmem_init_meta_front(
2849 struct kmem_page_meta *meta,
2850 kmem_range_id_t range_id,
2851 bool from_right)
2852 {
2853 kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2854 KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2855 meta->km_page_marker = KMEM_META_START;
2856 if (!from_right) {
2857 meta++;
2858 kmem_meta_base[range_id] = meta;
2859 }
2860 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2861 }
2862
2863 __startup_func
2864 static void
kmem_metadata_init(void)2865 kmem_metadata_init(void)
2866 {
2867 for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2868 vm_map_offset_t addr = kmem_meta_range[i].min_address;
2869 struct kmem_page_meta *meta;
2870 uint64_t meta_idx;
2871
2872 vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2873 kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2874 VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2875 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
2876 KMS_PERMANENT | KMS_NOFAIL | KMS_NOSOFTLIMIT,
2877 VM_KERN_MEMORY_OSFMK).kmr_submap;
2878
2879 kmem_meta_range[i].min_address = addr;
2880 kmem_meta_range[i].max_address = addr + kmem_meta_size;
2881
2882 meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2883 kmem_init_meta_front(meta, i, 0);
2884
2885 meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2886 &meta_idx);
2887 kmem_init_meta_front(meta, i, 1);
2888 }
2889 }
2890
2891 __startup_func
2892 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2893 kmem_init_front_head(
2894 struct kmem_sizeclass *ks,
2895 uint32_t front)
2896 {
2897 LIST_INIT(&ks->ks_allfree_head[front]);
2898 LIST_INIT(&ks->ks_partial_head[front]);
2899 LIST_INIT(&ks->ks_full_head[front]);
2900 }
2901
2902 __startup_func
2903 static void
kmem_sizeclass_init(void)2904 kmem_sizeclass_init(void)
2905 {
2906 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2907 struct kmem_sizeclass *ks = &kmem_size_array[i];
2908 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2909
2910 ks->ks_size = kmem_get_size_from_idx(i);
2911 ks->ks_num_chunk = roundup(KMEM_NUM_SLOTS * ks->ks_size,
2912 KMEM_CHUNK_SIZE_MIN) / KMEM_CHUNK_SIZE_MIN;
2913 ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2914
2915 /*
2916 * Check that everything fits in the metadata.
2917 */
2918 assert(ks->ks_num_elem <=
2919 (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2920 assert(kmem_quarantine_count(ks) - 1 <
2921 (1u << KMEM_QUARANTINE_BITS));
2922 assert(ks->ks_num_elem - kmem_guard_count(ks) <
2923 (1u << KMEM_AVAIL_BITS));
2924
2925 for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2926 kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2927 kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2928 }
2929 }
2930 }
2931
2932 /*
2933 * This is done during EARLY_BOOT as it needs the corecrypto module to be
2934 * set up.
2935 */
2936 __startup_func
2937 static void
kmem_crypto_init(void)2938 kmem_crypto_init(void)
2939 {
2940 vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2941
2942 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2943 struct kmem_sizeclass *ks = &kmem_size_array[i];
2944
2945 ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2946 zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2947 crypto_random_kmem_init(ctx);
2948 }
2949 }
2950 }
2951 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2952
2953 __abortlike
2954 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2955 kmem_validate_slot_panic(
2956 vm_map_offset_t addr,
2957 struct kmem_page_meta *meta,
2958 uint32_t slot_idx,
2959 uint32_t size_idx)
2960 {
2961 if (meta->km_page_marker != KMEM_META_PRIMARY) {
2962 panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2963 }
2964 if (meta->km_sizeclass != size_idx) {
2965 panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2966 meta, meta->km_sizeclass, size_idx);
2967 }
2968 panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2969 slot_idx, meta, (void *)addr);
2970 }
2971
2972 __abortlike
2973 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2974 kmem_invalid_slot_for_addr(
2975 mach_vm_range_t slot,
2976 vm_map_offset_t start,
2977 vm_map_offset_t end)
2978 {
2979 panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2980 (void *)slot->min_address, (void *)slot->max_address,
2981 (void *)start, (void *)end);
2982 }
2983
2984 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2985 kmem_validate_slot(
2986 vm_map_offset_t addr,
2987 struct kmem_page_meta *meta,
2988 uint32_t size_idx,
2989 uint32_t slot_idx)
2990 {
2991 if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2992 (meta->km_sizeclass != size_idx) ||
2993 ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2994 kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2995 }
2996 }
2997
2998 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2999 kmem_validate_slot_initial(
3000 mach_vm_range_t slot,
3001 vm_map_offset_t start,
3002 vm_map_offset_t end,
3003 struct kmem_page_meta *meta,
3004 uint32_t size_idx,
3005 uint32_t slot_idx)
3006 {
3007 if ((slot->min_address == 0) || (slot->max_address == 0) ||
3008 (start < slot->min_address) || (start >= slot->max_address) ||
3009 (end > slot->max_address)) {
3010 kmem_invalid_slot_for_addr(slot, start, end);
3011 }
3012
3013 kmem_validate_slot(start, meta, size_idx, slot_idx);
3014 }
3015
3016 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)3017 kmem_addr_get_slot_idx(
3018 vm_map_offset_t start,
3019 vm_map_offset_t end,
3020 vm_map_range_id_t range_id,
3021 struct kmem_page_meta **meta,
3022 uint32_t *size_idx,
3023 mach_vm_range_t slot)
3024 {
3025 vm_map_offset_t chunk_start;
3026 vm_map_size_t slot_size;
3027 uint32_t slot_idx;
3028
3029 *meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
3030 *size_idx = (*meta)->km_sizeclass;
3031 slot_size = kmem_get_size_from_idx(*size_idx);
3032 slot_idx = (start - chunk_start) / slot_size;
3033 slot->min_address = chunk_start + slot_idx * slot_size;
3034 slot->max_address = slot->min_address + slot_size;
3035
3036 kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
3037
3038 return slot_idx;
3039 }
3040
3041 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)3042 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
3043 {
3044 #if KASAN
3045 #pragma unused(from, to)
3046 return true;
3047 #else
3048 vm_offset_t page_addr = trunc_page(from);
3049
3050 for (; page_addr < to; page_addr += PAGE_SIZE) {
3051 /*
3052 * This can race with another thread doing a populate on the same metadata
3053 * page, where we see an updated pmap but unmapped KASan shadow, causing a
3054 * fault in the shadow when we first access the metadata page. Avoid this
3055 * by always synchronizing on the kmem_meta_lock with KASan.
3056 */
3057 if (!pmap_find_phys(kernel_pmap, page_addr)) {
3058 return true;
3059 }
3060 }
3061
3062 return false;
3063 #endif /* !KASAN */
3064 }
3065
3066 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)3067 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
3068 {
3069 vm_offset_t page_addr = trunc_page(from);
3070
3071 vmlp_api_start(KMEM_POPULATE_META_LOCKED);
3072
3073 vm_map_unlock(kernel_map);
3074
3075 vmlp_range_event(kernel_map, from, to - from);
3076
3077 for (; page_addr < to; page_addr += PAGE_SIZE) {
3078 for (;;) {
3079 kern_return_t ret = KERN_SUCCESS;
3080
3081 /*
3082 * All updates to kmem metadata are done under the kmem_meta_lock
3083 */
3084 kmem_meta_lock();
3085 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
3086 ret = kernel_memory_populate(page_addr,
3087 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
3088 VM_KERN_MEMORY_OSFMK);
3089 }
3090 kmem_meta_unlock();
3091
3092 if (ret == KERN_SUCCESS) {
3093 break;
3094 }
3095
3096 /*
3097 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
3098 * to bad system deadlocks, so if the allocation failed,
3099 * we need to do the VM_PAGE_WAIT() outside of the lock.
3100 */
3101 VM_PAGE_WAIT();
3102 }
3103 }
3104
3105 vm_map_lock(kernel_map);
3106 vmlp_api_end(KMEM_POPULATE_META_LOCKED, 0);
3107 }
3108
3109 __abortlike
3110 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass * sizeclass)3111 kmem_invalid_meta_panic(
3112 struct kmem_page_meta *meta,
3113 uint32_t slot_idx,
3114 struct kmem_sizeclass *sizeclass)
3115 {
3116 uint32_t size_idx = kmem_get_idx_from_size(sizeclass->ks_size);
3117
3118 if (slot_idx >= sizeclass->ks_num_elem) {
3119 panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
3120 sizeclass->ks_num_elem, meta);
3121 }
3122 if (meta->km_sizeclass != size_idx) {
3123 panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
3124 meta->km_sizeclass, meta);
3125 }
3126 panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
3127 }
3128
3129 __abortlike
3130 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)3131 kmem_slot_has_entry_panic(
3132 vm_map_entry_t entry,
3133 vm_map_offset_t addr)
3134 {
3135 panic("Entry (%p) already exists for addr (%p) being returned",
3136 entry, (void *)addr);
3137 }
3138
3139 __abortlike
3140 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)3141 kmem_slot_not_found(
3142 struct kmem_page_meta *meta,
3143 uint32_t slot_idx)
3144 {
3145 panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
3146 meta->km_bitmap);
3147 }
3148
3149 /*
3150 * Returns a 16bit random number between 0 and
3151 * upper_limit (inclusive)
3152 */
3153 __startup_func
3154 uint16_t
kmem_get_random16(uint16_t upper_limit)3155 kmem_get_random16(
3156 uint16_t upper_limit)
3157 {
3158 static uint64_t random_entropy;
3159 assert(upper_limit < UINT16_MAX);
3160 if (random_entropy == 0) {
3161 random_entropy = early_random();
3162 }
3163 uint32_t result = random_entropy & UINT32_MAX;
3164 random_entropy >>= 32;
3165 return (uint16_t)(result % (upper_limit + 1));
3166 }
3167
3168 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)3169 kmem_get_nth_free_slot(
3170 struct kmem_page_meta *meta,
3171 uint32_t n,
3172 uint32_t bitmap)
3173 {
3174 uint32_t zeros_seen = 0, ones_seen = 0;
3175
3176 while (bitmap) {
3177 uint32_t count = __builtin_ctz(bitmap);
3178
3179 zeros_seen += count;
3180 bitmap >>= count;
3181 if (__probable(~bitmap)) {
3182 count = __builtin_ctz(~bitmap);
3183 } else {
3184 count = 32;
3185 }
3186 if (count + ones_seen > n) {
3187 meta->km_avail_count -= 1;
3188 return zeros_seen + n;
3189 }
3190 ones_seen += count;
3191 bitmap >>= count;
3192 }
3193
3194 kmem_slot_not_found(meta, n);
3195 }
3196
3197
3198 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass * sizeclass,uint32_t bitmap)3199 kmem_get_next_slot(
3200 struct kmem_page_meta *meta,
3201 struct kmem_sizeclass *sizeclass,
3202 uint32_t bitmap)
3203 {
3204 uint32_t num_slots = meta->km_avail_count + meta->km_quarantined +
3205 kmem_guard_count(sizeclass);
3206 uint64_t slot_idx = 0;
3207
3208 assert(meta->km_avail_count > 0 &&
3209 num_slots == __builtin_popcount(meta->km_bitmap));
3210
3211 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
3212 /*
3213 * Use early random prior to early boot as the ks_rng_ctx requires
3214 * the corecrypto module to be setup before it is initialized and
3215 * used.
3216 *
3217 * num_slots can't be 0 as we take this path when we have more than
3218 * one slot left.
3219 */
3220 slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
3221 } else {
3222 crypto_random_uniform(zpercpu_get(sizeclass->ks_rng_ctx),
3223 num_slots, &slot_idx);
3224 }
3225
3226 return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
3227 }
3228
3229 /*
3230 * Returns an unallocated slot from the given metadata
3231 */
3232 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass * sizeclass,vm_map_entry_t * entry)3233 kmem_get_addr_from_meta(
3234 struct kmem_page_meta *meta,
3235 vm_map_range_id_t range_id,
3236 struct kmem_sizeclass *sizeclass,
3237 vm_map_entry_t *entry)
3238 {
3239 vm_map_offset_t addr;
3240 vm_map_size_t size = sizeclass->ks_size;
3241 uint32_t size_idx = kmem_get_idx_from_size(size);
3242 uint64_t meta_idx = meta - kmem_meta_base[range_id];
3243 mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
3244 uint32_t slot_bit;
3245 uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
3246
3247 if ((slot_idx >= sizeclass->ks_num_elem) ||
3248 (meta->km_sizeclass != size_idx) ||
3249 (meta->km_page_marker != KMEM_META_PRIMARY)) {
3250 kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
3251 }
3252
3253 slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
3254 meta->km_bitmap &= ~slot_bit;
3255
3256 addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
3257 assert(kmem_range_contains_fully(range_id, addr, size));
3258 if (vm_map_lookup_entry(kernel_map, addr, entry)) {
3259 kmem_slot_has_entry_panic(*entry, addr);
3260 }
3261 if ((*entry != vm_map_to_entry(kernel_map)) &&
3262 ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
3263 ((*entry)->vme_next->vme_start < (addr + size))) {
3264 kmem_slot_has_entry_panic(*entry, addr);
3265 }
3266 return addr;
3267 }
3268
3269 __abortlike
3270 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)3271 kmem_range_out_of_va(
3272 kmem_range_id_t range_id,
3273 uint32_t num_chunks)
3274 {
3275 panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
3276 }
3277
3278 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass * sizeclass,uint32_t size_idx)3279 kmem_init_allocated_chunk(
3280 struct kmem_page_meta *meta,
3281 struct kmem_sizeclass *sizeclass,
3282 uint32_t size_idx)
3283 {
3284 uint32_t meta_num = sizeclass->ks_num_chunk;
3285 uint32_t num_elem = sizeclass->ks_num_elem;
3286
3287 meta->km_bitmap = (1ull << num_elem) - 1;
3288 meta->km_chunk_len = (uint16_t)meta_num;
3289 meta->km_avail_count = (uint8_t)(num_elem - kmem_guard_count(sizeclass));
3290 meta->km_quarantined = 0;
3291 assert(LIST_NEXT(meta, km_link) == NULL);
3292 assert(meta->km_link.le_prev == NULL);
3293 meta->km_sizeclass = (uint8_t)size_idx;
3294 meta->km_page_marker = KMEM_META_PRIMARY;
3295 meta++;
3296 for (uint32_t i = 1; i < meta_num; i++) {
3297 meta->km_page_idx = (uint16_t)i;
3298 meta->km_avail_count = 0;
3299 meta->km_quarantined = 0;
3300 meta->km_sizeclass = (uint8_t)size_idx;
3301 meta->km_page_marker = 0;
3302 meta->km_bitmap = 0;
3303 meta++;
3304 }
3305 }
3306
3307 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)3308 kmem_get_additional_meta(
3309 struct kmem_page_meta *meta,
3310 uint32_t meta_req,
3311 bool from_right,
3312 struct kmem_page_meta **adj_free_meta)
3313 {
3314 struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
3315
3316 if (meta_prev->km_page_marker == KMEM_META_FREE) {
3317 uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
3318
3319 *adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
3320 meta_req -= chunk_len;
3321 } else {
3322 *adj_free_meta = NULL;
3323 }
3324
3325 return meta_req;
3326 }
3327
3328
3329 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx,uint32_t front)3330 kmem_get_new_chunk(
3331 vm_map_range_id_t range_id,
3332 bool from_right,
3333 uint32_t size_idx,
3334 uint32_t front)
3335 {
3336 struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3337 struct kmem_page_meta *start, *end, *meta_update;
3338 struct kmem_page_meta *adj_free_meta = NULL;
3339 uint32_t meta_req = sizeclass->ks_num_chunk;
3340
3341 for (;;) {
3342 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3343 struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3344 struct kmem_page_meta *meta;
3345 vm_offset_t start_addr, end_addr;
3346 uint32_t meta_num;
3347
3348 meta = from_right ? metab : metaf;
3349 meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
3350 &adj_free_meta);
3351
3352 if (metaf + meta_num >= metab) {
3353 kmem_range_out_of_va(range_id, meta_num);
3354 }
3355
3356 start = from_right ? (metab - meta_num) : metaf;
3357 end = from_right ? metab : (metaf + meta_num);
3358
3359 start_addr = (vm_offset_t)start;
3360 end_addr = (vm_offset_t)end;
3361
3362 /*
3363 * If the new high watermark stays on the same page,
3364 * no need to populate and drop the lock.
3365 */
3366 if (!page_aligned(from_right ? end_addr : start_addr) &&
3367 trunc_page(start_addr) == trunc_page(end_addr - 1)) {
3368 break;
3369 }
3370 if (!kmem_populate_needed(start_addr, end_addr)) {
3371 break;
3372 }
3373
3374 kmem_populate_meta_locked(start_addr, end_addr);
3375
3376 /*
3377 * Since we dropped the lock, reassess conditions still hold:
3378 * - the HWM we are changing must not have moved
3379 * - the other HWM must not intersect with ours
3380 * - in case of coalescing, the adjacent free meta must still
3381 * be free and of the same size.
3382 *
3383 * If we failed to grow, reevaluate whether freelists have
3384 * entries now by returning NULL.
3385 */
3386 metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3387 metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3388 if (meta != (from_right ? metab : metaf)) {
3389 return NULL;
3390 }
3391 if (metaf + meta_num >= metab) {
3392 kmem_range_out_of_va(range_id, meta_num);
3393 }
3394 if (adj_free_meta) {
3395 if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
3396 kmem_get_free_chunk_len(adj_free_meta) !=
3397 meta_req - meta_num) {
3398 return NULL;
3399 }
3400 }
3401
3402 break;
3403 }
3404
3405 /*
3406 * If there is an adjacent free chunk remove it from free list
3407 */
3408 if (adj_free_meta) {
3409 LIST_REMOVE(adj_free_meta, km_link);
3410 LIST_NEXT(adj_free_meta, km_link) = NULL;
3411 adj_free_meta->km_link.le_prev = NULL;
3412 }
3413
3414 /*
3415 * Update hwm
3416 */
3417 meta_update = from_right ? start : end;
3418 kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3419
3420 /*
3421 * Initialize metadata
3422 */
3423 start = from_right ? start : (end - meta_req);
3424 kmem_init_allocated_chunk(start, sizeclass, size_idx);
3425 LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], start, km_link);
3426
3427 return start;
3428 }
3429
3430 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3431 kmem_requeue_meta(
3432 struct kmem_page_meta *meta,
3433 struct kmem_list_head *head)
3434 {
3435 LIST_REMOVE(meta, km_link);
3436 LIST_INSERT_HEAD(head, meta, km_link);
3437 }
3438
3439 /*
3440 * Return corresponding sizeclass to stash free chunks in
3441 */
3442 __abortlike
3443 static void
kmem_invalid_chunk_num(uint32_t chunks)3444 kmem_invalid_chunk_num(uint32_t chunks)
3445 {
3446 panic("Invalid number of chunks %u\n", chunks);
3447 }
3448
3449 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3450 kmem_get_size_idx_for_chunks(uint32_t chunks)
3451 {
3452 for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3453 if (chunks >= kmem_size_array[i].ks_num_chunk) {
3454 return i;
3455 }
3456 }
3457 kmem_invalid_chunk_num(chunks);
3458 }
3459
3460 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3461 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3462 {
3463 bzero(meta, count * sizeof(struct kmem_page_meta));
3464 }
3465
3466 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3467 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3468 {
3469 #if MACH_ASSERT
3470 size_t size = count * sizeof(struct kmem_page_meta);
3471
3472 assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3473 #else
3474 #pragma unused(meta, count)
3475 #endif
3476 }
3477
3478 /*!
3479 * @function kmem_init_free_chunk()
3480 *
3481 * @discussion
3482 * This function prepares a range of chunks to be put on a free list.
3483 * The first and last metadata might be dirty, but the "inner" ones
3484 * must be zero filled by the caller prior to calling this function.
3485 */
3486 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3487 kmem_init_free_chunk(
3488 struct kmem_page_meta *meta,
3489 uint32_t num_chunks,
3490 uint32_t front)
3491 {
3492 struct kmem_sizeclass *sizeclass;
3493 uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3494
3495 if (num_chunks > 2) {
3496 kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3497 }
3498
3499 meta[0] = (struct kmem_page_meta){
3500 .km_free_chunks = num_chunks,
3501 .km_page_marker = KMEM_META_FREE,
3502 .km_sizeclass = (uint8_t)size_idx,
3503 };
3504 if (num_chunks > 1) {
3505 meta[num_chunks - 1] = (struct kmem_page_meta){
3506 .km_free_chunks = num_chunks,
3507 .km_page_marker = KMEM_META_FREE,
3508 .km_sizeclass = (uint8_t)size_idx,
3509 };
3510 }
3511
3512 sizeclass = &kmem_size_array[size_idx];
3513 LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3514 }
3515
3516 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3517 kmem_get_free_chunk_from_list(
3518 struct kmem_sizeclass *org_sizeclass,
3519 uint32_t size_idx,
3520 uint32_t front)
3521 {
3522 struct kmem_sizeclass *sizeclass;
3523 uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3524 struct kmem_page_meta *meta;
3525 uint32_t idx = size_idx;
3526
3527 while (idx < KMEM_NUM_SIZECLASS) {
3528 sizeclass = &kmem_size_array[idx];
3529 meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3530 if (meta) {
3531 break;
3532 }
3533 idx++;
3534 }
3535
3536 /*
3537 * Trim if larger in size
3538 */
3539 if (meta) {
3540 uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3541
3542 assert(meta->km_page_marker == KMEM_META_FREE);
3543 LIST_REMOVE(meta, km_link);
3544 LIST_NEXT(meta, km_link) = NULL;
3545 meta->km_link.le_prev = NULL;
3546 if (num_chunks_free > num_chunks) {
3547 num_chunks_free -= num_chunks;
3548 kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3549 }
3550
3551 kmem_init_allocated_chunk(meta, org_sizeclass, size_idx);
3552 LIST_INSERT_HEAD(&org_sizeclass->ks_partial_head[front], meta, km_link);
3553 }
3554
3555 return meta;
3556 }
3557
3558 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3559 kmem_locate_space(
3560 vm_map_size_t size,
3561 vm_map_range_id_t range_id,
3562 bool from_right,
3563 vm_map_offset_t *start_inout,
3564 vm_map_entry_t *entry_out)
3565 {
3566 vm_map_entry_t entry;
3567 uint32_t size_idx = kmem_get_idx_from_size(size);
3568 uint32_t front = kmem_get_front(range_id, from_right);
3569 struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3570 struct kmem_page_meta *meta;
3571
3572 assert(size <= sizeclass->ks_size);
3573
3574 do {
3575 /*
3576 * Attempt to find space trying:
3577 * 1. partial heads;
3578 * 2. free chunks in the segregated free-lists;
3579 * 3. extending the metadata range.
3580 */
3581 meta = LIST_FIRST(&sizeclass->ks_partial_head[front]) ?:
3582 kmem_get_free_chunk_from_list(sizeclass, size_idx, front) ?:
3583 kmem_get_new_chunk(range_id, from_right, size_idx, front);
3584 } while (meta == NULL);
3585
3586 *start_inout = kmem_get_addr_from_meta(meta, range_id, sizeclass, &entry);
3587
3588 if (meta->km_avail_count == 0) {
3589 kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3590 }
3591 if (entry_out) {
3592 *entry_out = entry;
3593 }
3594
3595 return KERN_SUCCESS;
3596 }
3597
3598 /*
3599 * Determine whether the given metadata was allocated from the right
3600 */
3601 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3602 kmem_meta_is_from_right(
3603 kmem_range_id_t range_id,
3604 struct kmem_page_meta *meta)
3605 {
3606 struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3607 __assert_only struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3608 struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3609 struct kmem_page_meta *meta_end;
3610
3611 meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3612
3613 if ((meta >= meta_base) && (meta < metaf)) {
3614 return false;
3615 }
3616
3617 assert(meta >= metab && meta < meta_end);
3618 return true;
3619 }
3620
3621 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3622 kmem_free_chunk(
3623 kmem_range_id_t range_id,
3624 struct kmem_page_meta *meta,
3625 bool from_right)
3626 {
3627 struct kmem_page_meta *meta_coalesce = meta - 1;
3628 struct kmem_page_meta *meta_start = meta;
3629 uint32_t num_chunks = kmem_get_chunk_len(meta);
3630 uint32_t add_chunks;
3631 struct kmem_page_meta *meta_end = meta + num_chunks;
3632 struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3633 uint32_t front = kmem_get_front(range_id, from_right);
3634
3635 meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3636 meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3637
3638 LIST_REMOVE(meta, km_link);
3639 kmem_clear_meta_range(meta, num_chunks);
3640
3641 /*
3642 * Coalesce left
3643 */
3644 if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3645 (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3646 meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3647 add_chunks = kmem_get_free_chunk_len(meta_start);
3648 num_chunks += add_chunks;
3649 LIST_REMOVE(meta_start, km_link);
3650 kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3651 }
3652
3653 /*
3654 * Coalesce right
3655 */
3656 if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3657 (meta_end->km_page_marker == KMEM_META_FREE)) {
3658 add_chunks = kmem_get_free_chunk_len(meta_end);
3659 LIST_REMOVE(meta_end, km_link);
3660 kmem_clear_meta_range(meta_end, 1);
3661 meta_end = meta_end + add_chunks;
3662 num_chunks += add_chunks;
3663 }
3664
3665 kmem_init_free_chunk(meta_start, num_chunks, front);
3666 }
3667
3668 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3669 kmem_free_slot(
3670 kmem_range_id_t range_id,
3671 mach_vm_range_t slot)
3672 {
3673 struct kmem_page_meta *meta;
3674 vm_map_offset_t chunk_start;
3675 uint32_t size_idx, slot_idx;
3676 struct kmem_sizeclass *sizeclass;
3677 vm_map_size_t slot_size;
3678
3679 meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3680 size_idx = meta->km_sizeclass;
3681
3682 sizeclass = &kmem_size_array[size_idx];
3683 slot_size = kmem_get_size_from_idx(size_idx);
3684 slot_idx = (slot->min_address - chunk_start) / slot_size;
3685 assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3686 meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3687
3688 if (meta->km_bitmap == ((1u << sizeclass->ks_num_elem) - 1)) {
3689 /*
3690 * If entire chunk empty add to emtpy list
3691 */
3692 bool from_right = kmem_meta_is_from_right(range_id, meta);
3693
3694 kmem_free_chunk(range_id, meta, from_right);
3695 } else if (meta->km_avail_count + meta->km_quarantined + 1 <
3696 kmem_quarantine_count(sizeclass)) {
3697 /*
3698 * If we're below quarantine levels, quarantine the slot
3699 * and move on.
3700 */
3701 meta->km_quarantined += 1;
3702 } else {
3703 /*
3704 * If we freed to full chunk move it to partial
3705 */
3706 if (meta->km_avail_count == 0) {
3707 uint32_t front = kmem_get_front(range_id,
3708 kmem_meta_is_from_right(range_id, meta));
3709
3710 kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3711 }
3712
3713 meta->km_avail_count += meta->km_quarantined + 1;
3714 meta->km_quarantined = 0;
3715 }
3716 }
3717
3718 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3719 kmem_free_space(
3720 vm_map_offset_t start,
3721 vm_map_offset_t end,
3722 vm_map_range_id_t range_id,
3723 mach_vm_range_t slot)
3724 {
3725 bool entry_present = false;
3726 vm_map_entry_t prev_entry;
3727 vm_map_entry_t next_entry;
3728
3729 if ((slot->min_address == start) && (slot->max_address == end)) {
3730 /*
3731 * Entire slot is being freed at once
3732 */
3733 return kmem_free_slot(range_id, slot);
3734 }
3735
3736 entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3737 assert(!entry_present);
3738 next_entry = prev_entry->vme_next;
3739
3740 if (((prev_entry == vm_map_to_entry(kernel_map) ||
3741 prev_entry->vme_end <= slot->min_address)) &&
3742 (next_entry == vm_map_to_entry(kernel_map) ||
3743 (next_entry->vme_start >= slot->max_address))) {
3744 /*
3745 * Free entire slot
3746 */
3747 kmem_free_slot(range_id, slot);
3748 }
3749 }
3750
3751 #pragma mark kmem init
3752
3753 /*
3754 * The default percentage of memory that can be mlocked is scaled based on the total
3755 * amount of memory in the system. These percentages are caclulated
3756 * offline and stored in this table. We index this table by
3757 * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3758 * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3759 *
3760 * Note that these values were picked for mac.
3761 * If we ever have very large memory config arm devices, we may want to revisit
3762 * since the kernel overhead is smaller there due to the larger page size.
3763 */
3764
3765 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3766 #define VM_USER_WIREABLE_MIN_CONFIG 32
3767 #if CONFIG_JETSAM
3768 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3769 * pressure.
3770 */
3771 static vm_map_size_t wire_limit_percents[] =
3772 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3773 #else
3774 static vm_map_size_t wire_limit_percents[] =
3775 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3776 #endif /* CONFIG_JETSAM */
3777
3778 /* Set limit to 95% of DRAM if serverperfmode=1 */
3779 #define VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT 95
3780 /* Use special serverperfmode behavior iff DRAM > 2^35 = 32GiB of RAM. */
3781 #define VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG 35
3782
3783 /*
3784 * Sets the default global user wire limit which limits the amount of
3785 * memory that can be locked via mlock() based on the above algorithm..
3786 * This can be overridden via a sysctl.
3787 */
3788 static void
kmem_set_user_wire_limits(void)3789 kmem_set_user_wire_limits(void)
3790 {
3791 uint64_t available_mem_log;
3792 uint64_t max_wire_percent;
3793 size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3794 sizeof(vm_map_size_t);
3795 vm_map_size_t limit;
3796 uint64_t config_memsize = max_mem;
3797 #if defined(XNU_TARGET_OS_OSX)
3798 config_memsize = max_mem_actual;
3799 #endif /* defined(XNU_TARGET_OS_OSX) */
3800
3801 available_mem_log = bit_floor(config_memsize);
3802
3803 if (serverperfmode &&
3804 (available_mem_log >= VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG)) {
3805 max_wire_percent = VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT;
3806 } else {
3807 if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3808 available_mem_log = 0;
3809 } else {
3810 available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3811 }
3812 if (available_mem_log >= wire_limit_percents_length) {
3813 available_mem_log = wire_limit_percents_length - 1;
3814 }
3815 max_wire_percent = wire_limit_percents[available_mem_log];
3816 }
3817
3818 limit = config_memsize * max_wire_percent / 100;
3819 /* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3820 if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3821 limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3822 }
3823
3824 vm_global_user_wire_limit = limit;
3825 /* the default per task limit is the same as the global limit */
3826 vm_per_task_user_wire_limit = limit;
3827 vm_add_wire_count_over_global_limit = 0;
3828 vm_add_wire_count_over_user_limit = 0;
3829 }
3830
3831 #define KMEM_MAX_CLAIMS 50
3832 __startup_data
3833 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3834
3835 #if !MACH_ASSERT
3836 __startup_data
3837 #endif /* !MACH_ASSERT */
3838 uint32_t kmem_claim_count = 0;
3839
3840 #if MACH_ASSERT
3841 /**
3842 * Save off some minimal information about the ranges for consumption by
3843 * post-lockdown tests.
3844 */
3845 static struct mach_vm_range kmem_test_saved_ranges[KMEM_MAX_CLAIMS];
3846 #endif /* MACH_ASSERT */
3847
3848 /**
3849 * For a requested claim size (i.e. kc_size), get the number of bytes which
3850 * should actually be allocated for a region in order to be able to properly
3851 * provide the requested size (the allocation size).
3852 *
3853 * This allocation size is always greater or equal to the claim size. It can,
3854 * for example, include additional space as required by the kernel memory
3855 * configuration.
3856 *
3857 * @param known_last Is the claim in question known to be the last region after
3858 * all placing has completed? The size for a known_last allocation is always
3859 * less than or equal to a non-known_last allocation of the same size.
3860 */
3861 __startup_func
3862 static vm_map_size_t
kmem_claim_to_allocation_size(vm_map_size_t claim_size,bool known_last)3863 kmem_claim_to_allocation_size(vm_map_size_t claim_size, bool known_last)
3864 {
3865 (void)known_last;
3866 /*
3867 * Allocation size and claim size are identical.
3868 */
3869 return claim_size;
3870 }
3871
3872 /**
3873 * Compute the largest claim which can be made from a given allocation size.
3874 */
3875 static vm_map_size_t
kmem_allocation_to_claim_size(vm_map_size_t allocation_size)3876 kmem_allocation_to_claim_size(vm_map_size_t allocation_size)
3877 {
3878 /*
3879 * Allocation size and claim size are identical.
3880 */
3881 return allocation_size;
3882 }
3883
3884 __startup_func
3885 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3886 kmem_range_startup_init(
3887 struct kmem_range_startup_spec *sp)
3888 {
3889 assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3890 if (sp->kc_calculate_sz) {
3891 sp->kc_size = (sp->kc_calculate_sz)();
3892 }
3893 if (sp->kc_size) {
3894 kmem_claims[kmem_claim_count] = *sp;
3895 kmem_claim_count++;
3896 }
3897 }
3898
3899 static vm_offset_t
kmem_fuzz_start(void)3900 kmem_fuzz_start(void)
3901 {
3902 vm_offset_t kmapoff_kaddr = 0;
3903 uint32_t kmapoff_pgcnt;
3904
3905 kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3906
3907 vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3908
3909 kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3910 KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3911 VM_KERN_MEMORY_OSFMK);
3912
3913
3914 return kmapoff_kaddr + kmapoff_size;
3915 }
3916
3917 /*
3918 * Generate a randomly shuffled array of indices from 0 to count - 1
3919 */
3920 __startup_func
3921 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3922 kmem_shuffle(
3923 uint16_t *shuffle_buf,
3924 uint16_t count)
3925 {
3926 for (uint16_t i = 0; i < count; i++) {
3927 uint16_t j = kmem_get_random16(i);
3928 if (j != i) {
3929 shuffle_buf[i] = shuffle_buf[j];
3930 }
3931 shuffle_buf[j] = i;
3932 }
3933 }
3934
3935 __startup_func
3936 static void
kmem_shuffle_claims(void)3937 kmem_shuffle_claims(void)
3938 {
3939 uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3940 uint16_t limit = (uint16_t)kmem_claim_count;
3941
3942 kmem_shuffle(&shuffle_buf[0], limit);
3943 for (uint16_t i = 0; i < limit; i++) {
3944 struct kmem_range_startup_spec tmp = kmem_claims[i];
3945 kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3946 kmem_claims[shuffle_buf[i]] = tmp;
3947 }
3948 }
3949
3950 __startup_func
3951 static void
kmem_readjust_ranges(uint32_t cur_idx)3952 kmem_readjust_ranges(
3953 uint32_t cur_idx)
3954 {
3955 assert(cur_idx != 0);
3956 uint32_t j = cur_idx - 1, random;
3957 struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3958 struct mach_vm_range *sp_range = sp.kc_range;
3959 /*
3960 * Even if sp is currently last, it will never be last after it is moved.
3961 * As such, we want to bump other claims over it and include any necessary
3962 * padding for a non-last claim.
3963 *
3964 * While changing which claim is last can impact the total VA usage, since a
3965 * known_last allocation size is guaranteed to always be less-than-or-equal
3966 * to a non-known_last allocation (which is used for pre-placement sizing),
3967 * we will always have enough space so long as the pre-placement sizing had
3968 * enough space.
3969 */
3970 vm_map_offset_t sp_allocation_size =
3971 kmem_claim_to_allocation_size(sp.kc_size, /* known_last */ false);
3972
3973 /*
3974 * Find max index where restriction is met
3975 */
3976 for (; j > 0; j--) {
3977 struct kmem_range_startup_spec spj = kmem_claims[j];
3978 vm_map_offset_t max_start = spj.kc_range->min_address;
3979 if (spj.kc_flags & KC_NO_MOVE) {
3980 panic("kmem_range_init: Can't scramble with multiple constraints");
3981 }
3982 if (max_start <= sp_range->min_address) {
3983 break;
3984 }
3985 }
3986
3987 /*
3988 * Pick a random index from 0 to max index and shift claims to the right
3989 * to make room for restricted claim
3990 */
3991 random = kmem_get_random16((uint16_t)j);
3992 assert(random <= j);
3993
3994 sp_range->min_address = kmem_claims[random].kc_range->min_address;
3995 sp_range->max_address = sp_range->min_address + sp.kc_size;
3996
3997 for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3998 struct kmem_range_startup_spec spj = kmem_claims[j];
3999 struct mach_vm_range *range = spj.kc_range;
4000 range->min_address += sp_allocation_size;
4001 range->max_address += sp_allocation_size;
4002 kmem_claims[j + 1] = spj;
4003 }
4004
4005 sp.kc_flags |= KC_NO_MOVE;
4006 kmem_claims[random] = sp;
4007 }
4008
4009 __startup_func
4010 static void
kmem_add_ptr_claims(void)4011 kmem_add_ptr_claims(void)
4012 {
4013 uint64_t kmem_meta_num, kmem_ptr_chunks;
4014 vm_map_size_t org_ptr_range_size __assert_only;
4015
4016 org_ptr_range_size = ptr_range_size;
4017
4018 ptr_range_size -= PAGE_SIZE;
4019 ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
4020 ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
4021
4022 kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
4023 ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
4024
4025 kmem_meta_num = kmem_ptr_chunks + 2;
4026 kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
4027
4028 assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
4029 /*
4030 * Add claims for kmem's ranges
4031 */
4032 for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
4033 struct kmem_range_startup_spec kmem_spec = {
4034 .kc_name = "kmem_ptr_range",
4035 .kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
4036 .kc_size = ptr_range_size,
4037 .kc_flags = KC_NO_ENTRY,
4038 };
4039 kmem_claims[kmem_claim_count++] = kmem_spec;
4040
4041 struct kmem_range_startup_spec kmem_meta_spec = {
4042 .kc_name = "kmem_ptr_range_meta",
4043 .kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
4044 .kc_size = kmem_meta_size,
4045 .kc_flags = KC_NONE,
4046 };
4047 kmem_claims[kmem_claim_count++] = kmem_meta_spec;
4048 }
4049 }
4050
4051 __startup_func
4052 static void
kmem_add_extra_claims(void)4053 kmem_add_extra_claims(void)
4054 {
4055 vm_map_size_t largest_free_size = 0, total_claims = 0;
4056 vm_map_size_t sane_sprayqtn_size = 0, sprayqtn_allocation_size = 0;
4057 vm_map_size_t ptr_total_allocation_size = 0;
4058
4059 vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
4060 largest_free_size = trunc_page(largest_free_size);
4061
4062 /*
4063 * kasan and configs w/o *TRR need to have just one ptr range due to
4064 * resource constraints.
4065 */
4066 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
4067 kmem_ptr_ranges = 1;
4068 #endif
4069 /*
4070 * Determine size of data and pointer kmem_ranges
4071 */
4072 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4073 struct kmem_range_startup_spec sp_i = kmem_claims[i];
4074
4075 total_claims += kmem_claim_to_allocation_size(
4076 sp_i.kc_size, /* known_last */ false);
4077 }
4078 assert((total_claims & PAGE_MASK) == 0);
4079
4080
4081 largest_free_size -= total_claims;
4082
4083 /*
4084 * Use half the total available VA for all pointer allocations (this
4085 * includes the kmem_sprayqtn range). Given that we have 4 total
4086 * ranges divide the available VA by 8.
4087 */
4088 ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
4089
4090 sprayqtn_range_size = ptr_range_size;
4091 sane_sprayqtn_size = kmem_claim_to_allocation_size(
4092 /* claim_size */ sane_size / 2, /* known_last */ false);
4093 if (sprayqtn_range_size > sane_sprayqtn_size) {
4094 vm_map_size_t sprayqtn_extra;
4095
4096 /*
4097 * Spray quarantine doesn't need that much space.
4098 * Shrink it to something reasonable and equally share the leftover VA
4099 * with the other pointer ranges.
4100 */
4101 sprayqtn_extra = sprayqtn_range_size - sane_sprayqtn_size;
4102 sprayqtn_range_size -= sprayqtn_extra;
4103 ptr_range_size += sprayqtn_extra / kmem_ptr_ranges;
4104 }
4105
4106 ptr_range_size = round_page(ptr_range_size);
4107 sprayqtn_range_size = round_page(sprayqtn_range_size);
4108
4109 /* Less any necessary allocation padding... */
4110 ptr_range_size = kmem_allocation_to_claim_size(ptr_range_size);
4111 sprayqtn_range_size = kmem_allocation_to_claim_size(sprayqtn_range_size);
4112
4113 /*
4114 * Add the pointer and metadata claims
4115 * Note: this call modifies ptr_range_size and may, depending on the padding
4116 * requirements, slightly increase or decrease the overall allocation size
4117 * of the pointer+metadata region.
4118 */
4119 kmem_add_ptr_claims();
4120
4121 sprayqtn_allocation_size = kmem_claim_to_allocation_size(
4122 sprayqtn_range_size, /* known_last */ false);
4123 ptr_total_allocation_size =
4124 (kmem_claim_to_allocation_size(ptr_range_size, /* known_last */ false) +
4125 kmem_claim_to_allocation_size(kmem_meta_size, /* known_last */ false)) *
4126 kmem_ptr_ranges;
4127
4128 /*
4129 * Check: spray and ptr_range are minimally valid.
4130 * This is a useful assert as it should catch us if we were to end up with a
4131 * "negative" (or extremely large) data_range_size.
4132 */
4133 assert(sprayqtn_allocation_size + ptr_total_allocation_size < largest_free_size);
4134
4135 /*
4136 * Finally, give any remaining allocable space to the data region.
4137 */
4138 data_range_size = largest_free_size - sprayqtn_allocation_size -
4139 ptr_total_allocation_size;
4140
4141 /*
4142 * If we need the data shared range, divide the size
4143 * for the data ranges between BUFFERS and SHARED.
4144 *
4145 * If not, all data allocations go into KMEM_RANGE_ID_DATA.
4146 */
4147 if (kmem_needs_data_share_range()) {
4148 /*
4149 * Round down the size, because our kmem ranges logic round
4150 * these sizes to page size, and we need to make sure we never
4151 * exceed the remaining allocable space we divided.
4152 */
4153 shared_data_range_size = data_range_size =
4154 trunc_page(data_range_size / 2);
4155 } else {
4156 shared_data_range_size = 0;
4157 }
4158
4159 /* Less any necessary allocation padding... */
4160 data_range_size = kmem_allocation_to_claim_size(data_range_size);
4161 shared_data_range_size = shared_data_range_size ?
4162 kmem_allocation_to_claim_size(shared_data_range_size) : 0;
4163
4164 /* Check: our allocations should all still fit in the free space */
4165 assert(sprayqtn_allocation_size + ptr_total_allocation_size +
4166 kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) +
4167 kmem_claim_to_allocation_size(shared_data_range_size, /* known_last */ false) <=
4168 largest_free_size);
4169
4170 struct kmem_range_startup_spec kmem_spec_sprayqtn = {
4171 .kc_name = "kmem_sprayqtn_range",
4172 .kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
4173 .kc_size = sprayqtn_range_size,
4174 .kc_flags = KC_NO_ENTRY,
4175 };
4176 kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
4177
4178 struct kmem_range_startup_spec kmem_spec_data_buffers = {
4179 .kc_name = "kmem_data_buffers_range",
4180 .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
4181 .kc_size = data_range_size,
4182 .kc_flags = KC_NO_ENTRY,
4183 };
4184 kmem_claims[kmem_claim_count++] = kmem_spec_data_buffers;
4185
4186 if (kmem_needs_data_share_range()) {
4187 struct kmem_range_startup_spec kmem_spec_data_shared = {
4188 .kc_name = "kmem_data_shared_range",
4189 .kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA_SHARED],
4190 .kc_size = shared_data_range_size,
4191 .kc_flags = KC_NO_ENTRY,
4192 };
4193 kmem_claims[kmem_claim_count++] = kmem_spec_data_shared;
4194 }
4195 }
4196
4197 __startup_func
4198 static void
kmem_scramble_ranges(void)4199 kmem_scramble_ranges(void)
4200 {
4201 vm_map_offset_t va_alloc_head = 0;
4202
4203 /*
4204 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
4205 * the vm can find the requested ranges.
4206 */
4207 kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
4208 VM_MAP_PAGE_SIZE(kernel_map));
4209 kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
4210
4211 /*
4212 * Allocating the g_kext_map prior to randomizing the remaining submaps as
4213 * this map is 2G in size and starts at the end of kernel_text on x86. It
4214 * could overflow into the heap.
4215 */
4216 kext_alloc_init();
4217
4218 /*
4219 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
4220 * stack addresses. (With a 4K page and 9 bits of randomness, this
4221 * eats about 2M of VA from the map)
4222 *
4223 * Note that we always need to slide by at least one page because the VM
4224 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
4225 * do not admit this address to be part of any zone submap.
4226 */
4227 va_alloc_head = kmem_fuzz_start();
4228
4229 /*
4230 * Add claims for ptr and data kmem_ranges
4231 */
4232 kmem_add_extra_claims();
4233
4234 /*
4235 * Minimally verify that our placer will be able to resolve the constraints
4236 * of all claims
4237 */
4238 bool has_min_address = false;
4239 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4240 struct kmem_range_startup_spec sp_i = kmem_claims[i];
4241
4242 /* Verify that we have only one claim with a min address constraint */
4243 if (sp_i.kc_range->min_address) {
4244 if (has_min_address) {
4245 panic("Cannot place with multiple min_address constraints");
4246 } else {
4247 has_min_address = true;
4248 }
4249 }
4250
4251 if (sp_i.kc_range->max_address) {
4252 panic("Cannot place with a max_address constraint");
4253 }
4254 }
4255
4256
4257 /*
4258 * Shuffle registered claims
4259 */
4260 assert(kmem_claim_count < UINT16_MAX);
4261 kmem_shuffle_claims();
4262
4263 /*
4264 * Apply restrictions and determine range for each claim
4265 */
4266 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4267 struct kmem_range_startup_spec sp = kmem_claims[i];
4268 struct mach_vm_range *sp_range = sp.kc_range;
4269
4270 /*
4271 * Find space using the allocation size (rather than the claim size) in
4272 * order to ensure we provide any applicable padding.
4273 */
4274 bool is_last = (i == kmem_claim_count - 1);
4275 vm_map_offset_t sp_allocation_size =
4276 kmem_claim_to_allocation_size(sp.kc_size, is_last);
4277
4278 if (vm_map_locate_space_anywhere(kernel_map, sp_allocation_size, 0,
4279 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4280 &va_alloc_head, NULL) != KERN_SUCCESS) {
4281 panic("kmem_range_init: vm_map_locate_space failing for claim %s, "
4282 "size 0x%llx",
4283 sp.kc_name, sp_allocation_size);
4284 }
4285
4286 /*
4287 * Re-adjust ranges if restriction not met
4288 */
4289 if (sp_range->min_address && va_alloc_head > sp_range->min_address) {
4290 kmem_readjust_ranges(i);
4291 } else {
4292 /*
4293 * Though the actual allocated space may be larger, provide only the
4294 * size requested by the original claim.
4295 */
4296 sp_range->min_address = va_alloc_head;
4297 sp_range->max_address = va_alloc_head + sp.kc_size;
4298 }
4299
4300 va_alloc_head += sp_allocation_size;
4301 }
4302
4303 /*
4304 * We have settled on the ranges, now create temporary entries for the
4305 * claims
4306 */
4307 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4308 struct kmem_range_startup_spec sp = kmem_claims[i];
4309 bool is_last = (i == kmem_claim_count - 1);
4310 vm_map_offset_t sp_allocation_size =
4311 kmem_claim_to_allocation_size(sp.kc_size, is_last);
4312 vm_map_entry_t entry = NULL;
4313 if (sp.kc_flags & KC_NO_ENTRY) {
4314 continue;
4315 }
4316
4317
4318 /*
4319 * We reserve the full allocation size (rather than the claim size) so
4320 * that nothing ends up placed in the padding space (if applicable).
4321 */
4322 if (vm_map_find_space(kernel_map, sp.kc_range->min_address,
4323 sp_allocation_size, 0,
4324 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4325 &entry) != KERN_SUCCESS) {
4326 panic("kmem_range_init: vm_map_find_space failing for claim %s",
4327 sp.kc_name);
4328 }
4329 vm_object_reference(kernel_object_default);
4330 VME_OBJECT_SET(entry, kernel_object_default, false, 0);
4331 VME_OFFSET_SET(entry, entry->vme_start);
4332 vm_map_unlock(kernel_map);
4333 }
4334
4335 /*
4336 * Now that we are done assigning all the ranges, reset
4337 * kmem_ranges[KMEM_RANGE_ID_NONE]
4338 */
4339 kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
4340
4341 #if DEBUG || DEVELOPMENT
4342 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4343 struct kmem_range_startup_spec sp = kmem_claims[i];
4344
4345 printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
4346 (void *)sp.kc_range->min_address,
4347 (void *)sp.kc_range->max_address,
4348 mach_vm_size_pretty(sp.kc_size),
4349 mach_vm_size_unit(sp.kc_size));
4350 }
4351 #endif /* DEBUG || DEVELOPMENT */
4352
4353 #if MACH_ASSERT
4354 /*
4355 * Since many parts of the claim infrastructure are marked as startup data
4356 * (and are thus unavailable post-lockdown), save off information our tests
4357 * need now.
4358 */
4359 for (uint32_t i = 0; i < kmem_claim_count; i++) {
4360 kmem_test_saved_ranges[i] = *(kmem_claims[i].kc_range);
4361 }
4362 #endif /* MACH_ASSERT */
4363 }
4364
4365 __startup_func
4366 static void
kmem_range_init(void)4367 kmem_range_init(void)
4368 {
4369 vm_size_t range_adjustment;
4370
4371 kmem_scramble_ranges();
4372
4373 range_adjustment = sprayqtn_range_size >> 3;
4374 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
4375 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
4376 kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
4377 kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
4378
4379 range_adjustment = data_range_size >> 3;
4380 kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
4381 kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
4382 kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
4383 kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
4384
4385 if (kmem_needs_data_share_range()) {
4386 range_adjustment = shared_data_range_size >> 3;
4387 kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address =
4388 kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address + range_adjustment;
4389 kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address =
4390 kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address;
4391 }
4392
4393 pmap_init();
4394 kmem_metadata_init();
4395 kmem_sizeclass_init();
4396
4397 #if DEBUG || DEVELOPMENT
4398 for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
4399 vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
4400 printf("kmem_large_ranges[%d] : %p - %p (%u%c)\n", i,
4401 (void *)kmem_large_ranges[i].min_address,
4402 (void *)kmem_large_ranges[i].max_address,
4403 mach_vm_size_pretty(range_size),
4404 mach_vm_size_unit(range_size));
4405 }
4406 #endif
4407 }
4408 #ifndef __BUILDING_XNU_LIB_UNITTEST__ /* kernel map is not maintained in unit-test */
4409 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
4410 #endif /* __BUILDING_XNU_LIB_UNITTEST__ */
4411
4412 #if DEBUG || DEVELOPMENT
4413 __startup_func
4414 static void
kmem_log_init(void)4415 kmem_log_init(void)
4416 {
4417 /*
4418 * Log can only be created after the the kmem subsystem is initialized as
4419 * btlog creation uses kmem
4420 */
4421 kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
4422 }
4423 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
4424
4425 kmem_gobj_stats
kmem_get_gobj_stats(void)4426 kmem_get_gobj_stats(void)
4427 {
4428 vmlp_api_start(KMEM_GET_GOBJ_STATS);
4429 kmem_gobj_stats stats = {};
4430
4431 vm_map_lock(kernel_map);
4432 for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
4433 kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
4434 struct mach_vm_range range = kmem_ranges[range_id];
4435 struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
4436 struct kmem_page_meta *meta_end;
4437 uint64_t meta_idx = meta - kmem_meta_base[range_id];
4438 vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
4439 vm_map_offset_t addr;
4440 vm_map_entry_t entry;
4441
4442 /*
4443 * Left front
4444 */
4445 va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
4446 meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
4447
4448 /*
4449 * Right front
4450 */
4451 meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
4452 meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
4453 &meta_idx);
4454 meta_idx = meta_end - meta;
4455 meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
4456 va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
4457
4458 /*
4459 * Compute VA allocated in entire range
4460 */
4461 if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
4462 entry = entry->vme_next;
4463 }
4464
4465 vmlp_range_event_entry(kernel_map, entry);
4466
4467 while (entry != vm_map_to_entry(kernel_map) &&
4468 entry->vme_start < range.max_address) {
4469 used += (entry->vme_end - entry->vme_start);
4470 entry = entry->vme_next;
4471 }
4472
4473 pte_sz = round_page(atop(va - used) * 8);
4474
4475 stats.total_used += used;
4476 stats.total_va += va;
4477 stats.pte_sz += pte_sz;
4478 stats.meta_sz += meta_sz;
4479 }
4480 vm_map_unlock(kernel_map);
4481
4482 vmlp_api_end(KMEM_GET_GOBJ_STATS, 0);
4483 return stats;
4484 }
4485
4486 #endif /* DEBUG || DEVELOPMENT */
4487
4488 /*
4489 * kmem_init:
4490 *
4491 * Initialize the kernel's virtual memory map, taking
4492 * into account all memory allocated up to this time.
4493 */
4494 __startup_func
4495 void
kmem_init(vm_offset_t start,vm_offset_t end)4496 kmem_init(
4497 vm_offset_t start,
4498 vm_offset_t end)
4499 {
4500 vm_map_offset_t map_start;
4501 vm_map_offset_t map_end;
4502
4503 map_start = vm_map_trunc_page(start,
4504 VM_MAP_PAGE_MASK(kernel_map));
4505 map_end = vm_map_round_page(end,
4506 VM_MAP_PAGE_MASK(kernel_map));
4507
4508 vm_map_will_allocate_early_map(&kernel_map);
4509 #if defined(__arm64__)
4510 kernel_map = vm_map_create_options(pmap_kernel(),
4511 VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4512 VM_MAX_KERNEL_ADDRESS,
4513 VM_MAP_CREATE_DEFAULT);
4514 /*
4515 * Reserve virtual memory allocated up to this time.
4516 */
4517 {
4518 unsigned int region_select = 0;
4519 vm_map_offset_t region_start;
4520 vm_map_size_t region_size;
4521 vm_map_offset_t map_addr;
4522 kern_return_t kr;
4523
4524 while (pmap_virtual_region(region_select, ®ion_start, ®ion_size)) {
4525 map_addr = region_start;
4526 kr = vm_map_enter(kernel_map, &map_addr,
4527 vm_map_round_page(region_size,
4528 VM_MAP_PAGE_MASK(kernel_map)),
4529 (vm_map_offset_t) 0,
4530 VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(
4531 .vmkf_no_pmap_check = true,
4532 .vmkf_no_soft_limit = true),
4533 VM_OBJECT_NULL,
4534 (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
4535 VM_INHERIT_DEFAULT);
4536
4537 if (kr != KERN_SUCCESS) {
4538 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4539 (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
4540 (uint64_t) region_size, kr);
4541 }
4542
4543 region_select++;
4544 }
4545 }
4546 #else
4547 kernel_map = vm_map_create_options(pmap_kernel(),
4548 VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
4549 VM_MAP_CREATE_DEFAULT);
4550 /*
4551 * Reserve virtual memory allocated up to this time.
4552 */
4553 if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
4554 vm_map_offset_t map_addr;
4555 kern_return_t kr;
4556
4557 map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
4558 kr = vm_map_enter(kernel_map,
4559 &map_addr,
4560 (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4561 (vm_map_offset_t) 0,
4562 VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
4563 VM_OBJECT_NULL,
4564 (vm_object_offset_t) 0, FALSE,
4565 VM_PROT_NONE, VM_PROT_NONE,
4566 VM_INHERIT_DEFAULT);
4567
4568 if (kr != KERN_SUCCESS) {
4569 panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4570 (uint64_t) start, (uint64_t) end,
4571 (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4572 (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4573 kr);
4574 }
4575 }
4576 #endif
4577
4578 kmem_set_user_wire_limits();
4579 }
4580
4581
4582 #pragma mark map copyio
4583
4584 /*
4585 * Note: semantic types aren't used as `copyio` already validates.
4586 */
4587
4588 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)4589 copyinmap(
4590 vm_map_t map,
4591 vm_map_offset_t fromaddr,
4592 void *todata,
4593 vm_size_t length)
4594 {
4595 kern_return_t kr = KERN_SUCCESS;
4596 vm_map_switch_context_t switch_ctx;
4597
4598 if (vm_map_pmap(map) == pmap_kernel()) {
4599 /* assume a correct copy */
4600 memcpy(todata, CAST_DOWN(void *, fromaddr), length);
4601 } else if (current_map() == map) {
4602 if (copyin(fromaddr, todata, length) != 0) {
4603 kr = KERN_INVALID_ADDRESS;
4604 }
4605 } else {
4606 vm_map_reference(map);
4607 switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4608 if (copyin(fromaddr, todata, length) != 0) {
4609 kr = KERN_INVALID_ADDRESS;
4610 }
4611 vm_map_switch_back(switch_ctx);
4612 vm_map_deallocate(map);
4613 }
4614 return kr;
4615 }
4616
4617 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4618 copyoutmap(
4619 vm_map_t map,
4620 void *fromdata,
4621 vm_map_address_t toaddr,
4622 vm_size_t length)
4623 {
4624 kern_return_t kr = KERN_SUCCESS;
4625 vm_map_switch_context_t switch_ctx;
4626
4627 if (vm_map_pmap(map) == pmap_kernel()) {
4628 /* assume a correct copy */
4629 memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4630 } else if (current_map() == map) {
4631 if (copyout(fromdata, toaddr, length) != 0) {
4632 ktriage_record(thread_tid(current_thread()),
4633 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4634 KDBG_TRIAGE_RESERVED,
4635 KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR),
4636 KERN_INVALID_ADDRESS /* arg */);
4637 kr = KERN_INVALID_ADDRESS;
4638 }
4639 } else {
4640 vm_map_reference(map);
4641 switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4642 if (copyout(fromdata, toaddr, length) != 0) {
4643 ktriage_record(thread_tid(current_thread()),
4644 KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4645 KDBG_TRIAGE_RESERVED,
4646 KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR),
4647 KERN_INVALID_ADDRESS /* arg */);
4648 kr = KERN_INVALID_ADDRESS;
4649 }
4650 vm_map_switch_back(switch_ctx);
4651 vm_map_deallocate(map);
4652 }
4653 return kr;
4654 }
4655
4656 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4657 copyoutmap_atomic32(
4658 vm_map_t map,
4659 uint32_t value,
4660 vm_map_address_t toaddr)
4661 {
4662 kern_return_t kr = KERN_SUCCESS;
4663 vm_map_switch_context_t switch_ctx;
4664
4665 if (vm_map_pmap(map) == pmap_kernel()) {
4666 /* assume a correct toaddr */
4667 *(uint32_t *)toaddr = value;
4668 } else if (current_map() == map) {
4669 if (copyout_atomic32(value, toaddr) != 0) {
4670 kr = KERN_INVALID_ADDRESS;
4671 }
4672 } else {
4673 vm_map_reference(map);
4674 switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4675 if (copyout_atomic32(value, toaddr) != 0) {
4676 kr = KERN_INVALID_ADDRESS;
4677 }
4678 vm_map_switch_back(switch_ctx);
4679 vm_map_deallocate(map);
4680 }
4681 return kr;
4682 }
4683
4684 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4685 copyoutmap_atomic64(
4686 vm_map_t map,
4687 uint64_t value,
4688 vm_map_address_t toaddr)
4689 {
4690 kern_return_t kr = KERN_SUCCESS;
4691 vm_map_switch_context_t switch_ctx;
4692
4693 if (vm_map_pmap(map) == pmap_kernel()) {
4694 /* assume a correct toaddr */
4695 *(uint64_t *)toaddr = value;
4696 } else if (current_map() == map) {
4697 if (copyout_atomic64(value, toaddr) != 0) {
4698 kr = KERN_INVALID_ADDRESS;
4699 }
4700 } else {
4701 vm_map_reference(map);
4702 switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4703 if (copyout_atomic64(value, toaddr) != 0) {
4704 kr = KERN_INVALID_ADDRESS;
4705 }
4706 vm_map_switch_back(switch_ctx);
4707 vm_map_deallocate(map);
4708 }
4709 return kr;
4710 }
4711
4712
4713 #pragma mark pointer obfuscation / packing
4714
4715 /*
4716 *
4717 * The following two functions are to be used when exposing kernel
4718 * addresses to userspace via any of the various debug or info
4719 * facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4720 * and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4721 * are exported to KEXTs.
4722 *
4723 * NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4724 */
4725
4726 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4727 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4728 {
4729 assert(salt != 0);
4730
4731 if (addr == 0) {
4732 return 0ul;
4733 }
4734
4735 if (VM_KERNEL_IS_SLID(addr)) {
4736 return VM_KERNEL_UNSLIDE(addr);
4737 }
4738
4739 #if HAS_MTE
4740 /*
4741 * Remove traces of MTE tags or PAC signatures, to prevent observers from seeing
4742 * identical repeated values.
4743 */
4744 #endif /* HAS_MTE */
4745 addr = VM_KERNEL_STRIP_PTR(addr);
4746
4747 vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4748 SHA256_CTX sha_ctx;
4749
4750 SHA256_Init(&sha_ctx);
4751 SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4752 SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4753 SHA256_Final(sha_digest, &sha_ctx);
4754
4755 return sha_digest[0];
4756 }
4757
4758 __exported vm_offset_t
4759 vm_kernel_addrhash_external(vm_offset_t addr);
4760 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4761 vm_kernel_addrhash_external(vm_offset_t addr)
4762 {
4763 return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4764 }
4765
4766 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4767 vm_kernel_addrhide(
4768 vm_offset_t addr,
4769 vm_offset_t *hide_addr)
4770 {
4771 *hide_addr = VM_KERNEL_ADDRHIDE(addr);
4772 }
4773
4774 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4775 vm_kernel_addrperm_external(
4776 vm_offset_t addr,
4777 vm_offset_t *perm_addr)
4778 {
4779 addr = VM_KERNEL_STRIP_UPTR(addr);
4780
4781 if (VM_KERNEL_IS_SLID(addr)) {
4782 *perm_addr = VM_KERNEL_UNSLIDE(addr);
4783 } else if (VM_KERNEL_ADDRESS(addr)) {
4784 *perm_addr = ML_ADDRPERM(addr, vm_kernel_addrperm_ext);
4785 } else {
4786 *perm_addr = addr;
4787 }
4788 }
4789
4790 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4791 vm_kernel_unslide_or_perm_external(
4792 vm_offset_t addr,
4793 vm_offset_t *up_addr)
4794 {
4795 vm_kernel_addrperm_external(addr, up_addr);
4796 }
4797
4798 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4799 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4800 {
4801 if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4802 panic("pointer %p can't be packed: low %d bits aren't 0",
4803 (void *)ptr, params.vmpp_shift);
4804 } else if (ptr <= params.vmpp_base) {
4805 panic("pointer %p can't be packed: below base %p",
4806 (void *)ptr, (void *)params.vmpp_base);
4807 } else {
4808 panic("pointer %p can't be packed: maximum encodable pointer is %p",
4809 (void *)ptr, (void *)vm_packing_max_packable(params));
4810 }
4811 }
4812
4813 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4814 vm_packing_verify_range(
4815 const char *subsystem,
4816 vm_offset_t min_address,
4817 vm_offset_t max_address,
4818 vm_packing_params_t params)
4819 {
4820 if (min_address > max_address) {
4821 panic("%s: %s range invalid min:%p > max:%p",
4822 __func__, subsystem, (void *)min_address, (void *)max_address);
4823 }
4824
4825 if (!params.vmpp_base_relative) {
4826 return;
4827 }
4828
4829 if (min_address <= params.vmpp_base) {
4830 panic("%s: %s range invalid min:%p <= base:%p",
4831 __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4832 }
4833
4834 if (max_address > vm_packing_max_packable(params)) {
4835 panic("%s: %s range invalid max:%p >= max packable:%p",
4836 __func__, subsystem, (void *)max_address,
4837 (void *)vm_packing_max_packable(params));
4838 }
4839 }
4840
4841 #pragma mark tests
4842 #if MACH_ASSERT
4843 #include <sys/errno.h>
4844
4845 static void
4846 kmem_test_for_entry(
4847 vm_map_t map,
4848 vm_offset_t addr,
4849 void (^block)(vm_map_entry_t))
4850 {
4851 vm_map_entry_t entry;
4852
4853 vm_map_lock(map);
4854 block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4855 vm_map_unlock(map);
4856 }
4857
4858 #define kmem_test_assert_map(map, pg, entries) ({ \
4859 assert3u((map)->size, ==, ptoa(pg)); \
4860 assert3u((map)->hdr.nentries, ==, entries); \
4861 })
4862
4863 static bool
can_write_at(vm_offset_t offs,uint32_t page)4864 can_write_at(vm_offset_t offs, uint32_t page)
4865 {
4866 static const int zero;
4867
4868 return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4869 }
4870 #define assert_writeable(offs, page) \
4871 assertf(can_write_at(offs, page), \
4872 "can write at %p + ptoa(%d)", (void *)offs, page)
4873
4874 #define assert_faults(offs, page) \
4875 assertf(!can_write_at(offs, page), \
4876 "can write at %p + ptoa(%d)", (void *)offs, page)
4877
4878 #define peek(offs, page) \
4879 (*(uint32_t *)((offs) + ptoa(page)))
4880
4881 #define poke(offs, page, v) \
4882 (*(uint32_t *)((offs) + ptoa(page)) = (v))
4883
4884 #if CONFIG_SPTM
4885 __attribute__((noinline))
4886 static void
kmem_test_verify_type_policy(vm_offset_t addr,kmem_flags_t flags)4887 kmem_test_verify_type_policy(vm_offset_t addr, kmem_flags_t flags)
4888 {
4889 extern bool use_xnu_restricted;
4890 pmap_mapping_type_t expected_type = PMAP_MAPPING_TYPE_RESTRICTED;
4891
4892 /* Explicitly state the expected policy */
4893 if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
4894 expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4895 } else if ((flags & KMEM_DATA) &&
4896 !kalloc_is_restricted_data_mode_enforced()) {
4897 expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4898 }
4899
4900 /* If X_K_R is disabled, DEFAULT is the only possible mapping */
4901 if (!use_xnu_restricted) {
4902 expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4903 }
4904
4905 /* Verify if derived correctly */
4906 assert3u(expected_type, ==, __kmem_mapping_type(flags));
4907
4908 pmap_paddr_t pa = kvtophys(addr);
4909 if (pa == 0) {
4910 return;
4911 }
4912
4913 /* Verify if the mapped address actually got the expected type */
4914 assert3u(expected_type, ==, sptm_get_frame_type(pa));
4915 }
4916 #endif /* CONFIG_SPTM */
4917
4918 __attribute__((noinline))
4919 static void
kmem_alloc_basic_test(vm_map_t map)4920 kmem_alloc_basic_test(vm_map_t map)
4921 {
4922 kmem_guard_t guard = {
4923 .kmg_tag = VM_KERN_MEMORY_DIAG,
4924 };
4925 vm_offset_t addr;
4926
4927 /*
4928 * Test wired basics:
4929 * - KMA_KOBJECT
4930 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4931 * - allocation alignment
4932 */
4933 addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4934 KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4935 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4936 assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4937 kmem_test_assert_map(map, 10, 1);
4938
4939 kmem_test_for_entry(map, addr, ^(__assert_only vm_map_entry_t e){
4940 assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4941 assert(e->vme_kernel_object);
4942 assert(!e->vme_atomic);
4943 assert3u(e->vme_start, <=, addr);
4944 assert3u(addr + ptoa(10), <=, e->vme_end);
4945 });
4946
4947 assert_faults(addr, 0);
4948 for (int i = 1; i < 9; i++) {
4949 assert_writeable(addr, i);
4950 }
4951 assert_faults(addr, 9);
4952
4953 kmem_free(map, addr, ptoa(10));
4954 kmem_test_assert_map(map, 0, 0);
4955
4956 /*
4957 * Test pageable basics.
4958 */
4959 addr = kmem_alloc_guard(map, ptoa(10), 0,
4960 KMA_PAGEABLE, guard).kmr_address;
4961 assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4962 kmem_test_assert_map(map, 10, 1);
4963
4964 for (int i = 0; i < 9; i++) {
4965 assert_faults(addr, i);
4966 poke(addr, i, 42);
4967 assert_writeable(addr, i);
4968 }
4969
4970 kmem_free_guard(map, addr, ptoa(10),
4971 KMF_GUARD_FIRST | KMF_GUARD_LAST, guard);
4972 kmem_test_assert_map(map, 0, 0);
4973 }
4974
4975 __attribute__((noinline))
4976 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4977 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4978 {
4979 kmem_guard_t guard = {
4980 .kmg_atomic = !(kind & (KMR_DATA | KMR_DATA_SHARED)),
4981 .kmg_tag = VM_KERN_MEMORY_DIAG,
4982 .kmg_context = 0xefface,
4983 };
4984 vm_offset_t addr, newaddr;
4985 const int N = 10;
4986
4987 /*
4988 * This isn't something kmem_realloc_guard() _needs_ to do,
4989 * we could conceive an implementation where it grows in place
4990 * if there's space after it.
4991 *
4992 * However, this is what the implementation does today.
4993 */
4994 bool realloc_growth_changes_address = true;
4995 bool GF = (kind & KMR_GUARD_FIRST);
4996 bool GL = (kind & KMR_GUARD_LAST);
4997
4998 /*
4999 * Initial N page allocation
5000 */
5001 addr = kmem_alloc_guard(map, ptoa(N), 0,
5002 (kind & ~KMEM_FREEOLD) | KMA_ZERO, guard).kmr_address;
5003 assert3u(addr, !=, 0);
5004
5005 kmem_test_assert_map(map, N, 1);
5006 for (int pg = GF; pg < N - GL; pg++) {
5007 poke(addr, pg, 42 + pg);
5008 }
5009 for (int pg = N - GL; pg < N; pg++) {
5010 assert_faults(addr, pg);
5011 }
5012
5013 #if CONFIG_SPTM
5014 kmem_test_verify_type_policy(addr, ANYF(kind));
5015 #endif /* CONFIG_SPTM */
5016 /*
5017 * Grow to N + 3 pages
5018 */
5019 newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
5020 kind | KMR_ZERO, guard).kmr_address;
5021 assert3u(newaddr, !=, 0);
5022 if (realloc_growth_changes_address) {
5023 assert3u(addr, !=, newaddr);
5024 }
5025 if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
5026 kmem_test_assert_map(map, N + 3, 1);
5027 } else {
5028 kmem_test_assert_map(map, 2 * N + 3, 2);
5029 }
5030 for (int pg = GF; pg < N - GL; pg++) {
5031 assert3u(peek(newaddr, pg), ==, 42 + pg);
5032 }
5033 if ((kind & KMR_FREEOLD) == 0) {
5034 for (int pg = GF; pg < N - GL; pg++) {
5035 assert3u(peek(addr, pg), ==, 42 + pg);
5036 }
5037 /* check for tru-share */
5038 poke(addr + 16, 0, 1234);
5039 assert3u(peek(newaddr + 16, 0), ==, 1234);
5040 kmem_free_guard(map, addr, ptoa(N),
5041 kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
5042 kmem_test_assert_map(map, N + 3, 1);
5043 }
5044 if (addr != newaddr) {
5045 for (int pg = GF; pg < N - GL; pg++) {
5046 assert_faults(addr, pg);
5047 }
5048 }
5049 for (int pg = N - GL; pg < N + 3 - GL; pg++) {
5050 assert3u(peek(newaddr, pg), ==, 0);
5051 }
5052 for (int pg = N + 3 - GL; pg < N + 3; pg++) {
5053 assert_faults(newaddr, pg);
5054 }
5055 addr = newaddr;
5056
5057
5058 /*
5059 * Shrink to N - 2 pages
5060 */
5061 newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
5062 kind | KMR_ZERO, guard).kmr_address;
5063 assert3u(map->size, ==, ptoa(N - 2));
5064 assert3u(newaddr, ==, addr);
5065 kmem_test_assert_map(map, N - 2, 1);
5066
5067 for (int pg = GF; pg < N - 2 - GL; pg++) {
5068 assert3u(peek(addr, pg), ==, 42 + pg);
5069 }
5070 for (int pg = N - 2 - GL; pg < N + 3; pg++) {
5071 assert_faults(addr, pg);
5072 }
5073
5074 kmem_free_guard(map, addr, ptoa(N - 2),
5075 kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
5076 kmem_test_assert_map(map, 0, 0);
5077 }
5078
5079 static int
kmem_basic_test(__unused int64_t in,int64_t * out)5080 kmem_basic_test(__unused int64_t in, int64_t *out)
5081 {
5082 mach_vm_offset_t addr;
5083 vm_map_t map;
5084
5085 printf("%s: test running\n", __func__);
5086
5087 map = kmem_suballoc(kernel_map, &addr, 64U << 20,
5088 VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
5089 KMS_NOFAIL | KMS_DATA_SHARED, VM_KERN_MEMORY_DIAG).kmr_submap;
5090
5091 printf("%s: kmem_alloc ...\n", __func__);
5092 kmem_alloc_basic_test(map);
5093 printf("%s: PASS\n", __func__);
5094
5095 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
5096 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
5097 printf("%s: PASS\n", __func__);
5098
5099 printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
5100 kmem_realloc_basic_test(map, KMR_FREEOLD);
5101 printf("%s: PASS\n", __func__);
5102
5103 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5104 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
5105 printf("%s: PASS\n", __func__);
5106
5107 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5108 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
5109 printf("%s: PASS\n", __func__);
5110
5111 printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5112 kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5113 printf("%s: PASS\n", __func__);
5114
5115 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5116 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
5117 printf("%s: PASS\n", __func__);
5118
5119 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5120 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
5121 printf("%s: PASS\n", __func__);
5122
5123 printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5124 kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5125 printf("%s: PASS\n", __func__);
5126
5127 #if HAS_MTE
5128 printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
5129 kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD);
5130 printf("%s: PASS\n", __func__);
5131
5132 printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD) ...\n", __func__);
5133 kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD);
5134 printf("%s: PASS\n", __func__);
5135
5136 printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5137 kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
5138 printf("%s: PASS\n", __func__);
5139
5140 printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5141 kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
5142 printf("%s: PASS\n", __func__);
5143
5144 printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5145 kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5146 printf("%s: PASS\n", __func__);
5147
5148 printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5149 kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST);
5150 printf("%s: PASS\n", __func__);
5151
5152 printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5153 kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_LAST);
5154 printf("%s: PASS\n", __func__);
5155
5156 printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5157 kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5158 printf("%s: PASS\n", __func__);
5159 #endif /* HAS_MTE */
5160
5161 /* using KMR_DATA signals to test the non atomic realloc path */
5162 printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
5163 kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
5164 printf("%s: PASS\n", __func__);
5165
5166 /*
5167 * Using KMR_DATA without KMR_FREEOLD violates the
5168 * single-mappability of RESTRICTED pages.
5169 */
5170
5171 /* test KMR_SHARED_DATA for the new shared kheap */
5172 printf("%s: kmem_realloc (KMR_DATA_SHARED | KMR_FREEOLD) ...\n", __func__);
5173 kmem_realloc_basic_test(map, KMR_DATA_SHARED | KMR_FREEOLD);
5174 printf("%s: PASS\n", __func__);
5175
5176 /* test KMR_SHARED_DATA for the new shared kheap */
5177 printf("%s: kmem_realloc (KMR_DATA_SHARED) ...\n", __func__);
5178 kmem_realloc_basic_test(map, KMR_DATA_SHARED);
5179 printf("%s: PASS\n", __func__);
5180
5181 kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
5182 vm_map_deallocate(map);
5183
5184 printf("%s: test passed\n", __func__);
5185 *out = 1;
5186 return 0;
5187 }
5188 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
5189
5190 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)5191 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
5192 {
5193 __assert_only uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
5194
5195 assert(chunks >= kmem_size_array[idx].ks_num_chunk);
5196 }
5197
5198 __attribute__((noinline))
5199 static void
kmem_test_get_size_idx_for_all_chunks()5200 kmem_test_get_size_idx_for_all_chunks()
5201 {
5202 for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
5203 uint32_t chunks = kmem_size_array[i].ks_num_chunk;
5204
5205 if (chunks != 1) {
5206 kmem_test_get_size_idx_for_chunks(chunks - 1);
5207 }
5208 kmem_test_get_size_idx_for_chunks(chunks);
5209 kmem_test_get_size_idx_for_chunks(chunks + 1);
5210 }
5211 }
5212
5213 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)5214 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
5215 {
5216 printf("%s: test running\n", __func__);
5217
5218 printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
5219 kmem_test_get_size_idx_for_all_chunks();
5220 printf("%s: PASS\n", __func__);
5221
5222 printf("%s: test passed\n", __func__);
5223 *out = 1;
5224 return 0;
5225 }
5226 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
5227
5228
5229 #endif /* MACH_ASSERT */
5230