xref: /xnu-10063.121.3/osfmk/vm/vm_kern.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #include <kern/misc_protos.h>
80 #include <vm/cpm.h>
81 #include <kern/ledger.h>
82 #include <kern/bits.h>
83 #include <kern/startup.h>
84 
85 #include <string.h>
86 
87 #include <libkern/OSDebug.h>
88 #include <libkern/crypto/sha2.h>
89 #include <libkern/section_keywords.h>
90 #include <sys/kdebug.h>
91 #include <sys/kdebug_triage.h>
92 
93 #include <san/kasan.h>
94 #include <kern/kext_alloc.h>
95 #include <kern/backtrace.h>
96 #include <os/hash.h>
97 #include <kern/zalloc_internal.h>
98 #include <libkern/crypto/rand.h>
99 
100 /*
101  *	Variables exported by this module.
102  */
103 
104 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
105 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
106 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
107 
108 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
109     KMEM_RANGE_ID_NUM_PTR);
110 #define KMEM_GOBJ_THRESHOLD   (32ULL << 20)
111 #if DEBUG || DEVELOPMENT
112 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
113 #define KMEM_OUTLIER_SIZE      0
114 #define KMEM_OUTLIER_ALIGN     1
115 btlog_t kmem_outlier_log;
116 #endif /* DEBUG || DEVELOPMENT */
117 
118 __startup_data static vm_map_size_t data_range_size;
119 __startup_data static vm_map_size_t ptr_range_size;
120 __startup_data static vm_map_size_t sprayqtn_range_size;
121 
122 #pragma mark helpers
123 
124 __attribute__((overloadable))
125 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)126 ANYF(kma_flags_t flags)
127 {
128 	return (kmem_flags_t)flags;
129 }
130 
131 __attribute__((overloadable))
132 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)133 ANYF(kmr_flags_t flags)
134 {
135 	return (kmem_flags_t)flags;
136 }
137 
138 __attribute__((overloadable))
139 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)140 ANYF(kmf_flags_t flags)
141 {
142 	return (kmem_flags_t)flags;
143 }
144 
145 __abortlike
146 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)147 __kmem_invalid_size_panic(
148 	vm_map_t        map,
149 	vm_size_t       size,
150 	uint32_t        flags)
151 {
152 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
153 	    map, flags, (size_t)size);
154 }
155 
156 __abortlike
157 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)158 __kmem_invalid_arguments_panic(
159 	const char     *what,
160 	vm_map_t        map,
161 	vm_address_t    address,
162 	vm_size_t       size,
163 	uint32_t        flags)
164 {
165 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
166 	    "invalid arguments passed",
167 	    what, map, (void *)address, (size_t)size, flags);
168 }
169 
170 __abortlike
171 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)172 __kmem_failed_panic(
173 	vm_map_t        map,
174 	vm_size_t       size,
175 	uint32_t        flags,
176 	kern_return_t   kr,
177 	const char     *what)
178 {
179 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
180 	    what, map, (size_t)size, flags, kr);
181 }
182 
183 __abortlike
184 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)185 __kmem_entry_not_found_panic(
186 	vm_map_t        map,
187 	vm_offset_t     addr)
188 {
189 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
190 }
191 
192 static inline vm_object_t
__kmem_object(kmem_flags_t flags)193 __kmem_object(kmem_flags_t flags)
194 {
195 	if (flags & KMEM_COMPRESSOR) {
196 		if (flags & KMEM_KOBJECT) {
197 			panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
198 		}
199 		return compressor_object;
200 	}
201 	if (!(flags & KMEM_KOBJECT)) {
202 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
203 	}
204 	return kernel_object_default;
205 }
206 
207 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)208 __kmem_mapping_type(kmem_flags_t flags)
209 {
210 	if (flags & (KMEM_DATA | KMEM_COMPRESSOR)) {
211 		return PMAP_MAPPING_TYPE_DEFAULT;
212 	} else {
213 		return PMAP_MAPPING_TYPE_RESTRICTED;
214 	}
215 }
216 
217 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)218 __kmem_guard_left(kmem_flags_t flags)
219 {
220 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
221 }
222 
223 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)224 __kmem_guard_right(kmem_flags_t flags)
225 {
226 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
227 }
228 
229 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)230 __kmem_guard_size(kmem_flags_t flags)
231 {
232 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
233 }
234 
235 __pure2
236 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)237 __kmem_entry_orig_size(vm_map_entry_t entry)
238 {
239 	vm_object_t object = VME_OBJECT(entry);
240 
241 	if (entry->vme_kernel_object) {
242 		return entry->vme_end - entry->vme_start -
243 		       entry->vme_object_or_delta;
244 	} else {
245 		return object->vo_size - object->vo_size_delta;
246 	}
247 }
248 
249 
250 #pragma mark kmem range methods
251 
252 #if __arm64__
253 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
254 #define mach_vm_range_load(r, r_min, r_max) \
255 	asm("ldp %[rmin], %[rmax], [%[range]]" \
256 	    : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
257 	    : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
258 #else
259 #define mach_vm_range_load(r, rmin, rmax) \
260 	({ rmin = (r)->min_address; rmax = (r)->max_address; })
261 #endif
262 
263 __abortlike
264 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)265 __mach_vm_range_overflow(
266 	mach_vm_offset_t        addr,
267 	mach_vm_offset_t        size)
268 {
269 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
270 	    addr, addr, size);
271 }
272 
273 __abortlike
274 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)275 __mach_vm_range_invalid(
276 	mach_vm_offset_t        min_address,
277 	mach_vm_offset_t        max_address)
278 {
279 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
280 	    min_address, max_address);
281 }
282 
283 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)284 mach_vm_range_size(const struct mach_vm_range *r)
285 {
286 	mach_vm_offset_t rmin, rmax;
287 
288 	mach_vm_range_load(r, rmin, rmax);
289 	return rmax - rmin;
290 }
291 
292 __attribute__((overloadable))
293 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)294 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
295 {
296 	mach_vm_offset_t rmin, rmax;
297 
298 #if CONFIG_KERNEL_TAGGING
299 	if (VM_KERNEL_ADDRESS(addr)) {
300 		addr = vm_memtag_canonicalize_address(addr);
301 	}
302 #endif /* CONFIG_KERNEL_TAGGING */
303 
304 	/*
305 	 * The `&` is not a typo: we really expect the check to pass,
306 	 * so encourage the compiler to eagerly load and test without branches
307 	 */
308 	mach_vm_range_load(r, rmin, rmax);
309 	return (addr >= rmin) & (addr < rmax);
310 }
311 
312 __attribute__((overloadable))
313 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)314 mach_vm_range_contains(
315 	const struct mach_vm_range *r,
316 	mach_vm_offset_t        addr,
317 	mach_vm_offset_t        size)
318 {
319 	mach_vm_offset_t rmin, rmax;
320 
321 #if CONFIG_KERNEL_TAGGING
322 	if (VM_KERNEL_ADDRESS(addr)) {
323 		addr = vm_memtag_canonicalize_address(addr);
324 	}
325 #endif /* CONFIG_KERNEL_TAGGING */
326 
327 	/*
328 	 * The `&` is not a typo: we really expect the check to pass,
329 	 * so encourage the compiler to eagerly load and test without branches
330 	 */
331 	mach_vm_range_load(r, rmin, rmax);
332 	return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
333 }
334 
335 __attribute__((overloadable))
336 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)337 mach_vm_range_intersects(
338 	const struct mach_vm_range *r1,
339 	const struct mach_vm_range *r2)
340 {
341 	mach_vm_offset_t r1_min, r1_max;
342 	mach_vm_offset_t r2_min, r2_max;
343 
344 	mach_vm_range_load(r1, r1_min, r1_max);
345 	r2_min = r2->min_address;
346 	r2_max = r2->max_address;
347 
348 	if (r1_min > r1_max) {
349 		__mach_vm_range_invalid(r1_min, r1_max);
350 	}
351 
352 	if (r2_min > r2_max) {
353 		__mach_vm_range_invalid(r2_min, r2_max);
354 	}
355 
356 	return r1_max > r2_min && r1_min < r2_max;
357 }
358 
359 __attribute__((overloadable))
360 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)361 mach_vm_range_intersects(
362 	const struct mach_vm_range *r1,
363 	mach_vm_offset_t        addr,
364 	mach_vm_offset_t        size)
365 {
366 	struct mach_vm_range r2;
367 
368 	addr = VM_KERNEL_STRIP_UPTR(addr);
369 	r2.min_address = addr;
370 	if (os_add_overflow(addr, size, &r2.max_address)) {
371 		__mach_vm_range_overflow(addr, size);
372 	}
373 
374 	return mach_vm_range_intersects(r1, &r2);
375 }
376 
377 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)378 kmem_range_id_contains(
379 	kmem_range_id_t         range_id,
380 	vm_map_offset_t         addr,
381 	vm_map_size_t           size)
382 {
383 	return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
384 }
385 
386 __abortlike
387 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)388 kmem_range_invalid_panic(
389 	kmem_range_id_t         range_id,
390 	vm_map_offset_t         addr,
391 	vm_map_size_t           size)
392 {
393 	const struct mach_vm_range *r = &kmem_ranges[range_id];
394 	mach_vm_offset_t rmin, rmax;
395 
396 	mach_vm_range_load(r, rmin, rmax);
397 	if (addr + size < rmin) {
398 		panic("addr %p + size %llu overflows %p", (void *)addr, size,
399 		    (void *)(addr + size));
400 	}
401 	panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
402 	    (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
403 }
404 
405 /*
406  * Return whether the entire allocation is contained in the given range
407  */
408 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)409 kmem_range_contains_fully(
410 	kmem_range_id_t         range_id,
411 	vm_map_offset_t         addr,
412 	vm_map_size_t           size)
413 {
414 	const struct mach_vm_range *r = &kmem_ranges[range_id];
415 	mach_vm_offset_t rmin, rmax;
416 	bool result = false;
417 
418 	if (VM_KERNEL_ADDRESS(addr)) {
419 		addr = vm_memtag_canonicalize_address(addr);
420 	}
421 
422 	/*
423 	 * The `&` is not a typo: we really expect the check to pass,
424 	 * so encourage the compiler to eagerly load and test without branches
425 	 */
426 	mach_vm_range_load(r, rmin, rmax);
427 	result = (addr >= rmin) & (addr < rmax);
428 	if (__improbable(result
429 	    && ((addr + size < rmin) || (addr + size > rmax)))) {
430 		kmem_range_invalid_panic(range_id, addr, size);
431 	}
432 	return result;
433 }
434 
435 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)436 kmem_range_id_size(kmem_range_id_t range_id)
437 {
438 	return mach_vm_range_size(&kmem_ranges[range_id]);
439 }
440 
441 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)442 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
443 {
444 	kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
445 
446 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
447 		if (kmem_range_contains_fully(range_id, addr, size)) {
448 			return range_id;
449 		}
450 	}
451 	return KMEM_RANGE_ID_NONE;
452 }
453 
454 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)455 kmem_is_ptr_range(vm_map_range_id_t range_id)
456 {
457 	return (range_id >= KMEM_RANGE_ID_FIRST) &&
458 	       (range_id <= KMEM_RANGE_ID_NUM_PTR);
459 }
460 
461 __abortlike
462 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)463 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
464 {
465 	panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
466 	    (void *)addr);
467 }
468 
469 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)470 kmem_validate_range_for_overwrite(
471 	vm_map_offset_t         addr,
472 	vm_map_size_t           size)
473 {
474 	vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
475 
476 	if (kmem_is_ptr_range(range_id)) {
477 		kmem_range_invalid_for_overwrite(addr);
478 	}
479 
480 	return &kmem_ranges[range_id];
481 }
482 
483 
484 #pragma mark entry parameters
485 
486 
487 __abortlike
488 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)489 __kmem_entry_validate_panic(
490 	vm_map_t        map,
491 	vm_map_entry_t  entry,
492 	vm_offset_t     addr,
493 	vm_size_t       size,
494 	uint32_t        flags,
495 	kmem_guard_t    guard)
496 {
497 	const char *what = "???";
498 
499 	if (entry->vme_atomic != guard.kmg_atomic) {
500 		what = "atomicity";
501 	} else if (entry->is_sub_map != guard.kmg_submap) {
502 		what = "objectness";
503 	} else if (addr != entry->vme_start) {
504 		what = "left bound";
505 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
506 		what = "right bound";
507 	} else if (guard.kmg_context != entry->vme_context) {
508 		what = "guard";
509 	}
510 
511 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
512 	    "entry:%p %s mismatch guard(0x%08x)",
513 	    map, (void *)addr, size, flags, entry,
514 	    what, guard.kmg_context);
515 }
516 
517 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)518 __kmem_entry_validate_guard(
519 	vm_map_entry_t  entry,
520 	vm_offset_t     addr,
521 	vm_size_t       size,
522 	kmem_flags_t    flags,
523 	kmem_guard_t    guard)
524 {
525 	if (entry->vme_atomic != guard.kmg_atomic) {
526 		return false;
527 	}
528 
529 	if (!guard.kmg_atomic) {
530 		return true;
531 	}
532 
533 	if (entry->is_sub_map != guard.kmg_submap) {
534 		return false;
535 	}
536 
537 	if (addr != entry->vme_start) {
538 		return false;
539 	}
540 
541 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
542 		return false;
543 	}
544 
545 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
546 		return false;
547 	}
548 
549 	return true;
550 }
551 
552 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)553 kmem_entry_validate_guard(
554 	vm_map_t        map,
555 	vm_map_entry_t  entry,
556 	vm_offset_t     addr,
557 	vm_size_t       size,
558 	kmem_guard_t    guard)
559 {
560 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
561 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
562 	}
563 }
564 
565 __abortlike
566 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)567 __kmem_entry_validate_object_panic(
568 	vm_map_t        map,
569 	vm_map_entry_t  entry,
570 	kmem_flags_t    flags)
571 {
572 	const char *what;
573 	const char *verb;
574 
575 	if (entry->is_sub_map) {
576 		panic("kmem(map=%p) entry %p is a submap", map, entry);
577 	}
578 
579 	if (flags & KMEM_KOBJECT) {
580 		what = "kernel";
581 		verb = "isn't";
582 	} else if (flags & KMEM_COMPRESSOR) {
583 		what = "compressor";
584 		verb = "isn't";
585 	} else if (entry->vme_kernel_object) {
586 		what = "kernel";
587 		verb = "is unexpectedly";
588 	} else {
589 		what = "compressor";
590 		verb = "is unexpectedly";
591 	}
592 
593 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
594 	    map, flags, entry, verb, what);
595 }
596 
597 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)598 __kmem_entry_validate_object(
599 	vm_map_entry_t  entry,
600 	kmem_flags_t    flags)
601 {
602 	if (entry->is_sub_map) {
603 		return false;
604 	}
605 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
606 		return false;
607 	}
608 
609 	return (bool)(flags & KMEM_COMPRESSOR) ==
610 	       (VME_OBJECT(entry) == compressor_object);
611 }
612 
613 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)614 kmem_size_guard(
615 	vm_map_t        map,
616 	vm_offset_t     addr,
617 	kmem_guard_t    guard)
618 {
619 	kmem_flags_t flags = KMEM_GUESS_SIZE;
620 	vm_map_entry_t entry;
621 	vm_size_t size;
622 
623 	vm_map_lock_read(map);
624 
625 #if KASAN_CLASSIC
626 	addr -= PAGE_SIZE;
627 #endif /* KASAN_CLASSIC */
628 	addr = vm_memtag_canonicalize_address(addr);
629 
630 	if (!vm_map_lookup_entry(map, addr, &entry)) {
631 		__kmem_entry_not_found_panic(map, addr);
632 	}
633 
634 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
635 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
636 	}
637 
638 	size = __kmem_entry_orig_size(entry);
639 
640 	vm_map_unlock_read(map);
641 
642 	return size;
643 }
644 
645 static inline uint16_t
kmem_hash_backtrace(void * fp)646 kmem_hash_backtrace(
647 	void                     *fp)
648 {
649 	uint64_t  bt_count;
650 	uintptr_t bt[8] = {};
651 
652 	struct backtrace_control ctl = {
653 		.btc_frame_addr = (uintptr_t)fp,
654 	};
655 
656 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
657 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
658 }
659 
660 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
661     "Insufficient bits to represent ptr ranges");
662 
663 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)664 kmem_adjust_range_id(
665 	uint32_t                  hash)
666 {
667 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
668 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
669 }
670 
671 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)672 kmem_use_sprayqtn(
673 	kma_flags_t               kma_flags,
674 	vm_map_size_t             map_size,
675 	vm_offset_t               mask)
676 {
677 	/*
678 	 * Pointer allocations that are above the guard objects threshold or have
679 	 * leading guard pages with non standard alignment requests are redirected
680 	 * to the sprayqtn range.
681 	 */
682 #if DEBUG || DEVELOPMENT
683 	btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
684 	    BTREF_GET_NOWAIT : 0;
685 
686 	if ((kma_flags & KMA_SPRAYQTN) == 0) {
687 		if (map_size > KMEM_GOBJ_THRESHOLD) {
688 			btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
689 			    btref_get(__builtin_frame_address(0), flags));
690 		} else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
691 			btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
692 			    btref_get(__builtin_frame_address(0), flags));
693 		}
694 	}
695 #endif /* DEBUG || DEVELOPMENT */
696 
697 	return (kma_flags & KMA_SPRAYQTN) ||
698 	       (map_size > KMEM_GOBJ_THRESHOLD) ||
699 	       ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
700 }
701 
702 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)703 kmem_apply_security_policy(
704 	vm_map_t                  map,
705 	kma_flags_t               kma_flags,
706 	kmem_guard_t              guard,
707 	vm_map_size_t             map_size,
708 	vm_offset_t               mask,
709 	vm_map_kernel_flags_t    *vmk_flags,
710 	bool                      assert_dir __unused)
711 {
712 	kmem_range_id_t range_id;
713 	bool from_right;
714 	uint16_t type_hash = guard.kmg_type_hash;
715 
716 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
717 		return;
718 	}
719 
720 	/*
721 	 * A non-zero type-hash must be passed by krealloc_type
722 	 */
723 #if (DEBUG || DEVELOPMENT)
724 	if (assert_dir && !(kma_flags & KMA_DATA)) {
725 		assert(type_hash != 0);
726 	}
727 #endif
728 
729 	if (kma_flags & KMA_DATA) {
730 		range_id  = KMEM_RANGE_ID_DATA;
731 		/*
732 		 * As an optimization in KMA_DATA to avoid fragmentation,
733 		 * allocate static carveouts at the end of the DATA range.
734 		 */
735 		from_right = (bool)(kma_flags & KMA_PERMANENT);
736 	} else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
737 		range_id = KMEM_RANGE_ID_SPRAYQTN;
738 		from_right = (bool)(kma_flags & KMA_PERMANENT);
739 	} else if (type_hash) {
740 		range_id  = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
741 		from_right = type_hash & KMEM_DIRECTION_MASK;
742 	} else {
743 		/*
744 		 * Range id needs to correspond to one of the PTR ranges
745 		 */
746 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
747 		range_id  = kmem_adjust_range_id(type_hash);
748 		from_right = type_hash & KMEM_DIRECTION_MASK;
749 	}
750 
751 	vmk_flags->vmkf_range_id = range_id;
752 	vmk_flags->vmkf_last_free = from_right;
753 }
754 
755 #pragma mark allocation
756 
757 static kmem_return_t
758 kmem_alloc_guard_internal(
759 	vm_map_t                map,
760 	vm_size_t               size,
761 	vm_offset_t             mask,
762 	kma_flags_t             flags,
763 	kmem_guard_t            guard,
764 	kern_return_t         (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
765 {
766 	vm_object_t             object;
767 	vm_offset_t             delta = 0;
768 	vm_map_entry_t          entry = NULL;
769 	vm_map_offset_t         map_addr, fill_start;
770 	vm_map_size_t           map_size, fill_size;
771 	vm_page_t               guard_left = VM_PAGE_NULL;
772 	vm_page_t               guard_right = VM_PAGE_NULL;
773 	vm_page_t               wired_page_list = VM_PAGE_NULL;
774 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
775 	bool                    skip_guards;
776 	kmem_return_t           kmr = { };
777 
778 	assert(kernel_map && map->pmap == kernel_pmap);
779 
780 #if DEBUG || DEVELOPMENT
781 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
782 	    size, 0, 0, 0);
783 #endif
784 
785 	if (size == 0 ||
786 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
787 	    (size < __kmem_guard_size(ANYF(flags)))) {
788 		__kmem_invalid_size_panic(map, size, flags);
789 	}
790 
791 	/*
792 	 * limit the size of a single extent of wired memory
793 	 * to try and limit the damage to the system if
794 	 * too many pages get wired down
795 	 * limit raised to 2GB with 128GB max physical limit,
796 	 * but scaled by installed memory above this
797 	 *
798 	 * Note: kmem_alloc_contig_guard() is immune to this check.
799 	 */
800 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
801 	    alloc_pages == NULL &&
802 	    size > MAX(1ULL << 31, sane_size / 64))) {
803 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
804 		goto out_error;
805 	}
806 
807 	/*
808 	 * Guard pages:
809 	 *
810 	 * Guard pages are implemented as fictitious pages.
811 	 *
812 	 * However, some maps, and some objects are known
813 	 * to manage their memory explicitly, and do not need
814 	 * those to be materialized, which saves memory.
815 	 *
816 	 * By placing guard pages on either end of a stack,
817 	 * they can help detect cases where a thread walks
818 	 * off either end of its stack.
819 	 *
820 	 * They are allocated and set up here and attempts
821 	 * to access those pages are trapped in vm_fault_page().
822 	 *
823 	 * The map_size we were passed may include extra space for
824 	 * guard pages. fill_size represents the actual size to populate.
825 	 * Similarly, fill_start indicates where the actual pages
826 	 * will begin in the range.
827 	 */
828 
829 	map_size   = round_page(size);
830 	fill_start = 0;
831 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
832 
833 #if KASAN_CLASSIC
834 	if (flags & KMA_KASAN_GUARD) {
835 		assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
836 		flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
837 		delta     = ptoa(2);
838 		map_size += delta;
839 	}
840 #else
841 	(void)delta;
842 #endif /* KASAN_CLASSIC */
843 
844 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
845 	    map->never_faults;
846 
847 	if (flags & KMA_GUARD_FIRST) {
848 		vmk_flags.vmkf_guard_before = true;
849 		fill_start += PAGE_SIZE;
850 	}
851 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
852 		guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
853 		if (__improbable(guard_left == VM_PAGE_NULL)) {
854 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
855 			goto out_error;
856 		}
857 	}
858 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
859 		guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
860 		if (__improbable(guard_right == VM_PAGE_NULL)) {
861 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
862 			goto out_error;
863 		}
864 	}
865 
866 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
867 		if (alloc_pages) {
868 			kmr.kmr_return = alloc_pages(fill_size, flags,
869 			    &wired_page_list);
870 		} else {
871 			kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
872 			    &wired_page_list);
873 		}
874 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
875 			goto out_error;
876 		}
877 	}
878 
879 	/*
880 	 *	Allocate a new object (if necessary).  We must do this before
881 	 *	locking the map, or risk deadlock with the default pager.
882 	 */
883 	if (flags & KMA_KOBJECT) {
884 		object = kernel_object_default;
885 		vm_object_reference(object);
886 	} else if (flags & KMA_COMPRESSOR) {
887 		object = compressor_object;
888 		vm_object_reference(object);
889 	} else {
890 		object = vm_object_allocate(map_size);
891 		vm_object_lock(object);
892 		vm_object_set_size(object, map_size, size);
893 		/* stabilize the object to prevent shadowing */
894 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
895 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
896 		vm_object_unlock(object);
897 	}
898 
899 	if (flags & KMA_LAST_FREE) {
900 		vmk_flags.vmkf_last_free = true;
901 	}
902 	if (flags & KMA_PERMANENT) {
903 		vmk_flags.vmf_permanent = true;
904 	}
905 	kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
906 	    false);
907 
908 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
909 	    vmk_flags, &entry);
910 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
911 		vm_object_deallocate(object);
912 		goto out_error;
913 	}
914 
915 	map_addr = entry->vme_start;
916 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
917 	VME_ALIAS_SET(entry, guard.kmg_tag);
918 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
919 		VME_OFFSET_SET(entry, map_addr);
920 	}
921 
922 #if KASAN
923 	if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
924 		entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
925 	}
926 #endif /* KASAN */
927 
928 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
929 		entry->wired_count = 1;
930 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
931 	}
932 
933 	if (guard_left || guard_right || wired_page_list) {
934 		vm_object_offset_t offset = 0ull;
935 
936 		vm_object_lock(object);
937 		vm_map_unlock(map);
938 
939 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
940 			offset = map_addr;
941 		}
942 
943 		if (guard_left) {
944 			vm_page_insert(guard_left, object, offset);
945 			guard_left->vmp_busy = FALSE;
946 			guard_left = VM_PAGE_NULL;
947 		}
948 
949 		if (guard_right) {
950 			vm_page_insert(guard_right, object,
951 			    offset + fill_start + fill_size);
952 			guard_right->vmp_busy = FALSE;
953 			guard_right = VM_PAGE_NULL;
954 		}
955 
956 		if (wired_page_list) {
957 			kernel_memory_populate_object_and_unlock(object,
958 			    map_addr + fill_start, offset + fill_start, fill_size,
959 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
960 			    __kmem_mapping_type(ANYF(flags)));
961 		} else {
962 			vm_object_unlock(object);
963 		}
964 	} else {
965 		vm_map_unlock(map);
966 	}
967 
968 	/*
969 	 * now that the pages are wired, we no longer have to fear coalesce
970 	 */
971 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
972 		vm_map_simplify(map, map_addr);
973 	}
974 
975 #if DEBUG || DEVELOPMENT
976 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
977 	    atop(fill_size), 0, 0, 0);
978 #endif /* DEBUG || DEVELOPMENT */
979 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
980 
981 #if KASAN
982 	if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
983 		/*
984 		 * We need to allow the range for pageable memory,
985 		 * or faulting will not be allowed.
986 		 */
987 		kasan_notify_address(map_addr, map_size);
988 	}
989 #endif /* KASAN */
990 #if KASAN_CLASSIC
991 	if (flags & KMA_KASAN_GUARD) {
992 		kmr.kmr_address += PAGE_SIZE;
993 		kasan_alloc_large(kmr.kmr_address, size);
994 	}
995 #endif /* KASAN_CLASSIC */
996 #if CONFIG_KERNEL_TAGGING
997 	if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
998 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, size);
999 		vm_memtag_set_tag((vm_offset_t)kmr.kmr_address, size);
1000 #if KASAN_TBI
1001 		kasan_tbi_retag_unused_space((vm_offset_t)kmr.kmr_address, map_size, size);
1002 #endif /* KASAN_TBI */
1003 	}
1004 #endif /* CONFIG_KERNEL_TAGGING */
1005 	return kmr;
1006 
1007 out_error:
1008 	if (flags & KMA_NOFAIL) {
1009 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1010 	}
1011 	if (guard_left) {
1012 		guard_left->vmp_snext = wired_page_list;
1013 		wired_page_list = guard_left;
1014 	}
1015 	if (guard_right) {
1016 		guard_right->vmp_snext = wired_page_list;
1017 		wired_page_list = guard_right;
1018 	}
1019 	if (wired_page_list) {
1020 		vm_page_free_list(wired_page_list, FALSE);
1021 	}
1022 
1023 #if DEBUG || DEVELOPMENT
1024 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1025 	    0, 0, 0, 0);
1026 #endif /* DEBUG || DEVELOPMENT */
1027 
1028 	return kmr;
1029 }
1030 
1031 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1032 kmem_alloc_guard(
1033 	vm_map_t        map,
1034 	vm_size_t       size,
1035 	vm_offset_t     mask,
1036 	kma_flags_t     flags,
1037 	kmem_guard_t    guard)
1038 {
1039 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1040 }
1041 
1042 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1043 kmem_alloc_contig_guard(
1044 	vm_map_t                map,
1045 	vm_size_t               size,
1046 	vm_offset_t             mask,
1047 	ppnum_t                 max_pnum,
1048 	ppnum_t                 pnum_mask,
1049 	kma_flags_t             flags,
1050 	kmem_guard_t            guard)
1051 {
1052 	__auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1053 		return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1054 	};
1055 
1056 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1057 }
1058 
1059 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1060 kmem_suballoc(
1061 	vm_map_t                parent,
1062 	mach_vm_offset_t       *addr,
1063 	vm_size_t               size,
1064 	vm_map_create_options_t vmc_options,
1065 	int                     vm_flags,
1066 	kms_flags_t             flags,
1067 	vm_tag_t                tag)
1068 {
1069 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1070 	vm_map_offset_t map_addr = 0;
1071 	kmem_return_t kmr = { };
1072 	vm_map_t map;
1073 
1074 	assert(page_aligned(size));
1075 	assert(parent->pmap == kernel_pmap);
1076 
1077 	vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1078 
1079 	if (parent == kernel_map) {
1080 		assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1081 	}
1082 
1083 	if (vmk_flags.vmf_fixed) {
1084 		map_addr = trunc_page(*addr);
1085 	}
1086 
1087 	pmap_reference(vm_map_pmap(parent));
1088 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1089 
1090 	/*
1091 	 * 1. vm_map_enter() will consume one ref on success.
1092 	 *
1093 	 * 2. make the entry atomic as kernel submaps should never be split.
1094 	 *
1095 	 * 3. instruct vm_map_enter() that it is a fresh submap
1096 	 *    that needs to be taught its bounds as it inserted.
1097 	 */
1098 	vm_map_reference(map);
1099 
1100 	vmk_flags.vmkf_submap = true;
1101 	if ((flags & KMS_DATA) == 0) {
1102 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1103 		vmk_flags.vmkf_submap_atomic = true;
1104 	}
1105 	vmk_flags.vmkf_submap_adjust = true;
1106 	if (flags & KMS_LAST_FREE) {
1107 		vmk_flags.vmkf_last_free = true;
1108 	}
1109 	if (flags & KMS_PERMANENT) {
1110 		vmk_flags.vmf_permanent = true;
1111 	}
1112 	if (flags & KMS_DATA) {
1113 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1114 	}
1115 
1116 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1117 	    vmk_flags, (vm_object_t)map, 0, FALSE,
1118 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1119 
1120 	if (kmr.kmr_return != KERN_SUCCESS) {
1121 		if (flags & KMS_NOFAIL) {
1122 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1123 			    parent, size, kmr.kmr_return);
1124 		}
1125 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1126 		vm_map_deallocate(map);
1127 		vm_map_deallocate(map); /* also removes ref to pmap */
1128 		return kmr;
1129 	}
1130 
1131 	/*
1132 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1133 	 * that the exact same range is returned.
1134 	 */
1135 	if (*addr != 0 && parent == kernel_map &&
1136 	    startup_phase > STARTUP_SUB_KMEM) {
1137 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1138 	} else {
1139 		*addr = map_addr;
1140 	}
1141 
1142 	kmr.kmr_submap = map;
1143 	return kmr;
1144 }
1145 
1146 /*
1147  *	kmem_alloc:
1148  *
1149  *	Allocate wired-down memory in the kernel's address map
1150  *	or a submap.  The memory is not zero-filled.
1151  */
1152 
1153 __exported kern_return_t
1154 kmem_alloc_external(
1155 	vm_map_t        map,
1156 	vm_offset_t     *addrp,
1157 	vm_size_t       size);
1158 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1159 kmem_alloc_external(
1160 	vm_map_t        map,
1161 	vm_offset_t     *addrp,
1162 	vm_size_t       size)
1163 {
1164 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1165 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1166 	}
1167 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1168 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1169 }
1170 
1171 
1172 /*
1173  *	kmem_alloc_kobject:
1174  *
1175  *	Allocate wired-down memory in the kernel's address map
1176  *	or a submap.  The memory is not zero-filled.
1177  *
1178  *	The memory is allocated in the kernel_object.
1179  *	It may not be copied with vm_map_copy, and
1180  *	it may not be reallocated with kmem_realloc.
1181  */
1182 
1183 __exported kern_return_t
1184 kmem_alloc_kobject_external(
1185 	vm_map_t        map,
1186 	vm_offset_t     *addrp,
1187 	vm_size_t       size);
1188 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1189 kmem_alloc_kobject_external(
1190 	vm_map_t        map,
1191 	vm_offset_t     *addrp,
1192 	vm_size_t       size)
1193 {
1194 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1195 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1196 	}
1197 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1198 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1199 }
1200 
1201 /*
1202  *	kmem_alloc_pageable:
1203  *
1204  *	Allocate pageable memory in the kernel's address map.
1205  */
1206 
1207 __exported kern_return_t
1208 kmem_alloc_pageable_external(
1209 	vm_map_t        map,
1210 	vm_offset_t     *addrp,
1211 	vm_size_t       size);
1212 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1213 kmem_alloc_pageable_external(
1214 	vm_map_t        map,
1215 	vm_offset_t     *addrp,
1216 	vm_size_t       size)
1217 {
1218 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1219 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1220 	}
1221 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1222 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1223 }
1224 
1225 
1226 #pragma mark population
1227 
1228 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1229 kernel_memory_populate_pmap_enter(
1230 	vm_object_t             object,
1231 	vm_address_t            addr,
1232 	vm_object_offset_t      offset,
1233 	vm_page_t               mem,
1234 	vm_prot_t               prot,
1235 	int                     pe_flags,
1236 	pmap_mapping_type_t     mapping_type)
1237 {
1238 	kern_return_t   pe_result;
1239 	int             pe_options;
1240 
1241 	if (VMP_ERROR_GET(mem)) {
1242 		panic("VM page %p should not have an error", mem);
1243 	}
1244 
1245 	pe_options = PMAP_OPTIONS_NOWAIT;
1246 	if (object->internal) {
1247 		pe_options |= PMAP_OPTIONS_INTERNAL;
1248 	}
1249 	if (mem->vmp_reusable || object->all_reusable) {
1250 		pe_options |= PMAP_OPTIONS_REUSABLE;
1251 	}
1252 
1253 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1254 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1255 	    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1256 
1257 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1258 		vm_object_unlock(object);
1259 
1260 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1261 
1262 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1263 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1264 		    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1265 
1266 		vm_object_lock(object);
1267 	}
1268 
1269 	assert(pe_result == KERN_SUCCESS);
1270 }
1271 
1272 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1273 kernel_memory_populate_object_and_unlock(
1274 	vm_object_t             object, /* must be locked */
1275 	vm_address_t            addr,
1276 	vm_offset_t             offset,
1277 	vm_size_t               size,
1278 	vm_page_t               page_list,
1279 	kma_flags_t             flags,
1280 	vm_tag_t                tag,
1281 	vm_prot_t               prot,
1282 	pmap_mapping_type_t     mapping_type)
1283 {
1284 	vm_page_t       mem;
1285 	int             pe_flags;
1286 	bool            gobbled_list = page_list && page_list->vmp_gobbled;
1287 
1288 	assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1289 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1290 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1291 		assert3u(offset, ==, addr);
1292 	} else {
1293 		/*
1294 		 * kernel_memory_populate_pmap_enter() might drop the object
1295 		 * lock, and the caller might not own a reference anymore
1296 		 * and rely on holding the vm object lock for liveness.
1297 		 */
1298 		vm_object_reference_locked(object);
1299 	}
1300 
1301 	if (flags & KMA_KSTACK) {
1302 		pe_flags = VM_MEM_STACK;
1303 	} else {
1304 		pe_flags = 0;
1305 	}
1306 
1307 
1308 	for (vm_object_offset_t pg_offset = 0;
1309 	    pg_offset < size;
1310 	    pg_offset += PAGE_SIZE_64) {
1311 		if (page_list == NULL) {
1312 			panic("%s: page_list too short", __func__);
1313 		}
1314 
1315 		mem = page_list;
1316 		page_list = mem->vmp_snext;
1317 		mem->vmp_snext = NULL;
1318 
1319 		assert(mem->vmp_wire_count == 0);
1320 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1321 		assert(!mem->vmp_fictitious && !mem->vmp_private);
1322 
1323 		if (flags & KMA_COMPRESSOR) {
1324 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1325 			/*
1326 			 * Background processes doing I/O accounting can call
1327 			 * into NVME driver to do some work which results in
1328 			 * an allocation here and so we want to make sure
1329 			 * that the pages used by compressor, regardless of
1330 			 * process context, are never on the special Q.
1331 			 */
1332 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1333 
1334 			vm_page_insert(mem, object, offset + pg_offset);
1335 		} else {
1336 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1337 			mem->vmp_wire_count = 1;
1338 
1339 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1340 		}
1341 
1342 		mem->vmp_gobbled = false;
1343 		mem->vmp_busy = false;
1344 		mem->vmp_pmapped = true;
1345 		mem->vmp_wpmapped = true;
1346 
1347 		/*
1348 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1349 		 * for the kernel and compressor objects.
1350 		 */
1351 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1352 		    mem, prot, pe_flags, mapping_type);
1353 
1354 		if (flags & KMA_NOENCRYPT) {
1355 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1356 		}
1357 	}
1358 
1359 	if (page_list) {
1360 		panic("%s: page_list too long", __func__);
1361 	}
1362 
1363 	vm_object_unlock(object);
1364 	if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1365 		vm_object_deallocate(object);
1366 	}
1367 
1368 	/*
1369 	 * Update the accounting:
1370 	 * - the compressor "wired" pages don't really count as wired
1371 	 * - kmem_alloc_contig_guard() gives gobbled pages,
1372 	 *   which already count as wired but need to be ungobbled.
1373 	 */
1374 	if (gobbled_list) {
1375 		vm_page_lockspin_queues();
1376 		if (flags & KMA_COMPRESSOR) {
1377 			vm_page_wire_count -= atop(size);
1378 		}
1379 		vm_page_gobble_count -= atop(size);
1380 		vm_page_unlock_queues();
1381 	} else if ((flags & KMA_COMPRESSOR) == 0) {
1382 		vm_page_lockspin_queues();
1383 		vm_page_wire_count += atop(size);
1384 		vm_page_unlock_queues();
1385 	}
1386 
1387 	if (flags & KMA_KOBJECT) {
1388 		/* vm_page_insert_wired() handles regular objects already */
1389 		vm_tag_update_size(tag, size, NULL);
1390 	}
1391 
1392 #if KASAN
1393 	if (flags & KMA_COMPRESSOR) {
1394 		kasan_notify_address_nopoison(addr, size);
1395 	} else {
1396 		kasan_notify_address(addr, size);
1397 	}
1398 #endif /* KASAN */
1399 }
1400 
1401 
1402 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1403 kernel_memory_populate(
1404 	vm_offset_t     addr,
1405 	vm_size_t       size,
1406 	kma_flags_t     flags,
1407 	vm_tag_t        tag)
1408 {
1409 	kern_return_t   kr = KERN_SUCCESS;
1410 	vm_page_t       page_list = NULL;
1411 	vm_size_t       page_count = atop_64(size);
1412 	vm_object_t     object = __kmem_object(ANYF(flags));
1413 
1414 #if DEBUG || DEVELOPMENT
1415 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1416 	    size, 0, 0, 0);
1417 #endif /* DEBUG || DEVELOPMENT */
1418 
1419 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1420 	if (kr == KERN_SUCCESS) {
1421 		vm_object_lock(object);
1422 		kernel_memory_populate_object_and_unlock(object, addr,
1423 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1424 		    __kmem_mapping_type(ANYF(flags)));
1425 	}
1426 
1427 #if DEBUG || DEVELOPMENT
1428 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1429 	    page_count, 0, 0, 0);
1430 #endif /* DEBUG || DEVELOPMENT */
1431 	return kr;
1432 }
1433 
1434 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1435 kernel_memory_depopulate(
1436 	vm_offset_t        addr,
1437 	vm_size_t          size,
1438 	kma_flags_t        flags,
1439 	vm_tag_t           tag)
1440 {
1441 	vm_object_t        object = __kmem_object(ANYF(flags));
1442 	vm_object_offset_t offset = addr;
1443 	vm_page_t          mem;
1444 	vm_page_t          local_freeq = NULL;
1445 	unsigned int       pages_unwired = 0;
1446 
1447 	vm_object_lock(object);
1448 
1449 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1450 
1451 	for (vm_object_offset_t pg_offset = 0;
1452 	    pg_offset < size;
1453 	    pg_offset += PAGE_SIZE_64) {
1454 		mem = vm_page_lookup(object, offset + pg_offset);
1455 
1456 		assert(mem);
1457 
1458 		if (flags & KMA_COMPRESSOR) {
1459 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1460 		} else {
1461 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1462 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1463 			pages_unwired++;
1464 		}
1465 
1466 		mem->vmp_busy = TRUE;
1467 
1468 		assert(mem->vmp_tabled);
1469 		vm_page_remove(mem, TRUE);
1470 		assert(mem->vmp_busy);
1471 
1472 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1473 
1474 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1475 		mem->vmp_snext = local_freeq;
1476 		local_freeq = mem;
1477 	}
1478 
1479 	vm_object_unlock(object);
1480 
1481 	vm_page_free_list(local_freeq, TRUE);
1482 
1483 	if (!(flags & KMA_COMPRESSOR)) {
1484 		vm_page_lockspin_queues();
1485 		vm_page_wire_count -= pages_unwired;
1486 		vm_page_unlock_queues();
1487 	}
1488 
1489 	if (flags & KMA_KOBJECT) {
1490 		/* vm_page_remove() handles regular objects already */
1491 		vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1492 	}
1493 }
1494 
1495 #pragma mark reallocation
1496 
1497 __abortlike
1498 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1499 __kmem_realloc_invalid_object_size_panic(
1500 	vm_map_t                map,
1501 	vm_address_t            address,
1502 	vm_size_t               size,
1503 	vm_map_entry_t          entry)
1504 {
1505 	vm_object_t object  = VME_OBJECT(entry);
1506 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
1507 
1508 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1509 	    "object %p has unexpected size %ld",
1510 	    map, (void *)address, (size_t)size, entry, object, objsize);
1511 }
1512 
1513 __abortlike
1514 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1515 __kmem_realloc_invalid_pager_panic(
1516 	vm_map_t                map,
1517 	vm_address_t            address,
1518 	vm_size_t               size,
1519 	vm_map_entry_t          entry)
1520 {
1521 	vm_object_t object     = VME_OBJECT(entry);
1522 	memory_object_t pager  = object->pager;
1523 	bool pager_created     = object->pager_created;
1524 	bool pager_initialized = object->pager_initialized;
1525 	bool pager_ready       = object->pager_ready;
1526 
1527 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1528 	    "object %p has unexpected pager %p (%d,%d,%d)",
1529 	    map, (void *)address, (size_t)size, entry, object,
1530 	    pager, pager_created, pager_initialized, pager_ready);
1531 }
1532 
1533 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1534 kmem_realloc_shrink_guard(
1535 	vm_map_t                map,
1536 	vm_offset_t             req_oldaddr,
1537 	vm_size_t               req_oldsize,
1538 	vm_size_t               req_newsize,
1539 	kmr_flags_t             flags,
1540 	kmem_guard_t            guard,
1541 	vm_map_entry_t          entry)
1542 {
1543 	vmr_flags_t             vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1544 	vm_object_t             object;
1545 	vm_offset_t             delta = 0;
1546 	kmem_return_t           kmr;
1547 	bool                    was_atomic;
1548 	vm_size_t               oldsize = round_page(req_oldsize);
1549 	vm_size_t               newsize = round_page(req_newsize);
1550 	vm_address_t            oldaddr = req_oldaddr;
1551 
1552 #if KASAN_CLASSIC
1553 	if (flags & KMR_KASAN_GUARD) {
1554 		assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1555 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1556 		oldaddr -= PAGE_SIZE;
1557 		delta    = ptoa(2);
1558 		oldsize += delta;
1559 		newsize += delta;
1560 	}
1561 #endif /* KASAN_CLASSIC */
1562 
1563 	if (flags & KMR_TAG) {
1564 		oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1565 	}
1566 
1567 	vm_map_lock_assert_exclusive(map);
1568 
1569 	if ((flags & KMR_KOBJECT) == 0) {
1570 		object = VME_OBJECT(entry);
1571 		vm_object_reference(object);
1572 	}
1573 
1574 	/*
1575 	 *	Shrinking an atomic entry starts with splitting it,
1576 	 *	and removing the second half.
1577 	 */
1578 	was_atomic = entry->vme_atomic;
1579 	entry->vme_atomic = false;
1580 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1581 	entry->vme_atomic = was_atomic;
1582 
1583 #if KASAN
1584 	if (entry->vme_kernel_object && was_atomic) {
1585 		entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1586 	}
1587 #if KASAN_CLASSIC
1588 	if (flags & KMR_KASAN_GUARD) {
1589 		kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1590 		    ASAN_VALID);
1591 	}
1592 #endif
1593 #if KASAN_TBI
1594 	if (flags & KMR_TAG) {
1595 		kasan_tbi_mark_free_space(req_oldaddr + newsize, oldsize - newsize);
1596 	}
1597 #endif /* KASAN_TBI */
1598 #endif /* KASAN */
1599 	(void)vm_map_remove_and_unlock(map,
1600 	    oldaddr + newsize, oldaddr + oldsize,
1601 	    vmr_flags, KMEM_GUARD_NONE);
1602 
1603 
1604 	/*
1605 	 *	Lastly, if there are guard pages, deal with them.
1606 	 *
1607 	 *	The kernel object just needs to depopulate,
1608 	 *	regular objects require freeing the last page
1609 	 *	and replacing it with a guard.
1610 	 */
1611 	if (flags & KMR_KOBJECT) {
1612 		if (flags & KMR_GUARD_LAST) {
1613 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1614 			    PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1615 		}
1616 	} else {
1617 		vm_page_t guard_right = VM_PAGE_NULL;
1618 		vm_offset_t remove_start = newsize;
1619 
1620 		if (flags & KMR_GUARD_LAST) {
1621 			if (!map->never_faults) {
1622 				guard_right = vm_page_grab_guard(true);
1623 			}
1624 			remove_start -= PAGE_SIZE;
1625 		}
1626 
1627 		vm_object_lock(object);
1628 
1629 		if (object->vo_size != oldsize) {
1630 			__kmem_realloc_invalid_object_size_panic(map,
1631 			    req_oldaddr, req_oldsize + delta, entry);
1632 		}
1633 		vm_object_set_size(object, newsize, req_newsize);
1634 
1635 		vm_object_page_remove(object, remove_start, oldsize);
1636 
1637 		if (guard_right) {
1638 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1639 			guard_right->vmp_busy = false;
1640 		}
1641 		vm_object_unlock(object);
1642 		vm_object_deallocate(object);
1643 	}
1644 
1645 	kmr.kmr_address = req_oldaddr;
1646 	kmr.kmr_return  = 0;
1647 #if KASAN_CLASSIC
1648 	if (flags & KMA_KASAN_GUARD) {
1649 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1650 	}
1651 #endif /* KASAN_CLASSIC */
1652 #if KASAN_TBI
1653 	if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1654 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1655 		vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1656 		kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1657 	}
1658 #endif /* KASAN_TBI */
1659 
1660 	return kmr;
1661 }
1662 
1663 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1664 kmem_realloc_guard(
1665 	vm_map_t                map,
1666 	vm_offset_t             req_oldaddr,
1667 	vm_size_t               req_oldsize,
1668 	vm_size_t               req_newsize,
1669 	kmr_flags_t             flags,
1670 	kmem_guard_t            guard)
1671 {
1672 	vm_object_t             object;
1673 	vm_size_t               oldsize;
1674 	vm_size_t               newsize;
1675 	vm_offset_t             delta = 0;
1676 	vm_map_offset_t         oldaddr;
1677 	vm_map_offset_t         newaddr;
1678 	vm_object_offset_t      newoffs;
1679 	vm_map_entry_t          oldentry;
1680 	vm_map_entry_t          newentry;
1681 	vm_page_t               page_list = NULL;
1682 	bool                    needs_wakeup = false;
1683 	kmem_return_t           kmr = { };
1684 	unsigned int            last_timestamp;
1685 	vm_map_kernel_flags_t   vmk_flags = {
1686 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1687 	};
1688 
1689 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1690 	if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1691 		__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1692 		    req_oldsize, flags);
1693 	}
1694 
1695 	if (req_oldaddr == 0ul) {
1696 		return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1697 	}
1698 
1699 	if (req_newsize == 0ul) {
1700 		kmem_free_guard(map, req_oldaddr, req_oldsize,
1701 		    (kmf_flags_t)flags, guard);
1702 		return kmr;
1703 	}
1704 
1705 	if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1706 		__kmem_invalid_size_panic(map, req_newsize, flags);
1707 	}
1708 	if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1709 		__kmem_invalid_size_panic(map, req_newsize, flags);
1710 	}
1711 
1712 	oldsize = round_page(req_oldsize);
1713 	newsize = round_page(req_newsize);
1714 	oldaddr = req_oldaddr;
1715 #if KASAN_CLASSIC
1716 	if (flags & KMR_KASAN_GUARD) {
1717 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1718 		oldaddr -= PAGE_SIZE;
1719 		delta    = ptoa(2);
1720 		oldsize += delta;
1721 		newsize += delta;
1722 	}
1723 #endif /* KASAN_CLASSIC */
1724 #if CONFIG_KERNEL_TAGGING
1725 	if (flags & KMR_TAG) {
1726 		vm_memtag_verify_tag(req_oldaddr);
1727 		oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1728 	}
1729 #endif /* CONFIG_KERNEL_TAGGING */
1730 
1731 #if !KASAN
1732 	/*
1733 	 *	If not on a KASAN variant and no difference in requested size,
1734 	 *  just return.
1735 	 *
1736 	 *	Otherwise we want to validate the size and re-tag for KASAN_TBI.
1737 	 */
1738 	if (oldsize == newsize) {
1739 		kmr.kmr_address = req_oldaddr;
1740 		return kmr;
1741 	}
1742 #endif /* !KASAN */
1743 
1744 	/*
1745 	 *	If we're growing the allocation,
1746 	 *	then reserve the pages we'll need,
1747 	 *	and find a spot for its new place.
1748 	 */
1749 	if (oldsize < newsize) {
1750 #if DEBUG || DEVELOPMENT
1751 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1752 		    VM_KERN_REQUEST, DBG_FUNC_START,
1753 		    newsize - oldsize, 0, 0, 0);
1754 #endif /* DEBUG || DEVELOPMENT */
1755 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1756 		    (kma_flags_t)flags, &page_list);
1757 		if (kmr.kmr_return == KERN_SUCCESS) {
1758 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1759 			    newsize, 0, &vmk_flags, true);
1760 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1761 			    vmk_flags, &newentry);
1762 		}
1763 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1764 			if (flags & KMR_REALLOCF) {
1765 				kmem_free_guard(map, req_oldaddr, req_oldsize,
1766 				    KMF_NONE, guard);
1767 			}
1768 			if (page_list) {
1769 				vm_page_free_list(page_list, FALSE);
1770 			}
1771 #if DEBUG || DEVELOPMENT
1772 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1773 			    VM_KERN_REQUEST, DBG_FUNC_END,
1774 			    0, 0, 0, 0);
1775 #endif /* DEBUG || DEVELOPMENT */
1776 			return kmr;
1777 		}
1778 
1779 		/* map is locked */
1780 	} else {
1781 		vm_map_lock(map);
1782 	}
1783 
1784 
1785 	/*
1786 	 *	Locate the entry:
1787 	 *	- wait for it to quiesce.
1788 	 *	- validate its guard,
1789 	 *	- learn its correct tag,
1790 	 */
1791 again:
1792 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1793 		__kmem_entry_not_found_panic(map, req_oldaddr);
1794 	}
1795 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1796 		oldentry->needs_wakeup = true;
1797 		vm_map_entry_wait(map, THREAD_UNINT);
1798 		goto again;
1799 	}
1800 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1801 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1802 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1803 	}
1804 	/*
1805 	 *	TODO: We should validate for non atomic entries that the range
1806 	 *	      we are acting on is what we expect here.
1807 	 */
1808 #if KASAN
1809 	if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1810 		__kmem_realloc_invalid_object_size_panic(map,
1811 		    req_oldaddr, req_oldsize + delta, oldentry);
1812 	}
1813 
1814 	if (oldsize == newsize) {
1815 		kmr.kmr_address = req_oldaddr;
1816 		if (oldentry->vme_kernel_object) {
1817 			oldentry->vme_object_or_delta = delta +
1818 			    (-req_newsize & PAGE_MASK);
1819 		} else {
1820 			object = VME_OBJECT(oldentry);
1821 			vm_object_lock(object);
1822 			vm_object_set_size(object, newsize, req_newsize);
1823 			vm_object_unlock(object);
1824 		}
1825 		vm_map_unlock(map);
1826 
1827 #if KASAN_CLASSIC
1828 		if (flags & KMA_KASAN_GUARD) {
1829 			kasan_alloc_large(kmr.kmr_address, req_newsize);
1830 		}
1831 #endif /* KASAN_CLASSIC */
1832 #if KASAN_TBI
1833 		if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1834 			kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1835 			vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1836 			kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1837 		}
1838 #endif /* KASAN_TBI */
1839 		return kmr;
1840 	}
1841 #endif /* KASAN */
1842 
1843 	guard.kmg_tag = VME_ALIAS(oldentry);
1844 
1845 	if (newsize < oldsize) {
1846 		return kmem_realloc_shrink_guard(map, req_oldaddr,
1847 		           req_oldsize, req_newsize, flags, guard, oldentry);
1848 	}
1849 
1850 
1851 	/*
1852 	 *	We are growing the entry
1853 	 *
1854 	 *	For regular objects we use the object `vo_size` updates
1855 	 *	as a guarantee that no 2 kmem_realloc() can happen
1856 	 *	concurrently (by doing it before the map is unlocked.
1857 	 *
1858 	 *	For the kernel object, prevent the entry from being
1859 	 *	reallocated or changed by marking it "in_transition".
1860 	 */
1861 
1862 	object = VME_OBJECT(oldentry);
1863 	vm_object_lock(object);
1864 	vm_object_reference_locked(object);
1865 
1866 	newaddr = newentry->vme_start;
1867 	newoffs = oldsize;
1868 
1869 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1870 	VME_ALIAS_SET(newentry, guard.kmg_tag);
1871 	if (flags & KMR_KOBJECT) {
1872 		oldentry->in_transition = true;
1873 		VME_OFFSET_SET(newentry, newaddr);
1874 		newentry->wired_count = 1;
1875 		vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
1876 		newoffs = newaddr + oldsize;
1877 	} else {
1878 		if (object->pager_created || object->pager) {
1879 			/*
1880 			 * We can't "realloc/grow" the pager, so pageable
1881 			 * allocations should not go through this path.
1882 			 */
1883 			__kmem_realloc_invalid_pager_panic(map,
1884 			    req_oldaddr, req_oldsize + delta, oldentry);
1885 		}
1886 		if (object->vo_size != oldsize) {
1887 			__kmem_realloc_invalid_object_size_panic(map,
1888 			    req_oldaddr, req_oldsize + delta, oldentry);
1889 		}
1890 		vm_object_set_size(object, newsize, req_newsize);
1891 	}
1892 
1893 	last_timestamp = map->timestamp;
1894 	vm_map_unlock(map);
1895 
1896 
1897 	/*
1898 	 *	Now proceed with the population of pages.
1899 	 *
1900 	 *	Kernel objects can use the kmem population helpers.
1901 	 *
1902 	 *	Regular objects will insert pages manually,
1903 	 *	then wire the memory into the new range.
1904 	 */
1905 
1906 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1907 
1908 	if (flags & KMR_KOBJECT) {
1909 		pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
1910 
1911 		pmap_protect(kernel_pmap,
1912 		    oldaddr, oldaddr + oldsize - guard_right_size,
1913 		    VM_PROT_NONE);
1914 
1915 		for (vm_object_offset_t offset = 0;
1916 		    offset < oldsize - guard_right_size;
1917 		    offset += PAGE_SIZE_64) {
1918 			vm_page_t mem;
1919 
1920 			mem = vm_page_lookup(object, oldaddr + offset);
1921 			if (mem == VM_PAGE_NULL) {
1922 				continue;
1923 			}
1924 
1925 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1926 
1927 			mem->vmp_busy = true;
1928 			vm_page_remove(mem, true);
1929 			vm_page_insert_wired(mem, object, newaddr + offset,
1930 			    guard.kmg_tag);
1931 			mem->vmp_busy = false;
1932 
1933 			kernel_memory_populate_pmap_enter(object, newaddr,
1934 			    offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
1935 		}
1936 
1937 		kernel_memory_populate_object_and_unlock(object,
1938 		    newaddr + oldsize - guard_right_size,
1939 		    newoffs - guard_right_size,
1940 		    newsize - oldsize,
1941 		    page_list, (kma_flags_t)flags,
1942 		    guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
1943 	} else {
1944 		vm_page_t guard_right = VM_PAGE_NULL;
1945 
1946 		/*
1947 		 *	Note: we are borrowing the new entry reference
1948 		 *	on the object for the duration of this code,
1949 		 *	which works because we keep the object locked
1950 		 *	throughout.
1951 		 */
1952 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1953 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1954 			assert(guard_right->vmp_fictitious);
1955 			guard_right->vmp_busy = true;
1956 			vm_page_remove(guard_right, true);
1957 		}
1958 
1959 		if (flags & KMR_FREEOLD) {
1960 			/*
1961 			 * Freeing the old mapping will make
1962 			 * the old pages become pageable until
1963 			 * the new mapping makes them wired again.
1964 			 * Let's take an extra "wire_count" to
1965 			 * prevent any accidental "page out".
1966 			 * We'll have to undo that after wiring
1967 			 * the new mapping.
1968 			 */
1969 			vm_object_reference_locked(object); /* keep object alive */
1970 			for (vm_object_offset_t offset = 0;
1971 			    offset < oldsize - guard_right_size;
1972 			    offset += PAGE_SIZE_64) {
1973 				vm_page_t mem;
1974 
1975 				mem = vm_page_lookup(object, offset);
1976 				assert(mem != VM_PAGE_NULL);
1977 				assertf(!VM_PAGE_PAGEABLE(mem),
1978 				    "mem %p qstate %d",
1979 				    mem, mem->vmp_q_state);
1980 				if (VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_guard_addr) {
1981 					/* guard pages are not wired */
1982 				} else {
1983 					assertf(VM_PAGE_WIRED(mem),
1984 					    "mem %p qstate %d wirecount %d",
1985 					    mem,
1986 					    mem->vmp_q_state,
1987 					    mem->vmp_wire_count);
1988 					assertf(mem->vmp_wire_count >= 1,
1989 					    "mem %p wirecount %d",
1990 					    mem, mem->vmp_wire_count);
1991 					mem->vmp_wire_count++;
1992 				}
1993 			}
1994 		}
1995 
1996 		for (vm_object_offset_t offset = oldsize - guard_right_size;
1997 		    offset < newsize - guard_right_size;
1998 		    offset += PAGE_SIZE_64) {
1999 			vm_page_t mem = page_list;
2000 
2001 			page_list = mem->vmp_snext;
2002 			mem->vmp_snext = VM_PAGE_NULL;
2003 			assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
2004 			assert(!VM_PAGE_PAGEABLE(mem));
2005 
2006 			vm_page_insert(mem, object, offset);
2007 			mem->vmp_busy = false;
2008 		}
2009 
2010 		if (guard_right) {
2011 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
2012 			guard_right->vmp_busy = false;
2013 		}
2014 
2015 		vm_object_unlock(object);
2016 	}
2017 
2018 	/*
2019 	 *	Mark the entry as idle again,
2020 	 *	and honor KMR_FREEOLD if needed.
2021 	 */
2022 
2023 	vm_map_lock(map);
2024 	if (last_timestamp + 1 != map->timestamp &&
2025 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2026 		__kmem_entry_not_found_panic(map, req_oldaddr);
2027 	}
2028 
2029 	if (flags & KMR_KOBJECT) {
2030 		assert(oldentry->in_transition);
2031 		oldentry->in_transition = false;
2032 		if (oldentry->needs_wakeup) {
2033 			needs_wakeup = true;
2034 			oldentry->needs_wakeup = false;
2035 		}
2036 	}
2037 
2038 	if (flags & KMR_FREEOLD) {
2039 		vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2040 
2041 #if KASAN_CLASSIC
2042 		if (flags & KMR_KASAN_GUARD) {
2043 			kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2044 		}
2045 #endif
2046 #if KASAN_TBI
2047 		if (flags & KMR_TAG) {
2048 			kasan_tbi_mark_free_space(req_oldaddr, oldsize);
2049 		}
2050 #endif /* KASAN_TBI */
2051 		if (flags & KMR_GUARD_LAST) {
2052 			vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2053 		}
2054 		(void)vm_map_remove_and_unlock(map,
2055 		    oldaddr, oldaddr + oldsize,
2056 		    vmr_flags, guard);
2057 	} else {
2058 		vm_map_unlock(map);
2059 	}
2060 
2061 	if ((flags & KMR_KOBJECT) == 0) {
2062 		kern_return_t kr;
2063 		/*
2064 		 * This must happen _after_ we do the KMR_FREEOLD,
2065 		 * because wiring the pages will call into the pmap,
2066 		 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2067 		 * this would cause a second mapping of the page and panic.
2068 		 */
2069 		kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
2070 		    VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
2071 		assert(kr == KERN_SUCCESS);
2072 
2073 		if (flags & KMR_FREEOLD) {
2074 			/*
2075 			 * Undo the extra "wiring" we made above
2076 			 * and release the extra reference we took
2077 			 * on the object.
2078 			 */
2079 			vm_object_lock(object);
2080 			for (vm_object_offset_t offset = 0;
2081 			    offset < oldsize - guard_right_size;
2082 			    offset += PAGE_SIZE_64) {
2083 				vm_page_t mem;
2084 
2085 				mem = vm_page_lookup(object, offset);
2086 				assert(mem != VM_PAGE_NULL);
2087 				assertf(!VM_PAGE_PAGEABLE(mem),
2088 				    "mem %p qstate %d",
2089 				    mem, mem->vmp_q_state);
2090 				if (VM_PAGE_GET_PHYS_PAGE(mem) == vm_page_guard_addr) {
2091 					/* guard pages are not wired */
2092 				} else {
2093 					assertf(VM_PAGE_WIRED(mem),
2094 					    "mem %p qstate %d wirecount %d",
2095 					    mem,
2096 					    mem->vmp_q_state,
2097 					    mem->vmp_wire_count);
2098 					assertf(mem->vmp_wire_count >= 2,
2099 					    "mem %p wirecount %d",
2100 					    mem, mem->vmp_wire_count);
2101 					mem->vmp_wire_count--;
2102 					assert(VM_PAGE_WIRED(mem));
2103 					assert(mem->vmp_wire_count >= 1);
2104 				}
2105 			}
2106 			vm_object_unlock(object);
2107 			vm_object_deallocate(object); /* release extra ref */
2108 		}
2109 	}
2110 
2111 	if (needs_wakeup) {
2112 		vm_map_entry_wakeup(map);
2113 	}
2114 
2115 #if DEBUG || DEVELOPMENT
2116 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
2117 	    atop(newsize - oldsize), 0, 0, 0);
2118 #endif /* DEBUG || DEVELOPMENT */
2119 	kmr.kmr_address = newaddr;
2120 
2121 #if KASAN
2122 	kasan_notify_address(kmr.kmr_address, newsize);
2123 #endif /* KASAN */
2124 #if KASAN_CLASSIC
2125 	if (flags & KMR_KASAN_GUARD) {
2126 		kmr.kmr_address += PAGE_SIZE;
2127 		kasan_alloc_large(kmr.kmr_address, req_newsize);
2128 	}
2129 #endif /* KASAN_CLASSIC */
2130 #if KASAN_TBI
2131 	if (flags & KMR_TAG) {
2132 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
2133 		vm_memtag_set_tag(kmr.kmr_address, req_newsize);
2134 		kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
2135 	}
2136 #endif /* KASAN_TBI */
2137 
2138 	return kmr;
2139 }
2140 
2141 
2142 #pragma mark free
2143 
2144 #if KASAN
2145 
2146 __abortlike
2147 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2148 __kmem_free_invalid_object_size_panic(
2149 	vm_map_t                map,
2150 	vm_address_t            address,
2151 	vm_size_t               size,
2152 	vm_map_entry_t          entry)
2153 {
2154 	vm_object_t object  = VME_OBJECT(entry);
2155 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
2156 
2157 	panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2158 	    "object %p has unexpected size %ld",
2159 	    map, (void *)address, (size_t)size, entry, object, objsize);
2160 }
2161 
2162 #endif /* KASAN */
2163 
2164 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2165 kmem_free_guard(
2166 	vm_map_t        map,
2167 	vm_offset_t     req_addr,
2168 	vm_size_t       req_size,
2169 	kmf_flags_t     flags,
2170 	kmem_guard_t    guard)
2171 {
2172 	vmr_flags_t     vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2173 	vm_address_t    addr      = req_addr;
2174 	vm_offset_t     delta     = 0;
2175 	vm_size_t       size;
2176 #if KASAN
2177 	vm_map_entry_t  entry;
2178 #endif /* KASAN */
2179 
2180 	assert(map->pmap == kernel_pmap);
2181 
2182 #if KASAN_CLASSIC
2183 	if (flags & KMF_KASAN_GUARD) {
2184 		addr  -= PAGE_SIZE;
2185 		delta  = ptoa(2);
2186 	}
2187 #endif /* KASAN_CLASSIC */
2188 #if CONFIG_KERNEL_TAGGING
2189 	if (flags & KMF_TAG) {
2190 		vm_memtag_verify_tag(req_addr);
2191 		addr = vm_memtag_canonicalize_address(req_addr);
2192 	}
2193 #endif /* CONFIG_KERNEL_TAGGING */
2194 
2195 	if (flags & KMF_GUESS_SIZE) {
2196 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2197 		size = PAGE_SIZE;
2198 	} else if (req_size == 0) {
2199 		__kmem_invalid_size_panic(map, req_size, flags);
2200 	} else {
2201 		size = round_page(req_size) + delta;
2202 	}
2203 
2204 	vm_map_lock(map);
2205 
2206 #if KASAN
2207 	if (!vm_map_lookup_entry(map, addr, &entry)) {
2208 		__kmem_entry_not_found_panic(map, req_addr);
2209 	}
2210 	if (flags & KMF_GUESS_SIZE) {
2211 		vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2212 		req_size = __kmem_entry_orig_size(entry);
2213 		size = round_page(req_size + delta);
2214 	} else if (guard.kmg_atomic && entry->vme_kernel_object &&
2215 	    __kmem_entry_orig_size(entry) != req_size) {
2216 		/*
2217 		 * We can't make a strict check for regular
2218 		 * VM objects because it could be:
2219 		 *
2220 		 * - the kmem_guard_free() of a kmem_realloc_guard() without
2221 		 *   KMR_FREEOLD, and in that case the object size won't match.
2222 		 *
2223 		 * - a submap, in which case there is no "orig size".
2224 		 */
2225 		__kmem_free_invalid_object_size_panic(map,
2226 		    req_addr, req_size + delta, entry);
2227 	}
2228 #endif /* KASAN */
2229 #if KASAN_CLASSIC
2230 	if (flags & KMR_KASAN_GUARD) {
2231 		kasan_poison_range(addr, size, ASAN_VALID);
2232 	}
2233 #endif
2234 #if KASAN_TBI
2235 	if (flags & KMF_TAG) {
2236 		kasan_tbi_mark_free_space(req_addr, size);
2237 	}
2238 #endif /* KASAN_TBI */
2239 
2240 	/*
2241 	 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2242 	 * unwires the kernel mapping. The page won't be mapped any longer so
2243 	 * there is no extra step that is required for memory tagging to "clear"
2244 	 * it -- the page will be later laundered when reused.
2245 	 */
2246 	return vm_map_remove_and_unlock(map, addr, addr + size,
2247 	           vmr_flags, guard).kmr_size - delta;
2248 }
2249 
2250 __exported void
2251 kmem_free_external(
2252 	vm_map_t        map,
2253 	vm_offset_t     addr,
2254 	vm_size_t       size);
2255 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2256 kmem_free_external(
2257 	vm_map_t        map,
2258 	vm_offset_t     addr,
2259 	vm_size_t       size)
2260 {
2261 	if (size) {
2262 		kmem_free(map, trunc_page(addr), size);
2263 #if MACH_ASSERT
2264 	} else {
2265 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2266 		    map, (void *)addr, __builtin_return_address(0));
2267 #endif
2268 	}
2269 }
2270 
2271 #pragma mark kmem metadata
2272 
2273 /*
2274  * Guard objects for kmem pointer allocation:
2275  *
2276  * Guard objects introduce size slabs to kmem pointer allocations that are
2277  * allocated in chunks of n * sizeclass. When an allocation of a specific
2278  * sizeclass is requested a random slot from [0, n) is returned.
2279  * Allocations are returned from that chunk until m slots are left. The
2280  * remaining m slots are referred to as guard objects. They don't get
2281  * allocated and the chunk is now considered full. When an allocation is
2282  * freed to the chunk 1 slot is now available from m + 1 for the next
2283  * allocation of that sizeclass.
2284  *
2285  * Guard objects are intended to make exploitation of use after frees harder
2286  * as allocations that are freed can no longer be reliable reallocated.
2287  * They also make exploitation of OOBs harder as overflowing out of an
2288  * allocation can no longer be safe even with sufficient spraying.
2289  */
2290 
2291 #define KMEM_META_PRIMARY    UINT8_MAX
2292 #define KMEM_META_START     (UINT8_MAX - 1)
2293 #define KMEM_META_FREE      (UINT8_MAX - 2)
2294 #if __ARM_16K_PG__
2295 #define KMEM_MIN_SIZE        PAGE_SIZE
2296 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2297 #else /* __ARM_16K_PG__ */
2298 /*
2299  * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2300  * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2301  * Therefore populate sizeclasses from 4k for those devices.
2302  */
2303 #define KMEM_MIN_SIZE       (4 * 1024)
2304 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2305 #endif /* __ARM_16K_PG__ */
2306 #define KMEM_MAX_SIZE       (32ULL << 20)
2307 #define KMEM_START_IDX      (kmem_log2down(KMEM_MIN_SIZE))
2308 #define KMEM_LAST_IDX       (kmem_log2down(KMEM_MAX_SIZE))
2309 #define KMEM_NUM_SIZECLASS  (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2310 #define KMEM_FRONTS         (KMEM_RANGE_ID_NUM_PTR * 2)
2311 #define KMEM_NUM_GUARDS      2
2312 
2313 struct kmem_page_meta {
2314 	union {
2315 		/*
2316 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2317 		 */
2318 		uint32_t km_bitmap;
2319 		/*
2320 		 * On start and end of free chunk with KMEM_META_FREE marker
2321 		 */
2322 		uint32_t km_free_chunks;
2323 	};
2324 	/*
2325 	 * KMEM_META_PRIMARY: Start meta of allocated chunk
2326 	 * KMEM_META_FREE   : Start and end meta of free chunk
2327 	 * KMEM_META_START  : Meta region start and end
2328 	 */
2329 	uint8_t  km_page_marker;
2330 	uint8_t  km_sizeclass;
2331 	union {
2332 		/*
2333 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2334 		 */
2335 		uint16_t km_chunk_len;
2336 		/*
2337 		 * On secondary allocated chunks
2338 		 */
2339 		uint16_t km_page_idx;
2340 	};
2341 	LIST_ENTRY(kmem_page_meta) km_link;
2342 } kmem_page_meta_t;
2343 
2344 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2345 struct kmem_sizeclass {
2346 	vm_map_size_t                   ks_size;
2347 	uint32_t                        ks_num_chunk;
2348 	uint32_t                        ks_num_elem;
2349 	crypto_random_ctx_t __zpercpu   ks_rng_ctx;
2350 	kmem_list_head_t                ks_allfree_head[KMEM_FRONTS];
2351 	kmem_list_head_t                ks_partial_head[KMEM_FRONTS];
2352 	kmem_list_head_t                ks_full_head[KMEM_FRONTS];
2353 };
2354 
2355 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2356 
2357 /*
2358  * Locks to synchronize metadata population
2359  */
2360 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2361 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2362 #define kmem_meta_lock()   lck_mtx_lock(&kmem_meta_region_lck)
2363 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2364 
2365 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2366 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2367 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2368 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2369 /*
2370  * Keeps track of metadata high water mark for each front
2371  */
2372 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2373 static SECURITY_READ_ONLY_LATE(vm_map_t)
2374 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2375 static vm_map_size_t kmem_meta_size;
2376 
2377 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2378 kmem_get_front(
2379 	kmem_range_id_t         range_id,
2380 	bool                    from_right)
2381 {
2382 	assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2383 	    (range_id <= KMEM_RANGE_ID_NUM_PTR));
2384 	return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2385 }
2386 
2387 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2388 kmem_slot_idx_to_bit(
2389 	uint32_t                slot_idx,
2390 	uint32_t                size_idx __unused)
2391 {
2392 	assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2393 	return 1ull << slot_idx;
2394 }
2395 
2396 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2397 kmem_get_idx_from_size(vm_map_size_t size)
2398 {
2399 	assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2400 	return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2401 }
2402 
2403 __abortlike
2404 static void
kmem_invalid_size_idx(uint32_t idx)2405 kmem_invalid_size_idx(uint32_t idx)
2406 {
2407 	panic("Invalid sizeclass idx %u", idx);
2408 }
2409 
2410 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2411 kmem_get_size_from_idx(uint32_t idx)
2412 {
2413 	if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2414 		kmem_invalid_size_idx(idx);
2415 	}
2416 	return 1ul << (idx + KMEM_START_IDX);
2417 }
2418 
2419 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2420 kmem_get_page_idx(struct kmem_page_meta *meta)
2421 {
2422 	uint8_t page_marker = meta->km_page_marker;
2423 
2424 	return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2425 }
2426 
2427 __abortlike
2428 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2429 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2430 {
2431 	panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2432 	    meta);
2433 }
2434 
2435 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2436 kmem_get_chunk_len(struct kmem_page_meta *meta)
2437 {
2438 	if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2439 		kmem_invalid_chunk_len(meta);
2440 	}
2441 
2442 	return meta->km_chunk_len;
2443 }
2444 
2445 __abortlike
2446 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2447 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2448 {
2449 	panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2450 	    meta);
2451 }
2452 
2453 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2454 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2455 {
2456 	if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2457 		kmem_invalid_free_chunk_len(meta);
2458 	}
2459 
2460 	return meta->km_free_chunks;
2461 }
2462 
2463 /*
2464  * Return the metadata corresponding to the specified address
2465  */
2466 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2467 kmem_addr_to_meta(
2468 	vm_map_offset_t         addr,
2469 	vm_map_range_id_t       range_id,
2470 	vm_map_offset_t        *range_start,
2471 	uint64_t               *meta_idx)
2472 {
2473 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2474 
2475 	*range_start = kmem_ranges[range_id].min_address;
2476 	*meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2477 	return &meta_base[*meta_idx];
2478 }
2479 
2480 /*
2481  * Return the metadata start of the chunk that the address belongs to
2482  */
2483 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2484 kmem_addr_to_meta_start(
2485 	vm_address_t            addr,
2486 	vm_map_range_id_t       range_id,
2487 	vm_map_offset_t        *chunk_start)
2488 {
2489 	vm_map_offset_t range_start;
2490 	uint64_t meta_idx;
2491 	struct kmem_page_meta *meta;
2492 
2493 	meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2494 	meta_idx -= kmem_get_page_idx(meta);
2495 	meta -= kmem_get_page_idx(meta);
2496 	assert(meta->km_page_marker == KMEM_META_PRIMARY);
2497 	*chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2498 	return meta;
2499 }
2500 
2501 __startup_func
2502 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2503 kmem_init_meta_front(
2504 	struct kmem_page_meta  *meta,
2505 	kmem_range_id_t         range_id,
2506 	bool                    from_right)
2507 {
2508 	kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2509 	    KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2510 	meta->km_page_marker = KMEM_META_START;
2511 	if (!from_right) {
2512 		meta++;
2513 		kmem_meta_base[range_id] = meta;
2514 	}
2515 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2516 }
2517 
2518 __startup_func
2519 static void
kmem_metadata_init(void)2520 kmem_metadata_init(void)
2521 {
2522 	for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2523 		vm_map_offset_t addr = kmem_meta_range[i].min_address;
2524 		struct kmem_page_meta *meta;
2525 		uint64_t meta_idx;
2526 
2527 		vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2528 		kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2529 		    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2530 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
2531 		    VM_KERN_MEMORY_OSFMK).kmr_submap;
2532 
2533 		kmem_meta_range[i].min_address = addr;
2534 		kmem_meta_range[i].max_address = addr + kmem_meta_size;
2535 
2536 		meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2537 		kmem_init_meta_front(meta, i, 0);
2538 
2539 		meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2540 		    &meta_idx);
2541 		kmem_init_meta_front(meta, i, 1);
2542 	}
2543 }
2544 
2545 __startup_func
2546 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2547 kmem_init_front_head(
2548 	struct kmem_sizeclass  *ks,
2549 	uint32_t                front)
2550 {
2551 	LIST_INIT(&ks->ks_allfree_head[front]);
2552 	LIST_INIT(&ks->ks_partial_head[front]);
2553 	LIST_INIT(&ks->ks_full_head[front]);
2554 }
2555 
2556 __startup_func
2557 static void
kmem_sizeclass_init(void)2558 kmem_sizeclass_init(void)
2559 {
2560 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2561 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2562 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2563 
2564 		ks->ks_size = kmem_get_size_from_idx(i);
2565 		ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2566 		    KMEM_CHUNK_SIZE_MIN;
2567 		ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2568 		assert(ks->ks_num_elem <=
2569 		    (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2570 		for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2571 			kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2572 			kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2573 		}
2574 	}
2575 }
2576 
2577 /*
2578  * This is done during EARLY_BOOT as it needs the corecrypto module to be
2579  * set up.
2580  */
2581 __startup_func
2582 static void
kmem_crypto_init(void)2583 kmem_crypto_init(void)
2584 {
2585 	vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2586 
2587 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2588 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2589 
2590 		ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2591 		zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2592 			crypto_random_kmem_init(ctx);
2593 		}
2594 	}
2595 }
2596 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2597 
2598 __abortlike
2599 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2600 kmem_validate_slot_panic(
2601 	vm_map_offset_t         addr,
2602 	struct kmem_page_meta  *meta,
2603 	uint32_t                slot_idx,
2604 	uint32_t                size_idx)
2605 {
2606 	if (meta->km_page_marker != KMEM_META_PRIMARY) {
2607 		panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2608 	}
2609 	if (meta->km_sizeclass != size_idx) {
2610 		panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2611 		    meta, meta->km_sizeclass, size_idx);
2612 	}
2613 	panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2614 	    slot_idx, meta, (void *)addr);
2615 }
2616 
2617 __abortlike
2618 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2619 kmem_invalid_slot_for_addr(
2620 	mach_vm_range_t         slot,
2621 	vm_map_offset_t         start,
2622 	vm_map_offset_t         end)
2623 {
2624 	panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2625 	    (void *)slot->min_address, (void *)slot->max_address,
2626 	    (void *)start, (void *)end);
2627 }
2628 
2629 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2630 kmem_validate_slot(
2631 	vm_map_offset_t         addr,
2632 	struct kmem_page_meta  *meta,
2633 	uint32_t                size_idx,
2634 	uint32_t                slot_idx)
2635 {
2636 	if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2637 	    (meta->km_sizeclass != size_idx) ||
2638 	    ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2639 		kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2640 	}
2641 }
2642 
2643 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2644 kmem_validate_slot_initial(
2645 	mach_vm_range_t         slot,
2646 	vm_map_offset_t         start,
2647 	vm_map_offset_t         end,
2648 	struct kmem_page_meta  *meta,
2649 	uint32_t                size_idx,
2650 	uint32_t                slot_idx)
2651 {
2652 	if ((slot->min_address == 0) || (slot->max_address == 0) ||
2653 	    (start < slot->min_address) || (start >= slot->max_address) ||
2654 	    (end > slot->max_address)) {
2655 		kmem_invalid_slot_for_addr(slot, start, end);
2656 	}
2657 
2658 	kmem_validate_slot(start, meta, size_idx, slot_idx);
2659 }
2660 
2661 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2662 kmem_addr_get_slot_idx(
2663 	vm_map_offset_t         start,
2664 	vm_map_offset_t         end,
2665 	vm_map_range_id_t       range_id,
2666 	struct kmem_page_meta **meta,
2667 	uint32_t               *size_idx,
2668 	mach_vm_range_t         slot)
2669 {
2670 	vm_map_offset_t chunk_start;
2671 	vm_map_size_t slot_size;
2672 	uint32_t slot_idx;
2673 
2674 	*meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2675 	*size_idx = (*meta)->km_sizeclass;
2676 	slot_size = kmem_get_size_from_idx(*size_idx);
2677 	slot_idx = (start - chunk_start) / slot_size;
2678 	slot->min_address = chunk_start + slot_idx * slot_size;
2679 	slot->max_address = slot->min_address + slot_size;
2680 
2681 	kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2682 
2683 	return slot_idx;
2684 }
2685 
2686 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2687 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2688 {
2689 #if KASAN
2690 #pragma unused(from, to)
2691 	return true;
2692 #else
2693 	vm_offset_t page_addr = trunc_page(from);
2694 
2695 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2696 		/*
2697 		 * This can race with another thread doing a populate on the same metadata
2698 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2699 		 * fault in the shadow when we first access the metadata page. Avoid this
2700 		 * by always synchronizing on the kmem_meta_lock with KASan.
2701 		 */
2702 		if (!pmap_find_phys(kernel_pmap, page_addr)) {
2703 			return true;
2704 		}
2705 	}
2706 
2707 	return false;
2708 #endif /* !KASAN */
2709 }
2710 
2711 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2712 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2713 {
2714 	vm_offset_t page_addr = trunc_page(from);
2715 
2716 	vm_map_unlock(kernel_map);
2717 
2718 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2719 		for (;;) {
2720 			kern_return_t ret = KERN_SUCCESS;
2721 
2722 			/*
2723 			 * All updates to kmem metadata are done under the kmem_meta_lock
2724 			 */
2725 			kmem_meta_lock();
2726 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2727 				ret = kernel_memory_populate(page_addr,
2728 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2729 				    VM_KERN_MEMORY_OSFMK);
2730 			}
2731 			kmem_meta_unlock();
2732 
2733 			if (ret == KERN_SUCCESS) {
2734 				break;
2735 			}
2736 
2737 			/*
2738 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2739 			 * to bad system deadlocks, so if the allocation failed,
2740 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
2741 			 */
2742 			VM_PAGE_WAIT();
2743 		}
2744 	}
2745 
2746 	vm_map_lock(kernel_map);
2747 }
2748 
2749 __abortlike
2750 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2751 kmem_invalid_meta_panic(
2752 	struct kmem_page_meta  *meta,
2753 	uint32_t                slot_idx,
2754 	struct kmem_sizeclass   sizeclass)
2755 {
2756 	uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2757 
2758 	if (slot_idx >= sizeclass.ks_num_elem) {
2759 		panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2760 		    sizeclass.ks_num_elem, meta);
2761 	}
2762 	if (meta->km_sizeclass != size_idx) {
2763 		panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2764 		    meta->km_sizeclass, meta);
2765 	}
2766 	panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
2767 }
2768 
2769 __abortlike
2770 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)2771 kmem_slot_has_entry_panic(
2772 	vm_map_entry_t          entry,
2773 	vm_map_offset_t         addr)
2774 {
2775 	panic("Entry (%p) already exists for addr (%p) being returned",
2776 	    entry, (void *)addr);
2777 }
2778 
2779 __abortlike
2780 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)2781 kmem_slot_not_found(
2782 	struct kmem_page_meta  *meta,
2783 	uint32_t                slot_idx)
2784 {
2785 	panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
2786 	    meta->km_bitmap);
2787 }
2788 
2789 /*
2790  * Returns a 16bit random number between 0 and
2791  * upper_limit (inclusive)
2792  */
2793 __startup_func
2794 uint16_t
kmem_get_random16(uint16_t upper_limit)2795 kmem_get_random16(
2796 	uint16_t                upper_limit)
2797 {
2798 	static uint64_t random_entropy;
2799 	assert(upper_limit < UINT16_MAX);
2800 	if (random_entropy == 0) {
2801 		random_entropy = early_random();
2802 	}
2803 	uint32_t result = random_entropy & UINT32_MAX;
2804 	random_entropy >>= 32;
2805 	return (uint16_t)(result % (upper_limit + 1));
2806 }
2807 
2808 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)2809 kmem_get_nth_free_slot(
2810 	struct kmem_page_meta  *meta,
2811 	uint32_t                n,
2812 	uint32_t                bitmap)
2813 {
2814 	uint32_t zeros_seen = 0, ones_seen = 0;
2815 
2816 	while (bitmap) {
2817 		uint32_t count = __builtin_ctz(bitmap);
2818 
2819 		zeros_seen += count;
2820 		bitmap >>= count;
2821 		if (__probable(~bitmap)) {
2822 			count = __builtin_ctz(~bitmap);
2823 		} else {
2824 			count = 32;
2825 		}
2826 		if (count + ones_seen > n) {
2827 			return zeros_seen + n;
2828 		}
2829 		ones_seen += count;
2830 		bitmap >>= count;
2831 	}
2832 
2833 	kmem_slot_not_found(meta, n);
2834 }
2835 
2836 
2837 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)2838 kmem_get_next_slot(
2839 	struct kmem_page_meta  *meta,
2840 	struct kmem_sizeclass   sizeclass,
2841 	uint32_t                bitmap)
2842 {
2843 	uint32_t num_slots = __builtin_popcount(bitmap);
2844 	uint64_t slot_idx = 0;
2845 
2846 	assert(num_slots > 0);
2847 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
2848 		/*
2849 		 * Use early random prior to early boot as the ks_rng_ctx requires
2850 		 * the corecrypto module to be setup before it is initialized and
2851 		 * used.
2852 		 *
2853 		 * num_slots can't be 0 as we take this path when we have more than
2854 		 * one slot left.
2855 		 */
2856 		slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
2857 	} else {
2858 		crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
2859 		    &slot_idx);
2860 	}
2861 
2862 	return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
2863 }
2864 
2865 /*
2866  * Returns an unallocated slot from the given metadata
2867  */
2868 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)2869 kmem_get_addr_from_meta(
2870 	struct kmem_page_meta  *meta,
2871 	vm_map_range_id_t       range_id,
2872 	struct kmem_sizeclass   sizeclass,
2873 	vm_map_entry_t         *entry)
2874 {
2875 	vm_map_offset_t addr;
2876 	vm_map_size_t size = sizeclass.ks_size;
2877 	uint32_t size_idx = kmem_get_idx_from_size(size);
2878 	uint64_t meta_idx = meta - kmem_meta_base[range_id];
2879 	mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
2880 	uint32_t slot_bit;
2881 	uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
2882 
2883 	if ((slot_idx >= sizeclass.ks_num_elem) ||
2884 	    (meta->km_sizeclass != size_idx) ||
2885 	    (meta->km_page_marker != KMEM_META_PRIMARY)) {
2886 		kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
2887 	}
2888 
2889 	slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
2890 	meta->km_bitmap &= ~slot_bit;
2891 
2892 	addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
2893 	assert(kmem_range_contains_fully(range_id, addr, size));
2894 	if (vm_map_lookup_entry(kernel_map, addr, entry)) {
2895 		kmem_slot_has_entry_panic(*entry, addr);
2896 	}
2897 	if ((*entry != vm_map_to_entry(kernel_map)) &&
2898 	    ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
2899 	    ((*entry)->vme_next->vme_start < (addr + size))) {
2900 		kmem_slot_has_entry_panic(*entry, addr);
2901 	}
2902 	return addr;
2903 }
2904 
2905 __abortlike
2906 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)2907 kmem_range_out_of_va(
2908 	kmem_range_id_t         range_id,
2909 	uint32_t                num_chunks)
2910 {
2911 	panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
2912 }
2913 
2914 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)2915 kmem_init_allocated_chunk(
2916 	struct kmem_page_meta  *meta,
2917 	struct kmem_sizeclass   sizeclass,
2918 	uint32_t                size_idx)
2919 {
2920 	uint32_t meta_num = sizeclass.ks_num_chunk;
2921 	uint32_t num_elem = sizeclass.ks_num_elem;
2922 
2923 	meta->km_bitmap = (1ull << num_elem) - 1;
2924 	meta->km_chunk_len = (uint16_t)meta_num;
2925 	assert(LIST_NEXT(meta, km_link) == NULL);
2926 	assert(meta->km_link.le_prev == NULL);
2927 	meta->km_sizeclass = (uint8_t)size_idx;
2928 	meta->km_page_marker = KMEM_META_PRIMARY;
2929 	meta++;
2930 	for (uint32_t i = 1; i < meta_num; i++) {
2931 		meta->km_page_idx = (uint16_t)i;
2932 		meta->km_sizeclass = (uint8_t)size_idx;
2933 		meta->km_page_marker = 0;
2934 		meta->km_bitmap = 0;
2935 		meta++;
2936 	}
2937 }
2938 
2939 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)2940 kmem_get_additional_meta(
2941 	struct kmem_page_meta  *meta,
2942 	uint32_t                meta_req,
2943 	bool                    from_right,
2944 	struct kmem_page_meta **adj_free_meta)
2945 {
2946 	struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
2947 
2948 	if (meta_prev->km_page_marker == KMEM_META_FREE) {
2949 		uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
2950 
2951 		*adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
2952 		meta_req -= chunk_len;
2953 	} else {
2954 		*adj_free_meta = NULL;
2955 	}
2956 
2957 	return meta_req;
2958 }
2959 
2960 
2961 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)2962 kmem_get_new_chunk(
2963 	vm_map_range_id_t       range_id,
2964 	bool                    from_right,
2965 	uint32_t                size_idx)
2966 {
2967 	struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
2968 	struct kmem_page_meta *start, *end, *meta_update;
2969 	struct kmem_page_meta *adj_free_meta = NULL;
2970 	uint32_t meta_req = sizeclass.ks_num_chunk;
2971 
2972 	for (;;) {
2973 		struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2974 		struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2975 		struct kmem_page_meta *meta;
2976 		vm_offset_t start_addr, end_addr;
2977 		uint32_t meta_num;
2978 
2979 		meta = from_right ? metab : metaf;
2980 		meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
2981 		    &adj_free_meta);
2982 
2983 		if (metaf + meta_num >= metab) {
2984 			kmem_range_out_of_va(range_id, meta_num);
2985 		}
2986 
2987 		start = from_right ? (metab - meta_num) : metaf;
2988 		end = from_right ? metab : (metaf + meta_num);
2989 
2990 		start_addr = (vm_offset_t)start;
2991 		end_addr   = (vm_offset_t)end;
2992 
2993 		/*
2994 		 * If the new high watermark stays on the same page,
2995 		 * no need to populate and drop the lock.
2996 		 */
2997 		if (!page_aligned(from_right ? end_addr : start_addr) &&
2998 		    trunc_page(start_addr) == trunc_page(end_addr - 1)) {
2999 			break;
3000 		}
3001 		if (!kmem_populate_needed(start_addr, end_addr)) {
3002 			break;
3003 		}
3004 
3005 		kmem_populate_meta_locked(start_addr, end_addr);
3006 
3007 		/*
3008 		 * Since we dropped the lock, reassess conditions still hold:
3009 		 * - the HWM we are changing must not have moved
3010 		 * - the other HWM must not intersect with ours
3011 		 * - in case of coalescing, the adjacent free meta must still
3012 		 *   be free and of the same size.
3013 		 *
3014 		 * If we failed to grow, reevaluate whether freelists have
3015 		 * entries now by returning NULL.
3016 		 */
3017 		metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3018 		metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3019 		if (meta != (from_right ? metab : metaf)) {
3020 			return NULL;
3021 		}
3022 		if (metaf + meta_num >= metab) {
3023 			kmem_range_out_of_va(range_id, meta_num);
3024 		}
3025 		if (adj_free_meta) {
3026 			if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
3027 			    kmem_get_free_chunk_len(adj_free_meta) !=
3028 			    meta_req - meta_num) {
3029 				return NULL;
3030 			}
3031 		}
3032 
3033 		break;
3034 	}
3035 
3036 	/*
3037 	 * If there is an adjacent free chunk remove it from free list
3038 	 */
3039 	if (adj_free_meta) {
3040 		LIST_REMOVE(adj_free_meta, km_link);
3041 		LIST_NEXT(adj_free_meta, km_link) = NULL;
3042 		adj_free_meta->km_link.le_prev = NULL;
3043 	}
3044 
3045 	/*
3046 	 * Update hwm
3047 	 */
3048 	meta_update = from_right ? start : end;
3049 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3050 
3051 	/*
3052 	 * Initialize metadata
3053 	 */
3054 	start = from_right ? start : (end - meta_req);
3055 	kmem_init_allocated_chunk(start, sizeclass, size_idx);
3056 
3057 	return start;
3058 }
3059 
3060 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3061 kmem_requeue_meta(
3062 	struct kmem_page_meta  *meta,
3063 	struct kmem_list_head  *head)
3064 {
3065 	LIST_REMOVE(meta, km_link);
3066 	LIST_INSERT_HEAD(head, meta, km_link);
3067 }
3068 
3069 /*
3070  * Return corresponding sizeclass to stash free chunks in
3071  */
3072 __abortlike
3073 static void
kmem_invalid_chunk_num(uint32_t chunks)3074 kmem_invalid_chunk_num(uint32_t chunks)
3075 {
3076 	panic("Invalid number of chunks %u\n", chunks);
3077 }
3078 
3079 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3080 kmem_get_size_idx_for_chunks(uint32_t chunks)
3081 {
3082 	for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3083 		if (chunks >= kmem_size_array[i].ks_num_chunk) {
3084 			return i;
3085 		}
3086 	}
3087 	kmem_invalid_chunk_num(chunks);
3088 }
3089 
3090 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3091 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3092 {
3093 	bzero(meta, count * sizeof(struct kmem_page_meta));
3094 }
3095 
3096 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3097 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3098 {
3099 #if MACH_ASSERT
3100 	size_t size = count * sizeof(struct kmem_page_meta);
3101 
3102 	assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3103 #else
3104 #pragma unused(meta, count)
3105 #endif
3106 }
3107 
3108 /*!
3109  * @function kmem_init_free_chunk()
3110  *
3111  * @discussion
3112  * This function prepares a range of chunks to be put on a free list.
3113  * The first and last metadata might be dirty, but the "inner" ones
3114  * must be zero filled by the caller prior to calling this function.
3115  */
3116 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3117 kmem_init_free_chunk(
3118 	struct kmem_page_meta  *meta,
3119 	uint32_t                num_chunks,
3120 	uint32_t                front)
3121 {
3122 	struct kmem_sizeclass *sizeclass;
3123 	uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3124 
3125 	if (num_chunks > 2) {
3126 		kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3127 	}
3128 
3129 	meta[0] = (struct kmem_page_meta){
3130 		.km_free_chunks = num_chunks,
3131 		.km_page_marker = KMEM_META_FREE,
3132 		.km_sizeclass   = (uint8_t)size_idx,
3133 	};
3134 	if (num_chunks > 1) {
3135 		meta[num_chunks - 1] = (struct kmem_page_meta){
3136 			.km_free_chunks = num_chunks,
3137 			.km_page_marker = KMEM_META_FREE,
3138 			.km_sizeclass   = (uint8_t)size_idx,
3139 		};
3140 	}
3141 
3142 	sizeclass = &kmem_size_array[size_idx];
3143 	LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3144 }
3145 
3146 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3147 kmem_get_free_chunk_from_list(
3148 	struct kmem_sizeclass  *org_sizeclass,
3149 	uint32_t                size_idx,
3150 	uint32_t                front)
3151 {
3152 	struct kmem_sizeclass *sizeclass;
3153 	uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3154 	struct kmem_page_meta *meta;
3155 	uint32_t idx = size_idx;
3156 
3157 	while (idx < KMEM_NUM_SIZECLASS) {
3158 		sizeclass = &kmem_size_array[idx];
3159 		meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3160 		if (meta) {
3161 			break;
3162 		}
3163 		idx++;
3164 	}
3165 
3166 	/*
3167 	 * Trim if larger in size
3168 	 */
3169 	if (meta) {
3170 		uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3171 
3172 		assert(meta->km_page_marker == KMEM_META_FREE);
3173 		LIST_REMOVE(meta, km_link);
3174 		LIST_NEXT(meta, km_link) = NULL;
3175 		meta->km_link.le_prev = NULL;
3176 		if (num_chunks_free > num_chunks) {
3177 			num_chunks_free -= num_chunks;
3178 			kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3179 		}
3180 
3181 		kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3182 	}
3183 
3184 	return meta;
3185 }
3186 
3187 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3188 kmem_locate_space(
3189 	vm_map_size_t           size,
3190 	vm_map_range_id_t       range_id,
3191 	bool                    from_right,
3192 	vm_map_offset_t        *start_inout,
3193 	vm_map_entry_t         *entry_out)
3194 {
3195 	vm_map_entry_t entry;
3196 	uint32_t size_idx = kmem_get_idx_from_size(size);
3197 	uint32_t front = kmem_get_front(range_id, from_right);
3198 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3199 	struct kmem_page_meta *meta;
3200 
3201 	assert(size <= sizeclass->ks_size);
3202 again:
3203 	if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3204 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3205 		/*
3206 		 * Requeue to full if necessary
3207 		 */
3208 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3209 		if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3210 			kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3211 		}
3212 	} else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3213 	    front)) != NULL) {
3214 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3215 		/*
3216 		 * Queue to partial
3217 		 */
3218 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3219 		assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3220 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3221 	} else {
3222 		meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3223 		if (meta == NULL) {
3224 			goto again;
3225 		}
3226 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3227 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3228 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3229 	}
3230 
3231 	if (entry_out) {
3232 		*entry_out = entry;
3233 	}
3234 
3235 	return KERN_SUCCESS;
3236 }
3237 
3238 /*
3239  * Determine whether the given metadata was allocated from the right
3240  */
3241 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3242 kmem_meta_is_from_right(
3243 	kmem_range_id_t         range_id,
3244 	struct kmem_page_meta  *meta)
3245 {
3246 	struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3247 #if DEBUG || DEVELOPMENT
3248 	struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3249 #endif
3250 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3251 	struct kmem_page_meta *meta_end;
3252 
3253 	meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3254 
3255 	if ((meta >= meta_base) && (meta < metaf)) {
3256 		return false;
3257 	}
3258 
3259 	assert(meta >= metab && meta < meta_end);
3260 	return true;
3261 }
3262 
3263 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3264 kmem_free_chunk(
3265 	kmem_range_id_t         range_id,
3266 	struct kmem_page_meta  *meta,
3267 	bool                    from_right)
3268 {
3269 	struct kmem_page_meta *meta_coalesce = meta - 1;
3270 	struct kmem_page_meta *meta_start = meta;
3271 	uint32_t num_chunks = kmem_get_chunk_len(meta);
3272 	uint32_t add_chunks;
3273 	struct kmem_page_meta *meta_end = meta + num_chunks;
3274 	struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3275 	uint32_t front = kmem_get_front(range_id, from_right);
3276 
3277 	meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3278 	meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3279 
3280 	LIST_REMOVE(meta, km_link);
3281 	kmem_clear_meta_range(meta, num_chunks);
3282 
3283 	/*
3284 	 * Coalesce left
3285 	 */
3286 	if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3287 	    (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3288 		meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3289 		add_chunks = kmem_get_free_chunk_len(meta_start);
3290 		num_chunks += add_chunks;
3291 		LIST_REMOVE(meta_start, km_link);
3292 		kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3293 	}
3294 
3295 	/*
3296 	 * Coalesce right
3297 	 */
3298 	if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3299 	    (meta_end->km_page_marker == KMEM_META_FREE)) {
3300 		add_chunks = kmem_get_free_chunk_len(meta_end);
3301 		LIST_REMOVE(meta_end, km_link);
3302 		kmem_clear_meta_range(meta_end, 1);
3303 		meta_end = meta_end + add_chunks;
3304 		num_chunks += add_chunks;
3305 	}
3306 
3307 	kmem_init_free_chunk(meta_start, num_chunks, front);
3308 }
3309 
3310 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3311 kmem_free_slot(
3312 	kmem_range_id_t         range_id,
3313 	mach_vm_range_t         slot)
3314 {
3315 	struct kmem_page_meta *meta;
3316 	vm_map_offset_t chunk_start;
3317 	uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3318 	struct kmem_sizeclass *sizeclass;
3319 	vm_map_size_t slot_size;
3320 
3321 	meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3322 	size_idx = meta->km_sizeclass;
3323 	slot_size = kmem_get_size_from_idx(size_idx);
3324 	slot_idx = (slot->min_address - chunk_start) / slot_size;
3325 	assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3326 	meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3327 
3328 	sizeclass = &kmem_size_array[size_idx];
3329 	chunk_elem = sizeclass->ks_num_elem;
3330 	num_elem = __builtin_popcount(meta->km_bitmap);
3331 
3332 	if (num_elem == chunk_elem) {
3333 		/*
3334 		 * If entire chunk empty add to emtpy list
3335 		 */
3336 		bool from_right = kmem_meta_is_from_right(range_id, meta);
3337 
3338 		kmem_free_chunk(range_id, meta, from_right);
3339 	} else if (num_elem == KMEM_NUM_GUARDS + 1) {
3340 		/*
3341 		 * If we freed to full chunk move it to partial
3342 		 */
3343 		uint32_t front = kmem_get_front(range_id,
3344 		    kmem_meta_is_from_right(range_id, meta));
3345 
3346 		kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3347 	}
3348 }
3349 
3350 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3351 kmem_free_space(
3352 	vm_map_offset_t         start,
3353 	vm_map_offset_t         end,
3354 	vm_map_range_id_t       range_id,
3355 	mach_vm_range_t         slot)
3356 {
3357 	bool entry_present = false;
3358 	vm_map_entry_t prev_entry;
3359 	vm_map_entry_t next_entry;
3360 
3361 	if ((slot->min_address == start) && (slot->max_address == end)) {
3362 		/*
3363 		 * Entire slot is being freed at once
3364 		 */
3365 		return kmem_free_slot(range_id, slot);
3366 	}
3367 
3368 	entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3369 	assert(!entry_present);
3370 	next_entry = prev_entry->vme_next;
3371 
3372 	if (((prev_entry == vm_map_to_entry(kernel_map) ||
3373 	    prev_entry->vme_end <= slot->min_address)) &&
3374 	    (next_entry == vm_map_to_entry(kernel_map) ||
3375 	    (next_entry->vme_start >= slot->max_address))) {
3376 		/*
3377 		 * Free entire slot
3378 		 */
3379 		kmem_free_slot(range_id, slot);
3380 	}
3381 }
3382 
3383 #pragma mark kmem init
3384 
3385 /*
3386  * The default percentage of memory that can be mlocked is scaled based on the total
3387  * amount of memory in the system. These percentages are caclulated
3388  * offline and stored in this table. We index this table by
3389  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3390  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3391  *
3392  * Note that these values were picked for mac.
3393  * If we ever have very large memory config arm devices, we may want to revisit
3394  * since the kernel overhead is smaller there due to the larger page size.
3395  */
3396 
3397 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3398 #define VM_USER_WIREABLE_MIN_CONFIG 32
3399 #if CONFIG_JETSAM
3400 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3401  * pressure.
3402  */
3403 static vm_map_size_t wire_limit_percents[] =
3404 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3405 #else
3406 static vm_map_size_t wire_limit_percents[] =
3407 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3408 #endif /* CONFIG_JETSAM */
3409 
3410 /*
3411  * Sets the default global user wire limit which limits the amount of
3412  * memory that can be locked via mlock() based on the above algorithm..
3413  * This can be overridden via a sysctl.
3414  */
3415 static void
kmem_set_user_wire_limits(void)3416 kmem_set_user_wire_limits(void)
3417 {
3418 	uint64_t available_mem_log;
3419 	uint64_t max_wire_percent;
3420 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3421 	    sizeof(vm_map_size_t);
3422 	vm_map_size_t limit;
3423 	uint64_t config_memsize = max_mem;
3424 #if defined(XNU_TARGET_OS_OSX)
3425 	config_memsize = max_mem_actual;
3426 #endif /* defined(XNU_TARGET_OS_OSX) */
3427 
3428 	available_mem_log = bit_floor(config_memsize);
3429 
3430 	if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3431 		available_mem_log = 0;
3432 	} else {
3433 		available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3434 	}
3435 	if (available_mem_log >= wire_limit_percents_length) {
3436 		available_mem_log = wire_limit_percents_length - 1;
3437 	}
3438 	max_wire_percent = wire_limit_percents[available_mem_log];
3439 
3440 	limit = config_memsize * max_wire_percent / 100;
3441 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3442 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3443 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3444 	}
3445 
3446 	vm_global_user_wire_limit = limit;
3447 	/* the default per task limit is the same as the global limit */
3448 	vm_per_task_user_wire_limit = limit;
3449 	vm_add_wire_count_over_global_limit = 0;
3450 	vm_add_wire_count_over_user_limit = 0;
3451 }
3452 
3453 #define KMEM_MAX_CLAIMS 50
3454 __startup_data
3455 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3456 __startup_data
3457 uint32_t kmem_claim_count = 0;
3458 
3459 __startup_func
3460 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3461 kmem_range_startup_init(
3462 	struct kmem_range_startup_spec *sp)
3463 {
3464 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3465 	if (sp->kc_calculate_sz) {
3466 		sp->kc_size = (sp->kc_calculate_sz)();
3467 	}
3468 	if (sp->kc_size) {
3469 		kmem_claims[kmem_claim_count] = *sp;
3470 		kmem_claim_count++;
3471 	}
3472 }
3473 
3474 static vm_offset_t
kmem_fuzz_start(void)3475 kmem_fuzz_start(void)
3476 {
3477 	vm_offset_t kmapoff_kaddr = 0;
3478 	uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3479 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3480 
3481 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3482 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3483 	    VM_KERN_MEMORY_OSFMK);
3484 	return kmapoff_kaddr + kmapoff_size;
3485 }
3486 
3487 /*
3488  * Generate a randomly shuffled array of indices from 0 to count - 1
3489  */
3490 __startup_func
3491 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3492 kmem_shuffle(
3493 	uint16_t       *shuffle_buf,
3494 	uint16_t        count)
3495 {
3496 	for (uint16_t i = 0; i < count; i++) {
3497 		uint16_t j = kmem_get_random16(i);
3498 		if (j != i) {
3499 			shuffle_buf[i] = shuffle_buf[j];
3500 		}
3501 		shuffle_buf[j] = i;
3502 	}
3503 }
3504 
3505 __startup_func
3506 static void
kmem_shuffle_claims(void)3507 kmem_shuffle_claims(void)
3508 {
3509 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3510 	uint16_t limit = (uint16_t)kmem_claim_count;
3511 
3512 	kmem_shuffle(&shuffle_buf[0], limit);
3513 	for (uint16_t i = 0; i < limit; i++) {
3514 		struct kmem_range_startup_spec tmp = kmem_claims[i];
3515 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3516 		kmem_claims[shuffle_buf[i]] = tmp;
3517 	}
3518 }
3519 
3520 __startup_func
3521 static void
kmem_readjust_ranges(uint32_t cur_idx)3522 kmem_readjust_ranges(
3523 	uint32_t        cur_idx)
3524 {
3525 	assert(cur_idx != 0);
3526 	uint32_t j = cur_idx - 1, random;
3527 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3528 	struct mach_vm_range *sp_range = sp.kc_range;
3529 
3530 	/*
3531 	 * Find max index where restriction is met
3532 	 */
3533 	for (; j > 0; j--) {
3534 		struct kmem_range_startup_spec spj = kmem_claims[j];
3535 		vm_map_offset_t max_start = spj.kc_range->min_address;
3536 		if (spj.kc_flags & KC_NO_MOVE) {
3537 			panic("kmem_range_init: Can't scramble with multiple constraints");
3538 		}
3539 		if (max_start <= sp_range->min_address) {
3540 			break;
3541 		}
3542 	}
3543 
3544 	/*
3545 	 * Pick a random index from 0 to max index and shift claims to the right
3546 	 * to make room for restricted claim
3547 	 */
3548 	random = kmem_get_random16((uint16_t)j);
3549 	assert(random <= j);
3550 
3551 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
3552 	sp_range->max_address = sp_range->min_address + sp.kc_size;
3553 
3554 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3555 		struct kmem_range_startup_spec spj = kmem_claims[j];
3556 		struct mach_vm_range *range = spj.kc_range;
3557 		range->min_address += sp.kc_size;
3558 		range->max_address += sp.kc_size;
3559 		kmem_claims[j + 1] = spj;
3560 	}
3561 
3562 	sp.kc_flags = KC_NO_MOVE;
3563 	kmem_claims[random] = sp;
3564 }
3565 
3566 __startup_func
3567 static vm_map_size_t
kmem_add_ptr_claims(void)3568 kmem_add_ptr_claims(void)
3569 {
3570 	uint64_t kmem_meta_num, kmem_ptr_chunks;
3571 	vm_map_size_t org_ptr_range_size = ptr_range_size;
3572 
3573 	ptr_range_size -= PAGE_SIZE;
3574 	ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3575 	ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3576 
3577 	kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3578 	ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3579 
3580 	kmem_meta_num = kmem_ptr_chunks + 2;
3581 	kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3582 
3583 	assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3584 	/*
3585 	 * Add claims for kmem's ranges
3586 	 */
3587 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3588 		struct kmem_range_startup_spec kmem_spec = {
3589 			.kc_name = "kmem_ptr_range",
3590 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3591 			.kc_size = ptr_range_size,
3592 			.kc_flags = KC_NO_ENTRY,
3593 		};
3594 		kmem_claims[kmem_claim_count++] = kmem_spec;
3595 
3596 		struct kmem_range_startup_spec kmem_meta_spec = {
3597 			.kc_name = "kmem_ptr_range_meta",
3598 			.kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3599 			.kc_size = kmem_meta_size,
3600 			.kc_flags = KC_NONE,
3601 		};
3602 		kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3603 	}
3604 	return (org_ptr_range_size - ptr_range_size - kmem_meta_size) *
3605 	       kmem_ptr_ranges;
3606 }
3607 
3608 __startup_func
3609 static void
kmem_add_extra_claims(void)3610 kmem_add_extra_claims(void)
3611 {
3612 	vm_map_size_t largest_free_size = 0, total_claims = 0;
3613 
3614 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3615 	largest_free_size = trunc_page(largest_free_size);
3616 
3617 	/*
3618 	 * kasan and configs w/o *TRR need to have just one ptr range due to
3619 	 * resource constraints.
3620 	 */
3621 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3622 	kmem_ptr_ranges = 1;
3623 #endif
3624 	/*
3625 	 * Determine size of data and pointer kmem_ranges
3626 	 */
3627 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3628 		total_claims += kmem_claims[i].kc_size;
3629 	}
3630 	assert((total_claims & PAGE_MASK) == 0);
3631 	largest_free_size -= total_claims;
3632 
3633 	/*
3634 	 * Use half the total available VA for all pointer allocations (this
3635 	 * includes the kmem_sprayqtn range). Given that we have 4 total
3636 	 * ranges divide the available VA by 8.
3637 	 */
3638 	ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3639 	sprayqtn_range_size = ptr_range_size;
3640 
3641 	if (sprayqtn_range_size > (sane_size / 2)) {
3642 		sprayqtn_range_size = sane_size / 2;
3643 	}
3644 
3645 	ptr_range_size = round_page(ptr_range_size);
3646 	sprayqtn_range_size = round_page(sprayqtn_range_size);
3647 
3648 
3649 	data_range_size = largest_free_size
3650 	    - (ptr_range_size * kmem_ptr_ranges)
3651 	    - sprayqtn_range_size;
3652 
3653 	/*
3654 	 * Add claims for kmem's ranges
3655 	 */
3656 	data_range_size += kmem_add_ptr_claims();
3657 	assert(data_range_size + sprayqtn_range_size +
3658 	    ((ptr_range_size + kmem_meta_size) * kmem_ptr_ranges) <=
3659 	    largest_free_size);
3660 
3661 	struct kmem_range_startup_spec kmem_spec_sprayqtn = {
3662 		.kc_name = "kmem_sprayqtn_range",
3663 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
3664 		.kc_size = sprayqtn_range_size,
3665 		.kc_flags = KC_NO_ENTRY,
3666 	};
3667 	kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
3668 
3669 	struct kmem_range_startup_spec kmem_spec_data = {
3670 		.kc_name = "kmem_data_range",
3671 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
3672 		.kc_size = data_range_size,
3673 		.kc_flags = KC_NO_ENTRY,
3674 	};
3675 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
3676 }
3677 
3678 __startup_func
3679 static void
kmem_scramble_ranges(void)3680 kmem_scramble_ranges(void)
3681 {
3682 	vm_map_offset_t start = 0;
3683 
3684 	/*
3685 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
3686 	 * the vm can find the requested ranges.
3687 	 */
3688 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
3689 	    VM_MAP_PAGE_SIZE(kernel_map));
3690 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
3691 
3692 	/*
3693 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
3694 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
3695 	 * could overflow into the heap.
3696 	 */
3697 	kext_alloc_init();
3698 
3699 	/*
3700 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
3701 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
3702 	 * eats about 2M of VA from the map)
3703 	 *
3704 	 * Note that we always need to slide by at least one page because the VM
3705 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
3706 	 * do not admit this address to be part of any zone submap.
3707 	 */
3708 	start = kmem_fuzz_start();
3709 
3710 	/*
3711 	 * Add claims for ptr and data kmem_ranges
3712 	 */
3713 	kmem_add_extra_claims();
3714 
3715 	/*
3716 	 * Shuffle registered claims
3717 	 */
3718 	assert(kmem_claim_count < UINT16_MAX);
3719 	kmem_shuffle_claims();
3720 
3721 	/*
3722 	 * Apply restrictions and determine range for each claim
3723 	 */
3724 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3725 		vm_map_offset_t end = 0;
3726 		struct kmem_range_startup_spec sp = kmem_claims[i];
3727 		struct mach_vm_range *sp_range = sp.kc_range;
3728 		if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
3729 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &start, NULL) != KERN_SUCCESS) {
3730 			panic("kmem_range_init: vm_map_locate_space failing for claim %s",
3731 			    sp.kc_name);
3732 		}
3733 
3734 		end = start + sp.kc_size;
3735 		/*
3736 		 * Re-adjust ranges if restriction not met
3737 		 */
3738 		if (sp_range->min_address && start > sp_range->min_address) {
3739 			kmem_readjust_ranges(i);
3740 		} else {
3741 			sp_range->min_address = start;
3742 			sp_range->max_address = end;
3743 		}
3744 		start = end;
3745 	}
3746 
3747 	/*
3748 	 * We have settled on the ranges, now create temporary entries for the
3749 	 * claims
3750 	 */
3751 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3752 		struct kmem_range_startup_spec sp = kmem_claims[i];
3753 		vm_map_entry_t entry = NULL;
3754 		if (sp.kc_flags & KC_NO_ENTRY) {
3755 			continue;
3756 		}
3757 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
3758 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &entry) != KERN_SUCCESS) {
3759 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
3760 			    sp.kc_name);
3761 		}
3762 		vm_object_reference(kernel_object_default);
3763 		VME_OBJECT_SET(entry, kernel_object_default, false, 0);
3764 		VME_OFFSET_SET(entry, entry->vme_start);
3765 		vm_map_unlock(kernel_map);
3766 	}
3767 	/*
3768 	 * Now that we are done assigning all the ranges, reset
3769 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
3770 	 */
3771 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
3772 
3773 #if DEBUG || DEVELOPMENT
3774 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3775 		struct kmem_range_startup_spec sp = kmem_claims[i];
3776 
3777 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
3778 		    (void *)sp.kc_range->min_address,
3779 		    (void *)sp.kc_range->max_address,
3780 		    mach_vm_size_pretty(sp.kc_size),
3781 		    mach_vm_size_unit(sp.kc_size));
3782 	}
3783 #endif /* DEBUG || DEVELOPMENT */
3784 }
3785 
3786 __startup_func
3787 static void
kmem_range_init(void)3788 kmem_range_init(void)
3789 {
3790 	vm_size_t range_adjustment;
3791 
3792 	kmem_scramble_ranges();
3793 
3794 	range_adjustment = sprayqtn_range_size >> 3;
3795 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
3796 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
3797 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
3798 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
3799 
3800 	range_adjustment = data_range_size >> 3;
3801 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
3802 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
3803 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
3804 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
3805 
3806 	pmap_init();
3807 	kmem_metadata_init();
3808 	kmem_sizeclass_init();
3809 
3810 #if DEBUG || DEVELOPMENT
3811 	for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
3812 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
3813 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
3814 		    (void *)kmem_large_ranges[i].min_address,
3815 		    (void *)kmem_large_ranges[i].max_address,
3816 		    mach_vm_size_pretty(range_size),
3817 		    mach_vm_size_unit(range_size));
3818 	}
3819 #endif
3820 }
3821 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
3822 
3823 #if DEBUG || DEVELOPMENT
3824 __startup_func
3825 static void
kmem_log_init(void)3826 kmem_log_init(void)
3827 {
3828 	/*
3829 	 * Log can only be created after the the kmem subsystem is initialized as
3830 	 * btlog creation uses kmem
3831 	 */
3832 	kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
3833 }
3834 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
3835 
3836 kmem_gobj_stats
kmem_get_gobj_stats(void)3837 kmem_get_gobj_stats(void)
3838 {
3839 	kmem_gobj_stats stats = {};
3840 
3841 	vm_map_lock(kernel_map);
3842 	for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
3843 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
3844 		struct mach_vm_range range = kmem_ranges[range_id];
3845 		struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3846 		struct kmem_page_meta *meta_end;
3847 		uint64_t meta_idx = meta - kmem_meta_base[range_id];
3848 		vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
3849 		vm_map_offset_t addr;
3850 		vm_map_entry_t entry;
3851 
3852 		/*
3853 		 * Left front
3854 		 */
3855 		va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
3856 		meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
3857 
3858 		/*
3859 		 * Right front
3860 		 */
3861 		meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3862 		meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
3863 		    &meta_idx);
3864 		meta_idx = meta_end - meta;
3865 		meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
3866 		va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
3867 
3868 		/*
3869 		 * Compute VA allocated in entire range
3870 		 */
3871 		if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
3872 			entry = entry->vme_next;
3873 		}
3874 		while (entry != vm_map_to_entry(kernel_map) &&
3875 		    entry->vme_start < range.max_address) {
3876 			used += (entry->vme_end - entry->vme_start);
3877 			entry = entry->vme_next;
3878 		}
3879 
3880 		pte_sz = round_page(atop(va - used) * 8);
3881 
3882 		stats.total_used += used;
3883 		stats.total_va += va;
3884 		stats.pte_sz += pte_sz;
3885 		stats.meta_sz += meta_sz;
3886 	}
3887 	vm_map_unlock(kernel_map);
3888 
3889 	return stats;
3890 }
3891 
3892 #endif /* DEBUG || DEVELOPMENT */
3893 
3894 /*
3895  *	kmem_init:
3896  *
3897  *	Initialize the kernel's virtual memory map, taking
3898  *	into account all memory allocated up to this time.
3899  */
3900 __startup_func
3901 void
kmem_init(vm_offset_t start,vm_offset_t end)3902 kmem_init(
3903 	vm_offset_t     start,
3904 	vm_offset_t     end)
3905 {
3906 	vm_map_offset_t map_start;
3907 	vm_map_offset_t map_end;
3908 
3909 	map_start = vm_map_trunc_page(start,
3910 	    VM_MAP_PAGE_MASK(kernel_map));
3911 	map_end = vm_map_round_page(end,
3912 	    VM_MAP_PAGE_MASK(kernel_map));
3913 
3914 	vm_map_will_allocate_early_map(&kernel_map);
3915 #if defined(__arm64__)
3916 	kernel_map = vm_map_create_options(pmap_kernel(),
3917 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3918 	    VM_MAX_KERNEL_ADDRESS,
3919 	    VM_MAP_CREATE_DEFAULT);
3920 	/*
3921 	 *	Reserve virtual memory allocated up to this time.
3922 	 */
3923 	{
3924 		unsigned int    region_select = 0;
3925 		vm_map_offset_t region_start;
3926 		vm_map_size_t   region_size;
3927 		vm_map_offset_t map_addr;
3928 		kern_return_t kr;
3929 
3930 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
3931 			map_addr = region_start;
3932 			kr = vm_map_enter(kernel_map, &map_addr,
3933 			    vm_map_round_page(region_size,
3934 			    VM_MAP_PAGE_MASK(kernel_map)),
3935 			    (vm_map_offset_t) 0,
3936 			    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vmkf_no_pmap_check = true),
3937 			    VM_OBJECT_NULL,
3938 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
3939 			    VM_INHERIT_DEFAULT);
3940 
3941 			if (kr != KERN_SUCCESS) {
3942 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3943 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
3944 				    (uint64_t) region_size, kr);
3945 			}
3946 
3947 			region_select++;
3948 		}
3949 	}
3950 #else
3951 	kernel_map = vm_map_create_options(pmap_kernel(),
3952 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
3953 	    VM_MAP_CREATE_DEFAULT);
3954 	/*
3955 	 *	Reserve virtual memory allocated up to this time.
3956 	 */
3957 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
3958 		vm_map_offset_t map_addr;
3959 		kern_return_t kr;
3960 
3961 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3962 		kr = vm_map_enter(kernel_map,
3963 		    &map_addr,
3964 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3965 		    (vm_map_offset_t) 0,
3966 		    VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
3967 		    VM_OBJECT_NULL,
3968 		    (vm_object_offset_t) 0, FALSE,
3969 		    VM_PROT_NONE, VM_PROT_NONE,
3970 		    VM_INHERIT_DEFAULT);
3971 
3972 		if (kr != KERN_SUCCESS) {
3973 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3974 			    (uint64_t) start, (uint64_t) end,
3975 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3976 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3977 			    kr);
3978 		}
3979 	}
3980 #endif
3981 
3982 	kmem_set_user_wire_limits();
3983 }
3984 
3985 
3986 #pragma mark map copyio
3987 
3988 /*
3989  *	Routine:	copyinmap
3990  *	Purpose:
3991  *		Like copyin, except that fromaddr is an address
3992  *		in the specified VM map.  This implementation
3993  *		is incomplete; it handles the current user map
3994  *		and the kernel map/submaps.
3995  */
3996 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)3997 copyinmap(
3998 	vm_map_t                map,
3999 	vm_map_offset_t         fromaddr,
4000 	void                    *todata,
4001 	vm_size_t               length)
4002 {
4003 	kern_return_t   kr = KERN_SUCCESS;
4004 	vm_map_t oldmap;
4005 
4006 	if (vm_map_pmap(map) == pmap_kernel()) {
4007 		/* assume a correct copy */
4008 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
4009 	} else if (current_map() == map) {
4010 		if (copyin(fromaddr, todata, length) != 0) {
4011 			kr = KERN_INVALID_ADDRESS;
4012 		}
4013 	} else {
4014 		vm_map_reference(map);
4015 		oldmap = vm_map_switch(map);
4016 		if (copyin(fromaddr, todata, length) != 0) {
4017 			kr = KERN_INVALID_ADDRESS;
4018 		}
4019 		vm_map_switch(oldmap);
4020 		vm_map_deallocate(map);
4021 	}
4022 	return kr;
4023 }
4024 
4025 /*
4026  *	Routine:	copyoutmap
4027  *	Purpose:
4028  *		Like copyout, except that toaddr is an address
4029  *		in the specified VM map.
4030  */
4031 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4032 copyoutmap(
4033 	vm_map_t                map,
4034 	void                    *fromdata,
4035 	vm_map_address_t        toaddr,
4036 	vm_size_t               length)
4037 {
4038 	kern_return_t   kr = KERN_SUCCESS;
4039 	vm_map_t        oldmap;
4040 
4041 	if (vm_map_pmap(map) == pmap_kernel()) {
4042 		/* assume a correct copy */
4043 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4044 	} else if (current_map() == map) {
4045 		if (copyout(fromdata, toaddr, length) != 0) {
4046 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
4047 			kr = KERN_INVALID_ADDRESS;
4048 		}
4049 	} else {
4050 		vm_map_reference(map);
4051 		oldmap = vm_map_switch(map);
4052 		if (copyout(fromdata, toaddr, length) != 0) {
4053 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
4054 			kr = KERN_INVALID_ADDRESS;
4055 		}
4056 		vm_map_switch(oldmap);
4057 		vm_map_deallocate(map);
4058 	}
4059 	return kr;
4060 }
4061 
4062 /*
4063  *	Routine:	copyoutmap_atomic{32, 64}
4064  *	Purpose:
4065  *		Like copyoutmap, except that the operation is atomic.
4066  *      Takes in value rather than *fromdata pointer.
4067  */
4068 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4069 copyoutmap_atomic32(
4070 	vm_map_t                map,
4071 	uint32_t                value,
4072 	vm_map_address_t        toaddr)
4073 {
4074 	kern_return_t   kr = KERN_SUCCESS;
4075 	vm_map_t        oldmap;
4076 
4077 	if (vm_map_pmap(map) == pmap_kernel()) {
4078 		/* assume a correct toaddr */
4079 		*(uint32_t *)toaddr = value;
4080 	} else if (current_map() == map) {
4081 		if (copyout_atomic32(value, toaddr) != 0) {
4082 			kr = KERN_INVALID_ADDRESS;
4083 		}
4084 	} else {
4085 		vm_map_reference(map);
4086 		oldmap = vm_map_switch(map);
4087 		if (copyout_atomic32(value, toaddr) != 0) {
4088 			kr = KERN_INVALID_ADDRESS;
4089 		}
4090 		vm_map_switch(oldmap);
4091 		vm_map_deallocate(map);
4092 	}
4093 	return kr;
4094 }
4095 
4096 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4097 copyoutmap_atomic64(
4098 	vm_map_t                map,
4099 	uint64_t                value,
4100 	vm_map_address_t        toaddr)
4101 {
4102 	kern_return_t   kr = KERN_SUCCESS;
4103 	vm_map_t        oldmap;
4104 
4105 	if (vm_map_pmap(map) == pmap_kernel()) {
4106 		/* assume a correct toaddr */
4107 		*(uint64_t *)toaddr = value;
4108 	} else if (current_map() == map) {
4109 		if (copyout_atomic64(value, toaddr) != 0) {
4110 			kr = KERN_INVALID_ADDRESS;
4111 		}
4112 	} else {
4113 		vm_map_reference(map);
4114 		oldmap = vm_map_switch(map);
4115 		if (copyout_atomic64(value, toaddr) != 0) {
4116 			kr = KERN_INVALID_ADDRESS;
4117 		}
4118 		vm_map_switch(oldmap);
4119 		vm_map_deallocate(map);
4120 	}
4121 	return kr;
4122 }
4123 
4124 
4125 #pragma mark pointer obfuscation / packing
4126 
4127 /*
4128  *
4129  *	The following two functions are to be used when exposing kernel
4130  *	addresses to userspace via any of the various debug or info
4131  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4132  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4133  *	are exported to KEXTs.
4134  *
4135  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4136  */
4137 
4138 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4139 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4140 {
4141 	assert(salt != 0);
4142 
4143 	if (addr == 0) {
4144 		return 0ul;
4145 	}
4146 
4147 	if (VM_KERNEL_IS_SLID(addr)) {
4148 		return VM_KERNEL_UNSLIDE(addr);
4149 	}
4150 
4151 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4152 	SHA256_CTX sha_ctx;
4153 
4154 	SHA256_Init(&sha_ctx);
4155 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4156 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4157 	SHA256_Final(sha_digest, &sha_ctx);
4158 
4159 	return sha_digest[0];
4160 }
4161 
4162 __exported vm_offset_t
4163 vm_kernel_addrhash_external(vm_offset_t addr);
4164 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4165 vm_kernel_addrhash_external(vm_offset_t addr)
4166 {
4167 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4168 }
4169 
4170 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4171 vm_kernel_addrhide(
4172 	vm_offset_t addr,
4173 	vm_offset_t *hide_addr)
4174 {
4175 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
4176 }
4177 
4178 /*
4179  *	vm_kernel_addrperm_external:
4180  *	vm_kernel_unslide_or_perm_external:
4181  *
4182  *	Use these macros when exposing an address to userspace that could come from
4183  *	either kernel text/data *or* the heap.
4184  */
4185 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4186 vm_kernel_addrperm_external(
4187 	vm_offset_t addr,
4188 	vm_offset_t *perm_addr)
4189 {
4190 	if (VM_KERNEL_IS_SLID(addr)) {
4191 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
4192 	} else if (VM_KERNEL_ADDRESS(addr)) {
4193 		*perm_addr = addr + vm_kernel_addrperm_ext;
4194 	} else {
4195 		*perm_addr = addr;
4196 	}
4197 }
4198 
4199 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4200 vm_kernel_unslide_or_perm_external(
4201 	vm_offset_t addr,
4202 	vm_offset_t *up_addr)
4203 {
4204 	vm_kernel_addrperm_external(addr, up_addr);
4205 }
4206 
4207 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4208 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4209 {
4210 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4211 		panic("pointer %p can't be packed: low %d bits aren't 0",
4212 		    (void *)ptr, params.vmpp_shift);
4213 	} else if (ptr <= params.vmpp_base) {
4214 		panic("pointer %p can't be packed: below base %p",
4215 		    (void *)ptr, (void *)params.vmpp_base);
4216 	} else {
4217 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
4218 		    (void *)ptr, (void *)vm_packing_max_packable(params));
4219 	}
4220 }
4221 
4222 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4223 vm_packing_verify_range(
4224 	const char *subsystem,
4225 	vm_offset_t min_address,
4226 	vm_offset_t max_address,
4227 	vm_packing_params_t params)
4228 {
4229 	if (min_address > max_address) {
4230 		panic("%s: %s range invalid min:%p > max:%p",
4231 		    __func__, subsystem, (void *)min_address, (void *)max_address);
4232 	}
4233 
4234 	if (!params.vmpp_base_relative) {
4235 		return;
4236 	}
4237 
4238 	if (min_address <= params.vmpp_base) {
4239 		panic("%s: %s range invalid min:%p <= base:%p",
4240 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4241 	}
4242 
4243 	if (max_address > vm_packing_max_packable(params)) {
4244 		panic("%s: %s range invalid max:%p >= max packable:%p",
4245 		    __func__, subsystem, (void *)max_address,
4246 		    (void *)vm_packing_max_packable(params));
4247 	}
4248 }
4249 
4250 #pragma mark tests
4251 #if DEBUG || DEVELOPMENT
4252 #include <sys/errno.h>
4253 
4254 static void
4255 kmem_test_for_entry(
4256 	vm_map_t                map,
4257 	vm_offset_t             addr,
4258 	void                  (^block)(vm_map_entry_t))
4259 {
4260 	vm_map_entry_t entry;
4261 
4262 	vm_map_lock(map);
4263 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4264 	vm_map_unlock(map);
4265 }
4266 
4267 #define kmem_test_assert_map(map, pg, entries) ({ \
4268 	assert3u((map)->size, ==, ptoa(pg)); \
4269 	assert3u((map)->hdr.nentries, ==, entries); \
4270 })
4271 
4272 static bool
can_write_at(vm_offset_t offs,uint32_t page)4273 can_write_at(vm_offset_t offs, uint32_t page)
4274 {
4275 	static const int zero;
4276 
4277 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4278 }
4279 #define assert_writeable(offs, page) \
4280 	assertf(can_write_at(offs, page), \
4281 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4282 
4283 #define assert_faults(offs, page) \
4284 	assertf(!can_write_at(offs, page), \
4285 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4286 
4287 #define peek(offs, page) \
4288 	(*(uint32_t *)((offs) + ptoa(page)))
4289 
4290 #define poke(offs, page, v) \
4291 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
4292 
4293 __attribute__((noinline))
4294 static void
kmem_alloc_basic_test(vm_map_t map)4295 kmem_alloc_basic_test(vm_map_t map)
4296 {
4297 	kmem_guard_t guard = {
4298 		.kmg_tag = VM_KERN_MEMORY_DIAG,
4299 	};
4300 	vm_offset_t addr;
4301 
4302 	/*
4303 	 * Test wired basics:
4304 	 * - KMA_KOBJECT
4305 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4306 	 * - allocation alignment
4307 	 */
4308 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4309 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4310 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4311 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4312 	kmem_test_assert_map(map, 10, 1);
4313 
4314 	kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
4315 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4316 		assert(e->vme_kernel_object);
4317 		assert(!e->vme_atomic);
4318 		assert3u(e->vme_start, <=, addr);
4319 		assert3u(addr + ptoa(10), <=, e->vme_end);
4320 	});
4321 
4322 	assert_faults(addr, 0);
4323 	for (int i = 1; i < 9; i++) {
4324 		assert_writeable(addr, i);
4325 	}
4326 	assert_faults(addr, 9);
4327 
4328 	kmem_free(map, addr, ptoa(10));
4329 	kmem_test_assert_map(map, 0, 0);
4330 
4331 	/*
4332 	 * Test pageable basics.
4333 	 */
4334 	addr = kmem_alloc_guard(map, ptoa(10), 0,
4335 	    KMA_PAGEABLE, guard).kmr_address;
4336 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4337 	kmem_test_assert_map(map, 10, 1);
4338 
4339 	for (int i = 0; i < 9; i++) {
4340 		assert_faults(addr, i);
4341 		poke(addr, i, 42);
4342 		assert_writeable(addr, i);
4343 	}
4344 
4345 	kmem_free(map, addr, ptoa(10));
4346 	kmem_test_assert_map(map, 0, 0);
4347 }
4348 
4349 __attribute__((noinline))
4350 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4351 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4352 {
4353 	kmem_guard_t guard = {
4354 		.kmg_atomic  = !(kind & KMR_DATA),
4355 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
4356 		.kmg_context = 0xefface,
4357 	};
4358 	vm_offset_t addr, newaddr;
4359 	const int N = 10;
4360 
4361 	/*
4362 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
4363 	 *	we could conceive an implementation where it grows in place
4364 	 *	if there's space after it.
4365 	 *
4366 	 *	However, this is what the implementation does today.
4367 	 */
4368 	bool realloc_growth_changes_address = true;
4369 	bool GL = (kind & KMR_GUARD_LAST);
4370 
4371 	/*
4372 	 *	Initial N page allocation
4373 	 */
4374 	addr = kmem_alloc_guard(map, ptoa(N), 0,
4375 	    (kind & (KMA_KOBJECT | KMA_GUARD_LAST | KMA_DATA)) | KMA_ZERO,
4376 	    guard).kmr_address;
4377 	assert3u(addr, !=, 0);
4378 	kmem_test_assert_map(map, N, 1);
4379 	for (int pg = 0; pg < N - GL; pg++) {
4380 		poke(addr, pg, 42 + pg);
4381 	}
4382 	for (int pg = N - GL; pg < N; pg++) {
4383 		assert_faults(addr, pg);
4384 	}
4385 
4386 
4387 	/*
4388 	 *	Grow to N + 3 pages
4389 	 */
4390 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4391 	    kind | KMR_ZERO, guard).kmr_address;
4392 	assert3u(newaddr, !=, 0);
4393 	if (realloc_growth_changes_address) {
4394 		assert3u(addr, !=, newaddr);
4395 	}
4396 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4397 		kmem_test_assert_map(map, N + 3, 1);
4398 	} else {
4399 		kmem_test_assert_map(map, 2 * N + 3, 2);
4400 	}
4401 	for (int pg = 0; pg < N - GL; pg++) {
4402 		assert3u(peek(newaddr, pg), ==, 42 + pg);
4403 	}
4404 	if ((kind & KMR_FREEOLD) == 0) {
4405 		for (int pg = 0; pg < N - GL; pg++) {
4406 			assert3u(peek(addr, pg), ==, 42 + pg);
4407 		}
4408 		/* check for tru-share */
4409 		poke(addr + 16, 0, 1234);
4410 		assert3u(peek(newaddr + 16, 0), ==, 1234);
4411 		kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
4412 		kmem_test_assert_map(map, N + 3, 1);
4413 	}
4414 	if (addr != newaddr) {
4415 		for (int pg = 0; pg < N - GL; pg++) {
4416 			assert_faults(addr, pg);
4417 		}
4418 	}
4419 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4420 		assert3u(peek(newaddr, pg), ==, 0);
4421 	}
4422 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4423 		assert_faults(newaddr, pg);
4424 	}
4425 	addr = newaddr;
4426 
4427 
4428 	/*
4429 	 *	Shrink to N - 2 pages
4430 	 */
4431 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4432 	    kind | KMR_ZERO, guard).kmr_address;
4433 	assert3u(map->size, ==, ptoa(N - 2));
4434 	assert3u(newaddr, ==, addr);
4435 	kmem_test_assert_map(map, N - 2, 1);
4436 
4437 	for (int pg = 0; pg < N - 2 - GL; pg++) {
4438 		assert3u(peek(addr, pg), ==, 42 + pg);
4439 	}
4440 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4441 		assert_faults(addr, pg);
4442 	}
4443 
4444 	kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
4445 	kmem_test_assert_map(map, 0, 0);
4446 }
4447 
4448 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4449 kmem_basic_test(__unused int64_t in, int64_t *out)
4450 {
4451 	mach_vm_offset_t addr;
4452 	vm_map_t map;
4453 
4454 	printf("%s: test running\n", __func__);
4455 
4456 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4457 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4458 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4459 
4460 	printf("%s: kmem_alloc ...\n", __func__);
4461 	kmem_alloc_basic_test(map);
4462 	printf("%s:     PASS\n", __func__);
4463 
4464 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4465 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4466 	printf("%s:     PASS\n", __func__);
4467 
4468 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4469 	kmem_realloc_basic_test(map, KMR_FREEOLD);
4470 	printf("%s:     PASS\n", __func__);
4471 
4472 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4473 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
4474 	printf("%s:     PASS\n", __func__);
4475 
4476 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4477 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4478 	printf("%s:     PASS\n", __func__);
4479 
4480 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4481 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4482 	printf("%s:     PASS\n", __func__);
4483 
4484 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4485 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
4486 	printf("%s:     PASS\n", __func__);
4487 
4488 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4489 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4490 	printf("%s:     PASS\n", __func__);
4491 
4492 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4493 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4494 	printf("%s:     PASS\n", __func__);
4495 
4496 	/* using KMR_DATA signals to test the non atomic realloc path */
4497 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4498 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4499 	printf("%s:     PASS\n", __func__);
4500 
4501 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4502 	kmem_realloc_basic_test(map, KMR_DATA);
4503 	printf("%s:     PASS\n", __func__);
4504 
4505 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
4506 	vm_map_deallocate(map);
4507 
4508 	printf("%s: test passed\n", __func__);
4509 	*out = 1;
4510 	return 0;
4511 }
4512 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
4513 
4514 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)4515 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
4516 {
4517 	uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
4518 
4519 	assert(chunks >= kmem_size_array[idx].ks_num_chunk);
4520 }
4521 
4522 __attribute__((noinline))
4523 static void
kmem_test_get_size_idx_for_all_chunks()4524 kmem_test_get_size_idx_for_all_chunks()
4525 {
4526 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
4527 		uint32_t chunks = kmem_size_array[i].ks_num_chunk;
4528 
4529 		if (chunks != 1) {
4530 			kmem_test_get_size_idx_for_chunks(chunks - 1);
4531 		}
4532 		kmem_test_get_size_idx_for_chunks(chunks);
4533 		kmem_test_get_size_idx_for_chunks(chunks + 1);
4534 	}
4535 }
4536 
4537 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)4538 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
4539 {
4540 	printf("%s: test running\n", __func__);
4541 
4542 	printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
4543 	kmem_test_get_size_idx_for_all_chunks();
4544 	printf("%s:     PASS\n", __func__);
4545 
4546 	printf("%s: test passed\n", __func__);
4547 	*out = 1;
4548 	return 0;
4549 }
4550 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
4551 #endif /* DEBUG || DEVELOPMENT */
4552