xref: /xnu-8796.121.2/osfmk/vm/vm_kern.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <kern/misc_protos.h>
79 #include <vm/cpm.h>
80 #include <kern/ledger.h>
81 #include <kern/bits.h>
82 #include <kern/startup.h>
83 
84 #include <string.h>
85 
86 #include <libkern/OSDebug.h>
87 #include <libkern/crypto/sha2.h>
88 #include <libkern/section_keywords.h>
89 #include <sys/kdebug.h>
90 
91 #include <san/kasan.h>
92 #include <kern/kext_alloc.h>
93 #include <kern/backtrace.h>
94 #include <os/hash.h>
95 #include <kern/zalloc_internal.h>
96 #include <libkern/crypto/rand.h>
97 
98 /*
99  *	Variables exported by this module.
100  */
101 
102 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
103 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
104 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
105 
106 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
107     KMEM_RANGE_ID_NUM_PTR);
108 #define KMEM_GOBJ_THRESHOLD   (32ULL << 20)
109 #if DEBUG || DEVELOPMENT
110 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
111 #define KMEM_OUTLIER_SIZE      0
112 #define KMEM_OUTLIER_ALIGN     1
113 btlog_t kmem_outlier_log;
114 #endif /* DEBUG || DEVELOPMENT */
115 
116 __startup_data static vm_map_size_t data_range_size;
117 __startup_data static vm_map_size_t ptr_range_size;
118 __startup_data static vm_map_size_t sprayqtn_range_size;
119 
120 #pragma mark helpers
121 
122 __attribute__((overloadable))
123 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)124 ANYF(kma_flags_t flags)
125 {
126 	return (kmem_flags_t)flags;
127 }
128 
129 __attribute__((overloadable))
130 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)131 ANYF(kmr_flags_t flags)
132 {
133 	return (kmem_flags_t)flags;
134 }
135 
136 __attribute__((overloadable))
137 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)138 ANYF(kmf_flags_t flags)
139 {
140 	return (kmem_flags_t)flags;
141 }
142 
143 __abortlike
144 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)145 __kmem_invalid_size_panic(
146 	vm_map_t        map,
147 	vm_size_t       size,
148 	uint32_t        flags)
149 {
150 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
151 	    map, flags, (size_t)size);
152 }
153 
154 __abortlike
155 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)156 __kmem_invalid_arguments_panic(
157 	const char     *what,
158 	vm_map_t        map,
159 	vm_address_t    address,
160 	vm_size_t       size,
161 	uint32_t        flags)
162 {
163 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
164 	    "invalid arguments passed",
165 	    what, map, (void *)address, (size_t)size, flags);
166 }
167 
168 __abortlike
169 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)170 __kmem_failed_panic(
171 	vm_map_t        map,
172 	vm_size_t       size,
173 	uint32_t        flags,
174 	kern_return_t   kr,
175 	const char     *what)
176 {
177 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
178 	    what, map, (size_t)size, flags, kr);
179 }
180 
181 __abortlike
182 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)183 __kmem_entry_not_found_panic(
184 	vm_map_t        map,
185 	vm_offset_t     addr)
186 {
187 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
188 }
189 
190 __abortlike
191 static void
__kmem_invalid_object_panic(uint32_t flags)192 __kmem_invalid_object_panic(uint32_t flags)
193 {
194 	if (flags == 0) {
195 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
196 	}
197 	panic("more than one of KMEM_KOBJECT or KMEM_COMPRESSOR specified");
198 }
199 
200 static inline vm_object_t
__kmem_object(kmem_flags_t flags)201 __kmem_object(kmem_flags_t flags)
202 {
203 	flags &= (KMEM_KOBJECT | KMEM_COMPRESSOR);
204 	if (flags == 0 || (flags & (flags - 1))) {
205 		__kmem_invalid_object_panic(flags);
206 	}
207 
208 	return (flags & KMEM_KOBJECT) ? kernel_object : compressor_object;
209 }
210 
211 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)212 __kmem_guard_left(kmem_flags_t flags)
213 {
214 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
215 }
216 
217 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)218 __kmem_guard_right(kmem_flags_t flags)
219 {
220 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
221 }
222 
223 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)224 __kmem_guard_size(kmem_flags_t flags)
225 {
226 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
227 }
228 
229 __pure2
230 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)231 __kmem_entry_orig_size(vm_map_entry_t entry)
232 {
233 	vm_object_t object = VME_OBJECT(entry);
234 
235 	if (entry->vme_kernel_object) {
236 		return entry->vme_end - entry->vme_start -
237 		       entry->vme_object_or_delta;
238 	} else {
239 		return object->vo_size - object->vo_size_delta;
240 	}
241 }
242 
243 
244 #pragma mark kmem range methods
245 
246 #if __arm64__
247 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
248 #define mach_vm_range_load(r, r_min, r_max) \
249 	asm("ldp %[rmin], %[rmax], [%[range]]" \
250 	    : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
251 	    : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
252 #else
253 #define mach_vm_range_load(r, rmin, rmax) \
254 	({ rmin = (r)->min_address; rmax = (r)->max_address; })
255 #endif
256 
257 __abortlike
258 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)259 __mach_vm_range_overflow(
260 	mach_vm_offset_t        addr,
261 	mach_vm_offset_t        size)
262 {
263 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
264 	    addr, addr, size);
265 }
266 
267 __abortlike
268 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)269 __mach_vm_range_invalid(
270 	mach_vm_offset_t        min_address,
271 	mach_vm_offset_t        max_address)
272 {
273 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
274 	    min_address, max_address);
275 }
276 
277 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)278 mach_vm_range_size(const struct mach_vm_range *r)
279 {
280 	mach_vm_offset_t rmin, rmax;
281 
282 	mach_vm_range_load(r, rmin, rmax);
283 	return rmax - rmin;
284 }
285 
286 __attribute__((overloadable))
287 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)288 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
289 {
290 	mach_vm_offset_t rmin, rmax;
291 
292 #if CONFIG_KERNEL_TBI
293 	if (VM_KERNEL_ADDRESS(addr)) {
294 		addr = VM_KERNEL_TBI_FILL(addr);
295 	}
296 #endif /* CONFIG_KERNEL_TBI */
297 
298 	/*
299 	 * The `&` is not a typo: we really expect the check to pass,
300 	 * so encourage the compiler to eagerly load and test without branches
301 	 */
302 	mach_vm_range_load(r, rmin, rmax);
303 	return (addr >= rmin) & (addr < rmax);
304 }
305 
306 __attribute__((overloadable))
307 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)308 mach_vm_range_contains(
309 	const struct mach_vm_range *r,
310 	mach_vm_offset_t        addr,
311 	mach_vm_offset_t        size)
312 {
313 	mach_vm_offset_t rmin, rmax;
314 
315 #if CONFIG_KERNEL_TBI
316 	if (VM_KERNEL_ADDRESS(addr)) {
317 		addr = VM_KERNEL_TBI_FILL(addr);
318 	}
319 #endif /* CONFIG_KERNEL_TBI */
320 
321 	/*
322 	 * The `&` is not a typo: we really expect the check to pass,
323 	 * so encourage the compiler to eagerly load and test without branches
324 	 */
325 	mach_vm_range_load(r, rmin, rmax);
326 	return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
327 }
328 
329 __attribute__((overloadable))
330 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)331 mach_vm_range_intersects(
332 	const struct mach_vm_range *r1,
333 	const struct mach_vm_range *r2)
334 {
335 	mach_vm_offset_t r1_min, r1_max;
336 	mach_vm_offset_t r2_min, r2_max;
337 
338 	mach_vm_range_load(r1, r1_min, r1_max);
339 	r2_min = r2->min_address;
340 	r2_max = r2->max_address;
341 
342 	if (r1_min > r1_max) {
343 		__mach_vm_range_invalid(r1_min, r1_max);
344 	}
345 
346 	if (r2_min > r2_max) {
347 		__mach_vm_range_invalid(r2_min, r2_max);
348 	}
349 
350 	return r1_max > r2_min && r1_min < r2_max;
351 }
352 
353 __attribute__((overloadable))
354 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)355 mach_vm_range_intersects(
356 	const struct mach_vm_range *r1,
357 	mach_vm_offset_t        addr,
358 	mach_vm_offset_t        size)
359 {
360 	struct mach_vm_range r2;
361 
362 #if CONFIG_KERNEL_TBI
363 	addr = VM_KERNEL_STRIP_UPTR(addr);
364 #endif /* CONFIG_KERNEL_TBI */
365 	r2.min_address = addr;
366 	if (os_add_overflow(addr, size, &r2.max_address)) {
367 		__mach_vm_range_overflow(addr, size);
368 	}
369 
370 	return mach_vm_range_intersects(r1, &r2);
371 }
372 
373 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)374 kmem_range_id_contains(
375 	kmem_range_id_t         range_id,
376 	vm_map_offset_t         addr,
377 	vm_map_size_t           size)
378 {
379 	return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
380 }
381 
382 __abortlike
383 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)384 kmem_range_invalid_panic(
385 	kmem_range_id_t         range_id,
386 	vm_map_offset_t         addr,
387 	vm_map_size_t           size)
388 {
389 	const struct mach_vm_range *r = &kmem_ranges[range_id];
390 	mach_vm_offset_t rmin, rmax;
391 
392 	mach_vm_range_load(r, rmin, rmax);
393 	if (addr + size < rmin) {
394 		panic("addr %p + size %llu overflows %p", (void *)addr, size,
395 		    (void *)(addr + size));
396 	}
397 	panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
398 	    (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
399 }
400 
401 /*
402  * Return whether the entire allocation is contained in the given range
403  */
404 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)405 kmem_range_contains_fully(
406 	kmem_range_id_t         range_id,
407 	vm_map_offset_t         addr,
408 	vm_map_size_t           size)
409 {
410 	const struct mach_vm_range *r = &kmem_ranges[range_id];
411 	mach_vm_offset_t rmin, rmax;
412 	bool result = false;
413 
414 #if CONFIG_KERNEL_TBI
415 	if (VM_KERNEL_ADDRESS(addr)) {
416 		addr = VM_KERNEL_TBI_FILL(addr);
417 	}
418 #endif /* CONFIG_KERNEL_TBI */
419 
420 	/*
421 	 * The `&` is not a typo: we really expect the check to pass,
422 	 * so encourage the compiler to eagerly load and test without branches
423 	 */
424 	mach_vm_range_load(r, rmin, rmax);
425 	result = (addr >= rmin) & (addr < rmax);
426 	if (__improbable(result
427 	    && ((addr + size < rmin) || (addr + size > rmax)))) {
428 		kmem_range_invalid_panic(range_id, addr, size);
429 	}
430 	return result;
431 }
432 
433 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)434 kmem_range_id_size(kmem_range_id_t range_id)
435 {
436 	return mach_vm_range_size(&kmem_ranges[range_id]);
437 }
438 
439 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)440 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
441 {
442 	kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
443 
444 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
445 		if (kmem_range_contains_fully(range_id, addr, size)) {
446 			return range_id;
447 		}
448 	}
449 	return KMEM_RANGE_ID_NONE;
450 }
451 
452 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)453 kmem_is_ptr_range(vm_map_range_id_t range_id)
454 {
455 	return (range_id >= KMEM_RANGE_ID_FIRST) &&
456 	       (range_id <= KMEM_RANGE_ID_NUM_PTR);
457 }
458 
459 __abortlike
460 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)461 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
462 {
463 	panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
464 	    (void *)addr);
465 }
466 
467 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)468 kmem_validate_range_for_overwrite(
469 	vm_map_offset_t         addr,
470 	vm_map_size_t           size)
471 {
472 	vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
473 
474 	if (kmem_is_ptr_range(range_id)) {
475 		kmem_range_invalid_for_overwrite(addr);
476 	}
477 
478 	return &kmem_ranges[range_id];
479 }
480 
481 
482 #pragma mark entry parameters
483 
484 
485 __abortlike
486 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)487 __kmem_entry_validate_panic(
488 	vm_map_t        map,
489 	vm_map_entry_t  entry,
490 	vm_offset_t     addr,
491 	vm_size_t       size,
492 	uint32_t        flags,
493 	kmem_guard_t    guard)
494 {
495 	const char *what = "???";
496 
497 	if (entry->vme_atomic != guard.kmg_atomic) {
498 		what = "atomicity";
499 	} else if (entry->is_sub_map != guard.kmg_submap) {
500 		what = "objectness";
501 	} else if (addr != entry->vme_start) {
502 		what = "left bound";
503 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
504 		what = "right bound";
505 	} else if (guard.kmg_context != entry->vme_context) {
506 		what = "guard";
507 	}
508 
509 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
510 	    "entry:%p %s mismatch guard(0x%08x)",
511 	    map, (void *)addr, size, flags, entry,
512 	    what, guard.kmg_context);
513 }
514 
515 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)516 __kmem_entry_validate_guard(
517 	vm_map_entry_t  entry,
518 	vm_offset_t     addr,
519 	vm_size_t       size,
520 	kmem_flags_t    flags,
521 	kmem_guard_t    guard)
522 {
523 	if (entry->vme_atomic != guard.kmg_atomic) {
524 		return false;
525 	}
526 
527 	if (!guard.kmg_atomic) {
528 		return true;
529 	}
530 
531 	if (entry->is_sub_map != guard.kmg_submap) {
532 		return false;
533 	}
534 
535 	if (addr != entry->vme_start) {
536 		return false;
537 	}
538 
539 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
540 		return false;
541 	}
542 
543 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
544 		return false;
545 	}
546 
547 	return true;
548 }
549 
550 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)551 kmem_entry_validate_guard(
552 	vm_map_t        map,
553 	vm_map_entry_t  entry,
554 	vm_offset_t     addr,
555 	vm_size_t       size,
556 	kmem_guard_t    guard)
557 {
558 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
559 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
560 	}
561 }
562 
563 __abortlike
564 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)565 __kmem_entry_validate_object_panic(
566 	vm_map_t        map,
567 	vm_map_entry_t  entry,
568 	kmem_flags_t    flags)
569 {
570 	const char *what;
571 	const char *verb;
572 
573 	if (entry->is_sub_map) {
574 		panic("kmem(map=%p) entry %p is a submap", map, entry);
575 	}
576 
577 	if (flags & KMEM_KOBJECT) {
578 		what = "kernel";
579 		verb = "isn't";
580 	} else if (flags & KMEM_COMPRESSOR) {
581 		what = "compressor";
582 		verb = "isn't";
583 	} else if (entry->vme_kernel_object) {
584 		what = "kernel";
585 		verb = "is unexpectedly";
586 	} else {
587 		what = "compressor";
588 		verb = "is unexpectedly";
589 	}
590 
591 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
592 	    map, flags, entry, verb, what);
593 }
594 
595 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)596 __kmem_entry_validate_object(
597 	vm_map_entry_t  entry,
598 	kmem_flags_t    flags)
599 {
600 	if (entry->is_sub_map) {
601 		return false;
602 	}
603 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
604 		return false;
605 	}
606 
607 	return (bool)(flags & KMEM_COMPRESSOR) ==
608 	       (VME_OBJECT(entry) == compressor_object);
609 }
610 
611 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)612 kmem_size_guard(
613 	vm_map_t        map,
614 	vm_offset_t     addr,
615 	kmem_guard_t    guard)
616 {
617 	kmem_flags_t flags = KMEM_GUESS_SIZE;
618 	vm_map_entry_t entry;
619 	vm_size_t size;
620 
621 	vm_map_lock_read(map);
622 
623 #if KASAN_CLASSIC
624 	addr -= PAGE_SIZE;
625 #endif /* KASAN_CLASSIC */
626 #if KASAN_TBI
627 	addr = VM_KERNEL_TBI_FILL(addr);
628 #endif /* KASAN_TBI */
629 
630 	if (!vm_map_lookup_entry(map, addr, &entry)) {
631 		__kmem_entry_not_found_panic(map, addr);
632 	}
633 
634 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
635 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
636 	}
637 
638 	size = __kmem_entry_orig_size(entry);
639 
640 	vm_map_unlock_read(map);
641 
642 	return size;
643 }
644 
645 static inline uint16_t
kmem_hash_backtrace(void * fp)646 kmem_hash_backtrace(
647 	void                     *fp)
648 {
649 	uint64_t  bt_count;
650 	uintptr_t bt[8] = {};
651 
652 	struct backtrace_control ctl = {
653 		.btc_frame_addr = (uintptr_t)fp,
654 	};
655 
656 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
657 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
658 }
659 
660 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
661     "Insufficient bits to represent ptr ranges");
662 
663 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)664 kmem_adjust_range_id(
665 	uint32_t                  hash)
666 {
667 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
668 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
669 }
670 
671 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)672 kmem_use_sprayqtn(
673 	kma_flags_t               kma_flags,
674 	vm_map_size_t             map_size,
675 	vm_offset_t               mask)
676 {
677 	/*
678 	 * Pointer allocations that are above the guard objects threshold or have
679 	 * leading guard pages with non standard alignment requests are redirected
680 	 * to the sprayqtn range.
681 	 */
682 #if DEBUG || DEVELOPMENT
683 	btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
684 	    BTREF_GET_NOWAIT : 0;
685 
686 	if ((kma_flags & KMA_SPRAYQTN) == 0) {
687 		if (map_size > KMEM_GOBJ_THRESHOLD) {
688 			btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
689 			    btref_get(__builtin_frame_address(0), flags));
690 		} else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
691 			btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
692 			    btref_get(__builtin_frame_address(0), flags));
693 		}
694 	}
695 #endif /* DEBUG || DEVELOPMENT */
696 
697 	return (kma_flags & KMA_SPRAYQTN) ||
698 	       (map_size > KMEM_GOBJ_THRESHOLD) ||
699 	       ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
700 }
701 
702 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)703 kmem_apply_security_policy(
704 	vm_map_t                  map,
705 	kma_flags_t               kma_flags,
706 	kmem_guard_t              guard,
707 	vm_map_size_t             map_size,
708 	vm_offset_t               mask,
709 	vm_map_kernel_flags_t    *vmk_flags,
710 	bool                      assert_dir __unused)
711 {
712 	kmem_range_id_t range_id;
713 	bool from_right;
714 	uint16_t type_hash = guard.kmg_type_hash;
715 
716 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
717 		return;
718 	}
719 
720 	/*
721 	 * A non-zero type-hash must be passed by krealloc_type
722 	 */
723 #if (DEBUG || DEVELOPMENT)
724 	if (assert_dir && !(kma_flags & KMA_DATA)) {
725 		assert(type_hash != 0);
726 	}
727 #endif
728 
729 	if (kma_flags & KMA_DATA) {
730 		range_id  = KMEM_RANGE_ID_DATA;
731 		/*
732 		 * As an optimization in KMA_DATA to avoid fragmentation,
733 		 * allocate static carveouts at the end of the DATA range.
734 		 */
735 		from_right = (bool)(kma_flags & KMA_PERMANENT);
736 	} else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
737 		range_id = KMEM_RANGE_ID_SPRAYQTN;
738 		from_right = (bool)(kma_flags & KMA_PERMANENT);
739 	} else if (type_hash) {
740 		range_id  = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
741 		from_right = type_hash & KMEM_DIRECTION_MASK;
742 	} else {
743 		/*
744 		 * Range id needs to correspond to one of the PTR ranges
745 		 */
746 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
747 		range_id  = kmem_adjust_range_id(type_hash);
748 		from_right = type_hash & KMEM_DIRECTION_MASK;
749 	}
750 
751 	vmk_flags->vmkf_range_id = range_id;
752 	vmk_flags->vmkf_last_free = from_right;
753 }
754 
755 #pragma mark allocation
756 
757 static kmem_return_t
758 kmem_alloc_guard_internal(
759 	vm_map_t                map,
760 	vm_size_t               size,
761 	vm_offset_t             mask,
762 	kma_flags_t             flags,
763 	kmem_guard_t            guard,
764 	kern_return_t         (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
765 {
766 	vm_object_t             object;
767 	vm_offset_t             delta = 0;
768 	vm_map_entry_t          entry = NULL;
769 	vm_map_offset_t         map_addr, fill_start;
770 	vm_map_size_t           map_size, fill_size;
771 	vm_page_t               guard_left = VM_PAGE_NULL;
772 	vm_page_t               guard_right = VM_PAGE_NULL;
773 	vm_page_t               wired_page_list = VM_PAGE_NULL;
774 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
775 	bool                    skip_guards;
776 	kmem_return_t           kmr = { };
777 
778 	assert(kernel_map && map->pmap == kernel_pmap);
779 
780 #if DEBUG || DEVELOPMENT
781 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
782 	    size, 0, 0, 0);
783 #endif
784 
785 	if (size == 0 ||
786 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
787 	    (size < __kmem_guard_size(ANYF(flags)))) {
788 		__kmem_invalid_size_panic(map, size, flags);
789 	}
790 
791 	/*
792 	 * limit the size of a single extent of wired memory
793 	 * to try and limit the damage to the system if
794 	 * too many pages get wired down
795 	 * limit raised to 2GB with 128GB max physical limit,
796 	 * but scaled by installed memory above this
797 	 *
798 	 * Note: kmem_alloc_contig_guard() is immune to this check.
799 	 */
800 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
801 	    alloc_pages == NULL &&
802 	    size > MAX(1ULL << 31, sane_size / 64))) {
803 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
804 		goto out_error;
805 	}
806 
807 	/*
808 	 * Guard pages:
809 	 *
810 	 * Guard pages are implemented as fictitious pages.
811 	 *
812 	 * However, some maps, and some objects are known
813 	 * to manage their memory explicitly, and do not need
814 	 * those to be materialized, which saves memory.
815 	 *
816 	 * By placing guard pages on either end of a stack,
817 	 * they can help detect cases where a thread walks
818 	 * off either end of its stack.
819 	 *
820 	 * They are allocated and set up here and attempts
821 	 * to access those pages are trapped in vm_fault_page().
822 	 *
823 	 * The map_size we were passed may include extra space for
824 	 * guard pages. fill_size represents the actual size to populate.
825 	 * Similarly, fill_start indicates where the actual pages
826 	 * will begin in the range.
827 	 */
828 
829 	map_size   = round_page(size);
830 	fill_start = 0;
831 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
832 
833 #if KASAN_CLASSIC
834 	if (flags & KMA_KASAN_GUARD) {
835 		assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
836 		flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
837 		delta     = ptoa(2);
838 		map_size += delta;
839 	}
840 #else
841 	(void)delta;
842 #endif /* KASAN_CLASSIC */
843 
844 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
845 	    map->never_faults;
846 
847 	if (flags & KMA_GUARD_FIRST) {
848 		vmk_flags.vmkf_guard_before = true;
849 		fill_start += PAGE_SIZE;
850 	}
851 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
852 		guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
853 		if (__improbable(guard_left == VM_PAGE_NULL)) {
854 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
855 			goto out_error;
856 		}
857 	}
858 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
859 		guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
860 		if (__improbable(guard_right == VM_PAGE_NULL)) {
861 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
862 			goto out_error;
863 		}
864 	}
865 
866 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
867 		if (alloc_pages) {
868 			kmr.kmr_return = alloc_pages(fill_size, flags,
869 			    &wired_page_list);
870 		} else {
871 			kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
872 			    &wired_page_list);
873 		}
874 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
875 			goto out_error;
876 		}
877 	}
878 
879 	/*
880 	 *	Allocate a new object (if necessary).  We must do this before
881 	 *	locking the map, or risk deadlock with the default pager.
882 	 */
883 	if (flags & KMA_KOBJECT) {
884 		object = kernel_object;
885 		vm_object_reference(object);
886 	} else if (flags & KMA_COMPRESSOR) {
887 		object = compressor_object;
888 		vm_object_reference(object);
889 	} else {
890 		object = vm_object_allocate(map_size);
891 		vm_object_set_size(object, map_size, size);
892 		/* stabilize the object to prevent shadowing */
893 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
894 		object->true_share = TRUE;
895 	}
896 
897 	if (flags & KMA_LAST_FREE) {
898 		vmk_flags.vmkf_last_free = true;
899 	}
900 	if (flags & KMA_PERMANENT) {
901 		vmk_flags.vmf_permanent = true;
902 	}
903 	kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
904 	    false);
905 
906 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
907 	    vmk_flags, &entry);
908 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
909 		vm_object_deallocate(object);
910 		goto out_error;
911 	}
912 
913 	map_addr = entry->vme_start;
914 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
915 	VME_ALIAS_SET(entry, guard.kmg_tag);
916 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
917 		VME_OFFSET_SET(entry, map_addr);
918 	}
919 
920 #if KASAN
921 	if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
922 		entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
923 	}
924 #endif /* KASAN */
925 
926 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
927 		entry->wired_count = 1;
928 	}
929 
930 	if (guard_left || guard_right || wired_page_list) {
931 		vm_object_offset_t offset = 0ull;
932 
933 		vm_object_lock(object);
934 		vm_map_unlock(map);
935 
936 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
937 			offset = map_addr;
938 		}
939 
940 		if (guard_left) {
941 			vm_page_insert(guard_left, object, offset);
942 			guard_left->vmp_busy = FALSE;
943 			guard_left = VM_PAGE_NULL;
944 		}
945 
946 		if (guard_right) {
947 			vm_page_insert(guard_right, object,
948 			    offset + fill_start + fill_size);
949 			guard_right->vmp_busy = FALSE;
950 			guard_right = VM_PAGE_NULL;
951 		}
952 
953 		if (wired_page_list) {
954 			kernel_memory_populate_object_and_unlock(object,
955 			    map_addr + fill_start, offset + fill_start, fill_size,
956 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT);
957 		} else {
958 			vm_object_unlock(object);
959 		}
960 	} else {
961 		vm_map_unlock(map);
962 	}
963 
964 	/*
965 	 * now that the pages are wired, we no longer have to fear coalesce
966 	 */
967 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
968 		vm_map_simplify(map, map_addr);
969 	}
970 
971 #if DEBUG || DEVELOPMENT
972 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
973 	    atop(fill_size), 0, 0, 0);
974 #endif /* DEBUG || DEVELOPMENT */
975 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
976 
977 #if KASAN
978 	if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
979 		/*
980 		 * We need to allow the range for pageable memory,
981 		 * or faulting will not be allowed.
982 		 */
983 		kasan_notify_address(map_addr, map_size);
984 	}
985 #endif /* KASAN */
986 #if KASAN_CLASSIC
987 	if (flags & KMA_KASAN_GUARD) {
988 		kmr.kmr_address += PAGE_SIZE;
989 		kasan_alloc_large(kmr.kmr_address, size);
990 	}
991 #endif /* KASAN_CLASSIC */
992 #if KASAN_TBI
993 	if (flags & KMA_TAG) {
994 		kmr.kmr_address = kasan_tbi_tag_large_alloc(kmr.kmr_address,
995 		    map_size, size);
996 	}
997 #endif /* KASAN_TBI */
998 	return kmr;
999 
1000 out_error:
1001 	if (flags & KMA_NOFAIL) {
1002 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1003 	}
1004 	if (guard_left) {
1005 		guard_left->vmp_snext = wired_page_list;
1006 		wired_page_list = guard_left;
1007 	}
1008 	if (guard_right) {
1009 		guard_right->vmp_snext = wired_page_list;
1010 		wired_page_list = guard_right;
1011 	}
1012 	if (wired_page_list) {
1013 		vm_page_free_list(wired_page_list, FALSE);
1014 	}
1015 
1016 #if DEBUG || DEVELOPMENT
1017 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1018 	    0, 0, 0, 0);
1019 #endif /* DEBUG || DEVELOPMENT */
1020 
1021 	return kmr;
1022 }
1023 
1024 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1025 kmem_alloc_guard(
1026 	vm_map_t        map,
1027 	vm_size_t       size,
1028 	vm_offset_t     mask,
1029 	kma_flags_t     flags,
1030 	kmem_guard_t    guard)
1031 {
1032 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1033 }
1034 
1035 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1036 kmem_alloc_contig_guard(
1037 	vm_map_t                map,
1038 	vm_size_t               size,
1039 	vm_offset_t             mask,
1040 	ppnum_t                 max_pnum,
1041 	ppnum_t                 pnum_mask,
1042 	kma_flags_t             flags,
1043 	kmem_guard_t            guard)
1044 {
1045 	__auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1046 		return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1047 	};
1048 
1049 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1050 }
1051 
1052 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1053 kmem_suballoc(
1054 	vm_map_t                parent,
1055 	mach_vm_offset_t       *addr,
1056 	vm_size_t               size,
1057 	vm_map_create_options_t vmc_options,
1058 	int                     vm_flags,
1059 	kms_flags_t             flags,
1060 	vm_tag_t                tag)
1061 {
1062 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1063 	vm_map_offset_t map_addr = 0;
1064 	kmem_return_t kmr = { };
1065 	vm_map_t map;
1066 
1067 	assert(page_aligned(size));
1068 	assert(parent->pmap == kernel_pmap);
1069 
1070 	vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1071 
1072 	if (parent == kernel_map) {
1073 		assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1074 	}
1075 
1076 	if (vmk_flags.vmf_fixed) {
1077 		map_addr = trunc_page(*addr);
1078 	}
1079 
1080 	pmap_reference(vm_map_pmap(parent));
1081 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1082 
1083 	/*
1084 	 * 1. vm_map_enter() will consume one ref on success.
1085 	 *
1086 	 * 2. make the entry atomic as kernel submaps should never be split.
1087 	 *
1088 	 * 3. instruct vm_map_enter() that it is a fresh submap
1089 	 *    that needs to be taught its bounds as it inserted.
1090 	 */
1091 	vm_map_reference(map);
1092 
1093 	vmk_flags.vmkf_submap = true;
1094 	if ((flags & KMS_DATA) == 0) {
1095 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1096 		vmk_flags.vmkf_submap_atomic = true;
1097 	}
1098 	vmk_flags.vmkf_submap_adjust = true;
1099 	if (flags & KMS_LAST_FREE) {
1100 		vmk_flags.vmkf_last_free = true;
1101 	}
1102 	if (flags & KMS_PERMANENT) {
1103 		vmk_flags.vmf_permanent = true;
1104 	}
1105 	if (flags & KMS_DATA) {
1106 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1107 	}
1108 
1109 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1110 	    vmk_flags, (vm_object_t)map, 0, FALSE,
1111 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1112 
1113 	if (kmr.kmr_return != KERN_SUCCESS) {
1114 		if (flags & KMS_NOFAIL) {
1115 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1116 			    parent, size, kmr.kmr_return);
1117 		}
1118 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1119 		vm_map_deallocate(map);
1120 		vm_map_deallocate(map); /* also removes ref to pmap */
1121 		return kmr;
1122 	}
1123 
1124 	/*
1125 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1126 	 * that the exact same range is returned.
1127 	 */
1128 	if (*addr != 0 && parent == kernel_map &&
1129 	    startup_phase > STARTUP_SUB_KMEM) {
1130 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1131 	} else {
1132 		*addr = map_addr;
1133 	}
1134 
1135 	kmr.kmr_submap = map;
1136 	return kmr;
1137 }
1138 
1139 /*
1140  *	kmem_alloc:
1141  *
1142  *	Allocate wired-down memory in the kernel's address map
1143  *	or a submap.  The memory is not zero-filled.
1144  */
1145 
1146 __exported kern_return_t
1147 kmem_alloc_external(
1148 	vm_map_t        map,
1149 	vm_offset_t     *addrp,
1150 	vm_size_t       size);
1151 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1152 kmem_alloc_external(
1153 	vm_map_t        map,
1154 	vm_offset_t     *addrp,
1155 	vm_size_t       size)
1156 {
1157 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1158 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1159 	}
1160 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1161 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1162 }
1163 
1164 
1165 /*
1166  *	kmem_alloc_kobject:
1167  *
1168  *	Allocate wired-down memory in the kernel's address map
1169  *	or a submap.  The memory is not zero-filled.
1170  *
1171  *	The memory is allocated in the kernel_object.
1172  *	It may not be copied with vm_map_copy, and
1173  *	it may not be reallocated with kmem_realloc.
1174  */
1175 
1176 __exported kern_return_t
1177 kmem_alloc_kobject_external(
1178 	vm_map_t        map,
1179 	vm_offset_t     *addrp,
1180 	vm_size_t       size);
1181 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1182 kmem_alloc_kobject_external(
1183 	vm_map_t        map,
1184 	vm_offset_t     *addrp,
1185 	vm_size_t       size)
1186 {
1187 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1188 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1189 	}
1190 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1191 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1192 }
1193 
1194 /*
1195  *	kmem_alloc_pageable:
1196  *
1197  *	Allocate pageable memory in the kernel's address map.
1198  */
1199 
1200 __exported kern_return_t
1201 kmem_alloc_pageable_external(
1202 	vm_map_t        map,
1203 	vm_offset_t     *addrp,
1204 	vm_size_t       size);
1205 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1206 kmem_alloc_pageable_external(
1207 	vm_map_t        map,
1208 	vm_offset_t     *addrp,
1209 	vm_size_t       size)
1210 {
1211 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1212 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1213 	}
1214 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1215 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1216 }
1217 
1218 
1219 #pragma mark population
1220 
1221 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags)1222 kernel_memory_populate_pmap_enter(
1223 	vm_object_t             object,
1224 	vm_address_t            addr,
1225 	vm_object_offset_t      offset,
1226 	vm_page_t               mem,
1227 	vm_prot_t               prot,
1228 	int                     pe_flags)
1229 {
1230 	kern_return_t   pe_result;
1231 	int             pe_options;
1232 
1233 	PMAP_ENTER_CHECK(kernel_pmap, mem);
1234 
1235 	pe_options = PMAP_OPTIONS_NOWAIT;
1236 	if (object->internal) {
1237 		pe_options |= PMAP_OPTIONS_INTERNAL;
1238 	}
1239 	if (mem->vmp_reusable || object->all_reusable) {
1240 		pe_options |= PMAP_OPTIONS_REUSABLE;
1241 	}
1242 
1243 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1244 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1245 	    pe_flags, /* wired */ TRUE, pe_options, NULL);
1246 
1247 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1248 		vm_object_unlock(object);
1249 
1250 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1251 
1252 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1253 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1254 		    pe_flags, /* wired */ TRUE, pe_options, NULL);
1255 
1256 		vm_object_lock(object);
1257 	}
1258 
1259 	assert(pe_result == KERN_SUCCESS);
1260 }
1261 
1262 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot)1263 kernel_memory_populate_object_and_unlock(
1264 	vm_object_t     object, /* must be locked */
1265 	vm_address_t    addr,
1266 	vm_offset_t     offset,
1267 	vm_size_t       size,
1268 	vm_page_t       page_list,
1269 	kma_flags_t     flags,
1270 	vm_tag_t        tag,
1271 	vm_prot_t       prot)
1272 {
1273 	vm_page_t       mem;
1274 	int             pe_flags;
1275 	bool            gobbled_list = page_list && page_list->vmp_gobbled;
1276 
1277 	assert3u((bool)(flags & KMA_KOBJECT), ==, object == kernel_object);
1278 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1279 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1280 		assert3u(offset, ==, addr);
1281 	} else {
1282 		/*
1283 		 * kernel_memory_populate_pmap_enter() might drop the object
1284 		 * lock, and the caller might not own a reference anymore
1285 		 * and rely on holding the vm object lock for liveness.
1286 		 */
1287 		vm_object_reference_locked(object);
1288 	}
1289 
1290 	if (flags & KMA_KSTACK) {
1291 		pe_flags = VM_MEM_STACK;
1292 	} else {
1293 		pe_flags = 0;
1294 	}
1295 
1296 	for (vm_object_offset_t pg_offset = 0;
1297 	    pg_offset < size;
1298 	    pg_offset += PAGE_SIZE_64) {
1299 		if (page_list == NULL) {
1300 			panic("%s: page_list too short", __func__);
1301 		}
1302 
1303 		mem = page_list;
1304 		page_list = mem->vmp_snext;
1305 		mem->vmp_snext = NULL;
1306 
1307 		assert(mem->vmp_wire_count == 0);
1308 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1309 		assert(!mem->vmp_fictitious && !mem->vmp_private);
1310 
1311 		if (flags & KMA_COMPRESSOR) {
1312 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1313 			/*
1314 			 * Background processes doing I/O accounting can call
1315 			 * into NVME driver to do some work which results in
1316 			 * an allocation here and so we want to make sure
1317 			 * that the pages used by compressor, regardless of
1318 			 * process context, are never on the special Q.
1319 			 */
1320 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1321 
1322 			vm_page_insert(mem, object, offset + pg_offset);
1323 		} else {
1324 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1325 			mem->vmp_wire_count = 1;
1326 
1327 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1328 		}
1329 
1330 		mem->vmp_gobbled = false;
1331 		mem->vmp_busy = false;
1332 		mem->vmp_pmapped = true;
1333 		mem->vmp_wpmapped = true;
1334 
1335 		/*
1336 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1337 		 * for the kernel and compressor objects.
1338 		 */
1339 
1340 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1341 		    mem, prot, pe_flags);
1342 
1343 		if (flags & KMA_NOENCRYPT) {
1344 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1345 		}
1346 	}
1347 
1348 	if (page_list) {
1349 		panic("%s: page_list too long", __func__);
1350 	}
1351 
1352 	vm_object_unlock(object);
1353 	if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1354 		vm_object_deallocate(object);
1355 	}
1356 
1357 	/*
1358 	 * Update the accounting:
1359 	 * - the compressor "wired" pages don't really count as wired
1360 	 * - kmem_alloc_contig_guard() gives gobbled pages,
1361 	 *   which already count as wired but need to be ungobbled.
1362 	 */
1363 	if (gobbled_list) {
1364 		vm_page_lockspin_queues();
1365 		if (flags & KMA_COMPRESSOR) {
1366 			vm_page_wire_count -= atop(size);
1367 		}
1368 		vm_page_gobble_count -= atop(size);
1369 		vm_page_unlock_queues();
1370 	} else if ((flags & KMA_COMPRESSOR) == 0) {
1371 		vm_page_lockspin_queues();
1372 		vm_page_wire_count += atop(size);
1373 		vm_page_unlock_queues();
1374 	}
1375 
1376 	if (flags & KMA_KOBJECT) {
1377 		/* vm_page_insert_wired() handles regular objects already */
1378 		vm_tag_update_size(tag, size);
1379 	}
1380 
1381 #if KASAN
1382 	if (flags & KMA_COMPRESSOR) {
1383 		kasan_notify_address_nopoison(addr, size);
1384 	} else {
1385 		kasan_notify_address(addr, size);
1386 	}
1387 #endif /* KASAN */
1388 }
1389 
1390 
1391 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1392 kernel_memory_populate(
1393 	vm_offset_t     addr,
1394 	vm_size_t       size,
1395 	kma_flags_t     flags,
1396 	vm_tag_t        tag)
1397 {
1398 	kern_return_t   kr = KERN_SUCCESS;
1399 	vm_page_t       page_list = NULL;
1400 	vm_size_t       page_count = atop_64(size);
1401 	vm_object_t     object = __kmem_object(ANYF(flags));
1402 
1403 #if DEBUG || DEVELOPMENT
1404 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1405 	    size, 0, 0, 0);
1406 #endif /* DEBUG || DEVELOPMENT */
1407 
1408 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1409 	if (kr == KERN_SUCCESS) {
1410 		vm_object_lock(object);
1411 		kernel_memory_populate_object_and_unlock(object, addr,
1412 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT);
1413 	}
1414 
1415 #if DEBUG || DEVELOPMENT
1416 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1417 	    page_count, 0, 0, 0);
1418 #endif /* DEBUG || DEVELOPMENT */
1419 	return kr;
1420 }
1421 
1422 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1423 kernel_memory_depopulate(
1424 	vm_offset_t        addr,
1425 	vm_size_t          size,
1426 	kma_flags_t        flags,
1427 	vm_tag_t           tag)
1428 {
1429 	vm_object_t        object = __kmem_object(ANYF(flags));
1430 	vm_object_offset_t offset = addr;
1431 	vm_page_t          mem;
1432 	vm_page_t          local_freeq = NULL;
1433 	unsigned int       pages_unwired = 0;
1434 
1435 	vm_object_lock(object);
1436 
1437 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1438 
1439 	for (vm_object_offset_t pg_offset = 0;
1440 	    pg_offset < size;
1441 	    pg_offset += PAGE_SIZE_64) {
1442 		mem = vm_page_lookup(object, offset + pg_offset);
1443 
1444 		assert(mem);
1445 
1446 		if (flags & KMA_COMPRESSOR) {
1447 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1448 		} else {
1449 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1450 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1451 			pages_unwired++;
1452 		}
1453 
1454 		mem->vmp_busy = TRUE;
1455 
1456 		assert(mem->vmp_tabled);
1457 		vm_page_remove(mem, TRUE);
1458 		assert(mem->vmp_busy);
1459 
1460 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1461 
1462 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1463 		mem->vmp_snext = local_freeq;
1464 		local_freeq = mem;
1465 	}
1466 
1467 	vm_object_unlock(object);
1468 
1469 	vm_page_free_list(local_freeq, TRUE);
1470 
1471 	if (!(flags & KMA_COMPRESSOR)) {
1472 		vm_page_lockspin_queues();
1473 		vm_page_wire_count -= pages_unwired;
1474 		vm_page_unlock_queues();
1475 	}
1476 
1477 	if (flags & KMA_KOBJECT) {
1478 		/* vm_page_remove() handles regular objects already */
1479 		vm_tag_update_size(tag, -ptoa_64(pages_unwired));
1480 	}
1481 }
1482 
1483 #pragma mark reallocation
1484 
1485 __abortlike
1486 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1487 __kmem_realloc_invalid_object_size_panic(
1488 	vm_map_t                map,
1489 	vm_address_t            address,
1490 	vm_size_t               size,
1491 	vm_map_entry_t          entry)
1492 {
1493 	vm_object_t object  = VME_OBJECT(entry);
1494 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
1495 
1496 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1497 	    "object %p has unexpected size %ld",
1498 	    map, (void *)address, (size_t)size, entry, object, objsize);
1499 }
1500 
1501 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1502 kmem_realloc_shrink_guard(
1503 	vm_map_t                map,
1504 	vm_offset_t             req_oldaddr,
1505 	vm_size_t               req_oldsize,
1506 	vm_size_t               req_newsize,
1507 	kmr_flags_t             flags,
1508 	kmem_guard_t            guard,
1509 	vm_map_entry_t          entry)
1510 {
1511 	vmr_flags_t             vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1512 	vm_object_t             object;
1513 	vm_offset_t             delta = 0;
1514 	kmem_return_t           kmr;
1515 	bool                    was_atomic;
1516 	vm_size_t               oldsize = round_page(req_oldsize);
1517 	vm_size_t               newsize = round_page(req_newsize);
1518 	vm_address_t            oldaddr = req_oldaddr;
1519 
1520 #if KASAN_CLASSIC
1521 	if (flags & KMR_KASAN_GUARD) {
1522 		assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1523 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1524 		oldaddr -= PAGE_SIZE;
1525 		delta    = ptoa(2);
1526 		oldsize += delta;
1527 		newsize += delta;
1528 	}
1529 #endif /* KASAN_CLASSIC */
1530 #if KASAN_TBI
1531 	if (flags & KMR_TAG) {
1532 		oldaddr = VM_KERNEL_TBI_FILL(req_oldaddr);
1533 	}
1534 #endif /* KASAN_TBI */
1535 
1536 	vm_map_lock_assert_exclusive(map);
1537 
1538 	if ((flags & KMR_KOBJECT) == 0) {
1539 		object = VME_OBJECT(entry);
1540 		vm_object_reference(object);
1541 	}
1542 
1543 	/*
1544 	 *	Shrinking an atomic entry starts with splitting it,
1545 	 *	and removing the second half.
1546 	 */
1547 	was_atomic = entry->vme_atomic;
1548 	entry->vme_atomic = false;
1549 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1550 	entry->vme_atomic = was_atomic;
1551 
1552 #if KASAN
1553 	if (entry->vme_kernel_object && was_atomic) {
1554 		entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1555 	}
1556 #endif /* KASAN */
1557 #if KASAN_CLASSIC
1558 	if (flags & KMR_KASAN_GUARD) {
1559 		kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1560 		    ASAN_VALID);
1561 	}
1562 #endif
1563 #if KASAN_TBI
1564 	if (flags & KMR_TAG) {
1565 		kasan_tbi_tag_large_free(req_oldaddr + newsize,
1566 		    oldsize - newsize);
1567 	}
1568 #endif /* KASAN_TBI */
1569 	(void)vm_map_remove_and_unlock(map,
1570 	    oldaddr + newsize, oldaddr + oldsize,
1571 	    vmr_flags, KMEM_GUARD_NONE);
1572 
1573 
1574 	/*
1575 	 *	Lastly, if there are guard pages, deal with them.
1576 	 *
1577 	 *	The kernel object just needs to depopulate,
1578 	 *	regular objects require freeing the last page
1579 	 *	and replacing it with a guard.
1580 	 */
1581 	if (flags & KMR_KOBJECT) {
1582 		if (flags & KMR_GUARD_LAST) {
1583 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1584 			    PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1585 		}
1586 	} else {
1587 		vm_page_t guard_right = VM_PAGE_NULL;
1588 		vm_offset_t remove_start = newsize;
1589 
1590 		if (flags & KMR_GUARD_LAST) {
1591 			if (!map->never_faults) {
1592 				guard_right = vm_page_grab_guard(true);
1593 			}
1594 			remove_start -= PAGE_SIZE;
1595 		}
1596 
1597 		vm_object_lock(object);
1598 
1599 		if (object->vo_size != oldsize) {
1600 			__kmem_realloc_invalid_object_size_panic(map,
1601 			    req_oldaddr, req_oldsize + delta, entry);
1602 		}
1603 		vm_object_set_size(object, newsize, req_newsize);
1604 
1605 		vm_object_page_remove(object, remove_start, oldsize);
1606 
1607 		if (guard_right) {
1608 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1609 			guard_right->vmp_busy = false;
1610 		}
1611 		vm_object_unlock(object);
1612 		vm_object_deallocate(object);
1613 	}
1614 
1615 	kmr.kmr_address = req_oldaddr;
1616 	kmr.kmr_return  = 0;
1617 #if KASAN_CLASSIC
1618 	if (flags & KMA_KASAN_GUARD) {
1619 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1620 	}
1621 #endif /* KASAN_CLASSIC */
1622 #if KASAN_TBI
1623 	if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1624 		kmr.kmr_address = kasan_tbi_tag_large_alloc(kmr.kmr_address,
1625 		    newsize, req_newsize);
1626 	}
1627 #endif /* KASAN_TBI */
1628 
1629 	return kmr;
1630 }
1631 
1632 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1633 kmem_realloc_guard(
1634 	vm_map_t                map,
1635 	vm_offset_t             req_oldaddr,
1636 	vm_size_t               req_oldsize,
1637 	vm_size_t               req_newsize,
1638 	kmr_flags_t             flags,
1639 	kmem_guard_t            guard)
1640 {
1641 	vm_object_t             object;
1642 	vm_size_t               oldsize;
1643 	vm_size_t               newsize;
1644 	vm_offset_t             delta = 0;
1645 	vm_map_offset_t         oldaddr;
1646 	vm_map_offset_t         newaddr;
1647 	vm_object_offset_t      newoffs;
1648 	vm_map_entry_t          oldentry;
1649 	vm_map_entry_t          newentry;
1650 	vm_page_t               page_list = NULL;
1651 	bool                    needs_wakeup = false;
1652 	kmem_return_t           kmr = { };
1653 	unsigned int            last_timestamp;
1654 	vm_map_kernel_flags_t   vmk_flags = {
1655 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1656 	};
1657 
1658 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1659 	if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1660 		__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1661 		    req_oldsize, flags);
1662 	}
1663 
1664 	if (req_oldaddr == 0ul) {
1665 		return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1666 	}
1667 
1668 	if (req_newsize == 0ul) {
1669 		kmem_free_guard(map, req_oldaddr, req_oldsize,
1670 		    (kmf_flags_t)flags, guard);
1671 		return kmr;
1672 	}
1673 
1674 	if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1675 		__kmem_invalid_size_panic(map, req_newsize, flags);
1676 	}
1677 	if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1678 		__kmem_invalid_size_panic(map, req_newsize, flags);
1679 	}
1680 
1681 	oldsize = round_page(req_oldsize);
1682 	newsize = round_page(req_newsize);
1683 	oldaddr = req_oldaddr;
1684 #if KASAN_CLASSIC
1685 	if (flags & KMR_KASAN_GUARD) {
1686 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1687 		oldaddr -= PAGE_SIZE;
1688 		delta    = ptoa(2);
1689 		oldsize += delta;
1690 		newsize += delta;
1691 	}
1692 #endif /* KASAN_CLASSIC */
1693 #if KASAN_TBI
1694 	if (flags & KMR_TAG) {
1695 		__asan_load1(req_oldaddr);
1696 		oldaddr = VM_KERNEL_TBI_FILL(req_oldaddr);
1697 	}
1698 #endif /* KASAN_TBI */
1699 
1700 #if !KASAN
1701 	/*
1702 	 *	If not on a KASAN variant, just return.
1703 	 *
1704 	 *	Otherwise we want to validate the size
1705 	 *	and re-tag for KASAN_TBI.
1706 	 */
1707 	if (oldsize == newsize) {
1708 		kmr.kmr_address = req_oldaddr;
1709 		return kmr;
1710 	}
1711 #endif /* !KASAN */
1712 
1713 	/*
1714 	 *	If we're growing the allocation,
1715 	 *	then reserve the pages we'll need,
1716 	 *	and find a spot for its new place.
1717 	 */
1718 	if (oldsize < newsize) {
1719 #if DEBUG || DEVELOPMENT
1720 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1721 		    VM_KERN_REQUEST, DBG_FUNC_START,
1722 		    newsize - oldsize, 0, 0, 0);
1723 #endif /* DEBUG || DEVELOPMENT */
1724 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1725 		    (kma_flags_t)flags, &page_list);
1726 		if (kmr.kmr_return == KERN_SUCCESS) {
1727 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1728 			    newsize, 0, &vmk_flags, true);
1729 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1730 			    vmk_flags, &newentry);
1731 		}
1732 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1733 			if (flags & KMR_REALLOCF) {
1734 				kmem_free_guard(map, req_oldaddr, req_oldsize,
1735 				    KMF_NONE, guard);
1736 			}
1737 			if (page_list) {
1738 				vm_page_free_list(page_list, FALSE);
1739 			}
1740 #if DEBUG || DEVELOPMENT
1741 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1742 			    VM_KERN_REQUEST, DBG_FUNC_END,
1743 			    0, 0, 0, 0);
1744 #endif /* DEBUG || DEVELOPMENT */
1745 			return kmr;
1746 		}
1747 
1748 		/* map is locked */
1749 	} else {
1750 		vm_map_lock(map);
1751 	}
1752 
1753 
1754 	/*
1755 	 *	Locate the entry:
1756 	 *	- wait for it to quiesce.
1757 	 *	- validate its guard,
1758 	 *	- learn its correct tag,
1759 	 */
1760 again:
1761 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1762 		__kmem_entry_not_found_panic(map, req_oldaddr);
1763 	}
1764 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1765 		oldentry->needs_wakeup = true;
1766 		vm_map_entry_wait(map, THREAD_UNINT);
1767 		goto again;
1768 	}
1769 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1770 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1771 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1772 	}
1773 	/*
1774 	 *	TODO: We should validate for non atomic entries that the range
1775 	 *	      we are acting on is what we expect here.
1776 	 */
1777 #if KASAN
1778 	if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1779 		__kmem_realloc_invalid_object_size_panic(map,
1780 		    req_oldaddr, req_oldsize + delta, oldentry);
1781 	}
1782 
1783 	if (oldsize == newsize) {
1784 		kmr.kmr_address = req_oldaddr;
1785 		if (oldentry->vme_kernel_object) {
1786 			oldentry->vme_object_or_delta = delta +
1787 			    (-req_newsize & PAGE_MASK);
1788 		} else {
1789 			object = VME_OBJECT(oldentry);
1790 			vm_object_lock(object);
1791 			vm_object_set_size(object, newsize, req_newsize);
1792 			vm_object_unlock(object);
1793 		}
1794 		vm_map_unlock(map);
1795 
1796 #if KASAN_CLASSIC
1797 		if (flags & KMA_KASAN_GUARD) {
1798 			kasan_alloc_large(kmr.kmr_address, req_newsize);
1799 		}
1800 #endif /* KASAN_CLASSIC */
1801 #if KASAN_TBI
1802 		if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1803 			kmr.kmr_address = kasan_tbi_tag_large_alloc(kmr.kmr_address,
1804 			    newsize, req_newsize);
1805 		}
1806 #endif /* KASAN_TBI */
1807 		return kmr;
1808 	}
1809 #endif /* KASAN */
1810 
1811 	guard.kmg_tag = VME_ALIAS(oldentry);
1812 
1813 	if (newsize < oldsize) {
1814 		return kmem_realloc_shrink_guard(map, req_oldaddr,
1815 		           req_oldsize, req_newsize, flags, guard, oldentry);
1816 	}
1817 
1818 
1819 	/*
1820 	 *	We are growing the entry
1821 	 *
1822 	 *	For regular objects we use the object `vo_size` updates
1823 	 *	as a guarantee that no 2 kmem_realloc() can happen
1824 	 *	concurrently (by doing it before the map is unlocked.
1825 	 *
1826 	 *	For the kernel object, prevent the entry from being
1827 	 *	reallocated or changed by marking it "in_transition".
1828 	 */
1829 
1830 	object = VME_OBJECT(oldentry);
1831 	vm_object_lock(object);
1832 	vm_object_reference_locked(object);
1833 
1834 	newaddr = newentry->vme_start;
1835 	newoffs = oldsize;
1836 
1837 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1838 	VME_ALIAS_SET(newentry, guard.kmg_tag);
1839 	if (flags & KMR_KOBJECT) {
1840 		oldentry->in_transition = true;
1841 		VME_OFFSET_SET(newentry, newaddr);
1842 		newentry->wired_count = 1;
1843 		newoffs = newaddr + oldsize;
1844 	} else {
1845 		if (object->vo_size != oldsize) {
1846 			__kmem_realloc_invalid_object_size_panic(map,
1847 			    req_oldaddr, req_oldsize + delta, oldentry);
1848 		}
1849 		vm_object_set_size(object, newsize, req_newsize);
1850 	}
1851 
1852 	last_timestamp = map->timestamp;
1853 	vm_map_unlock(map);
1854 
1855 
1856 	/*
1857 	 *	Now proceed with the population of pages.
1858 	 *
1859 	 *	Kernel objects can use the kmem population helpers.
1860 	 *
1861 	 *	Regular objects will insert pages manually,
1862 	 *	then wire the memory into the new range.
1863 	 */
1864 
1865 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1866 
1867 	if (flags & KMR_KOBJECT) {
1868 		pmap_protect(kernel_pmap,
1869 		    oldaddr, oldaddr + oldsize - guard_right_size,
1870 		    VM_PROT_NONE);
1871 
1872 		for (vm_object_offset_t offset = 0;
1873 		    offset < oldsize - guard_right_size;
1874 		    offset += PAGE_SIZE_64) {
1875 			vm_page_t mem;
1876 
1877 			mem = vm_page_lookup(object, oldaddr + offset);
1878 			if (mem == VM_PAGE_NULL) {
1879 				continue;
1880 			}
1881 
1882 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1883 
1884 			mem->vmp_busy = true;
1885 			vm_page_remove(mem, true);
1886 			vm_page_insert_wired(mem, object, newaddr + offset,
1887 			    guard.kmg_tag);
1888 			mem->vmp_busy = false;
1889 
1890 			kernel_memory_populate_pmap_enter(object, newaddr,
1891 			    offset, mem, VM_PROT_DEFAULT, 0);
1892 		}
1893 
1894 		kernel_memory_populate_object_and_unlock(object,
1895 		    newaddr + oldsize - guard_right_size,
1896 		    newoffs - guard_right_size,
1897 		    newsize - oldsize,
1898 		    page_list, (kma_flags_t)flags,
1899 		    guard.kmg_tag, VM_PROT_DEFAULT);
1900 	} else {
1901 		vm_page_t guard_right = VM_PAGE_NULL;
1902 		kern_return_t kr;
1903 
1904 		/*
1905 		 *	Note: we are borrowing the new entry reference
1906 		 *	on the object for the duration of this code,
1907 		 *	which works because we keep the object locked
1908 		 *	throughout.
1909 		 */
1910 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1911 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1912 			assert(guard_right->vmp_fictitious);
1913 			guard_right->vmp_busy = true;
1914 			vm_page_remove(guard_right, true);
1915 		}
1916 
1917 		for (vm_object_offset_t offset = oldsize - guard_right_size;
1918 		    offset < newsize - guard_right_size;
1919 		    offset += PAGE_SIZE_64) {
1920 			vm_page_t mem = page_list;
1921 
1922 			page_list = mem->vmp_snext;
1923 			mem->vmp_snext = VM_PAGE_NULL;
1924 
1925 			vm_page_insert(mem, object, offset);
1926 			mem->vmp_busy = false;
1927 		}
1928 
1929 		if (guard_right) {
1930 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1931 			guard_right->vmp_busy = false;
1932 		}
1933 
1934 		vm_object_unlock(object);
1935 
1936 		kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
1937 		    VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
1938 		assert(kr == KERN_SUCCESS);
1939 	}
1940 
1941 	/*
1942 	 *	Mark the entry as idle again,
1943 	 *	and honor KMR_FREEOLD if needed.
1944 	 */
1945 
1946 	vm_map_lock(map);
1947 	if (last_timestamp + 1 != map->timestamp &&
1948 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1949 		__kmem_entry_not_found_panic(map, req_oldaddr);
1950 	}
1951 
1952 	if (flags & KMR_KOBJECT) {
1953 		assert(oldentry->in_transition);
1954 		oldentry->in_transition = false;
1955 		if (oldentry->needs_wakeup) {
1956 			needs_wakeup = true;
1957 			oldentry->needs_wakeup = false;
1958 		}
1959 	}
1960 
1961 	if (flags & KMR_FREEOLD) {
1962 		vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1963 
1964 #if KASAN_CLASSIC
1965 		if (flags & KMR_KASAN_GUARD) {
1966 			kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
1967 		}
1968 #endif
1969 #if KASAN_TBI
1970 		if (flags & KMR_TAG) {
1971 			kasan_tbi_tag_large_free(req_oldaddr, oldsize);
1972 		}
1973 #endif /* KASAN_TBI */
1974 		if (flags & KMR_GUARD_LAST) {
1975 			vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
1976 		}
1977 		(void)vm_map_remove_and_unlock(map,
1978 		    oldaddr, oldaddr + oldsize,
1979 		    vmr_flags, guard);
1980 	} else {
1981 		vm_map_unlock(map);
1982 	}
1983 
1984 	if (needs_wakeup) {
1985 		vm_map_entry_wakeup(map);
1986 	}
1987 
1988 #if DEBUG || DEVELOPMENT
1989 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1990 	    atop(newsize - oldsize), 0, 0, 0);
1991 #endif /* DEBUG || DEVELOPMENT */
1992 	kmr.kmr_address = newaddr;
1993 
1994 #if KASAN
1995 	kasan_notify_address(kmr.kmr_address, newsize);
1996 #endif /* KASAN */
1997 #if KASAN_CLASSIC
1998 	if (flags & KMR_KASAN_GUARD) {
1999 		kmr.kmr_address += PAGE_SIZE;
2000 		kasan_alloc_large(kmr.kmr_address, req_newsize);
2001 	}
2002 #endif /* KASAN_CLASSIC */
2003 #if KASAN_TBI
2004 	if (flags & KMR_TAG) {
2005 		kmr.kmr_address = kasan_tbi_tag_large_alloc(kmr.kmr_address,
2006 		    newsize, req_newsize);
2007 	}
2008 #endif /* KASAN_TBI */
2009 
2010 	return kmr;
2011 }
2012 
2013 
2014 #pragma mark free
2015 
2016 #if KASAN
2017 
2018 __abortlike
2019 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2020 __kmem_free_invalid_object_size_panic(
2021 	vm_map_t                map,
2022 	vm_address_t            address,
2023 	vm_size_t               size,
2024 	vm_map_entry_t          entry)
2025 {
2026 	vm_object_t object  = VME_OBJECT(entry);
2027 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
2028 
2029 	panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2030 	    "object %p has unexpected size %ld",
2031 	    map, (void *)address, (size_t)size, entry, object, objsize);
2032 }
2033 
2034 #endif /* KASAN */
2035 
2036 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2037 kmem_free_guard(
2038 	vm_map_t        map,
2039 	vm_offset_t     req_addr,
2040 	vm_size_t       req_size,
2041 	kmf_flags_t     flags,
2042 	kmem_guard_t    guard)
2043 {
2044 	vmr_flags_t     vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2045 	vm_address_t    addr      = req_addr;
2046 	vm_offset_t     delta     = 0;
2047 	vm_size_t       size;
2048 #if KASAN
2049 	vm_map_entry_t  entry;
2050 #endif /* KASAN */
2051 
2052 	assert(map->pmap == kernel_pmap);
2053 
2054 #if KASAN_CLASSIC
2055 	if (flags & KMF_KASAN_GUARD) {
2056 		addr  -= PAGE_SIZE;
2057 		delta  = ptoa(2);
2058 	}
2059 #endif /* KASAN_CLASSIC */
2060 #if KASAN_TBI
2061 	if (flags & KMF_TAG) {
2062 		__asan_load1(req_addr);
2063 		addr = VM_KERNEL_TBI_FILL(req_addr);
2064 	}
2065 #endif /* KASAN_TBI */
2066 
2067 	if (flags & KMF_GUESS_SIZE) {
2068 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2069 		size = PAGE_SIZE;
2070 	} else if (req_size == 0) {
2071 		__kmem_invalid_size_panic(map, req_size, flags);
2072 	} else {
2073 		size = round_page(req_size) + delta;
2074 	}
2075 
2076 	vm_map_lock(map);
2077 
2078 #if KASAN
2079 	if (!vm_map_lookup_entry(map, addr, &entry)) {
2080 		__kmem_entry_not_found_panic(map, req_addr);
2081 	}
2082 	if (flags & KMF_GUESS_SIZE) {
2083 		vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2084 		req_size = __kmem_entry_orig_size(entry);
2085 		size = round_page(req_size + delta);
2086 	} else if (guard.kmg_atomic && entry->vme_kernel_object &&
2087 	    __kmem_entry_orig_size(entry) != req_size) {
2088 		/*
2089 		 * We can't make a strict check for regular
2090 		 * VM objects because it could be:
2091 		 *
2092 		 * - the kmem_guard_free() of a kmem_realloc_guard() without
2093 		 *   KMR_FREEOLD, and in that case the object size won't match.
2094 		 *
2095 		 * - a submap, in which case there is no "orig size".
2096 		 */
2097 		__kmem_free_invalid_object_size_panic(map,
2098 		    req_addr, req_size + delta, entry);
2099 	}
2100 #endif /* KASAN */
2101 #if KASAN_CLASSIC
2102 	if (flags & KMR_KASAN_GUARD) {
2103 		kasan_poison_range(addr, size, ASAN_VALID);
2104 	}
2105 #endif
2106 #if KASAN_TBI
2107 	if (flags & KMF_TAG) {
2108 		kasan_tbi_tag_large_free(req_addr, size);
2109 	}
2110 #endif /* KASAN_TBI */
2111 	return vm_map_remove_and_unlock(map, addr, addr + size,
2112 	           vmr_flags, guard).kmr_size - delta;
2113 }
2114 
2115 __exported void
2116 kmem_free_external(
2117 	vm_map_t        map,
2118 	vm_offset_t     addr,
2119 	vm_size_t       size);
2120 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2121 kmem_free_external(
2122 	vm_map_t        map,
2123 	vm_offset_t     addr,
2124 	vm_size_t       size)
2125 {
2126 	if (size) {
2127 		kmem_free(map, trunc_page(addr), size);
2128 #if MACH_ASSERT
2129 	} else {
2130 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2131 		    map, (void *)addr, __builtin_return_address(0));
2132 #endif
2133 	}
2134 }
2135 
2136 #pragma mark kmem metadata
2137 
2138 /*
2139  * Guard objects for kmem pointer allocation:
2140  *
2141  * Guard objects introduce size slabs to kmem pointer allocations that are
2142  * allocated in chunks of n * sizeclass. When an allocation of a specific
2143  * sizeclass is requested a random slot from [0, n) is returned.
2144  * Allocations are returned from that chunk until m slots are left. The
2145  * remaining m slots are referred to as guard objects. They don't get
2146  * allocated and the chunk is now considered full. When an allocation is
2147  * freed to the chunk 1 slot is now available from m + 1 for the next
2148  * allocation of that sizeclass.
2149  *
2150  * Guard objects are intended to make exploitation of use after frees harder
2151  * as allocations that are freed can no longer be reliable reallocated.
2152  * They also make exploitation of OOBs harder as overflowing out of an
2153  * allocation can no longer be safe even with sufficient spraying.
2154  */
2155 
2156 #define KMEM_META_PRIMARY    UINT8_MAX
2157 #define KMEM_META_START     (UINT8_MAX - 1)
2158 #define KMEM_META_FREE      (UINT8_MAX - 2)
2159 #if __ARM_16K_PG__
2160 #define KMEM_MIN_SIZE        PAGE_SIZE
2161 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2162 #else /* __ARM_16K_PG__ */
2163 /*
2164  * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2165  * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2166  * Therefore populate sizeclasses from 4k for those devices.
2167  */
2168 #define KMEM_MIN_SIZE       (4 * 1024)
2169 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2170 #endif /* __ARM_16K_PG__ */
2171 #define KMEM_MAX_SIZE       (32ULL << 20)
2172 #define KMEM_START_IDX      (kmem_log2down(KMEM_MIN_SIZE))
2173 #define KMEM_LAST_IDX       (kmem_log2down(KMEM_MAX_SIZE))
2174 #define KMEM_NUM_SIZECLASS  (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2175 #define KMEM_FRONTS         (KMEM_RANGE_ID_NUM_PTR * 2)
2176 #define KMEM_NUM_GUARDS      2
2177 
2178 struct kmem_page_meta {
2179 	union {
2180 		/*
2181 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2182 		 */
2183 		uint32_t km_bitmap;
2184 		/*
2185 		 * On start and end of free chunk with KMEM_META_FREE marker
2186 		 */
2187 		uint32_t km_free_chunks;
2188 	};
2189 	/*
2190 	 * KMEM_META_PRIMARY: Start meta of allocated chunk
2191 	 * KMEM_META_FREE   : Start and end meta of free chunk
2192 	 * KMEM_META_START  : Meta region start and end
2193 	 */
2194 	uint8_t  km_page_marker;
2195 	uint8_t  km_sizeclass;
2196 	union {
2197 		/*
2198 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2199 		 */
2200 		uint16_t km_chunk_len;
2201 		/*
2202 		 * On secondary allocated chunks
2203 		 */
2204 		uint16_t km_page_idx;
2205 	};
2206 	LIST_ENTRY(kmem_page_meta) km_link;
2207 } kmem_page_meta_t;
2208 
2209 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2210 struct kmem_sizeclass {
2211 	vm_map_size_t                   ks_size;
2212 	uint32_t                        ks_num_chunk;
2213 	uint32_t                        ks_num_elem;
2214 	crypto_random_ctx_t __zpercpu   ks_rng_ctx;
2215 	kmem_list_head_t                ks_allfree_head[KMEM_FRONTS];
2216 	kmem_list_head_t                ks_partial_head[KMEM_FRONTS];
2217 	kmem_list_head_t                ks_full_head[KMEM_FRONTS];
2218 };
2219 
2220 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2221 
2222 /*
2223  * Locks to synchronize metadata population
2224  */
2225 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2226 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2227 #define kmem_meta_lock()   lck_mtx_lock(&kmem_meta_region_lck)
2228 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2229 
2230 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2231 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2232 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2233 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2234 /*
2235  * Keeps track of metadata high water mark for each front
2236  */
2237 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2238 static SECURITY_READ_ONLY_LATE(vm_map_t)
2239 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2240 static vm_map_size_t kmem_meta_size;
2241 
2242 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2243 kmem_get_front(
2244 	kmem_range_id_t         range_id,
2245 	bool                    from_right)
2246 {
2247 	assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2248 	    (range_id <= KMEM_RANGE_ID_NUM_PTR));
2249 	return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2250 }
2251 
2252 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2253 kmem_slot_idx_to_bit(
2254 	uint32_t                slot_idx,
2255 	uint32_t                size_idx __unused)
2256 {
2257 	assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2258 	return 1ull << slot_idx;
2259 }
2260 
2261 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2262 kmem_get_idx_from_size(vm_map_size_t size)
2263 {
2264 	assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2265 	return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2266 }
2267 
2268 __abortlike
2269 static void
kmem_invalid_size_idx(uint32_t idx)2270 kmem_invalid_size_idx(uint32_t idx)
2271 {
2272 	panic("Invalid sizeclass idx %u", idx);
2273 }
2274 
2275 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2276 kmem_get_size_from_idx(uint32_t idx)
2277 {
2278 	if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2279 		kmem_invalid_size_idx(idx);
2280 	}
2281 	return 1ul << (idx + KMEM_START_IDX);
2282 }
2283 
2284 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2285 kmem_get_page_idx(struct kmem_page_meta *meta)
2286 {
2287 	uint8_t page_marker = meta->km_page_marker;
2288 
2289 	return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2290 }
2291 
2292 __abortlike
2293 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2294 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2295 {
2296 	panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2297 	    meta);
2298 }
2299 
2300 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2301 kmem_get_chunk_len(struct kmem_page_meta *meta)
2302 {
2303 	if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2304 		kmem_invalid_chunk_len(meta);
2305 	}
2306 
2307 	return meta->km_chunk_len;
2308 }
2309 
2310 __abortlike
2311 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2312 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2313 {
2314 	panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2315 	    meta);
2316 }
2317 
2318 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2319 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2320 {
2321 	if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2322 		kmem_invalid_free_chunk_len(meta);
2323 	}
2324 
2325 	return meta->km_free_chunks;
2326 }
2327 
2328 /*
2329  * Return the metadata corresponding to the specified address
2330  */
2331 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2332 kmem_addr_to_meta(
2333 	vm_map_offset_t         addr,
2334 	vm_map_range_id_t       range_id,
2335 	vm_map_offset_t        *range_start,
2336 	uint64_t               *meta_idx)
2337 {
2338 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2339 
2340 	*range_start = kmem_ranges[range_id].min_address;
2341 	*meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2342 	return &meta_base[*meta_idx];
2343 }
2344 
2345 /*
2346  * Return the metadata start of the chunk that the address belongs to
2347  */
2348 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2349 kmem_addr_to_meta_start(
2350 	vm_address_t            addr,
2351 	vm_map_range_id_t       range_id,
2352 	vm_map_offset_t        *chunk_start)
2353 {
2354 	vm_map_offset_t range_start;
2355 	uint64_t meta_idx;
2356 	struct kmem_page_meta *meta;
2357 
2358 	meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2359 	meta_idx -= kmem_get_page_idx(meta);
2360 	meta -= kmem_get_page_idx(meta);
2361 	assert(meta->km_page_marker == KMEM_META_PRIMARY);
2362 	*chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2363 	return meta;
2364 }
2365 
2366 __startup_func
2367 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2368 kmem_init_meta_front(
2369 	struct kmem_page_meta  *meta,
2370 	kmem_range_id_t         range_id,
2371 	bool                    from_right)
2372 {
2373 	kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2374 	    KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2375 	meta->km_page_marker = KMEM_META_START;
2376 	if (!from_right) {
2377 		meta++;
2378 		kmem_meta_base[range_id] = meta;
2379 	}
2380 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2381 }
2382 
2383 __startup_func
2384 static void
kmem_metadata_init(void)2385 kmem_metadata_init(void)
2386 {
2387 	for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2388 		vm_map_offset_t addr = kmem_meta_range[i].min_address;
2389 		struct kmem_page_meta *meta;
2390 		uint64_t meta_idx;
2391 
2392 		vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2393 		kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2394 		    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2395 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
2396 		    VM_KERN_MEMORY_OSFMK).kmr_submap;
2397 
2398 		kmem_meta_range[i].min_address = addr;
2399 		kmem_meta_range[i].max_address = addr + kmem_meta_size;
2400 
2401 		meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2402 		kmem_init_meta_front(meta, i, 0);
2403 
2404 		meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2405 		    &meta_idx);
2406 		kmem_init_meta_front(meta, i, 1);
2407 	}
2408 }
2409 
2410 __startup_func
2411 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2412 kmem_init_front_head(
2413 	struct kmem_sizeclass  *ks,
2414 	uint32_t                front)
2415 {
2416 	LIST_INIT(&ks->ks_allfree_head[front]);
2417 	LIST_INIT(&ks->ks_partial_head[front]);
2418 	LIST_INIT(&ks->ks_full_head[front]);
2419 }
2420 
2421 __startup_func
2422 static void
kmem_sizeclass_init(void)2423 kmem_sizeclass_init(void)
2424 {
2425 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2426 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2427 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2428 
2429 		ks->ks_size = kmem_get_size_from_idx(i);
2430 		ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2431 		    KMEM_CHUNK_SIZE_MIN;
2432 		ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2433 		assert(ks->ks_num_elem <=
2434 		    (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2435 		for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2436 			kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2437 			kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2438 		}
2439 	}
2440 }
2441 
2442 /*
2443  * This is done during EARLY_BOOT as it needs the corecrypto module to be
2444  * set up.
2445  */
2446 __startup_func
2447 static void
kmem_crypto_init(void)2448 kmem_crypto_init(void)
2449 {
2450 	vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2451 
2452 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2453 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2454 
2455 		ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2456 		zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2457 			crypto_random_kmem_init(ctx);
2458 		}
2459 	}
2460 }
2461 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2462 
2463 __abortlike
2464 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2465 kmem_validate_slot_panic(
2466 	vm_map_offset_t         addr,
2467 	struct kmem_page_meta  *meta,
2468 	uint32_t                slot_idx,
2469 	uint32_t                size_idx)
2470 {
2471 	if (meta->km_page_marker != KMEM_META_PRIMARY) {
2472 		panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2473 	}
2474 	if (meta->km_sizeclass != size_idx) {
2475 		panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2476 		    meta, meta->km_sizeclass, size_idx);
2477 	}
2478 	panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2479 	    slot_idx, meta, (void *)addr);
2480 }
2481 
2482 __abortlike
2483 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2484 kmem_invalid_slot_for_addr(
2485 	mach_vm_range_t         slot,
2486 	vm_map_offset_t         start,
2487 	vm_map_offset_t         end)
2488 {
2489 	panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2490 	    (void *)slot->min_address, (void *)slot->max_address,
2491 	    (void *)start, (void *)end);
2492 }
2493 
2494 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2495 kmem_validate_slot(
2496 	vm_map_offset_t         addr,
2497 	struct kmem_page_meta  *meta,
2498 	uint32_t                size_idx,
2499 	uint32_t                slot_idx)
2500 {
2501 	if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2502 	    (meta->km_sizeclass != size_idx) ||
2503 	    ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2504 		kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2505 	}
2506 }
2507 
2508 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2509 kmem_validate_slot_initial(
2510 	mach_vm_range_t         slot,
2511 	vm_map_offset_t         start,
2512 	vm_map_offset_t         end,
2513 	struct kmem_page_meta  *meta,
2514 	uint32_t                size_idx,
2515 	uint32_t                slot_idx)
2516 {
2517 	if ((slot->min_address == 0) || (slot->max_address == 0) ||
2518 	    (start < slot->min_address) || (start >= slot->max_address) ||
2519 	    (end > slot->max_address)) {
2520 		kmem_invalid_slot_for_addr(slot, start, end);
2521 	}
2522 
2523 	kmem_validate_slot(start, meta, size_idx, slot_idx);
2524 }
2525 
2526 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2527 kmem_addr_get_slot_idx(
2528 	vm_map_offset_t         start,
2529 	vm_map_offset_t         end,
2530 	vm_map_range_id_t       range_id,
2531 	struct kmem_page_meta **meta,
2532 	uint32_t               *size_idx,
2533 	mach_vm_range_t         slot)
2534 {
2535 	vm_map_offset_t chunk_start;
2536 	vm_map_size_t slot_size;
2537 	uint32_t slot_idx;
2538 
2539 	*meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2540 	*size_idx = (*meta)->km_sizeclass;
2541 	slot_size = kmem_get_size_from_idx(*size_idx);
2542 	slot_idx = (start - chunk_start) / slot_size;
2543 	slot->min_address = chunk_start + slot_idx * slot_size;
2544 	slot->max_address = slot->min_address + slot_size;
2545 
2546 	kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2547 
2548 	return slot_idx;
2549 }
2550 
2551 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2552 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2553 {
2554 #if KASAN
2555 #pragma unused(from, to)
2556 	return true;
2557 #else
2558 	vm_offset_t page_addr = trunc_page(from);
2559 
2560 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2561 		/*
2562 		 * This can race with another thread doing a populate on the same metadata
2563 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2564 		 * fault in the shadow when we first access the metadata page. Avoid this
2565 		 * by always synchronizing on the kmem_meta_lock with KASan.
2566 		 */
2567 		if (!pmap_find_phys(kernel_pmap, page_addr)) {
2568 			return true;
2569 		}
2570 	}
2571 
2572 	return false;
2573 #endif /* !KASAN */
2574 }
2575 
2576 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2577 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2578 {
2579 	vm_offset_t page_addr = trunc_page(from);
2580 
2581 	vm_map_unlock(kernel_map);
2582 
2583 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2584 		for (;;) {
2585 			kern_return_t ret = KERN_SUCCESS;
2586 
2587 			/*
2588 			 * All updates to kmem metadata are done under the kmem_meta_lock
2589 			 */
2590 			kmem_meta_lock();
2591 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2592 				ret = kernel_memory_populate(page_addr,
2593 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2594 				    VM_KERN_MEMORY_OSFMK);
2595 			}
2596 			kmem_meta_unlock();
2597 
2598 			if (ret == KERN_SUCCESS) {
2599 				break;
2600 			}
2601 
2602 			/*
2603 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2604 			 * to bad system deadlocks, so if the allocation failed,
2605 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
2606 			 */
2607 			VM_PAGE_WAIT();
2608 		}
2609 	}
2610 
2611 	vm_map_lock(kernel_map);
2612 }
2613 
2614 __abortlike
2615 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2616 kmem_invalid_meta_panic(
2617 	struct kmem_page_meta  *meta,
2618 	uint32_t                slot_idx,
2619 	struct kmem_sizeclass   sizeclass)
2620 {
2621 	uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2622 
2623 	if (slot_idx >= sizeclass.ks_num_elem) {
2624 		panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2625 		    sizeclass.ks_num_elem, meta);
2626 	}
2627 	if (meta->km_sizeclass != size_idx) {
2628 		panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2629 		    meta->km_sizeclass, meta);
2630 	}
2631 	panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
2632 }
2633 
2634 __abortlike
2635 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)2636 kmem_slot_has_entry_panic(
2637 	vm_map_entry_t          entry,
2638 	vm_map_offset_t         addr)
2639 {
2640 	panic("Entry (%p) already exists for addr (%p) being returned",
2641 	    entry, (void *)addr);
2642 }
2643 
2644 __abortlike
2645 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)2646 kmem_slot_not_found(
2647 	struct kmem_page_meta  *meta,
2648 	uint32_t                slot_idx)
2649 {
2650 	panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
2651 	    meta->km_bitmap);
2652 }
2653 
2654 /*
2655  * Returns a 16bit random number between 0 and
2656  * upper_limit (inclusive)
2657  */
2658 __startup_func
2659 uint16_t
kmem_get_random16(uint16_t upper_limit)2660 kmem_get_random16(
2661 	uint16_t                upper_limit)
2662 {
2663 	static uint64_t random_entropy;
2664 	assert(upper_limit < UINT16_MAX);
2665 	if (random_entropy == 0) {
2666 		random_entropy = early_random();
2667 	}
2668 	uint32_t result = random_entropy & UINT32_MAX;
2669 	random_entropy >>= 32;
2670 	return (uint16_t)(result % (upper_limit + 1));
2671 }
2672 
2673 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)2674 kmem_get_nth_free_slot(
2675 	struct kmem_page_meta  *meta,
2676 	uint32_t                n,
2677 	uint32_t                bitmap)
2678 {
2679 	uint32_t zeros_seen = 0, ones_seen = 0;
2680 
2681 	while (bitmap) {
2682 		uint32_t count = __builtin_ctz(bitmap);
2683 
2684 		zeros_seen += count;
2685 		bitmap >>= count;
2686 		if (__probable(~bitmap)) {
2687 			count = __builtin_ctz(~bitmap);
2688 		} else {
2689 			count = 32;
2690 		}
2691 		if (count + ones_seen > n) {
2692 			return zeros_seen + n;
2693 		}
2694 		ones_seen += count;
2695 		bitmap >>= count;
2696 	}
2697 
2698 	kmem_slot_not_found(meta, n);
2699 }
2700 
2701 
2702 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)2703 kmem_get_next_slot(
2704 	struct kmem_page_meta  *meta,
2705 	struct kmem_sizeclass   sizeclass,
2706 	uint32_t                bitmap)
2707 {
2708 	uint32_t num_slots = __builtin_popcount(bitmap);
2709 	uint64_t slot_idx = 0;
2710 
2711 	assert(num_slots > 0);
2712 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
2713 		/*
2714 		 * Use early random prior to early boot as the ks_rng_ctx requires
2715 		 * the corecrypto module to be setup before it is initialized and
2716 		 * used.
2717 		 *
2718 		 * num_slots can't be 0 as we take this path when we have more than
2719 		 * one slot left.
2720 		 */
2721 		slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
2722 	} else {
2723 		crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
2724 		    &slot_idx);
2725 	}
2726 
2727 	return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
2728 }
2729 
2730 /*
2731  * Returns an unallocated slot from the given metadata
2732  */
2733 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)2734 kmem_get_addr_from_meta(
2735 	struct kmem_page_meta  *meta,
2736 	vm_map_range_id_t       range_id,
2737 	struct kmem_sizeclass   sizeclass,
2738 	vm_map_entry_t         *entry)
2739 {
2740 	vm_map_offset_t addr;
2741 	vm_map_size_t size = sizeclass.ks_size;
2742 	uint32_t size_idx = kmem_get_idx_from_size(size);
2743 	uint64_t meta_idx = meta - kmem_meta_base[range_id];
2744 	mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
2745 	uint32_t slot_bit;
2746 	uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
2747 
2748 	if ((slot_idx >= sizeclass.ks_num_elem) ||
2749 	    (meta->km_sizeclass != size_idx) ||
2750 	    (meta->km_page_marker != KMEM_META_PRIMARY)) {
2751 		kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
2752 	}
2753 
2754 	slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
2755 	meta->km_bitmap &= ~slot_bit;
2756 
2757 	addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
2758 	assert(kmem_range_contains_fully(range_id, addr, size));
2759 	if (vm_map_lookup_entry(kernel_map, addr, entry)) {
2760 		kmem_slot_has_entry_panic(*entry, addr);
2761 	}
2762 	if ((*entry != vm_map_to_entry(kernel_map)) &&
2763 	    ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
2764 	    ((*entry)->vme_next->vme_start < (addr + size))) {
2765 		kmem_slot_has_entry_panic(*entry, addr);
2766 	}
2767 	return addr;
2768 }
2769 
2770 __abortlike
2771 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)2772 kmem_range_out_of_va(
2773 	kmem_range_id_t         range_id,
2774 	uint32_t                num_chunks)
2775 {
2776 	panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
2777 }
2778 
2779 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)2780 kmem_init_allocated_chunk(
2781 	struct kmem_page_meta  *meta,
2782 	struct kmem_sizeclass   sizeclass,
2783 	uint32_t                size_idx)
2784 {
2785 	uint32_t meta_num = sizeclass.ks_num_chunk;
2786 	uint32_t num_elem = sizeclass.ks_num_elem;
2787 
2788 	meta->km_bitmap = (1ull << num_elem) - 1;
2789 	meta->km_chunk_len = (uint16_t)meta_num;
2790 	assert(LIST_NEXT(meta, km_link) == NULL);
2791 	assert(meta->km_link.le_prev == NULL);
2792 	meta->km_sizeclass = (uint8_t)size_idx;
2793 	meta->km_page_marker = KMEM_META_PRIMARY;
2794 	meta++;
2795 	for (uint32_t i = 1; i < meta_num; i++) {
2796 		meta->km_page_idx = (uint16_t)i;
2797 		meta->km_sizeclass = (uint8_t)size_idx;
2798 		meta->km_page_marker = 0;
2799 		meta->km_bitmap = 0;
2800 		meta++;
2801 	}
2802 }
2803 
2804 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)2805 kmem_get_additional_meta(
2806 	struct kmem_page_meta  *meta,
2807 	uint32_t                meta_req,
2808 	bool                    from_right,
2809 	struct kmem_page_meta **adj_free_meta)
2810 {
2811 	struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
2812 
2813 	if (meta_prev->km_page_marker == KMEM_META_FREE) {
2814 		uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
2815 
2816 		*adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
2817 		meta_req -= chunk_len;
2818 	} else {
2819 		*adj_free_meta = NULL;
2820 	}
2821 
2822 	return meta_req;
2823 }
2824 
2825 
2826 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)2827 kmem_get_new_chunk(
2828 	vm_map_range_id_t       range_id,
2829 	bool                    from_right,
2830 	uint32_t                size_idx)
2831 {
2832 	struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
2833 	struct kmem_page_meta *start, *end, *meta_update;
2834 	struct kmem_page_meta *adj_free_meta = NULL;
2835 	uint32_t meta_req = sizeclass.ks_num_chunk;
2836 
2837 	for (;;) {
2838 		struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2839 		struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2840 		struct kmem_page_meta *meta;
2841 		vm_offset_t start_addr, end_addr;
2842 		uint32_t meta_num;
2843 
2844 		meta = from_right ? metab : metaf;
2845 		meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
2846 		    &adj_free_meta);
2847 
2848 		if (metaf + meta_num >= metab) {
2849 			kmem_range_out_of_va(range_id, meta_num);
2850 		}
2851 
2852 		start = from_right ? (metab - meta_num) : metaf;
2853 		end = from_right ? metab : (metaf + meta_num);
2854 
2855 		start_addr = (vm_offset_t)start;
2856 		end_addr   = (vm_offset_t)end;
2857 
2858 		/*
2859 		 * If the new high watermark stays on the same page,
2860 		 * no need to populate and drop the lock.
2861 		 */
2862 		if (!page_aligned(from_right ? end_addr : start_addr) &&
2863 		    trunc_page(start_addr) == trunc_page(end_addr - 1)) {
2864 			break;
2865 		}
2866 		if (!kmem_populate_needed(start_addr, end_addr)) {
2867 			break;
2868 		}
2869 
2870 		kmem_populate_meta_locked(start_addr, end_addr);
2871 
2872 		/*
2873 		 * Since we dropped the lock, reassess conditions still hold:
2874 		 * - the HWM we are changing must not have moved
2875 		 * - the other HWM must not intersect with ours
2876 		 * - in case of coalescing, the adjacent free meta must still
2877 		 *   be free and of the same size.
2878 		 *
2879 		 * If we failed to grow, reevaluate whether freelists have
2880 		 * entries now by returning NULL.
2881 		 */
2882 		metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2883 		metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2884 		if (meta != (from_right ? metab : metaf)) {
2885 			return NULL;
2886 		}
2887 		if (metaf + meta_num >= metab) {
2888 			kmem_range_out_of_va(range_id, meta_num);
2889 		}
2890 		if (adj_free_meta) {
2891 			if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
2892 			    kmem_get_free_chunk_len(adj_free_meta) !=
2893 			    meta_req - meta_num) {
2894 				return NULL;
2895 			}
2896 		}
2897 
2898 		break;
2899 	}
2900 
2901 	/*
2902 	 * If there is an adjacent free chunk remove it from free list
2903 	 */
2904 	if (adj_free_meta) {
2905 		LIST_REMOVE(adj_free_meta, km_link);
2906 		LIST_NEXT(adj_free_meta, km_link) = NULL;
2907 		adj_free_meta->km_link.le_prev = NULL;
2908 	}
2909 
2910 	/*
2911 	 * Update hwm
2912 	 */
2913 	meta_update = from_right ? start : end;
2914 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
2915 
2916 	/*
2917 	 * Initialize metadata
2918 	 */
2919 	start = from_right ? start : (end - meta_req);
2920 	kmem_init_allocated_chunk(start, sizeclass, size_idx);
2921 
2922 	return start;
2923 }
2924 
2925 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)2926 kmem_requeue_meta(
2927 	struct kmem_page_meta  *meta,
2928 	struct kmem_list_head  *head)
2929 {
2930 	LIST_REMOVE(meta, km_link);
2931 	LIST_INSERT_HEAD(head, meta, km_link);
2932 }
2933 
2934 /*
2935  * Return corresponding sizeclass to stash free chunks in
2936  */
2937 __abortlike
2938 static void
kmem_invalid_chunk_num(uint32_t chunks)2939 kmem_invalid_chunk_num(uint32_t chunks)
2940 {
2941 	panic("Invalid number of chunks %u\n", chunks);
2942 }
2943 
2944 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)2945 kmem_get_size_idx_for_chunks(uint32_t chunks)
2946 {
2947 	for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
2948 		if (chunks >= kmem_size_array[i].ks_num_chunk) {
2949 			return i;
2950 		}
2951 	}
2952 	kmem_invalid_chunk_num(chunks);
2953 }
2954 
2955 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)2956 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
2957 {
2958 	bzero(meta, count * sizeof(struct kmem_page_meta));
2959 }
2960 
2961 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)2962 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
2963 {
2964 #if MACH_ASSERT
2965 	size_t size = count * sizeof(struct kmem_page_meta);
2966 
2967 	assert(memcmp_zero_ptr_aligned(meta, size) == 0);
2968 #else
2969 #pragma unused(meta, count)
2970 #endif
2971 }
2972 
2973 /*!
2974  * @function kmem_init_free_chunk()
2975  *
2976  * @discussion
2977  * This function prepares a range of chunks to be put on a free list.
2978  * The first and last metadata might be dirty, but the "inner" ones
2979  * must be zero filled by the caller prior to calling this function.
2980  */
2981 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)2982 kmem_init_free_chunk(
2983 	struct kmem_page_meta  *meta,
2984 	uint32_t                num_chunks,
2985 	uint32_t                front)
2986 {
2987 	struct kmem_sizeclass *sizeclass;
2988 	uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
2989 
2990 	if (num_chunks > 2) {
2991 		kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
2992 	}
2993 
2994 	meta[0] = (struct kmem_page_meta){
2995 		.km_free_chunks = num_chunks,
2996 		.km_page_marker = KMEM_META_FREE,
2997 		.km_sizeclass   = (uint8_t)size_idx,
2998 	};
2999 	if (num_chunks > 1) {
3000 		meta[num_chunks - 1] = (struct kmem_page_meta){
3001 			.km_free_chunks = num_chunks,
3002 			.km_page_marker = KMEM_META_FREE,
3003 			.km_sizeclass   = (uint8_t)size_idx,
3004 		};
3005 	}
3006 
3007 	sizeclass = &kmem_size_array[size_idx];
3008 	LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3009 }
3010 
3011 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3012 kmem_get_free_chunk_from_list(
3013 	struct kmem_sizeclass  *org_sizeclass,
3014 	uint32_t                size_idx,
3015 	uint32_t                front)
3016 {
3017 	struct kmem_sizeclass *sizeclass;
3018 	uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3019 	struct kmem_page_meta *meta;
3020 	uint32_t idx = size_idx;
3021 
3022 	while (idx < KMEM_NUM_SIZECLASS) {
3023 		sizeclass = &kmem_size_array[idx];
3024 		meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3025 		if (meta) {
3026 			break;
3027 		}
3028 		idx++;
3029 	}
3030 
3031 	/*
3032 	 * Trim if larger in size
3033 	 */
3034 	if (meta) {
3035 		uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3036 
3037 		assert(meta->km_page_marker == KMEM_META_FREE);
3038 		LIST_REMOVE(meta, km_link);
3039 		LIST_NEXT(meta, km_link) = NULL;
3040 		meta->km_link.le_prev = NULL;
3041 		if (num_chunks_free > num_chunks) {
3042 			num_chunks_free -= num_chunks;
3043 			kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3044 		}
3045 
3046 		kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3047 	}
3048 
3049 	return meta;
3050 }
3051 
3052 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3053 kmem_locate_space(
3054 	vm_map_size_t           size,
3055 	vm_map_range_id_t       range_id,
3056 	bool                    from_right,
3057 	vm_map_offset_t        *start_inout,
3058 	vm_map_entry_t         *entry_out)
3059 {
3060 	vm_map_entry_t entry;
3061 	uint32_t size_idx = kmem_get_idx_from_size(size);
3062 	uint32_t front = kmem_get_front(range_id, from_right);
3063 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3064 	struct kmem_page_meta *meta;
3065 
3066 	assert(size <= sizeclass->ks_size);
3067 again:
3068 	if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3069 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3070 		/*
3071 		 * Requeue to full if necessary
3072 		 */
3073 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3074 		if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3075 			kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3076 		}
3077 	} else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3078 	    front)) != NULL) {
3079 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3080 		/*
3081 		 * Queue to partial
3082 		 */
3083 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3084 		assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3085 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3086 	} else {
3087 		meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3088 		if (meta == NULL) {
3089 			goto again;
3090 		}
3091 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3092 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3093 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3094 	}
3095 
3096 	if (entry_out) {
3097 		*entry_out = entry;
3098 	}
3099 
3100 	return KERN_SUCCESS;
3101 }
3102 
3103 /*
3104  * Determine whether the given metadata was allocated from the right
3105  */
3106 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3107 kmem_meta_is_from_right(
3108 	kmem_range_id_t         range_id,
3109 	struct kmem_page_meta  *meta)
3110 {
3111 	struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3112 #if DEBUG || DEVELOPMENT
3113 	struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3114 #endif
3115 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3116 	struct kmem_page_meta *meta_end;
3117 
3118 	meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3119 
3120 	if ((meta >= meta_base) && (meta < metaf)) {
3121 		return false;
3122 	}
3123 
3124 	assert(meta >= metab && meta < meta_end);
3125 	return true;
3126 }
3127 
3128 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3129 kmem_free_chunk(
3130 	kmem_range_id_t         range_id,
3131 	struct kmem_page_meta  *meta,
3132 	bool                    from_right)
3133 {
3134 	struct kmem_page_meta *meta_coalesce = meta - 1;
3135 	struct kmem_page_meta *meta_start = meta;
3136 	uint32_t num_chunks = kmem_get_chunk_len(meta);
3137 	uint32_t add_chunks;
3138 	struct kmem_page_meta *meta_end = meta + num_chunks;
3139 	struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3140 	uint32_t front = kmem_get_front(range_id, from_right);
3141 
3142 	meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3143 	meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3144 
3145 	LIST_REMOVE(meta, km_link);
3146 	kmem_clear_meta_range(meta, num_chunks);
3147 
3148 	/*
3149 	 * Coalesce left
3150 	 */
3151 	if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3152 	    (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3153 		meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3154 		add_chunks = kmem_get_free_chunk_len(meta_start);
3155 		num_chunks += add_chunks;
3156 		LIST_REMOVE(meta_start, km_link);
3157 		kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3158 	}
3159 
3160 	/*
3161 	 * Coalesce right
3162 	 */
3163 	if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3164 	    (meta_end->km_page_marker == KMEM_META_FREE)) {
3165 		add_chunks = kmem_get_free_chunk_len(meta_end);
3166 		LIST_REMOVE(meta_end, km_link);
3167 		kmem_clear_meta_range(meta_end, 1);
3168 		meta_end = meta_end + add_chunks;
3169 		num_chunks += add_chunks;
3170 	}
3171 
3172 	kmem_init_free_chunk(meta_start, num_chunks, front);
3173 }
3174 
3175 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3176 kmem_free_slot(
3177 	kmem_range_id_t         range_id,
3178 	mach_vm_range_t         slot)
3179 {
3180 	struct kmem_page_meta *meta;
3181 	vm_map_offset_t chunk_start;
3182 	uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3183 	struct kmem_sizeclass *sizeclass;
3184 	vm_map_size_t slot_size;
3185 
3186 	meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3187 	size_idx = meta->km_sizeclass;
3188 	slot_size = kmem_get_size_from_idx(size_idx);
3189 	slot_idx = (slot->min_address - chunk_start) / slot_size;
3190 	assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3191 	meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3192 
3193 	sizeclass = &kmem_size_array[size_idx];
3194 	chunk_elem = sizeclass->ks_num_elem;
3195 	num_elem = __builtin_popcount(meta->km_bitmap);
3196 
3197 	if (num_elem == chunk_elem) {
3198 		/*
3199 		 * If entire chunk empty add to emtpy list
3200 		 */
3201 		bool from_right = kmem_meta_is_from_right(range_id, meta);
3202 
3203 		kmem_free_chunk(range_id, meta, from_right);
3204 	} else if (num_elem == KMEM_NUM_GUARDS + 1) {
3205 		/*
3206 		 * If we freed to full chunk move it to partial
3207 		 */
3208 		uint32_t front = kmem_get_front(range_id,
3209 		    kmem_meta_is_from_right(range_id, meta));
3210 
3211 		kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3212 	}
3213 }
3214 
3215 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3216 kmem_free_space(
3217 	vm_map_offset_t         start,
3218 	vm_map_offset_t         end,
3219 	vm_map_range_id_t       range_id,
3220 	mach_vm_range_t         slot)
3221 {
3222 	bool entry_present = false;
3223 	vm_map_entry_t prev_entry;
3224 	vm_map_entry_t next_entry;
3225 
3226 	if ((slot->min_address == start) && (slot->max_address == end)) {
3227 		/*
3228 		 * Entire slot is being freed at once
3229 		 */
3230 		return kmem_free_slot(range_id, slot);
3231 	}
3232 
3233 	entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3234 	assert(!entry_present);
3235 	next_entry = prev_entry->vme_next;
3236 
3237 	if (((prev_entry == vm_map_to_entry(kernel_map) ||
3238 	    prev_entry->vme_end <= slot->min_address)) &&
3239 	    (next_entry == vm_map_to_entry(kernel_map) ||
3240 	    (next_entry->vme_start >= slot->max_address))) {
3241 		/*
3242 		 * Free entire slot
3243 		 */
3244 		kmem_free_slot(range_id, slot);
3245 	}
3246 }
3247 
3248 #pragma mark kmem init
3249 
3250 /*
3251  * The default percentage of memory that can be mlocked is scaled based on the total
3252  * amount of memory in the system. These percentages are caclulated
3253  * offline and stored in this table. We index this table by
3254  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3255  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3256  *
3257  * Note that these values were picked for mac.
3258  * If we ever have very large memory config arm devices, we may want to revisit
3259  * since the kernel overhead is smaller there due to the larger page size.
3260  */
3261 
3262 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3263 #define VM_USER_WIREABLE_MIN_CONFIG 32
3264 #if CONFIG_JETSAM
3265 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3266  * pressure.
3267  */
3268 static vm_map_size_t wire_limit_percents[] =
3269 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3270 #else
3271 static vm_map_size_t wire_limit_percents[] =
3272 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3273 #endif /* CONFIG_JETSAM */
3274 
3275 /*
3276  * Sets the default global user wire limit which limits the amount of
3277  * memory that can be locked via mlock() based on the above algorithm..
3278  * This can be overridden via a sysctl.
3279  */
3280 static void
kmem_set_user_wire_limits(void)3281 kmem_set_user_wire_limits(void)
3282 {
3283 	uint64_t available_mem_log;
3284 	uint64_t max_wire_percent;
3285 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3286 	    sizeof(vm_map_size_t);
3287 	vm_map_size_t limit;
3288 	uint64_t config_memsize = max_mem;
3289 #if defined(XNU_TARGET_OS_OSX)
3290 	config_memsize = max_mem_actual;
3291 #endif /* defined(XNU_TARGET_OS_OSX) */
3292 
3293 	available_mem_log = bit_floor(config_memsize);
3294 
3295 	if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3296 		available_mem_log = 0;
3297 	} else {
3298 		available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3299 	}
3300 	if (available_mem_log >= wire_limit_percents_length) {
3301 		available_mem_log = wire_limit_percents_length - 1;
3302 	}
3303 	max_wire_percent = wire_limit_percents[available_mem_log];
3304 
3305 	limit = config_memsize * max_wire_percent / 100;
3306 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3307 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3308 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3309 	}
3310 
3311 	vm_global_user_wire_limit = limit;
3312 	/* the default per task limit is the same as the global limit */
3313 	vm_per_task_user_wire_limit = limit;
3314 	vm_add_wire_count_over_global_limit = 0;
3315 	vm_add_wire_count_over_user_limit = 0;
3316 }
3317 
3318 #define KMEM_MAX_CLAIMS 50
3319 __startup_data
3320 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3321 __startup_data
3322 uint32_t kmem_claim_count = 0;
3323 
3324 __startup_func
3325 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3326 kmem_range_startup_init(
3327 	struct kmem_range_startup_spec *sp)
3328 {
3329 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3330 	if (sp->kc_calculate_sz) {
3331 		sp->kc_size = (sp->kc_calculate_sz)();
3332 	}
3333 	if (sp->kc_size) {
3334 		kmem_claims[kmem_claim_count] = *sp;
3335 		kmem_claim_count++;
3336 	}
3337 }
3338 
3339 static vm_offset_t
kmem_fuzz_start(void)3340 kmem_fuzz_start(void)
3341 {
3342 	vm_offset_t kmapoff_kaddr = 0;
3343 	uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3344 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3345 
3346 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3347 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3348 	    VM_KERN_MEMORY_OSFMK);
3349 	return kmapoff_kaddr + kmapoff_size;
3350 }
3351 
3352 /*
3353  * Generate a randomly shuffled array of indices from 0 to count - 1
3354  */
3355 __startup_func
3356 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3357 kmem_shuffle(
3358 	uint16_t       *shuffle_buf,
3359 	uint16_t        count)
3360 {
3361 	for (uint16_t i = 0; i < count; i++) {
3362 		uint16_t j = kmem_get_random16(i);
3363 		if (j != i) {
3364 			shuffle_buf[i] = shuffle_buf[j];
3365 		}
3366 		shuffle_buf[j] = i;
3367 	}
3368 }
3369 
3370 __startup_func
3371 static void
kmem_shuffle_claims(void)3372 kmem_shuffle_claims(void)
3373 {
3374 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3375 	uint16_t limit = (uint16_t)kmem_claim_count;
3376 
3377 	kmem_shuffle(&shuffle_buf[0], limit);
3378 	for (uint16_t i = 0; i < limit; i++) {
3379 		struct kmem_range_startup_spec tmp = kmem_claims[i];
3380 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3381 		kmem_claims[shuffle_buf[i]] = tmp;
3382 	}
3383 }
3384 
3385 __startup_func
3386 static void
kmem_readjust_ranges(uint32_t cur_idx)3387 kmem_readjust_ranges(
3388 	uint32_t        cur_idx)
3389 {
3390 	assert(cur_idx != 0);
3391 	uint32_t j = cur_idx - 1, random;
3392 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3393 	struct mach_vm_range *sp_range = sp.kc_range;
3394 
3395 	/*
3396 	 * Find max index where restriction is met
3397 	 */
3398 	for (; j > 0; j--) {
3399 		struct kmem_range_startup_spec spj = kmem_claims[j];
3400 		vm_map_offset_t max_start = spj.kc_range->min_address;
3401 		if (spj.kc_flags & KC_NO_MOVE) {
3402 			panic("kmem_range_init: Can't scramble with multiple constraints");
3403 		}
3404 		if (max_start <= sp_range->min_address) {
3405 			break;
3406 		}
3407 	}
3408 
3409 	/*
3410 	 * Pick a random index from 0 to max index and shift claims to the right
3411 	 * to make room for restricted claim
3412 	 */
3413 	random = kmem_get_random16((uint16_t)j);
3414 	assert(random <= j);
3415 
3416 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
3417 	sp_range->max_address = sp_range->min_address + sp.kc_size;
3418 
3419 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3420 		struct kmem_range_startup_spec spj = kmem_claims[j];
3421 		struct mach_vm_range *range = spj.kc_range;
3422 		range->min_address += sp.kc_size;
3423 		range->max_address += sp.kc_size;
3424 		kmem_claims[j + 1] = spj;
3425 	}
3426 
3427 	sp.kc_flags = KC_NO_MOVE;
3428 	kmem_claims[random] = sp;
3429 }
3430 
3431 __startup_func
3432 static vm_map_size_t
kmem_add_ptr_claims(void)3433 kmem_add_ptr_claims(void)
3434 {
3435 	uint64_t kmem_meta_num, kmem_ptr_chunks;
3436 	vm_map_size_t org_ptr_range_size = ptr_range_size;
3437 
3438 	ptr_range_size -= PAGE_SIZE;
3439 	ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3440 	ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3441 
3442 	kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3443 	ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3444 
3445 	kmem_meta_num = kmem_ptr_chunks + 2;
3446 	kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3447 
3448 	assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3449 	/*
3450 	 * Add claims for kmem's ranges
3451 	 */
3452 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3453 		struct kmem_range_startup_spec kmem_spec = {
3454 			.kc_name = "kmem_ptr_range",
3455 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3456 			.kc_size = ptr_range_size,
3457 			.kc_flags = KC_NO_ENTRY,
3458 		};
3459 		kmem_claims[kmem_claim_count++] = kmem_spec;
3460 
3461 		struct kmem_range_startup_spec kmem_meta_spec = {
3462 			.kc_name = "kmem_ptr_range_meta",
3463 			.kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3464 			.kc_size = kmem_meta_size,
3465 			.kc_flags = KC_NONE,
3466 		};
3467 		kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3468 	}
3469 	return (org_ptr_range_size - ptr_range_size - kmem_meta_size) *
3470 	       kmem_ptr_ranges;
3471 }
3472 
3473 __startup_func
3474 static void
kmem_add_extra_claims(void)3475 kmem_add_extra_claims(void)
3476 {
3477 	vm_map_size_t largest_free_size = 0, total_claims = 0;
3478 
3479 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3480 	largest_free_size = trunc_page(largest_free_size);
3481 
3482 	/*
3483 	 * kasan and configs w/o *TRR need to have just one ptr range due to
3484 	 * resource constraints.
3485 	 */
3486 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3487 	kmem_ptr_ranges = 1;
3488 #endif
3489 	/*
3490 	 * Determine size of data and pointer kmem_ranges
3491 	 */
3492 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3493 		total_claims += kmem_claims[i].kc_size;
3494 	}
3495 	assert((total_claims & PAGE_MASK) == 0);
3496 	largest_free_size -= total_claims;
3497 
3498 	/*
3499 	 * Use half the total available VA for all pointer allocations (this
3500 	 * includes the kmem_sprayqtn range). Given that we have 4 total
3501 	 * ranges divide the available VA by 8.
3502 	 */
3503 	ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3504 	sprayqtn_range_size = ptr_range_size;
3505 
3506 	if (sprayqtn_range_size > (sane_size / 2)) {
3507 		sprayqtn_range_size = sane_size / 2;
3508 	}
3509 
3510 	ptr_range_size = round_page(ptr_range_size);
3511 	sprayqtn_range_size = round_page(sprayqtn_range_size);
3512 
3513 
3514 	data_range_size = largest_free_size
3515 	    - (ptr_range_size * kmem_ptr_ranges)
3516 	    - sprayqtn_range_size;
3517 
3518 	/*
3519 	 * Add claims for kmem's ranges
3520 	 */
3521 	data_range_size += kmem_add_ptr_claims();
3522 	assert(data_range_size + sprayqtn_range_size +
3523 	    ((ptr_range_size + kmem_meta_size) * kmem_ptr_ranges) <=
3524 	    largest_free_size);
3525 
3526 	struct kmem_range_startup_spec kmem_spec_sprayqtn = {
3527 		.kc_name = "kmem_sprayqtn_range",
3528 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
3529 		.kc_size = sprayqtn_range_size,
3530 		.kc_flags = KC_NO_ENTRY,
3531 	};
3532 	kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
3533 
3534 	struct kmem_range_startup_spec kmem_spec_data = {
3535 		.kc_name = "kmem_data_range",
3536 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
3537 		.kc_size = data_range_size,
3538 		.kc_flags = KC_NO_ENTRY,
3539 	};
3540 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
3541 }
3542 
3543 __startup_func
3544 static void
kmem_scramble_ranges(void)3545 kmem_scramble_ranges(void)
3546 {
3547 	vm_map_offset_t start = 0;
3548 
3549 	/*
3550 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
3551 	 * the vm can find the requested ranges.
3552 	 */
3553 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
3554 	    VM_MAP_PAGE_SIZE(kernel_map));
3555 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
3556 
3557 	/*
3558 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
3559 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
3560 	 * could overflow into the heap.
3561 	 */
3562 	kext_alloc_init();
3563 
3564 	/*
3565 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
3566 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
3567 	 * eats about 2M of VA from the map)
3568 	 *
3569 	 * Note that we always need to slide by at least one page because the VM
3570 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
3571 	 * do not admit this address to be part of any zone submap.
3572 	 */
3573 	start = kmem_fuzz_start();
3574 
3575 	/*
3576 	 * Add claims for ptr and data kmem_ranges
3577 	 */
3578 	kmem_add_extra_claims();
3579 
3580 	/*
3581 	 * Shuffle registered claims
3582 	 */
3583 	assert(kmem_claim_count < UINT16_MAX);
3584 	kmem_shuffle_claims();
3585 
3586 	/*
3587 	 * Apply restrictions and determine range for each claim
3588 	 */
3589 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3590 		vm_map_offset_t end = 0;
3591 		struct kmem_range_startup_spec sp = kmem_claims[i];
3592 		struct mach_vm_range *sp_range = sp.kc_range;
3593 		if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
3594 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &start, NULL) != KERN_SUCCESS) {
3595 			panic("kmem_range_init: vm_map_locate_space failing for claim %s",
3596 			    sp.kc_name);
3597 		}
3598 
3599 		end = start + sp.kc_size;
3600 		/*
3601 		 * Re-adjust ranges if restriction not met
3602 		 */
3603 		if (sp_range->min_address && start > sp_range->min_address) {
3604 			kmem_readjust_ranges(i);
3605 		} else {
3606 			sp_range->min_address = start;
3607 			sp_range->max_address = end;
3608 		}
3609 		start = end;
3610 	}
3611 
3612 	/*
3613 	 * We have settled on the ranges, now create temporary entries for the
3614 	 * claims
3615 	 */
3616 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3617 		struct kmem_range_startup_spec sp = kmem_claims[i];
3618 		vm_map_entry_t entry = NULL;
3619 		if (sp.kc_flags & KC_NO_ENTRY) {
3620 			continue;
3621 		}
3622 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
3623 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &entry) != KERN_SUCCESS) {
3624 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
3625 			    sp.kc_name);
3626 		}
3627 		vm_object_reference(kernel_object);
3628 		VME_OBJECT_SET(entry, kernel_object, false, 0);
3629 		VME_OFFSET_SET(entry, entry->vme_start);
3630 		vm_map_unlock(kernel_map);
3631 	}
3632 	/*
3633 	 * Now that we are done assigning all the ranges, reset
3634 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
3635 	 */
3636 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
3637 
3638 #if DEBUG || DEVELOPMENT
3639 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3640 		struct kmem_range_startup_spec sp = kmem_claims[i];
3641 
3642 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
3643 		    (void *)sp.kc_range->min_address,
3644 		    (void *)sp.kc_range->max_address,
3645 		    mach_vm_size_pretty(sp.kc_size),
3646 		    mach_vm_size_unit(sp.kc_size));
3647 	}
3648 #endif /* DEBUG || DEVELOPMENT */
3649 }
3650 
3651 __startup_func
3652 static void
kmem_range_init(void)3653 kmem_range_init(void)
3654 {
3655 	vm_size_t range_adjustment;
3656 
3657 	kmem_scramble_ranges();
3658 
3659 	range_adjustment = sprayqtn_range_size >> 3;
3660 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
3661 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
3662 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
3663 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
3664 
3665 	range_adjustment = data_range_size >> 3;
3666 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
3667 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
3668 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
3669 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
3670 
3671 	pmap_init();
3672 	kmem_metadata_init();
3673 	kmem_sizeclass_init();
3674 
3675 #if DEBUG || DEVELOPMENT
3676 	for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
3677 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
3678 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
3679 		    (void *)kmem_large_ranges[i].min_address,
3680 		    (void *)kmem_large_ranges[i].max_address,
3681 		    mach_vm_size_pretty(range_size),
3682 		    mach_vm_size_unit(range_size));
3683 	}
3684 #endif
3685 }
3686 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
3687 
3688 #if DEBUG || DEVELOPMENT
3689 __startup_func
3690 static void
kmem_log_init(void)3691 kmem_log_init(void)
3692 {
3693 	/*
3694 	 * Log can only be created after the the kmem subsystem is initialized as
3695 	 * btlog creation uses kmem
3696 	 */
3697 	kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
3698 }
3699 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
3700 
3701 kmem_gobj_stats
kmem_get_gobj_stats(void)3702 kmem_get_gobj_stats(void)
3703 {
3704 	kmem_gobj_stats stats = {};
3705 
3706 	vm_map_lock(kernel_map);
3707 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3708 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
3709 		struct mach_vm_range range = kmem_ranges[range_id];
3710 		struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3711 		struct kmem_page_meta *meta_end;
3712 		uint64_t meta_idx = meta - kmem_meta_base[range_id];
3713 		vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
3714 		vm_map_offset_t addr;
3715 		vm_map_entry_t entry;
3716 
3717 		/*
3718 		 * Left front
3719 		 */
3720 		va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
3721 		meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
3722 
3723 		/*
3724 		 * Right front
3725 		 */
3726 		meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3727 		meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
3728 		    &meta_idx);
3729 		meta_idx = meta_end - meta;
3730 		meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
3731 		va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
3732 
3733 		/*
3734 		 * Compute VA allocated in entire range
3735 		 */
3736 		if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
3737 			entry = entry->vme_next;
3738 		}
3739 		while (entry != vm_map_to_entry(kernel_map) &&
3740 		    entry->vme_start < range.max_address) {
3741 			used += (entry->vme_end - entry->vme_start);
3742 			entry = entry->vme_next;
3743 		}
3744 
3745 		pte_sz = round_page(atop(va - used) * 8);
3746 
3747 		stats.total_used += used;
3748 		stats.total_va += va;
3749 		stats.pte_sz += pte_sz;
3750 		stats.meta_sz += meta_sz;
3751 	}
3752 	vm_map_unlock(kernel_map);
3753 
3754 	return stats;
3755 }
3756 
3757 #endif /* DEBUG || DEVELOPMENT */
3758 
3759 /*
3760  *	kmem_init:
3761  *
3762  *	Initialize the kernel's virtual memory map, taking
3763  *	into account all memory allocated up to this time.
3764  */
3765 __startup_func
3766 void
kmem_init(vm_offset_t start,vm_offset_t end)3767 kmem_init(
3768 	vm_offset_t     start,
3769 	vm_offset_t     end)
3770 {
3771 	vm_map_offset_t map_start;
3772 	vm_map_offset_t map_end;
3773 
3774 	map_start = vm_map_trunc_page(start,
3775 	    VM_MAP_PAGE_MASK(kernel_map));
3776 	map_end = vm_map_round_page(end,
3777 	    VM_MAP_PAGE_MASK(kernel_map));
3778 
3779 	vm_map_will_allocate_early_map(&kernel_map);
3780 #if defined(__arm64__)
3781 	kernel_map = vm_map_create_options(pmap_kernel(),
3782 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3783 	    VM_MAX_KERNEL_ADDRESS,
3784 	    VM_MAP_CREATE_DEFAULT);
3785 	/*
3786 	 *	Reserve virtual memory allocated up to this time.
3787 	 */
3788 	{
3789 		unsigned int    region_select = 0;
3790 		vm_map_offset_t region_start;
3791 		vm_map_size_t   region_size;
3792 		vm_map_offset_t map_addr;
3793 		kern_return_t kr;
3794 
3795 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
3796 			map_addr = region_start;
3797 			kr = vm_map_enter(kernel_map, &map_addr,
3798 			    vm_map_round_page(region_size,
3799 			    VM_MAP_PAGE_MASK(kernel_map)),
3800 			    (vm_map_offset_t) 0,
3801 			    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vmkf_no_pmap_check = true),
3802 			    VM_OBJECT_NULL,
3803 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
3804 			    VM_INHERIT_DEFAULT);
3805 
3806 			if (kr != KERN_SUCCESS) {
3807 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3808 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
3809 				    (uint64_t) region_size, kr);
3810 			}
3811 
3812 			region_select++;
3813 		}
3814 	}
3815 #else
3816 	kernel_map = vm_map_create_options(pmap_kernel(),
3817 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
3818 	    VM_MAP_CREATE_DEFAULT);
3819 	/*
3820 	 *	Reserve virtual memory allocated up to this time.
3821 	 */
3822 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
3823 		vm_map_offset_t map_addr;
3824 		kern_return_t kr;
3825 
3826 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3827 		kr = vm_map_enter(kernel_map,
3828 		    &map_addr,
3829 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3830 		    (vm_map_offset_t) 0,
3831 		    VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
3832 		    VM_OBJECT_NULL,
3833 		    (vm_object_offset_t) 0, FALSE,
3834 		    VM_PROT_NONE, VM_PROT_NONE,
3835 		    VM_INHERIT_DEFAULT);
3836 
3837 		if (kr != KERN_SUCCESS) {
3838 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3839 			    (uint64_t) start, (uint64_t) end,
3840 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3841 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3842 			    kr);
3843 		}
3844 	}
3845 #endif
3846 
3847 	kmem_set_user_wire_limits();
3848 }
3849 
3850 
3851 #pragma mark map copyio
3852 
3853 /*
3854  *	Routine:	copyinmap
3855  *	Purpose:
3856  *		Like copyin, except that fromaddr is an address
3857  *		in the specified VM map.  This implementation
3858  *		is incomplete; it handles the current user map
3859  *		and the kernel map/submaps.
3860  */
3861 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)3862 copyinmap(
3863 	vm_map_t                map,
3864 	vm_map_offset_t         fromaddr,
3865 	void                    *todata,
3866 	vm_size_t               length)
3867 {
3868 	kern_return_t   kr = KERN_SUCCESS;
3869 	vm_map_t oldmap;
3870 
3871 	if (vm_map_pmap(map) == pmap_kernel()) {
3872 		/* assume a correct copy */
3873 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
3874 	} else if (current_map() == map) {
3875 		if (copyin(fromaddr, todata, length) != 0) {
3876 			kr = KERN_INVALID_ADDRESS;
3877 		}
3878 	} else {
3879 		vm_map_reference(map);
3880 		oldmap = vm_map_switch(map);
3881 		if (copyin(fromaddr, todata, length) != 0) {
3882 			kr = KERN_INVALID_ADDRESS;
3883 		}
3884 		vm_map_switch(oldmap);
3885 		vm_map_deallocate(map);
3886 	}
3887 	return kr;
3888 }
3889 
3890 /*
3891  *	Routine:	copyoutmap
3892  *	Purpose:
3893  *		Like copyout, except that toaddr is an address
3894  *		in the specified VM map.
3895  */
3896 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)3897 copyoutmap(
3898 	vm_map_t                map,
3899 	void                    *fromdata,
3900 	vm_map_address_t        toaddr,
3901 	vm_size_t               length)
3902 {
3903 	kern_return_t   kr = KERN_SUCCESS;
3904 	vm_map_t        oldmap;
3905 
3906 	if (vm_map_pmap(map) == pmap_kernel()) {
3907 		/* assume a correct copy */
3908 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
3909 	} else if (current_map() == map) {
3910 		if (copyout(fromdata, toaddr, length) != 0) {
3911 			kr = KERN_INVALID_ADDRESS;
3912 		}
3913 	} else {
3914 		vm_map_reference(map);
3915 		oldmap = vm_map_switch(map);
3916 		if (copyout(fromdata, toaddr, length) != 0) {
3917 			kr = KERN_INVALID_ADDRESS;
3918 		}
3919 		vm_map_switch(oldmap);
3920 		vm_map_deallocate(map);
3921 	}
3922 	return kr;
3923 }
3924 
3925 /*
3926  *	Routine:	copyoutmap_atomic{32, 64}
3927  *	Purpose:
3928  *		Like copyoutmap, except that the operation is atomic.
3929  *      Takes in value rather than *fromdata pointer.
3930  */
3931 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)3932 copyoutmap_atomic32(
3933 	vm_map_t                map,
3934 	uint32_t                value,
3935 	vm_map_address_t        toaddr)
3936 {
3937 	kern_return_t   kr = KERN_SUCCESS;
3938 	vm_map_t        oldmap;
3939 
3940 	if (vm_map_pmap(map) == pmap_kernel()) {
3941 		/* assume a correct toaddr */
3942 		*(uint32_t *)toaddr = value;
3943 	} else if (current_map() == map) {
3944 		if (copyout_atomic32(value, toaddr) != 0) {
3945 			kr = KERN_INVALID_ADDRESS;
3946 		}
3947 	} else {
3948 		vm_map_reference(map);
3949 		oldmap = vm_map_switch(map);
3950 		if (copyout_atomic32(value, toaddr) != 0) {
3951 			kr = KERN_INVALID_ADDRESS;
3952 		}
3953 		vm_map_switch(oldmap);
3954 		vm_map_deallocate(map);
3955 	}
3956 	return kr;
3957 }
3958 
3959 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)3960 copyoutmap_atomic64(
3961 	vm_map_t                map,
3962 	uint64_t                value,
3963 	vm_map_address_t        toaddr)
3964 {
3965 	kern_return_t   kr = KERN_SUCCESS;
3966 	vm_map_t        oldmap;
3967 
3968 	if (vm_map_pmap(map) == pmap_kernel()) {
3969 		/* assume a correct toaddr */
3970 		*(uint64_t *)toaddr = value;
3971 	} else if (current_map() == map) {
3972 		if (copyout_atomic64(value, toaddr) != 0) {
3973 			kr = KERN_INVALID_ADDRESS;
3974 		}
3975 	} else {
3976 		vm_map_reference(map);
3977 		oldmap = vm_map_switch(map);
3978 		if (copyout_atomic64(value, toaddr) != 0) {
3979 			kr = KERN_INVALID_ADDRESS;
3980 		}
3981 		vm_map_switch(oldmap);
3982 		vm_map_deallocate(map);
3983 	}
3984 	return kr;
3985 }
3986 
3987 
3988 #pragma mark pointer obfuscation / packing
3989 
3990 /*
3991  *
3992  *	The following two functions are to be used when exposing kernel
3993  *	addresses to userspace via any of the various debug or info
3994  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
3995  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
3996  *	are exported to KEXTs.
3997  *
3998  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
3999  */
4000 
4001 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4002 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4003 {
4004 	assert(salt != 0);
4005 
4006 	if (addr == 0) {
4007 		return 0ul;
4008 	}
4009 
4010 	if (VM_KERNEL_IS_SLID(addr)) {
4011 		return VM_KERNEL_UNSLIDE(addr);
4012 	}
4013 
4014 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4015 	SHA256_CTX sha_ctx;
4016 
4017 	SHA256_Init(&sha_ctx);
4018 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4019 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4020 	SHA256_Final(sha_digest, &sha_ctx);
4021 
4022 	return sha_digest[0];
4023 }
4024 
4025 __exported vm_offset_t
4026 vm_kernel_addrhash_external(vm_offset_t addr);
4027 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4028 vm_kernel_addrhash_external(vm_offset_t addr)
4029 {
4030 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4031 }
4032 
4033 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4034 vm_kernel_addrhide(
4035 	vm_offset_t addr,
4036 	vm_offset_t *hide_addr)
4037 {
4038 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
4039 }
4040 
4041 /*
4042  *	vm_kernel_addrperm_external:
4043  *	vm_kernel_unslide_or_perm_external:
4044  *
4045  *	Use these macros when exposing an address to userspace that could come from
4046  *	either kernel text/data *or* the heap.
4047  */
4048 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4049 vm_kernel_addrperm_external(
4050 	vm_offset_t addr,
4051 	vm_offset_t *perm_addr)
4052 {
4053 	if (VM_KERNEL_IS_SLID(addr)) {
4054 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
4055 	} else if (VM_KERNEL_ADDRESS(addr)) {
4056 		*perm_addr = addr + vm_kernel_addrperm_ext;
4057 	} else {
4058 		*perm_addr = addr;
4059 	}
4060 }
4061 
4062 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4063 vm_kernel_unslide_or_perm_external(
4064 	vm_offset_t addr,
4065 	vm_offset_t *up_addr)
4066 {
4067 	vm_kernel_addrperm_external(addr, up_addr);
4068 }
4069 
4070 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4071 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4072 {
4073 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4074 		panic("pointer %p can't be packed: low %d bits aren't 0",
4075 		    (void *)ptr, params.vmpp_shift);
4076 	} else if (ptr <= params.vmpp_base) {
4077 		panic("pointer %p can't be packed: below base %p",
4078 		    (void *)ptr, (void *)params.vmpp_base);
4079 	} else {
4080 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
4081 		    (void *)ptr, (void *)vm_packing_max_packable(params));
4082 	}
4083 }
4084 
4085 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4086 vm_packing_verify_range(
4087 	const char *subsystem,
4088 	vm_offset_t min_address,
4089 	vm_offset_t max_address,
4090 	vm_packing_params_t params)
4091 {
4092 	if (min_address > max_address) {
4093 		panic("%s: %s range invalid min:%p > max:%p",
4094 		    __func__, subsystem, (void *)min_address, (void *)max_address);
4095 	}
4096 
4097 	if (!params.vmpp_base_relative) {
4098 		return;
4099 	}
4100 
4101 	if (min_address <= params.vmpp_base) {
4102 		panic("%s: %s range invalid min:%p <= base:%p",
4103 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4104 	}
4105 
4106 	if (max_address > vm_packing_max_packable(params)) {
4107 		panic("%s: %s range invalid max:%p >= max packable:%p",
4108 		    __func__, subsystem, (void *)max_address,
4109 		    (void *)vm_packing_max_packable(params));
4110 	}
4111 }
4112 
4113 #pragma mark tests
4114 #if DEBUG || DEVELOPMENT
4115 #include <sys/errno.h>
4116 
4117 static void
4118 kmem_test_for_entry(
4119 	vm_map_t                map,
4120 	vm_offset_t             addr,
4121 	void                  (^block)(vm_map_entry_t))
4122 {
4123 	vm_map_entry_t entry;
4124 
4125 	vm_map_lock(map);
4126 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4127 	vm_map_unlock(map);
4128 }
4129 
4130 #define kmem_test_assert_map(map, pg, entries) ({ \
4131 	assert3u((map)->size, ==, ptoa(pg)); \
4132 	assert3u((map)->hdr.nentries, ==, entries); \
4133 })
4134 
4135 static bool
can_write_at(vm_offset_t offs,uint32_t page)4136 can_write_at(vm_offset_t offs, uint32_t page)
4137 {
4138 	static const int zero;
4139 
4140 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4141 }
4142 #define assert_writeable(offs, page) \
4143 	assertf(can_write_at(offs, page), \
4144 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4145 
4146 #define assert_faults(offs, page) \
4147 	assertf(!can_write_at(offs, page), \
4148 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4149 
4150 #define peek(offs, page) \
4151 	(*(uint32_t *)((offs) + ptoa(page)))
4152 
4153 #define poke(offs, page, v) \
4154 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
4155 
4156 __attribute__((noinline))
4157 static void
kmem_alloc_basic_test(vm_map_t map)4158 kmem_alloc_basic_test(vm_map_t map)
4159 {
4160 	kmem_guard_t guard = {
4161 		.kmg_tag = VM_KERN_MEMORY_DIAG,
4162 	};
4163 	vm_offset_t addr;
4164 
4165 	/*
4166 	 * Test wired basics:
4167 	 * - KMA_KOBJECT
4168 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4169 	 * - allocation alignment
4170 	 */
4171 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4172 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4173 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4174 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4175 	kmem_test_assert_map(map, 10, 1);
4176 
4177 	kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
4178 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4179 		assert(e->vme_kernel_object);
4180 		assert(!e->vme_atomic);
4181 		assert3u(e->vme_start, <=, addr);
4182 		assert3u(addr + ptoa(10), <=, e->vme_end);
4183 	});
4184 
4185 	assert_faults(addr, 0);
4186 	for (int i = 1; i < 9; i++) {
4187 		assert_writeable(addr, i);
4188 	}
4189 	assert_faults(addr, 9);
4190 
4191 	kmem_free(map, addr, ptoa(10));
4192 	kmem_test_assert_map(map, 0, 0);
4193 
4194 	/*
4195 	 * Test pageable basics.
4196 	 */
4197 	addr = kmem_alloc_guard(map, ptoa(10), 0,
4198 	    KMA_PAGEABLE, guard).kmr_address;
4199 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4200 	kmem_test_assert_map(map, 10, 1);
4201 
4202 	for (int i = 0; i < 9; i++) {
4203 		assert_faults(addr, i);
4204 		poke(addr, i, 42);
4205 		assert_writeable(addr, i);
4206 	}
4207 
4208 	kmem_free(map, addr, ptoa(10));
4209 	kmem_test_assert_map(map, 0, 0);
4210 }
4211 
4212 __attribute__((noinline))
4213 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4214 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4215 {
4216 	kmem_guard_t guard = {
4217 		.kmg_atomic  = !(kind & KMR_DATA),
4218 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
4219 		.kmg_context = 0xefface,
4220 	};
4221 	vm_offset_t addr, newaddr;
4222 	const int N = 10;
4223 
4224 	/*
4225 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
4226 	 *	we could conceive an implementation where it grows in place
4227 	 *	if there's space after it.
4228 	 *
4229 	 *	However, this is what the implementation does today.
4230 	 */
4231 	bool realloc_growth_changes_address = true;
4232 	bool GL = (kind & KMR_GUARD_LAST);
4233 
4234 	/*
4235 	 *	Initial N page allocation
4236 	 */
4237 	addr = kmem_alloc_guard(map, ptoa(N), 0,
4238 	    (kind & (KMA_KOBJECT | KMA_GUARD_LAST)) | KMA_ZERO,
4239 	    guard).kmr_address;
4240 	assert3u(addr, !=, 0);
4241 	kmem_test_assert_map(map, N, 1);
4242 	for (int pg = 0; pg < N - GL; pg++) {
4243 		poke(addr, pg, 42 + pg);
4244 	}
4245 	for (int pg = N - GL; pg < N; pg++) {
4246 		assert_faults(addr, pg);
4247 	}
4248 
4249 
4250 	/*
4251 	 *	Grow to N + 3 pages
4252 	 */
4253 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4254 	    kind | KMR_ZERO, guard).kmr_address;
4255 	assert3u(newaddr, !=, 0);
4256 	if (realloc_growth_changes_address) {
4257 		assert3u(addr, !=, newaddr);
4258 	}
4259 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4260 		kmem_test_assert_map(map, N + 3, 1);
4261 	} else {
4262 		kmem_test_assert_map(map, 2 * N + 3, 2);
4263 	}
4264 	for (int pg = 0; pg < N - GL; pg++) {
4265 		assert3u(peek(newaddr, pg), ==, 42 + pg);
4266 	}
4267 	if ((kind & KMR_FREEOLD) == 0) {
4268 		for (int pg = 0; pg < N - GL; pg++) {
4269 			assert3u(peek(addr, pg), ==, 42 + pg);
4270 		}
4271 		/* check for tru-share */
4272 		poke(addr + 16, 0, 1234);
4273 		assert3u(peek(newaddr + 16, 0), ==, 1234);
4274 		kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
4275 		kmem_test_assert_map(map, N + 3, 1);
4276 	}
4277 	if (addr != newaddr) {
4278 		for (int pg = 0; pg < N - GL; pg++) {
4279 			assert_faults(addr, pg);
4280 		}
4281 	}
4282 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4283 		assert3u(peek(newaddr, pg), ==, 0);
4284 	}
4285 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4286 		assert_faults(newaddr, pg);
4287 	}
4288 	addr = newaddr;
4289 
4290 
4291 	/*
4292 	 *	Shrink to N - 2 pages
4293 	 */
4294 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4295 	    kind | KMR_ZERO, guard).kmr_address;
4296 	assert3u(map->size, ==, ptoa(N - 2));
4297 	assert3u(newaddr, ==, addr);
4298 	kmem_test_assert_map(map, N - 2, 1);
4299 
4300 	for (int pg = 0; pg < N - 2 - GL; pg++) {
4301 		assert3u(peek(addr, pg), ==, 42 + pg);
4302 	}
4303 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4304 		assert_faults(addr, pg);
4305 	}
4306 
4307 	kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
4308 	kmem_test_assert_map(map, 0, 0);
4309 }
4310 
4311 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4312 kmem_basic_test(__unused int64_t in, int64_t *out)
4313 {
4314 	mach_vm_offset_t addr;
4315 	vm_map_t map;
4316 
4317 	printf("%s: test running\n", __func__);
4318 
4319 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4320 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4321 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4322 
4323 	printf("%s: kmem_alloc ...\n", __func__);
4324 	kmem_alloc_basic_test(map);
4325 	printf("%s:     PASS\n", __func__);
4326 
4327 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4328 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4329 	printf("%s:     PASS\n", __func__);
4330 
4331 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4332 	kmem_realloc_basic_test(map, KMR_FREEOLD);
4333 	printf("%s:     PASS\n", __func__);
4334 
4335 	printf("%s: kmem_realloc (KMR_NONE) ...\n", __func__);
4336 	kmem_realloc_basic_test(map, KMR_NONE);
4337 	printf("%s:     PASS\n", __func__);
4338 
4339 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4340 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4341 	printf("%s:     PASS\n", __func__);
4342 
4343 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4344 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4345 	printf("%s:     PASS\n", __func__);
4346 
4347 	/* using KMR_DATA signals to test the non atomic realloc path */
4348 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4349 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4350 	printf("%s:     PASS\n", __func__);
4351 
4352 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4353 	kmem_realloc_basic_test(map, KMR_DATA);
4354 	printf("%s:     PASS\n", __func__);
4355 
4356 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
4357 	vm_map_deallocate(map);
4358 
4359 	printf("%s: test passed\n", __func__);
4360 	*out = 1;
4361 	return 0;
4362 }
4363 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
4364 
4365 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)4366 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
4367 {
4368 	uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
4369 
4370 	assert(chunks >= kmem_size_array[idx].ks_num_chunk);
4371 }
4372 
4373 __attribute__((noinline))
4374 static void
kmem_test_get_size_idx_for_all_chunks()4375 kmem_test_get_size_idx_for_all_chunks()
4376 {
4377 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
4378 		uint32_t chunks = kmem_size_array[i].ks_num_chunk;
4379 
4380 		if (chunks != 1) {
4381 			kmem_test_get_size_idx_for_chunks(chunks - 1);
4382 		}
4383 		kmem_test_get_size_idx_for_chunks(chunks);
4384 		kmem_test_get_size_idx_for_chunks(chunks + 1);
4385 	}
4386 }
4387 
4388 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)4389 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
4390 {
4391 	printf("%s: test running\n", __func__);
4392 
4393 	printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
4394 	kmem_test_get_size_idx_for_all_chunks();
4395 	printf("%s:     PASS\n", __func__);
4396 
4397 	printf("%s: test passed\n", __func__);
4398 	*out = 1;
4399 	return 0;
4400 }
4401 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
4402 #endif /* DEBUG || DEVELOPMENT */
4403