xref: /xnu-10002.41.9/osfmk/vm/vm_kern.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #include <kern/misc_protos.h>
80 #include <vm/cpm.h>
81 #include <kern/ledger.h>
82 #include <kern/bits.h>
83 #include <kern/startup.h>
84 
85 #include <string.h>
86 
87 #include <libkern/OSDebug.h>
88 #include <libkern/crypto/sha2.h>
89 #include <libkern/section_keywords.h>
90 #include <sys/kdebug.h>
91 #include <sys/kdebug_triage.h>
92 
93 #include <san/kasan.h>
94 #include <kern/kext_alloc.h>
95 #include <kern/backtrace.h>
96 #include <os/hash.h>
97 #include <kern/zalloc_internal.h>
98 #include <libkern/crypto/rand.h>
99 
100 /*
101  *	Variables exported by this module.
102  */
103 
104 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
105 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
106 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
107 
108 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
109     KMEM_RANGE_ID_NUM_PTR);
110 #define KMEM_GOBJ_THRESHOLD   (32ULL << 20)
111 #if DEBUG || DEVELOPMENT
112 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
113 #define KMEM_OUTLIER_SIZE      0
114 #define KMEM_OUTLIER_ALIGN     1
115 btlog_t kmem_outlier_log;
116 #endif /* DEBUG || DEVELOPMENT */
117 
118 __startup_data static vm_map_size_t data_range_size;
119 __startup_data static vm_map_size_t ptr_range_size;
120 __startup_data static vm_map_size_t sprayqtn_range_size;
121 
122 #pragma mark helpers
123 
124 __attribute__((overloadable))
125 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)126 ANYF(kma_flags_t flags)
127 {
128 	return (kmem_flags_t)flags;
129 }
130 
131 __attribute__((overloadable))
132 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)133 ANYF(kmr_flags_t flags)
134 {
135 	return (kmem_flags_t)flags;
136 }
137 
138 __attribute__((overloadable))
139 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)140 ANYF(kmf_flags_t flags)
141 {
142 	return (kmem_flags_t)flags;
143 }
144 
145 __abortlike
146 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)147 __kmem_invalid_size_panic(
148 	vm_map_t        map,
149 	vm_size_t       size,
150 	uint32_t        flags)
151 {
152 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
153 	    map, flags, (size_t)size);
154 }
155 
156 __abortlike
157 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)158 __kmem_invalid_arguments_panic(
159 	const char     *what,
160 	vm_map_t        map,
161 	vm_address_t    address,
162 	vm_size_t       size,
163 	uint32_t        flags)
164 {
165 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
166 	    "invalid arguments passed",
167 	    what, map, (void *)address, (size_t)size, flags);
168 }
169 
170 __abortlike
171 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)172 __kmem_failed_panic(
173 	vm_map_t        map,
174 	vm_size_t       size,
175 	uint32_t        flags,
176 	kern_return_t   kr,
177 	const char     *what)
178 {
179 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
180 	    what, map, (size_t)size, flags, kr);
181 }
182 
183 __abortlike
184 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)185 __kmem_entry_not_found_panic(
186 	vm_map_t        map,
187 	vm_offset_t     addr)
188 {
189 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
190 }
191 
192 static inline vm_object_t
__kmem_object(kmem_flags_t flags)193 __kmem_object(kmem_flags_t flags)
194 {
195 	if (flags & KMEM_COMPRESSOR) {
196 		if (flags & KMEM_KOBJECT) {
197 			panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
198 		}
199 		return compressor_object;
200 	}
201 	if (!(flags & KMEM_KOBJECT)) {
202 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
203 	}
204 	return kernel_object_default;
205 }
206 
207 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)208 __kmem_guard_left(kmem_flags_t flags)
209 {
210 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
211 }
212 
213 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)214 __kmem_guard_right(kmem_flags_t flags)
215 {
216 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
217 }
218 
219 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)220 __kmem_guard_size(kmem_flags_t flags)
221 {
222 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
223 }
224 
225 __pure2
226 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)227 __kmem_entry_orig_size(vm_map_entry_t entry)
228 {
229 	vm_object_t object = VME_OBJECT(entry);
230 
231 	if (entry->vme_kernel_object) {
232 		return entry->vme_end - entry->vme_start -
233 		       entry->vme_object_or_delta;
234 	} else {
235 		return object->vo_size - object->vo_size_delta;
236 	}
237 }
238 
239 
240 #pragma mark kmem range methods
241 
242 #if __arm64__
243 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
244 #define mach_vm_range_load(r, r_min, r_max) \
245 	asm("ldp %[rmin], %[rmax], [%[range]]" \
246 	    : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
247 	    : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
248 #else
249 #define mach_vm_range_load(r, rmin, rmax) \
250 	({ rmin = (r)->min_address; rmax = (r)->max_address; })
251 #endif
252 
253 __abortlike
254 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)255 __mach_vm_range_overflow(
256 	mach_vm_offset_t        addr,
257 	mach_vm_offset_t        size)
258 {
259 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
260 	    addr, addr, size);
261 }
262 
263 __abortlike
264 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)265 __mach_vm_range_invalid(
266 	mach_vm_offset_t        min_address,
267 	mach_vm_offset_t        max_address)
268 {
269 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
270 	    min_address, max_address);
271 }
272 
273 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)274 mach_vm_range_size(const struct mach_vm_range *r)
275 {
276 	mach_vm_offset_t rmin, rmax;
277 
278 	mach_vm_range_load(r, rmin, rmax);
279 	return rmax - rmin;
280 }
281 
282 __attribute__((overloadable))
283 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)284 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
285 {
286 	mach_vm_offset_t rmin, rmax;
287 
288 #if CONFIG_KERNEL_TAGGING
289 	if (VM_KERNEL_ADDRESS(addr)) {
290 		addr = vm_memtag_canonicalize_address(addr);
291 	}
292 #endif /* CONFIG_KERNEL_TAGGING */
293 
294 	/*
295 	 * The `&` is not a typo: we really expect the check to pass,
296 	 * so encourage the compiler to eagerly load and test without branches
297 	 */
298 	mach_vm_range_load(r, rmin, rmax);
299 	return (addr >= rmin) & (addr < rmax);
300 }
301 
302 __attribute__((overloadable))
303 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)304 mach_vm_range_contains(
305 	const struct mach_vm_range *r,
306 	mach_vm_offset_t        addr,
307 	mach_vm_offset_t        size)
308 {
309 	mach_vm_offset_t rmin, rmax;
310 
311 #if CONFIG_KERNEL_TAGGING
312 	if (VM_KERNEL_ADDRESS(addr)) {
313 		addr = vm_memtag_canonicalize_address(addr);
314 	}
315 #endif /* CONFIG_KERNEL_TAGGING */
316 
317 	/*
318 	 * The `&` is not a typo: we really expect the check to pass,
319 	 * so encourage the compiler to eagerly load and test without branches
320 	 */
321 	mach_vm_range_load(r, rmin, rmax);
322 	return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
323 }
324 
325 __attribute__((overloadable))
326 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)327 mach_vm_range_intersects(
328 	const struct mach_vm_range *r1,
329 	const struct mach_vm_range *r2)
330 {
331 	mach_vm_offset_t r1_min, r1_max;
332 	mach_vm_offset_t r2_min, r2_max;
333 
334 	mach_vm_range_load(r1, r1_min, r1_max);
335 	r2_min = r2->min_address;
336 	r2_max = r2->max_address;
337 
338 	if (r1_min > r1_max) {
339 		__mach_vm_range_invalid(r1_min, r1_max);
340 	}
341 
342 	if (r2_min > r2_max) {
343 		__mach_vm_range_invalid(r2_min, r2_max);
344 	}
345 
346 	return r1_max > r2_min && r1_min < r2_max;
347 }
348 
349 __attribute__((overloadable))
350 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)351 mach_vm_range_intersects(
352 	const struct mach_vm_range *r1,
353 	mach_vm_offset_t        addr,
354 	mach_vm_offset_t        size)
355 {
356 	struct mach_vm_range r2;
357 
358 	addr = VM_KERNEL_STRIP_UPTR(addr);
359 	r2.min_address = addr;
360 	if (os_add_overflow(addr, size, &r2.max_address)) {
361 		__mach_vm_range_overflow(addr, size);
362 	}
363 
364 	return mach_vm_range_intersects(r1, &r2);
365 }
366 
367 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)368 kmem_range_id_contains(
369 	kmem_range_id_t         range_id,
370 	vm_map_offset_t         addr,
371 	vm_map_size_t           size)
372 {
373 	return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
374 }
375 
376 __abortlike
377 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)378 kmem_range_invalid_panic(
379 	kmem_range_id_t         range_id,
380 	vm_map_offset_t         addr,
381 	vm_map_size_t           size)
382 {
383 	const struct mach_vm_range *r = &kmem_ranges[range_id];
384 	mach_vm_offset_t rmin, rmax;
385 
386 	mach_vm_range_load(r, rmin, rmax);
387 	if (addr + size < rmin) {
388 		panic("addr %p + size %llu overflows %p", (void *)addr, size,
389 		    (void *)(addr + size));
390 	}
391 	panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
392 	    (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
393 }
394 
395 /*
396  * Return whether the entire allocation is contained in the given range
397  */
398 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)399 kmem_range_contains_fully(
400 	kmem_range_id_t         range_id,
401 	vm_map_offset_t         addr,
402 	vm_map_size_t           size)
403 {
404 	const struct mach_vm_range *r = &kmem_ranges[range_id];
405 	mach_vm_offset_t rmin, rmax;
406 	bool result = false;
407 
408 	if (VM_KERNEL_ADDRESS(addr)) {
409 		addr = vm_memtag_canonicalize_address(addr);
410 	}
411 
412 	/*
413 	 * The `&` is not a typo: we really expect the check to pass,
414 	 * so encourage the compiler to eagerly load and test without branches
415 	 */
416 	mach_vm_range_load(r, rmin, rmax);
417 	result = (addr >= rmin) & (addr < rmax);
418 	if (__improbable(result
419 	    && ((addr + size < rmin) || (addr + size > rmax)))) {
420 		kmem_range_invalid_panic(range_id, addr, size);
421 	}
422 	return result;
423 }
424 
425 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)426 kmem_range_id_size(kmem_range_id_t range_id)
427 {
428 	return mach_vm_range_size(&kmem_ranges[range_id]);
429 }
430 
431 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)432 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
433 {
434 	kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
435 
436 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
437 		if (kmem_range_contains_fully(range_id, addr, size)) {
438 			return range_id;
439 		}
440 	}
441 	return KMEM_RANGE_ID_NONE;
442 }
443 
444 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)445 kmem_is_ptr_range(vm_map_range_id_t range_id)
446 {
447 	return (range_id >= KMEM_RANGE_ID_FIRST) &&
448 	       (range_id <= KMEM_RANGE_ID_NUM_PTR);
449 }
450 
451 __abortlike
452 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)453 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
454 {
455 	panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
456 	    (void *)addr);
457 }
458 
459 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)460 kmem_validate_range_for_overwrite(
461 	vm_map_offset_t         addr,
462 	vm_map_size_t           size)
463 {
464 	vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
465 
466 	if (kmem_is_ptr_range(range_id)) {
467 		kmem_range_invalid_for_overwrite(addr);
468 	}
469 
470 	return &kmem_ranges[range_id];
471 }
472 
473 
474 #pragma mark entry parameters
475 
476 
477 __abortlike
478 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)479 __kmem_entry_validate_panic(
480 	vm_map_t        map,
481 	vm_map_entry_t  entry,
482 	vm_offset_t     addr,
483 	vm_size_t       size,
484 	uint32_t        flags,
485 	kmem_guard_t    guard)
486 {
487 	const char *what = "???";
488 
489 	if (entry->vme_atomic != guard.kmg_atomic) {
490 		what = "atomicity";
491 	} else if (entry->is_sub_map != guard.kmg_submap) {
492 		what = "objectness";
493 	} else if (addr != entry->vme_start) {
494 		what = "left bound";
495 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
496 		what = "right bound";
497 	} else if (guard.kmg_context != entry->vme_context) {
498 		what = "guard";
499 	}
500 
501 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
502 	    "entry:%p %s mismatch guard(0x%08x)",
503 	    map, (void *)addr, size, flags, entry,
504 	    what, guard.kmg_context);
505 }
506 
507 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)508 __kmem_entry_validate_guard(
509 	vm_map_entry_t  entry,
510 	vm_offset_t     addr,
511 	vm_size_t       size,
512 	kmem_flags_t    flags,
513 	kmem_guard_t    guard)
514 {
515 	if (entry->vme_atomic != guard.kmg_atomic) {
516 		return false;
517 	}
518 
519 	if (!guard.kmg_atomic) {
520 		return true;
521 	}
522 
523 	if (entry->is_sub_map != guard.kmg_submap) {
524 		return false;
525 	}
526 
527 	if (addr != entry->vme_start) {
528 		return false;
529 	}
530 
531 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
532 		return false;
533 	}
534 
535 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
536 		return false;
537 	}
538 
539 	return true;
540 }
541 
542 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)543 kmem_entry_validate_guard(
544 	vm_map_t        map,
545 	vm_map_entry_t  entry,
546 	vm_offset_t     addr,
547 	vm_size_t       size,
548 	kmem_guard_t    guard)
549 {
550 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
551 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
552 	}
553 }
554 
555 __abortlike
556 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)557 __kmem_entry_validate_object_panic(
558 	vm_map_t        map,
559 	vm_map_entry_t  entry,
560 	kmem_flags_t    flags)
561 {
562 	const char *what;
563 	const char *verb;
564 
565 	if (entry->is_sub_map) {
566 		panic("kmem(map=%p) entry %p is a submap", map, entry);
567 	}
568 
569 	if (flags & KMEM_KOBJECT) {
570 		what = "kernel";
571 		verb = "isn't";
572 	} else if (flags & KMEM_COMPRESSOR) {
573 		what = "compressor";
574 		verb = "isn't";
575 	} else if (entry->vme_kernel_object) {
576 		what = "kernel";
577 		verb = "is unexpectedly";
578 	} else {
579 		what = "compressor";
580 		verb = "is unexpectedly";
581 	}
582 
583 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
584 	    map, flags, entry, verb, what);
585 }
586 
587 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)588 __kmem_entry_validate_object(
589 	vm_map_entry_t  entry,
590 	kmem_flags_t    flags)
591 {
592 	if (entry->is_sub_map) {
593 		return false;
594 	}
595 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
596 		return false;
597 	}
598 
599 	return (bool)(flags & KMEM_COMPRESSOR) ==
600 	       (VME_OBJECT(entry) == compressor_object);
601 }
602 
603 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)604 kmem_size_guard(
605 	vm_map_t        map,
606 	vm_offset_t     addr,
607 	kmem_guard_t    guard)
608 {
609 	kmem_flags_t flags = KMEM_GUESS_SIZE;
610 	vm_map_entry_t entry;
611 	vm_size_t size;
612 
613 	vm_map_lock_read(map);
614 
615 #if KASAN_CLASSIC
616 	addr -= PAGE_SIZE;
617 #endif /* KASAN_CLASSIC */
618 	addr = vm_memtag_canonicalize_address(addr);
619 
620 	if (!vm_map_lookup_entry(map, addr, &entry)) {
621 		__kmem_entry_not_found_panic(map, addr);
622 	}
623 
624 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
625 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
626 	}
627 
628 	size = __kmem_entry_orig_size(entry);
629 
630 	vm_map_unlock_read(map);
631 
632 	return size;
633 }
634 
635 static inline uint16_t
kmem_hash_backtrace(void * fp)636 kmem_hash_backtrace(
637 	void                     *fp)
638 {
639 	uint64_t  bt_count;
640 	uintptr_t bt[8] = {};
641 
642 	struct backtrace_control ctl = {
643 		.btc_frame_addr = (uintptr_t)fp,
644 	};
645 
646 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
647 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
648 }
649 
650 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
651     "Insufficient bits to represent ptr ranges");
652 
653 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)654 kmem_adjust_range_id(
655 	uint32_t                  hash)
656 {
657 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
658 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
659 }
660 
661 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)662 kmem_use_sprayqtn(
663 	kma_flags_t               kma_flags,
664 	vm_map_size_t             map_size,
665 	vm_offset_t               mask)
666 {
667 	/*
668 	 * Pointer allocations that are above the guard objects threshold or have
669 	 * leading guard pages with non standard alignment requests are redirected
670 	 * to the sprayqtn range.
671 	 */
672 #if DEBUG || DEVELOPMENT
673 	btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
674 	    BTREF_GET_NOWAIT : 0;
675 
676 	if ((kma_flags & KMA_SPRAYQTN) == 0) {
677 		if (map_size > KMEM_GOBJ_THRESHOLD) {
678 			btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
679 			    btref_get(__builtin_frame_address(0), flags));
680 		} else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
681 			btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
682 			    btref_get(__builtin_frame_address(0), flags));
683 		}
684 	}
685 #endif /* DEBUG || DEVELOPMENT */
686 
687 	return (kma_flags & KMA_SPRAYQTN) ||
688 	       (map_size > KMEM_GOBJ_THRESHOLD) ||
689 	       ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
690 }
691 
692 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)693 kmem_apply_security_policy(
694 	vm_map_t                  map,
695 	kma_flags_t               kma_flags,
696 	kmem_guard_t              guard,
697 	vm_map_size_t             map_size,
698 	vm_offset_t               mask,
699 	vm_map_kernel_flags_t    *vmk_flags,
700 	bool                      assert_dir __unused)
701 {
702 	kmem_range_id_t range_id;
703 	bool from_right;
704 	uint16_t type_hash = guard.kmg_type_hash;
705 
706 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
707 		return;
708 	}
709 
710 	/*
711 	 * A non-zero type-hash must be passed by krealloc_type
712 	 */
713 #if (DEBUG || DEVELOPMENT)
714 	if (assert_dir && !(kma_flags & KMA_DATA)) {
715 		assert(type_hash != 0);
716 	}
717 #endif
718 
719 	if (kma_flags & KMA_DATA) {
720 		range_id  = KMEM_RANGE_ID_DATA;
721 		/*
722 		 * As an optimization in KMA_DATA to avoid fragmentation,
723 		 * allocate static carveouts at the end of the DATA range.
724 		 */
725 		from_right = (bool)(kma_flags & KMA_PERMANENT);
726 	} else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
727 		range_id = KMEM_RANGE_ID_SPRAYQTN;
728 		from_right = (bool)(kma_flags & KMA_PERMANENT);
729 	} else if (type_hash) {
730 		range_id  = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
731 		from_right = type_hash & KMEM_DIRECTION_MASK;
732 	} else {
733 		/*
734 		 * Range id needs to correspond to one of the PTR ranges
735 		 */
736 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
737 		range_id  = kmem_adjust_range_id(type_hash);
738 		from_right = type_hash & KMEM_DIRECTION_MASK;
739 	}
740 
741 	vmk_flags->vmkf_range_id = range_id;
742 	vmk_flags->vmkf_last_free = from_right;
743 }
744 
745 #pragma mark allocation
746 
747 static kmem_return_t
748 kmem_alloc_guard_internal(
749 	vm_map_t                map,
750 	vm_size_t               size,
751 	vm_offset_t             mask,
752 	kma_flags_t             flags,
753 	kmem_guard_t            guard,
754 	kern_return_t         (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
755 {
756 	vm_object_t             object;
757 	vm_offset_t             delta = 0;
758 	vm_map_entry_t          entry = NULL;
759 	vm_map_offset_t         map_addr, fill_start;
760 	vm_map_size_t           map_size, fill_size;
761 	vm_page_t               guard_left = VM_PAGE_NULL;
762 	vm_page_t               guard_right = VM_PAGE_NULL;
763 	vm_page_t               wired_page_list = VM_PAGE_NULL;
764 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
765 	bool                    skip_guards;
766 	kmem_return_t           kmr = { };
767 
768 	assert(kernel_map && map->pmap == kernel_pmap);
769 
770 #if DEBUG || DEVELOPMENT
771 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
772 	    size, 0, 0, 0);
773 #endif
774 
775 	if (size == 0 ||
776 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
777 	    (size < __kmem_guard_size(ANYF(flags)))) {
778 		__kmem_invalid_size_panic(map, size, flags);
779 	}
780 
781 	/*
782 	 * limit the size of a single extent of wired memory
783 	 * to try and limit the damage to the system if
784 	 * too many pages get wired down
785 	 * limit raised to 2GB with 128GB max physical limit,
786 	 * but scaled by installed memory above this
787 	 *
788 	 * Note: kmem_alloc_contig_guard() is immune to this check.
789 	 */
790 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
791 	    alloc_pages == NULL &&
792 	    size > MAX(1ULL << 31, sane_size / 64))) {
793 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
794 		goto out_error;
795 	}
796 
797 	/*
798 	 * Guard pages:
799 	 *
800 	 * Guard pages are implemented as fictitious pages.
801 	 *
802 	 * However, some maps, and some objects are known
803 	 * to manage their memory explicitly, and do not need
804 	 * those to be materialized, which saves memory.
805 	 *
806 	 * By placing guard pages on either end of a stack,
807 	 * they can help detect cases where a thread walks
808 	 * off either end of its stack.
809 	 *
810 	 * They are allocated and set up here and attempts
811 	 * to access those pages are trapped in vm_fault_page().
812 	 *
813 	 * The map_size we were passed may include extra space for
814 	 * guard pages. fill_size represents the actual size to populate.
815 	 * Similarly, fill_start indicates where the actual pages
816 	 * will begin in the range.
817 	 */
818 
819 	map_size   = round_page(size);
820 	fill_start = 0;
821 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
822 
823 #if KASAN_CLASSIC
824 	if (flags & KMA_KASAN_GUARD) {
825 		assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
826 		flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
827 		delta     = ptoa(2);
828 		map_size += delta;
829 	}
830 #else
831 	(void)delta;
832 #endif /* KASAN_CLASSIC */
833 
834 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
835 	    map->never_faults;
836 
837 	if (flags & KMA_GUARD_FIRST) {
838 		vmk_flags.vmkf_guard_before = true;
839 		fill_start += PAGE_SIZE;
840 	}
841 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
842 		guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
843 		if (__improbable(guard_left == VM_PAGE_NULL)) {
844 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
845 			goto out_error;
846 		}
847 	}
848 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
849 		guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
850 		if (__improbable(guard_right == VM_PAGE_NULL)) {
851 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
852 			goto out_error;
853 		}
854 	}
855 
856 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
857 		if (alloc_pages) {
858 			kmr.kmr_return = alloc_pages(fill_size, flags,
859 			    &wired_page_list);
860 		} else {
861 			kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
862 			    &wired_page_list);
863 		}
864 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
865 			goto out_error;
866 		}
867 	}
868 
869 	/*
870 	 *	Allocate a new object (if necessary).  We must do this before
871 	 *	locking the map, or risk deadlock with the default pager.
872 	 */
873 	if (flags & KMA_KOBJECT) {
874 		object = kernel_object_default;
875 		vm_object_reference(object);
876 	} else if (flags & KMA_COMPRESSOR) {
877 		object = compressor_object;
878 		vm_object_reference(object);
879 	} else {
880 		object = vm_object_allocate(map_size);
881 		vm_object_set_size(object, map_size, size);
882 		/* stabilize the object to prevent shadowing */
883 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
884 		object->true_share = TRUE;
885 	}
886 
887 	if (flags & KMA_LAST_FREE) {
888 		vmk_flags.vmkf_last_free = true;
889 	}
890 	if (flags & KMA_PERMANENT) {
891 		vmk_flags.vmf_permanent = true;
892 	}
893 	kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
894 	    false);
895 
896 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
897 	    vmk_flags, &entry);
898 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
899 		vm_object_deallocate(object);
900 		goto out_error;
901 	}
902 
903 	map_addr = entry->vme_start;
904 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
905 	VME_ALIAS_SET(entry, guard.kmg_tag);
906 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
907 		VME_OFFSET_SET(entry, map_addr);
908 	}
909 
910 #if KASAN
911 	if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
912 		entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
913 	}
914 #endif /* KASAN */
915 
916 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
917 		entry->wired_count = 1;
918 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
919 	}
920 
921 	if (guard_left || guard_right || wired_page_list) {
922 		vm_object_offset_t offset = 0ull;
923 
924 		vm_object_lock(object);
925 		vm_map_unlock(map);
926 
927 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
928 			offset = map_addr;
929 		}
930 
931 		if (guard_left) {
932 			vm_page_insert(guard_left, object, offset);
933 			guard_left->vmp_busy = FALSE;
934 			guard_left = VM_PAGE_NULL;
935 		}
936 
937 		if (guard_right) {
938 			vm_page_insert(guard_right, object,
939 			    offset + fill_start + fill_size);
940 			guard_right->vmp_busy = FALSE;
941 			guard_right = VM_PAGE_NULL;
942 		}
943 
944 		if (wired_page_list) {
945 			kernel_memory_populate_object_and_unlock(object,
946 			    map_addr + fill_start, offset + fill_start, fill_size,
947 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT);
948 		} else {
949 			vm_object_unlock(object);
950 		}
951 	} else {
952 		vm_map_unlock(map);
953 	}
954 
955 	/*
956 	 * now that the pages are wired, we no longer have to fear coalesce
957 	 */
958 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
959 		vm_map_simplify(map, map_addr);
960 	}
961 
962 #if DEBUG || DEVELOPMENT
963 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
964 	    atop(fill_size), 0, 0, 0);
965 #endif /* DEBUG || DEVELOPMENT */
966 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
967 
968 #if KASAN
969 	if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
970 		/*
971 		 * We need to allow the range for pageable memory,
972 		 * or faulting will not be allowed.
973 		 */
974 		kasan_notify_address(map_addr, map_size);
975 	}
976 #endif /* KASAN */
977 #if KASAN_CLASSIC
978 	if (flags & KMA_KASAN_GUARD) {
979 		kmr.kmr_address += PAGE_SIZE;
980 		kasan_alloc_large(kmr.kmr_address, size);
981 	}
982 #endif /* KASAN_CLASSIC */
983 #if CONFIG_KERNEL_TAGGING
984 	if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
985 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, size);
986 		vm_memtag_set_tag((vm_offset_t)kmr.kmr_address, size);
987 #if KASAN_TBI
988 		kasan_tbi_retag_unused_space((vm_offset_t)kmr.kmr_address, map_size, size);
989 #endif /* KASAN_TBI */
990 	}
991 #endif /* CONFIG_KERNEL_TAGGING */
992 	return kmr;
993 
994 out_error:
995 	if (flags & KMA_NOFAIL) {
996 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
997 	}
998 	if (guard_left) {
999 		guard_left->vmp_snext = wired_page_list;
1000 		wired_page_list = guard_left;
1001 	}
1002 	if (guard_right) {
1003 		guard_right->vmp_snext = wired_page_list;
1004 		wired_page_list = guard_right;
1005 	}
1006 	if (wired_page_list) {
1007 		vm_page_free_list(wired_page_list, FALSE);
1008 	}
1009 
1010 #if DEBUG || DEVELOPMENT
1011 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1012 	    0, 0, 0, 0);
1013 #endif /* DEBUG || DEVELOPMENT */
1014 
1015 	return kmr;
1016 }
1017 
1018 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1019 kmem_alloc_guard(
1020 	vm_map_t        map,
1021 	vm_size_t       size,
1022 	vm_offset_t     mask,
1023 	kma_flags_t     flags,
1024 	kmem_guard_t    guard)
1025 {
1026 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1027 }
1028 
1029 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1030 kmem_alloc_contig_guard(
1031 	vm_map_t                map,
1032 	vm_size_t               size,
1033 	vm_offset_t             mask,
1034 	ppnum_t                 max_pnum,
1035 	ppnum_t                 pnum_mask,
1036 	kma_flags_t             flags,
1037 	kmem_guard_t            guard)
1038 {
1039 	__auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1040 		return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1041 	};
1042 
1043 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1044 }
1045 
1046 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1047 kmem_suballoc(
1048 	vm_map_t                parent,
1049 	mach_vm_offset_t       *addr,
1050 	vm_size_t               size,
1051 	vm_map_create_options_t vmc_options,
1052 	int                     vm_flags,
1053 	kms_flags_t             flags,
1054 	vm_tag_t                tag)
1055 {
1056 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1057 	vm_map_offset_t map_addr = 0;
1058 	kmem_return_t kmr = { };
1059 	vm_map_t map;
1060 
1061 	assert(page_aligned(size));
1062 	assert(parent->pmap == kernel_pmap);
1063 
1064 	vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1065 
1066 	if (parent == kernel_map) {
1067 		assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1068 	}
1069 
1070 	if (vmk_flags.vmf_fixed) {
1071 		map_addr = trunc_page(*addr);
1072 	}
1073 
1074 	pmap_reference(vm_map_pmap(parent));
1075 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1076 
1077 	/*
1078 	 * 1. vm_map_enter() will consume one ref on success.
1079 	 *
1080 	 * 2. make the entry atomic as kernel submaps should never be split.
1081 	 *
1082 	 * 3. instruct vm_map_enter() that it is a fresh submap
1083 	 *    that needs to be taught its bounds as it inserted.
1084 	 */
1085 	vm_map_reference(map);
1086 
1087 	vmk_flags.vmkf_submap = true;
1088 	if ((flags & KMS_DATA) == 0) {
1089 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1090 		vmk_flags.vmkf_submap_atomic = true;
1091 	}
1092 	vmk_flags.vmkf_submap_adjust = true;
1093 	if (flags & KMS_LAST_FREE) {
1094 		vmk_flags.vmkf_last_free = true;
1095 	}
1096 	if (flags & KMS_PERMANENT) {
1097 		vmk_flags.vmf_permanent = true;
1098 	}
1099 	if (flags & KMS_DATA) {
1100 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1101 	}
1102 
1103 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1104 	    vmk_flags, (vm_object_t)map, 0, FALSE,
1105 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1106 
1107 	if (kmr.kmr_return != KERN_SUCCESS) {
1108 		if (flags & KMS_NOFAIL) {
1109 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1110 			    parent, size, kmr.kmr_return);
1111 		}
1112 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1113 		vm_map_deallocate(map);
1114 		vm_map_deallocate(map); /* also removes ref to pmap */
1115 		return kmr;
1116 	}
1117 
1118 	/*
1119 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1120 	 * that the exact same range is returned.
1121 	 */
1122 	if (*addr != 0 && parent == kernel_map &&
1123 	    startup_phase > STARTUP_SUB_KMEM) {
1124 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1125 	} else {
1126 		*addr = map_addr;
1127 	}
1128 
1129 	kmr.kmr_submap = map;
1130 	return kmr;
1131 }
1132 
1133 /*
1134  *	kmem_alloc:
1135  *
1136  *	Allocate wired-down memory in the kernel's address map
1137  *	or a submap.  The memory is not zero-filled.
1138  */
1139 
1140 __exported kern_return_t
1141 kmem_alloc_external(
1142 	vm_map_t        map,
1143 	vm_offset_t     *addrp,
1144 	vm_size_t       size);
1145 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1146 kmem_alloc_external(
1147 	vm_map_t        map,
1148 	vm_offset_t     *addrp,
1149 	vm_size_t       size)
1150 {
1151 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1152 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1153 	}
1154 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1155 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1156 }
1157 
1158 
1159 /*
1160  *	kmem_alloc_kobject:
1161  *
1162  *	Allocate wired-down memory in the kernel's address map
1163  *	or a submap.  The memory is not zero-filled.
1164  *
1165  *	The memory is allocated in the kernel_object.
1166  *	It may not be copied with vm_map_copy, and
1167  *	it may not be reallocated with kmem_realloc.
1168  */
1169 
1170 __exported kern_return_t
1171 kmem_alloc_kobject_external(
1172 	vm_map_t        map,
1173 	vm_offset_t     *addrp,
1174 	vm_size_t       size);
1175 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1176 kmem_alloc_kobject_external(
1177 	vm_map_t        map,
1178 	vm_offset_t     *addrp,
1179 	vm_size_t       size)
1180 {
1181 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1182 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1183 	}
1184 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1185 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1186 }
1187 
1188 /*
1189  *	kmem_alloc_pageable:
1190  *
1191  *	Allocate pageable memory in the kernel's address map.
1192  */
1193 
1194 __exported kern_return_t
1195 kmem_alloc_pageable_external(
1196 	vm_map_t        map,
1197 	vm_offset_t     *addrp,
1198 	vm_size_t       size);
1199 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1200 kmem_alloc_pageable_external(
1201 	vm_map_t        map,
1202 	vm_offset_t     *addrp,
1203 	vm_size_t       size)
1204 {
1205 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1206 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1207 	}
1208 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1209 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1210 }
1211 
1212 
1213 #pragma mark population
1214 
1215 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags)1216 kernel_memory_populate_pmap_enter(
1217 	vm_object_t             object,
1218 	vm_address_t            addr,
1219 	vm_object_offset_t      offset,
1220 	vm_page_t               mem,
1221 	vm_prot_t               prot,
1222 	int                     pe_flags)
1223 {
1224 	kern_return_t   pe_result;
1225 	int             pe_options;
1226 
1227 	if (VMP_ERROR_GET(mem)) {
1228 		panic("VM page %p should not have an error", mem);
1229 	}
1230 
1231 	pe_options = PMAP_OPTIONS_NOWAIT;
1232 	if (object->internal) {
1233 		pe_options |= PMAP_OPTIONS_INTERNAL;
1234 	}
1235 	if (mem->vmp_reusable || object->all_reusable) {
1236 		pe_options |= PMAP_OPTIONS_REUSABLE;
1237 	}
1238 
1239 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1240 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1241 	    pe_flags, /* wired */ TRUE, pe_options, NULL);
1242 
1243 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1244 		vm_object_unlock(object);
1245 
1246 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1247 
1248 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1249 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1250 		    pe_flags, /* wired */ TRUE, pe_options, NULL);
1251 
1252 		vm_object_lock(object);
1253 	}
1254 
1255 	assert(pe_result == KERN_SUCCESS);
1256 }
1257 
1258 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot)1259 kernel_memory_populate_object_and_unlock(
1260 	vm_object_t     object, /* must be locked */
1261 	vm_address_t    addr,
1262 	vm_offset_t     offset,
1263 	vm_size_t       size,
1264 	vm_page_t       page_list,
1265 	kma_flags_t     flags,
1266 	vm_tag_t        tag,
1267 	vm_prot_t       prot)
1268 {
1269 	vm_page_t       mem;
1270 	int             pe_flags;
1271 	bool            gobbled_list = page_list && page_list->vmp_gobbled;
1272 
1273 	assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1274 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1275 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1276 		assert3u(offset, ==, addr);
1277 	} else {
1278 		/*
1279 		 * kernel_memory_populate_pmap_enter() might drop the object
1280 		 * lock, and the caller might not own a reference anymore
1281 		 * and rely on holding the vm object lock for liveness.
1282 		 */
1283 		vm_object_reference_locked(object);
1284 	}
1285 
1286 	if (flags & KMA_KSTACK) {
1287 		pe_flags = VM_MEM_STACK;
1288 	} else {
1289 		pe_flags = 0;
1290 	}
1291 
1292 
1293 	for (vm_object_offset_t pg_offset = 0;
1294 	    pg_offset < size;
1295 	    pg_offset += PAGE_SIZE_64) {
1296 		if (page_list == NULL) {
1297 			panic("%s: page_list too short", __func__);
1298 		}
1299 
1300 		mem = page_list;
1301 		page_list = mem->vmp_snext;
1302 		mem->vmp_snext = NULL;
1303 
1304 		assert(mem->vmp_wire_count == 0);
1305 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1306 		assert(!mem->vmp_fictitious && !mem->vmp_private);
1307 
1308 		if (flags & KMA_COMPRESSOR) {
1309 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1310 			/*
1311 			 * Background processes doing I/O accounting can call
1312 			 * into NVME driver to do some work which results in
1313 			 * an allocation here and so we want to make sure
1314 			 * that the pages used by compressor, regardless of
1315 			 * process context, are never on the special Q.
1316 			 */
1317 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1318 
1319 			vm_page_insert(mem, object, offset + pg_offset);
1320 		} else {
1321 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1322 			mem->vmp_wire_count = 1;
1323 
1324 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1325 		}
1326 
1327 		mem->vmp_gobbled = false;
1328 		mem->vmp_busy = false;
1329 		mem->vmp_pmapped = true;
1330 		mem->vmp_wpmapped = true;
1331 
1332 		/*
1333 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1334 		 * for the kernel and compressor objects.
1335 		 */
1336 
1337 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1338 		    mem, prot, pe_flags);
1339 
1340 		if (flags & KMA_NOENCRYPT) {
1341 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1342 		}
1343 	}
1344 
1345 	if (page_list) {
1346 		panic("%s: page_list too long", __func__);
1347 	}
1348 
1349 	vm_object_unlock(object);
1350 	if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1351 		vm_object_deallocate(object);
1352 	}
1353 
1354 	/*
1355 	 * Update the accounting:
1356 	 * - the compressor "wired" pages don't really count as wired
1357 	 * - kmem_alloc_contig_guard() gives gobbled pages,
1358 	 *   which already count as wired but need to be ungobbled.
1359 	 */
1360 	if (gobbled_list) {
1361 		vm_page_lockspin_queues();
1362 		if (flags & KMA_COMPRESSOR) {
1363 			vm_page_wire_count -= atop(size);
1364 		}
1365 		vm_page_gobble_count -= atop(size);
1366 		vm_page_unlock_queues();
1367 	} else if ((flags & KMA_COMPRESSOR) == 0) {
1368 		vm_page_lockspin_queues();
1369 		vm_page_wire_count += atop(size);
1370 		vm_page_unlock_queues();
1371 	}
1372 
1373 	if (flags & KMA_KOBJECT) {
1374 		/* vm_page_insert_wired() handles regular objects already */
1375 		vm_tag_update_size(tag, size, NULL);
1376 	}
1377 
1378 #if KASAN
1379 	if (flags & KMA_COMPRESSOR) {
1380 		kasan_notify_address_nopoison(addr, size);
1381 	} else {
1382 		kasan_notify_address(addr, size);
1383 	}
1384 #endif /* KASAN */
1385 }
1386 
1387 
1388 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1389 kernel_memory_populate(
1390 	vm_offset_t     addr,
1391 	vm_size_t       size,
1392 	kma_flags_t     flags,
1393 	vm_tag_t        tag)
1394 {
1395 	kern_return_t   kr = KERN_SUCCESS;
1396 	vm_page_t       page_list = NULL;
1397 	vm_size_t       page_count = atop_64(size);
1398 	vm_object_t     object = __kmem_object(ANYF(flags));
1399 
1400 #if DEBUG || DEVELOPMENT
1401 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1402 	    size, 0, 0, 0);
1403 #endif /* DEBUG || DEVELOPMENT */
1404 
1405 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1406 	if (kr == KERN_SUCCESS) {
1407 		vm_object_lock(object);
1408 		kernel_memory_populate_object_and_unlock(object, addr,
1409 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT);
1410 	}
1411 
1412 #if DEBUG || DEVELOPMENT
1413 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1414 	    page_count, 0, 0, 0);
1415 #endif /* DEBUG || DEVELOPMENT */
1416 	return kr;
1417 }
1418 
1419 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1420 kernel_memory_depopulate(
1421 	vm_offset_t        addr,
1422 	vm_size_t          size,
1423 	kma_flags_t        flags,
1424 	vm_tag_t           tag)
1425 {
1426 	vm_object_t        object = __kmem_object(ANYF(flags));
1427 	vm_object_offset_t offset = addr;
1428 	vm_page_t          mem;
1429 	vm_page_t          local_freeq = NULL;
1430 	unsigned int       pages_unwired = 0;
1431 
1432 	vm_object_lock(object);
1433 
1434 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1435 
1436 	for (vm_object_offset_t pg_offset = 0;
1437 	    pg_offset < size;
1438 	    pg_offset += PAGE_SIZE_64) {
1439 		mem = vm_page_lookup(object, offset + pg_offset);
1440 
1441 		assert(mem);
1442 
1443 		if (flags & KMA_COMPRESSOR) {
1444 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1445 		} else {
1446 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1447 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1448 			pages_unwired++;
1449 		}
1450 
1451 		mem->vmp_busy = TRUE;
1452 
1453 		assert(mem->vmp_tabled);
1454 		vm_page_remove(mem, TRUE);
1455 		assert(mem->vmp_busy);
1456 
1457 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1458 
1459 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1460 		mem->vmp_snext = local_freeq;
1461 		local_freeq = mem;
1462 	}
1463 
1464 	vm_object_unlock(object);
1465 
1466 	vm_page_free_list(local_freeq, TRUE);
1467 
1468 	if (!(flags & KMA_COMPRESSOR)) {
1469 		vm_page_lockspin_queues();
1470 		vm_page_wire_count -= pages_unwired;
1471 		vm_page_unlock_queues();
1472 	}
1473 
1474 	if (flags & KMA_KOBJECT) {
1475 		/* vm_page_remove() handles regular objects already */
1476 		vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1477 	}
1478 }
1479 
1480 #pragma mark reallocation
1481 
1482 __abortlike
1483 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1484 __kmem_realloc_invalid_object_size_panic(
1485 	vm_map_t                map,
1486 	vm_address_t            address,
1487 	vm_size_t               size,
1488 	vm_map_entry_t          entry)
1489 {
1490 	vm_object_t object  = VME_OBJECT(entry);
1491 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
1492 
1493 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1494 	    "object %p has unexpected size %ld",
1495 	    map, (void *)address, (size_t)size, entry, object, objsize);
1496 }
1497 
1498 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1499 kmem_realloc_shrink_guard(
1500 	vm_map_t                map,
1501 	vm_offset_t             req_oldaddr,
1502 	vm_size_t               req_oldsize,
1503 	vm_size_t               req_newsize,
1504 	kmr_flags_t             flags,
1505 	kmem_guard_t            guard,
1506 	vm_map_entry_t          entry)
1507 {
1508 	vmr_flags_t             vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1509 	vm_object_t             object;
1510 	vm_offset_t             delta = 0;
1511 	kmem_return_t           kmr;
1512 	bool                    was_atomic;
1513 	vm_size_t               oldsize = round_page(req_oldsize);
1514 	vm_size_t               newsize = round_page(req_newsize);
1515 	vm_address_t            oldaddr = req_oldaddr;
1516 
1517 #if KASAN_CLASSIC
1518 	if (flags & KMR_KASAN_GUARD) {
1519 		assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1520 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1521 		oldaddr -= PAGE_SIZE;
1522 		delta    = ptoa(2);
1523 		oldsize += delta;
1524 		newsize += delta;
1525 	}
1526 #endif /* KASAN_CLASSIC */
1527 
1528 	if (flags & KMR_TAG) {
1529 		oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1530 	}
1531 
1532 	vm_map_lock_assert_exclusive(map);
1533 
1534 	if ((flags & KMR_KOBJECT) == 0) {
1535 		object = VME_OBJECT(entry);
1536 		vm_object_reference(object);
1537 	}
1538 
1539 	/*
1540 	 *	Shrinking an atomic entry starts with splitting it,
1541 	 *	and removing the second half.
1542 	 */
1543 	was_atomic = entry->vme_atomic;
1544 	entry->vme_atomic = false;
1545 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1546 	entry->vme_atomic = was_atomic;
1547 
1548 #if KASAN
1549 	if (entry->vme_kernel_object && was_atomic) {
1550 		entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1551 	}
1552 #if KASAN_CLASSIC
1553 	if (flags & KMR_KASAN_GUARD) {
1554 		kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1555 		    ASAN_VALID);
1556 	}
1557 #endif
1558 #if KASAN_TBI
1559 	if (flags & KMR_TAG) {
1560 		kasan_tbi_mark_free_space(req_oldaddr + newsize, oldsize - newsize);
1561 	}
1562 #endif /* KASAN_TBI */
1563 #endif /* KASAN */
1564 	(void)vm_map_remove_and_unlock(map,
1565 	    oldaddr + newsize, oldaddr + oldsize,
1566 	    vmr_flags, KMEM_GUARD_NONE);
1567 
1568 
1569 	/*
1570 	 *	Lastly, if there are guard pages, deal with them.
1571 	 *
1572 	 *	The kernel object just needs to depopulate,
1573 	 *	regular objects require freeing the last page
1574 	 *	and replacing it with a guard.
1575 	 */
1576 	if (flags & KMR_KOBJECT) {
1577 		if (flags & KMR_GUARD_LAST) {
1578 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1579 			    PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1580 		}
1581 	} else {
1582 		vm_page_t guard_right = VM_PAGE_NULL;
1583 		vm_offset_t remove_start = newsize;
1584 
1585 		if (flags & KMR_GUARD_LAST) {
1586 			if (!map->never_faults) {
1587 				guard_right = vm_page_grab_guard(true);
1588 			}
1589 			remove_start -= PAGE_SIZE;
1590 		}
1591 
1592 		vm_object_lock(object);
1593 
1594 		if (object->vo_size != oldsize) {
1595 			__kmem_realloc_invalid_object_size_panic(map,
1596 			    req_oldaddr, req_oldsize + delta, entry);
1597 		}
1598 		vm_object_set_size(object, newsize, req_newsize);
1599 
1600 		vm_object_page_remove(object, remove_start, oldsize);
1601 
1602 		if (guard_right) {
1603 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1604 			guard_right->vmp_busy = false;
1605 		}
1606 		vm_object_unlock(object);
1607 		vm_object_deallocate(object);
1608 	}
1609 
1610 	kmr.kmr_address = req_oldaddr;
1611 	kmr.kmr_return  = 0;
1612 #if KASAN_CLASSIC
1613 	if (flags & KMA_KASAN_GUARD) {
1614 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1615 	}
1616 #endif /* KASAN_CLASSIC */
1617 #if KASAN_TBI
1618 	if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1619 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1620 		vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1621 		kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1622 	}
1623 #endif /* KASAN_TBI */
1624 
1625 	return kmr;
1626 }
1627 
1628 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1629 kmem_realloc_guard(
1630 	vm_map_t                map,
1631 	vm_offset_t             req_oldaddr,
1632 	vm_size_t               req_oldsize,
1633 	vm_size_t               req_newsize,
1634 	kmr_flags_t             flags,
1635 	kmem_guard_t            guard)
1636 {
1637 	vm_object_t             object;
1638 	vm_size_t               oldsize;
1639 	vm_size_t               newsize;
1640 	vm_offset_t             delta = 0;
1641 	vm_map_offset_t         oldaddr;
1642 	vm_map_offset_t         newaddr;
1643 	vm_object_offset_t      newoffs;
1644 	vm_map_entry_t          oldentry;
1645 	vm_map_entry_t          newentry;
1646 	vm_page_t               page_list = NULL;
1647 	bool                    needs_wakeup = false;
1648 	kmem_return_t           kmr = { };
1649 	unsigned int            last_timestamp;
1650 	vm_map_kernel_flags_t   vmk_flags = {
1651 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1652 	};
1653 
1654 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1655 	if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1656 		__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1657 		    req_oldsize, flags);
1658 	}
1659 
1660 	if (req_oldaddr == 0ul) {
1661 		return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1662 	}
1663 
1664 	if (req_newsize == 0ul) {
1665 		kmem_free_guard(map, req_oldaddr, req_oldsize,
1666 		    (kmf_flags_t)flags, guard);
1667 		return kmr;
1668 	}
1669 
1670 	if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1671 		__kmem_invalid_size_panic(map, req_newsize, flags);
1672 	}
1673 	if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1674 		__kmem_invalid_size_panic(map, req_newsize, flags);
1675 	}
1676 
1677 	oldsize = round_page(req_oldsize);
1678 	newsize = round_page(req_newsize);
1679 	oldaddr = req_oldaddr;
1680 #if KASAN_CLASSIC
1681 	if (flags & KMR_KASAN_GUARD) {
1682 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1683 		oldaddr -= PAGE_SIZE;
1684 		delta    = ptoa(2);
1685 		oldsize += delta;
1686 		newsize += delta;
1687 	}
1688 #endif /* KASAN_CLASSIC */
1689 #if CONFIG_KERNEL_TAGGING
1690 	if (flags & KMR_TAG) {
1691 		vm_memtag_verify_tag(req_oldaddr);
1692 		oldaddr = vm_memtag_canonicalize_address(req_oldaddr);
1693 	}
1694 #endif /* CONFIG_KERNEL_TAGGING */
1695 
1696 #if !KASAN
1697 	/*
1698 	 *	If not on a KASAN variant and no difference in requested size,
1699 	 *  just return.
1700 	 *
1701 	 *	Otherwise we want to validate the size and re-tag for KASAN_TBI.
1702 	 */
1703 	if (oldsize == newsize) {
1704 		kmr.kmr_address = req_oldaddr;
1705 		return kmr;
1706 	}
1707 #endif /* !KASAN */
1708 
1709 	/*
1710 	 *	If we're growing the allocation,
1711 	 *	then reserve the pages we'll need,
1712 	 *	and find a spot for its new place.
1713 	 */
1714 	if (oldsize < newsize) {
1715 #if DEBUG || DEVELOPMENT
1716 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1717 		    VM_KERN_REQUEST, DBG_FUNC_START,
1718 		    newsize - oldsize, 0, 0, 0);
1719 #endif /* DEBUG || DEVELOPMENT */
1720 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1721 		    (kma_flags_t)flags, &page_list);
1722 		if (kmr.kmr_return == KERN_SUCCESS) {
1723 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1724 			    newsize, 0, &vmk_flags, true);
1725 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1726 			    vmk_flags, &newentry);
1727 		}
1728 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1729 			if (flags & KMR_REALLOCF) {
1730 				kmem_free_guard(map, req_oldaddr, req_oldsize,
1731 				    KMF_NONE, guard);
1732 			}
1733 			if (page_list) {
1734 				vm_page_free_list(page_list, FALSE);
1735 			}
1736 #if DEBUG || DEVELOPMENT
1737 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1738 			    VM_KERN_REQUEST, DBG_FUNC_END,
1739 			    0, 0, 0, 0);
1740 #endif /* DEBUG || DEVELOPMENT */
1741 			return kmr;
1742 		}
1743 
1744 		/* map is locked */
1745 	} else {
1746 		vm_map_lock(map);
1747 	}
1748 
1749 
1750 	/*
1751 	 *	Locate the entry:
1752 	 *	- wait for it to quiesce.
1753 	 *	- validate its guard,
1754 	 *	- learn its correct tag,
1755 	 */
1756 again:
1757 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1758 		__kmem_entry_not_found_panic(map, req_oldaddr);
1759 	}
1760 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1761 		oldentry->needs_wakeup = true;
1762 		vm_map_entry_wait(map, THREAD_UNINT);
1763 		goto again;
1764 	}
1765 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1766 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1767 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1768 	}
1769 	/*
1770 	 *	TODO: We should validate for non atomic entries that the range
1771 	 *	      we are acting on is what we expect here.
1772 	 */
1773 #if KASAN
1774 	if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1775 		__kmem_realloc_invalid_object_size_panic(map,
1776 		    req_oldaddr, req_oldsize + delta, oldentry);
1777 	}
1778 
1779 	if (oldsize == newsize) {
1780 		kmr.kmr_address = req_oldaddr;
1781 		if (oldentry->vme_kernel_object) {
1782 			oldentry->vme_object_or_delta = delta +
1783 			    (-req_newsize & PAGE_MASK);
1784 		} else {
1785 			object = VME_OBJECT(oldentry);
1786 			vm_object_lock(object);
1787 			vm_object_set_size(object, newsize, req_newsize);
1788 			vm_object_unlock(object);
1789 		}
1790 		vm_map_unlock(map);
1791 
1792 #if KASAN_CLASSIC
1793 		if (flags & KMA_KASAN_GUARD) {
1794 			kasan_alloc_large(kmr.kmr_address, req_newsize);
1795 		}
1796 #endif /* KASAN_CLASSIC */
1797 #if KASAN_TBI
1798 		if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1799 			kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
1800 			vm_memtag_set_tag(kmr.kmr_address, req_newsize);
1801 			kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
1802 		}
1803 #endif /* KASAN_TBI */
1804 		return kmr;
1805 	}
1806 #endif /* KASAN */
1807 
1808 	guard.kmg_tag = VME_ALIAS(oldentry);
1809 
1810 	if (newsize < oldsize) {
1811 		return kmem_realloc_shrink_guard(map, req_oldaddr,
1812 		           req_oldsize, req_newsize, flags, guard, oldentry);
1813 	}
1814 
1815 
1816 	/*
1817 	 *	We are growing the entry
1818 	 *
1819 	 *	For regular objects we use the object `vo_size` updates
1820 	 *	as a guarantee that no 2 kmem_realloc() can happen
1821 	 *	concurrently (by doing it before the map is unlocked.
1822 	 *
1823 	 *	For the kernel object, prevent the entry from being
1824 	 *	reallocated or changed by marking it "in_transition".
1825 	 */
1826 
1827 	object = VME_OBJECT(oldentry);
1828 	vm_object_lock(object);
1829 	vm_object_reference_locked(object);
1830 
1831 	newaddr = newentry->vme_start;
1832 	newoffs = oldsize;
1833 
1834 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1835 	VME_ALIAS_SET(newentry, guard.kmg_tag);
1836 	if (flags & KMR_KOBJECT) {
1837 		oldentry->in_transition = true;
1838 		VME_OFFSET_SET(newentry, newaddr);
1839 		newentry->wired_count = 1;
1840 		vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
1841 		newoffs = newaddr + oldsize;
1842 	} else {
1843 		if (object->vo_size != oldsize) {
1844 			__kmem_realloc_invalid_object_size_panic(map,
1845 			    req_oldaddr, req_oldsize + delta, oldentry);
1846 		}
1847 		vm_object_set_size(object, newsize, req_newsize);
1848 	}
1849 
1850 	last_timestamp = map->timestamp;
1851 	vm_map_unlock(map);
1852 
1853 
1854 	/*
1855 	 *	Now proceed with the population of pages.
1856 	 *
1857 	 *	Kernel objects can use the kmem population helpers.
1858 	 *
1859 	 *	Regular objects will insert pages manually,
1860 	 *	then wire the memory into the new range.
1861 	 */
1862 
1863 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1864 
1865 	if (flags & KMR_KOBJECT) {
1866 		pmap_protect(kernel_pmap,
1867 		    oldaddr, oldaddr + oldsize - guard_right_size,
1868 		    VM_PROT_NONE);
1869 
1870 		for (vm_object_offset_t offset = 0;
1871 		    offset < oldsize - guard_right_size;
1872 		    offset += PAGE_SIZE_64) {
1873 			vm_page_t mem;
1874 
1875 			mem = vm_page_lookup(object, oldaddr + offset);
1876 			if (mem == VM_PAGE_NULL) {
1877 				continue;
1878 			}
1879 
1880 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1881 
1882 			mem->vmp_busy = true;
1883 			vm_page_remove(mem, true);
1884 			vm_page_insert_wired(mem, object, newaddr + offset,
1885 			    guard.kmg_tag);
1886 			mem->vmp_busy = false;
1887 
1888 			kernel_memory_populate_pmap_enter(object, newaddr,
1889 			    offset, mem, VM_PROT_DEFAULT, 0);
1890 		}
1891 
1892 		kernel_memory_populate_object_and_unlock(object,
1893 		    newaddr + oldsize - guard_right_size,
1894 		    newoffs - guard_right_size,
1895 		    newsize - oldsize,
1896 		    page_list, (kma_flags_t)flags,
1897 		    guard.kmg_tag, VM_PROT_DEFAULT);
1898 	} else {
1899 		vm_page_t guard_right = VM_PAGE_NULL;
1900 		kern_return_t kr;
1901 
1902 		/*
1903 		 *	Note: we are borrowing the new entry reference
1904 		 *	on the object for the duration of this code,
1905 		 *	which works because we keep the object locked
1906 		 *	throughout.
1907 		 */
1908 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1909 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1910 			assert(guard_right->vmp_fictitious);
1911 			guard_right->vmp_busy = true;
1912 			vm_page_remove(guard_right, true);
1913 		}
1914 
1915 		for (vm_object_offset_t offset = oldsize - guard_right_size;
1916 		    offset < newsize - guard_right_size;
1917 		    offset += PAGE_SIZE_64) {
1918 			vm_page_t mem = page_list;
1919 
1920 			page_list = mem->vmp_snext;
1921 			mem->vmp_snext = VM_PAGE_NULL;
1922 
1923 			vm_page_insert(mem, object, offset);
1924 			mem->vmp_busy = false;
1925 		}
1926 
1927 		if (guard_right) {
1928 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1929 			guard_right->vmp_busy = false;
1930 		}
1931 
1932 		vm_object_unlock(object);
1933 
1934 		kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
1935 		    VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
1936 		assert(kr == KERN_SUCCESS);
1937 	}
1938 
1939 	/*
1940 	 *	Mark the entry as idle again,
1941 	 *	and honor KMR_FREEOLD if needed.
1942 	 */
1943 
1944 	vm_map_lock(map);
1945 	if (last_timestamp + 1 != map->timestamp &&
1946 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1947 		__kmem_entry_not_found_panic(map, req_oldaddr);
1948 	}
1949 
1950 	if (flags & KMR_KOBJECT) {
1951 		assert(oldentry->in_transition);
1952 		oldentry->in_transition = false;
1953 		if (oldentry->needs_wakeup) {
1954 			needs_wakeup = true;
1955 			oldentry->needs_wakeup = false;
1956 		}
1957 	}
1958 
1959 	if (flags & KMR_FREEOLD) {
1960 		vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1961 
1962 #if KASAN_CLASSIC
1963 		if (flags & KMR_KASAN_GUARD) {
1964 			kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
1965 		}
1966 #endif
1967 #if KASAN_TBI
1968 		if (flags & KMR_TAG) {
1969 			kasan_tbi_mark_free_space(req_oldaddr, oldsize);
1970 		}
1971 #endif /* KASAN_TBI */
1972 		if (flags & KMR_GUARD_LAST) {
1973 			vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
1974 		}
1975 		(void)vm_map_remove_and_unlock(map,
1976 		    oldaddr, oldaddr + oldsize,
1977 		    vmr_flags, guard);
1978 	} else {
1979 		vm_map_unlock(map);
1980 	}
1981 
1982 	if (needs_wakeup) {
1983 		vm_map_entry_wakeup(map);
1984 	}
1985 
1986 #if DEBUG || DEVELOPMENT
1987 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1988 	    atop(newsize - oldsize), 0, 0, 0);
1989 #endif /* DEBUG || DEVELOPMENT */
1990 	kmr.kmr_address = newaddr;
1991 
1992 #if KASAN
1993 	kasan_notify_address(kmr.kmr_address, newsize);
1994 #endif /* KASAN */
1995 #if KASAN_CLASSIC
1996 	if (flags & KMR_KASAN_GUARD) {
1997 		kmr.kmr_address += PAGE_SIZE;
1998 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1999 	}
2000 #endif /* KASAN_CLASSIC */
2001 #if KASAN_TBI
2002 	if (flags & KMR_TAG) {
2003 		kmr.kmr_address = vm_memtag_assign_tag(kmr.kmr_address, req_newsize);
2004 		vm_memtag_set_tag(kmr.kmr_address, req_newsize);
2005 		kasan_tbi_retag_unused_space(kmr.kmr_address, newsize, req_newsize);
2006 	}
2007 #endif /* KASAN_TBI */
2008 
2009 	return kmr;
2010 }
2011 
2012 
2013 #pragma mark free
2014 
2015 #if KASAN
2016 
2017 __abortlike
2018 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2019 __kmem_free_invalid_object_size_panic(
2020 	vm_map_t                map,
2021 	vm_address_t            address,
2022 	vm_size_t               size,
2023 	vm_map_entry_t          entry)
2024 {
2025 	vm_object_t object  = VME_OBJECT(entry);
2026 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
2027 
2028 	panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2029 	    "object %p has unexpected size %ld",
2030 	    map, (void *)address, (size_t)size, entry, object, objsize);
2031 }
2032 
2033 #endif /* KASAN */
2034 
2035 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2036 kmem_free_guard(
2037 	vm_map_t        map,
2038 	vm_offset_t     req_addr,
2039 	vm_size_t       req_size,
2040 	kmf_flags_t     flags,
2041 	kmem_guard_t    guard)
2042 {
2043 	vmr_flags_t     vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2044 	vm_address_t    addr      = req_addr;
2045 	vm_offset_t     delta     = 0;
2046 	vm_size_t       size;
2047 #if KASAN
2048 	vm_map_entry_t  entry;
2049 #endif /* KASAN */
2050 
2051 	assert(map->pmap == kernel_pmap);
2052 
2053 #if KASAN_CLASSIC
2054 	if (flags & KMF_KASAN_GUARD) {
2055 		addr  -= PAGE_SIZE;
2056 		delta  = ptoa(2);
2057 	}
2058 #endif /* KASAN_CLASSIC */
2059 #if CONFIG_KERNEL_TAGGING
2060 	if (flags & KMF_TAG) {
2061 		vm_memtag_verify_tag(req_addr);
2062 		addr = vm_memtag_canonicalize_address(req_addr);
2063 	}
2064 #endif /* CONFIG_KERNEL_TAGGING */
2065 
2066 	if (flags & KMF_GUESS_SIZE) {
2067 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2068 		size = PAGE_SIZE;
2069 	} else if (req_size == 0) {
2070 		__kmem_invalid_size_panic(map, req_size, flags);
2071 	} else {
2072 		size = round_page(req_size) + delta;
2073 	}
2074 
2075 	vm_map_lock(map);
2076 
2077 #if KASAN
2078 	if (!vm_map_lookup_entry(map, addr, &entry)) {
2079 		__kmem_entry_not_found_panic(map, req_addr);
2080 	}
2081 	if (flags & KMF_GUESS_SIZE) {
2082 		vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2083 		req_size = __kmem_entry_orig_size(entry);
2084 		size = round_page(req_size + delta);
2085 	} else if (guard.kmg_atomic && entry->vme_kernel_object &&
2086 	    __kmem_entry_orig_size(entry) != req_size) {
2087 		/*
2088 		 * We can't make a strict check for regular
2089 		 * VM objects because it could be:
2090 		 *
2091 		 * - the kmem_guard_free() of a kmem_realloc_guard() without
2092 		 *   KMR_FREEOLD, and in that case the object size won't match.
2093 		 *
2094 		 * - a submap, in which case there is no "orig size".
2095 		 */
2096 		__kmem_free_invalid_object_size_panic(map,
2097 		    req_addr, req_size + delta, entry);
2098 	}
2099 #endif /* KASAN */
2100 #if KASAN_CLASSIC
2101 	if (flags & KMR_KASAN_GUARD) {
2102 		kasan_poison_range(addr, size, ASAN_VALID);
2103 	}
2104 #endif
2105 #if KASAN_TBI
2106 	if (flags & KMF_TAG) {
2107 		kasan_tbi_mark_free_space(req_addr, size);
2108 	}
2109 #endif /* KASAN_TBI */
2110 
2111 	/*
2112 	 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2113 	 * unwires the kernel mapping. The page won't be mapped any longer so
2114 	 * there is no extra step that is required for memory tagging to "clear"
2115 	 * it -- the page will be later laundered when reused.
2116 	 */
2117 	return vm_map_remove_and_unlock(map, addr, addr + size,
2118 	           vmr_flags, guard).kmr_size - delta;
2119 }
2120 
2121 __exported void
2122 kmem_free_external(
2123 	vm_map_t        map,
2124 	vm_offset_t     addr,
2125 	vm_size_t       size);
2126 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2127 kmem_free_external(
2128 	vm_map_t        map,
2129 	vm_offset_t     addr,
2130 	vm_size_t       size)
2131 {
2132 	if (size) {
2133 		kmem_free(map, trunc_page(addr), size);
2134 #if MACH_ASSERT
2135 	} else {
2136 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2137 		    map, (void *)addr, __builtin_return_address(0));
2138 #endif
2139 	}
2140 }
2141 
2142 #pragma mark kmem metadata
2143 
2144 /*
2145  * Guard objects for kmem pointer allocation:
2146  *
2147  * Guard objects introduce size slabs to kmem pointer allocations that are
2148  * allocated in chunks of n * sizeclass. When an allocation of a specific
2149  * sizeclass is requested a random slot from [0, n) is returned.
2150  * Allocations are returned from that chunk until m slots are left. The
2151  * remaining m slots are referred to as guard objects. They don't get
2152  * allocated and the chunk is now considered full. When an allocation is
2153  * freed to the chunk 1 slot is now available from m + 1 for the next
2154  * allocation of that sizeclass.
2155  *
2156  * Guard objects are intended to make exploitation of use after frees harder
2157  * as allocations that are freed can no longer be reliable reallocated.
2158  * They also make exploitation of OOBs harder as overflowing out of an
2159  * allocation can no longer be safe even with sufficient spraying.
2160  */
2161 
2162 #define KMEM_META_PRIMARY    UINT8_MAX
2163 #define KMEM_META_START     (UINT8_MAX - 1)
2164 #define KMEM_META_FREE      (UINT8_MAX - 2)
2165 #if __ARM_16K_PG__
2166 #define KMEM_MIN_SIZE        PAGE_SIZE
2167 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2168 #else /* __ARM_16K_PG__ */
2169 /*
2170  * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2171  * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2172  * Therefore populate sizeclasses from 4k for those devices.
2173  */
2174 #define KMEM_MIN_SIZE       (4 * 1024)
2175 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2176 #endif /* __ARM_16K_PG__ */
2177 #define KMEM_MAX_SIZE       (32ULL << 20)
2178 #define KMEM_START_IDX      (kmem_log2down(KMEM_MIN_SIZE))
2179 #define KMEM_LAST_IDX       (kmem_log2down(KMEM_MAX_SIZE))
2180 #define KMEM_NUM_SIZECLASS  (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2181 #define KMEM_FRONTS         (KMEM_RANGE_ID_NUM_PTR * 2)
2182 #define KMEM_NUM_GUARDS      2
2183 
2184 struct kmem_page_meta {
2185 	union {
2186 		/*
2187 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2188 		 */
2189 		uint32_t km_bitmap;
2190 		/*
2191 		 * On start and end of free chunk with KMEM_META_FREE marker
2192 		 */
2193 		uint32_t km_free_chunks;
2194 	};
2195 	/*
2196 	 * KMEM_META_PRIMARY: Start meta of allocated chunk
2197 	 * KMEM_META_FREE   : Start and end meta of free chunk
2198 	 * KMEM_META_START  : Meta region start and end
2199 	 */
2200 	uint8_t  km_page_marker;
2201 	uint8_t  km_sizeclass;
2202 	union {
2203 		/*
2204 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2205 		 */
2206 		uint16_t km_chunk_len;
2207 		/*
2208 		 * On secondary allocated chunks
2209 		 */
2210 		uint16_t km_page_idx;
2211 	};
2212 	LIST_ENTRY(kmem_page_meta) km_link;
2213 } kmem_page_meta_t;
2214 
2215 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2216 struct kmem_sizeclass {
2217 	vm_map_size_t                   ks_size;
2218 	uint32_t                        ks_num_chunk;
2219 	uint32_t                        ks_num_elem;
2220 	crypto_random_ctx_t __zpercpu   ks_rng_ctx;
2221 	kmem_list_head_t                ks_allfree_head[KMEM_FRONTS];
2222 	kmem_list_head_t                ks_partial_head[KMEM_FRONTS];
2223 	kmem_list_head_t                ks_full_head[KMEM_FRONTS];
2224 };
2225 
2226 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2227 
2228 /*
2229  * Locks to synchronize metadata population
2230  */
2231 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2232 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2233 #define kmem_meta_lock()   lck_mtx_lock(&kmem_meta_region_lck)
2234 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2235 
2236 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2237 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2238 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2239 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2240 /*
2241  * Keeps track of metadata high water mark for each front
2242  */
2243 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2244 static SECURITY_READ_ONLY_LATE(vm_map_t)
2245 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2246 static vm_map_size_t kmem_meta_size;
2247 
2248 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2249 kmem_get_front(
2250 	kmem_range_id_t         range_id,
2251 	bool                    from_right)
2252 {
2253 	assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2254 	    (range_id <= KMEM_RANGE_ID_NUM_PTR));
2255 	return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2256 }
2257 
2258 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2259 kmem_slot_idx_to_bit(
2260 	uint32_t                slot_idx,
2261 	uint32_t                size_idx __unused)
2262 {
2263 	assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2264 	return 1ull << slot_idx;
2265 }
2266 
2267 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2268 kmem_get_idx_from_size(vm_map_size_t size)
2269 {
2270 	assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2271 	return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2272 }
2273 
2274 __abortlike
2275 static void
kmem_invalid_size_idx(uint32_t idx)2276 kmem_invalid_size_idx(uint32_t idx)
2277 {
2278 	panic("Invalid sizeclass idx %u", idx);
2279 }
2280 
2281 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2282 kmem_get_size_from_idx(uint32_t idx)
2283 {
2284 	if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2285 		kmem_invalid_size_idx(idx);
2286 	}
2287 	return 1ul << (idx + KMEM_START_IDX);
2288 }
2289 
2290 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2291 kmem_get_page_idx(struct kmem_page_meta *meta)
2292 {
2293 	uint8_t page_marker = meta->km_page_marker;
2294 
2295 	return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2296 }
2297 
2298 __abortlike
2299 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2300 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2301 {
2302 	panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2303 	    meta);
2304 }
2305 
2306 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2307 kmem_get_chunk_len(struct kmem_page_meta *meta)
2308 {
2309 	if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2310 		kmem_invalid_chunk_len(meta);
2311 	}
2312 
2313 	return meta->km_chunk_len;
2314 }
2315 
2316 __abortlike
2317 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2318 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2319 {
2320 	panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2321 	    meta);
2322 }
2323 
2324 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2325 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2326 {
2327 	if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2328 		kmem_invalid_free_chunk_len(meta);
2329 	}
2330 
2331 	return meta->km_free_chunks;
2332 }
2333 
2334 /*
2335  * Return the metadata corresponding to the specified address
2336  */
2337 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2338 kmem_addr_to_meta(
2339 	vm_map_offset_t         addr,
2340 	vm_map_range_id_t       range_id,
2341 	vm_map_offset_t        *range_start,
2342 	uint64_t               *meta_idx)
2343 {
2344 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2345 
2346 	*range_start = kmem_ranges[range_id].min_address;
2347 	*meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2348 	return &meta_base[*meta_idx];
2349 }
2350 
2351 /*
2352  * Return the metadata start of the chunk that the address belongs to
2353  */
2354 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2355 kmem_addr_to_meta_start(
2356 	vm_address_t            addr,
2357 	vm_map_range_id_t       range_id,
2358 	vm_map_offset_t        *chunk_start)
2359 {
2360 	vm_map_offset_t range_start;
2361 	uint64_t meta_idx;
2362 	struct kmem_page_meta *meta;
2363 
2364 	meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2365 	meta_idx -= kmem_get_page_idx(meta);
2366 	meta -= kmem_get_page_idx(meta);
2367 	assert(meta->km_page_marker == KMEM_META_PRIMARY);
2368 	*chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2369 	return meta;
2370 }
2371 
2372 __startup_func
2373 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2374 kmem_init_meta_front(
2375 	struct kmem_page_meta  *meta,
2376 	kmem_range_id_t         range_id,
2377 	bool                    from_right)
2378 {
2379 	kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2380 	    KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2381 	meta->km_page_marker = KMEM_META_START;
2382 	if (!from_right) {
2383 		meta++;
2384 		kmem_meta_base[range_id] = meta;
2385 	}
2386 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2387 }
2388 
2389 __startup_func
2390 static void
kmem_metadata_init(void)2391 kmem_metadata_init(void)
2392 {
2393 	for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2394 		vm_map_offset_t addr = kmem_meta_range[i].min_address;
2395 		struct kmem_page_meta *meta;
2396 		uint64_t meta_idx;
2397 
2398 		vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2399 		kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2400 		    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2401 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
2402 		    VM_KERN_MEMORY_OSFMK).kmr_submap;
2403 
2404 		kmem_meta_range[i].min_address = addr;
2405 		kmem_meta_range[i].max_address = addr + kmem_meta_size;
2406 
2407 		meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2408 		kmem_init_meta_front(meta, i, 0);
2409 
2410 		meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2411 		    &meta_idx);
2412 		kmem_init_meta_front(meta, i, 1);
2413 	}
2414 }
2415 
2416 __startup_func
2417 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2418 kmem_init_front_head(
2419 	struct kmem_sizeclass  *ks,
2420 	uint32_t                front)
2421 {
2422 	LIST_INIT(&ks->ks_allfree_head[front]);
2423 	LIST_INIT(&ks->ks_partial_head[front]);
2424 	LIST_INIT(&ks->ks_full_head[front]);
2425 }
2426 
2427 __startup_func
2428 static void
kmem_sizeclass_init(void)2429 kmem_sizeclass_init(void)
2430 {
2431 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2432 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2433 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2434 
2435 		ks->ks_size = kmem_get_size_from_idx(i);
2436 		ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2437 		    KMEM_CHUNK_SIZE_MIN;
2438 		ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2439 		assert(ks->ks_num_elem <=
2440 		    (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2441 		for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2442 			kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2443 			kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2444 		}
2445 	}
2446 }
2447 
2448 /*
2449  * This is done during EARLY_BOOT as it needs the corecrypto module to be
2450  * set up.
2451  */
2452 __startup_func
2453 static void
kmem_crypto_init(void)2454 kmem_crypto_init(void)
2455 {
2456 	vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2457 
2458 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2459 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2460 
2461 		ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2462 		zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2463 			crypto_random_kmem_init(ctx);
2464 		}
2465 	}
2466 }
2467 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2468 
2469 __abortlike
2470 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2471 kmem_validate_slot_panic(
2472 	vm_map_offset_t         addr,
2473 	struct kmem_page_meta  *meta,
2474 	uint32_t                slot_idx,
2475 	uint32_t                size_idx)
2476 {
2477 	if (meta->km_page_marker != KMEM_META_PRIMARY) {
2478 		panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2479 	}
2480 	if (meta->km_sizeclass != size_idx) {
2481 		panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2482 		    meta, meta->km_sizeclass, size_idx);
2483 	}
2484 	panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2485 	    slot_idx, meta, (void *)addr);
2486 }
2487 
2488 __abortlike
2489 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2490 kmem_invalid_slot_for_addr(
2491 	mach_vm_range_t         slot,
2492 	vm_map_offset_t         start,
2493 	vm_map_offset_t         end)
2494 {
2495 	panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2496 	    (void *)slot->min_address, (void *)slot->max_address,
2497 	    (void *)start, (void *)end);
2498 }
2499 
2500 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2501 kmem_validate_slot(
2502 	vm_map_offset_t         addr,
2503 	struct kmem_page_meta  *meta,
2504 	uint32_t                size_idx,
2505 	uint32_t                slot_idx)
2506 {
2507 	if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2508 	    (meta->km_sizeclass != size_idx) ||
2509 	    ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2510 		kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2511 	}
2512 }
2513 
2514 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2515 kmem_validate_slot_initial(
2516 	mach_vm_range_t         slot,
2517 	vm_map_offset_t         start,
2518 	vm_map_offset_t         end,
2519 	struct kmem_page_meta  *meta,
2520 	uint32_t                size_idx,
2521 	uint32_t                slot_idx)
2522 {
2523 	if ((slot->min_address == 0) || (slot->max_address == 0) ||
2524 	    (start < slot->min_address) || (start >= slot->max_address) ||
2525 	    (end > slot->max_address)) {
2526 		kmem_invalid_slot_for_addr(slot, start, end);
2527 	}
2528 
2529 	kmem_validate_slot(start, meta, size_idx, slot_idx);
2530 }
2531 
2532 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2533 kmem_addr_get_slot_idx(
2534 	vm_map_offset_t         start,
2535 	vm_map_offset_t         end,
2536 	vm_map_range_id_t       range_id,
2537 	struct kmem_page_meta **meta,
2538 	uint32_t               *size_idx,
2539 	mach_vm_range_t         slot)
2540 {
2541 	vm_map_offset_t chunk_start;
2542 	vm_map_size_t slot_size;
2543 	uint32_t slot_idx;
2544 
2545 	*meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2546 	*size_idx = (*meta)->km_sizeclass;
2547 	slot_size = kmem_get_size_from_idx(*size_idx);
2548 	slot_idx = (start - chunk_start) / slot_size;
2549 	slot->min_address = chunk_start + slot_idx * slot_size;
2550 	slot->max_address = slot->min_address + slot_size;
2551 
2552 	kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2553 
2554 	return slot_idx;
2555 }
2556 
2557 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2558 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2559 {
2560 #if KASAN
2561 #pragma unused(from, to)
2562 	return true;
2563 #else
2564 	vm_offset_t page_addr = trunc_page(from);
2565 
2566 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2567 		/*
2568 		 * This can race with another thread doing a populate on the same metadata
2569 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2570 		 * fault in the shadow when we first access the metadata page. Avoid this
2571 		 * by always synchronizing on the kmem_meta_lock with KASan.
2572 		 */
2573 		if (!pmap_find_phys(kernel_pmap, page_addr)) {
2574 			return true;
2575 		}
2576 	}
2577 
2578 	return false;
2579 #endif /* !KASAN */
2580 }
2581 
2582 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2583 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2584 {
2585 	vm_offset_t page_addr = trunc_page(from);
2586 
2587 	vm_map_unlock(kernel_map);
2588 
2589 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2590 		for (;;) {
2591 			kern_return_t ret = KERN_SUCCESS;
2592 
2593 			/*
2594 			 * All updates to kmem metadata are done under the kmem_meta_lock
2595 			 */
2596 			kmem_meta_lock();
2597 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2598 				ret = kernel_memory_populate(page_addr,
2599 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2600 				    VM_KERN_MEMORY_OSFMK);
2601 			}
2602 			kmem_meta_unlock();
2603 
2604 			if (ret == KERN_SUCCESS) {
2605 				break;
2606 			}
2607 
2608 			/*
2609 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2610 			 * to bad system deadlocks, so if the allocation failed,
2611 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
2612 			 */
2613 			VM_PAGE_WAIT();
2614 		}
2615 	}
2616 
2617 	vm_map_lock(kernel_map);
2618 }
2619 
2620 __abortlike
2621 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2622 kmem_invalid_meta_panic(
2623 	struct kmem_page_meta  *meta,
2624 	uint32_t                slot_idx,
2625 	struct kmem_sizeclass   sizeclass)
2626 {
2627 	uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2628 
2629 	if (slot_idx >= sizeclass.ks_num_elem) {
2630 		panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2631 		    sizeclass.ks_num_elem, meta);
2632 	}
2633 	if (meta->km_sizeclass != size_idx) {
2634 		panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2635 		    meta->km_sizeclass, meta);
2636 	}
2637 	panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
2638 }
2639 
2640 __abortlike
2641 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)2642 kmem_slot_has_entry_panic(
2643 	vm_map_entry_t          entry,
2644 	vm_map_offset_t         addr)
2645 {
2646 	panic("Entry (%p) already exists for addr (%p) being returned",
2647 	    entry, (void *)addr);
2648 }
2649 
2650 __abortlike
2651 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)2652 kmem_slot_not_found(
2653 	struct kmem_page_meta  *meta,
2654 	uint32_t                slot_idx)
2655 {
2656 	panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
2657 	    meta->km_bitmap);
2658 }
2659 
2660 /*
2661  * Returns a 16bit random number between 0 and
2662  * upper_limit (inclusive)
2663  */
2664 __startup_func
2665 uint16_t
kmem_get_random16(uint16_t upper_limit)2666 kmem_get_random16(
2667 	uint16_t                upper_limit)
2668 {
2669 	static uint64_t random_entropy;
2670 	assert(upper_limit < UINT16_MAX);
2671 	if (random_entropy == 0) {
2672 		random_entropy = early_random();
2673 	}
2674 	uint32_t result = random_entropy & UINT32_MAX;
2675 	random_entropy >>= 32;
2676 	return (uint16_t)(result % (upper_limit + 1));
2677 }
2678 
2679 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)2680 kmem_get_nth_free_slot(
2681 	struct kmem_page_meta  *meta,
2682 	uint32_t                n,
2683 	uint32_t                bitmap)
2684 {
2685 	uint32_t zeros_seen = 0, ones_seen = 0;
2686 
2687 	while (bitmap) {
2688 		uint32_t count = __builtin_ctz(bitmap);
2689 
2690 		zeros_seen += count;
2691 		bitmap >>= count;
2692 		if (__probable(~bitmap)) {
2693 			count = __builtin_ctz(~bitmap);
2694 		} else {
2695 			count = 32;
2696 		}
2697 		if (count + ones_seen > n) {
2698 			return zeros_seen + n;
2699 		}
2700 		ones_seen += count;
2701 		bitmap >>= count;
2702 	}
2703 
2704 	kmem_slot_not_found(meta, n);
2705 }
2706 
2707 
2708 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)2709 kmem_get_next_slot(
2710 	struct kmem_page_meta  *meta,
2711 	struct kmem_sizeclass   sizeclass,
2712 	uint32_t                bitmap)
2713 {
2714 	uint32_t num_slots = __builtin_popcount(bitmap);
2715 	uint64_t slot_idx = 0;
2716 
2717 	assert(num_slots > 0);
2718 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
2719 		/*
2720 		 * Use early random prior to early boot as the ks_rng_ctx requires
2721 		 * the corecrypto module to be setup before it is initialized and
2722 		 * used.
2723 		 *
2724 		 * num_slots can't be 0 as we take this path when we have more than
2725 		 * one slot left.
2726 		 */
2727 		slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
2728 	} else {
2729 		crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
2730 		    &slot_idx);
2731 	}
2732 
2733 	return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
2734 }
2735 
2736 /*
2737  * Returns an unallocated slot from the given metadata
2738  */
2739 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)2740 kmem_get_addr_from_meta(
2741 	struct kmem_page_meta  *meta,
2742 	vm_map_range_id_t       range_id,
2743 	struct kmem_sizeclass   sizeclass,
2744 	vm_map_entry_t         *entry)
2745 {
2746 	vm_map_offset_t addr;
2747 	vm_map_size_t size = sizeclass.ks_size;
2748 	uint32_t size_idx = kmem_get_idx_from_size(size);
2749 	uint64_t meta_idx = meta - kmem_meta_base[range_id];
2750 	mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
2751 	uint32_t slot_bit;
2752 	uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
2753 
2754 	if ((slot_idx >= sizeclass.ks_num_elem) ||
2755 	    (meta->km_sizeclass != size_idx) ||
2756 	    (meta->km_page_marker != KMEM_META_PRIMARY)) {
2757 		kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
2758 	}
2759 
2760 	slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
2761 	meta->km_bitmap &= ~slot_bit;
2762 
2763 	addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
2764 	assert(kmem_range_contains_fully(range_id, addr, size));
2765 	if (vm_map_lookup_entry(kernel_map, addr, entry)) {
2766 		kmem_slot_has_entry_panic(*entry, addr);
2767 	}
2768 	if ((*entry != vm_map_to_entry(kernel_map)) &&
2769 	    ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
2770 	    ((*entry)->vme_next->vme_start < (addr + size))) {
2771 		kmem_slot_has_entry_panic(*entry, addr);
2772 	}
2773 	return addr;
2774 }
2775 
2776 __abortlike
2777 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)2778 kmem_range_out_of_va(
2779 	kmem_range_id_t         range_id,
2780 	uint32_t                num_chunks)
2781 {
2782 	panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
2783 }
2784 
2785 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)2786 kmem_init_allocated_chunk(
2787 	struct kmem_page_meta  *meta,
2788 	struct kmem_sizeclass   sizeclass,
2789 	uint32_t                size_idx)
2790 {
2791 	uint32_t meta_num = sizeclass.ks_num_chunk;
2792 	uint32_t num_elem = sizeclass.ks_num_elem;
2793 
2794 	meta->km_bitmap = (1ull << num_elem) - 1;
2795 	meta->km_chunk_len = (uint16_t)meta_num;
2796 	assert(LIST_NEXT(meta, km_link) == NULL);
2797 	assert(meta->km_link.le_prev == NULL);
2798 	meta->km_sizeclass = (uint8_t)size_idx;
2799 	meta->km_page_marker = KMEM_META_PRIMARY;
2800 	meta++;
2801 	for (uint32_t i = 1; i < meta_num; i++) {
2802 		meta->km_page_idx = (uint16_t)i;
2803 		meta->km_sizeclass = (uint8_t)size_idx;
2804 		meta->km_page_marker = 0;
2805 		meta->km_bitmap = 0;
2806 		meta++;
2807 	}
2808 }
2809 
2810 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)2811 kmem_get_additional_meta(
2812 	struct kmem_page_meta  *meta,
2813 	uint32_t                meta_req,
2814 	bool                    from_right,
2815 	struct kmem_page_meta **adj_free_meta)
2816 {
2817 	struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
2818 
2819 	if (meta_prev->km_page_marker == KMEM_META_FREE) {
2820 		uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
2821 
2822 		*adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
2823 		meta_req -= chunk_len;
2824 	} else {
2825 		*adj_free_meta = NULL;
2826 	}
2827 
2828 	return meta_req;
2829 }
2830 
2831 
2832 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)2833 kmem_get_new_chunk(
2834 	vm_map_range_id_t       range_id,
2835 	bool                    from_right,
2836 	uint32_t                size_idx)
2837 {
2838 	struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
2839 	struct kmem_page_meta *start, *end, *meta_update;
2840 	struct kmem_page_meta *adj_free_meta = NULL;
2841 	uint32_t meta_req = sizeclass.ks_num_chunk;
2842 
2843 	for (;;) {
2844 		struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2845 		struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2846 		struct kmem_page_meta *meta;
2847 		vm_offset_t start_addr, end_addr;
2848 		uint32_t meta_num;
2849 
2850 		meta = from_right ? metab : metaf;
2851 		meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
2852 		    &adj_free_meta);
2853 
2854 		if (metaf + meta_num >= metab) {
2855 			kmem_range_out_of_va(range_id, meta_num);
2856 		}
2857 
2858 		start = from_right ? (metab - meta_num) : metaf;
2859 		end = from_right ? metab : (metaf + meta_num);
2860 
2861 		start_addr = (vm_offset_t)start;
2862 		end_addr   = (vm_offset_t)end;
2863 
2864 		/*
2865 		 * If the new high watermark stays on the same page,
2866 		 * no need to populate and drop the lock.
2867 		 */
2868 		if (!page_aligned(from_right ? end_addr : start_addr) &&
2869 		    trunc_page(start_addr) == trunc_page(end_addr - 1)) {
2870 			break;
2871 		}
2872 		if (!kmem_populate_needed(start_addr, end_addr)) {
2873 			break;
2874 		}
2875 
2876 		kmem_populate_meta_locked(start_addr, end_addr);
2877 
2878 		/*
2879 		 * Since we dropped the lock, reassess conditions still hold:
2880 		 * - the HWM we are changing must not have moved
2881 		 * - the other HWM must not intersect with ours
2882 		 * - in case of coalescing, the adjacent free meta must still
2883 		 *   be free and of the same size.
2884 		 *
2885 		 * If we failed to grow, reevaluate whether freelists have
2886 		 * entries now by returning NULL.
2887 		 */
2888 		metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
2889 		metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
2890 		if (meta != (from_right ? metab : metaf)) {
2891 			return NULL;
2892 		}
2893 		if (metaf + meta_num >= metab) {
2894 			kmem_range_out_of_va(range_id, meta_num);
2895 		}
2896 		if (adj_free_meta) {
2897 			if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
2898 			    kmem_get_free_chunk_len(adj_free_meta) !=
2899 			    meta_req - meta_num) {
2900 				return NULL;
2901 			}
2902 		}
2903 
2904 		break;
2905 	}
2906 
2907 	/*
2908 	 * If there is an adjacent free chunk remove it from free list
2909 	 */
2910 	if (adj_free_meta) {
2911 		LIST_REMOVE(adj_free_meta, km_link);
2912 		LIST_NEXT(adj_free_meta, km_link) = NULL;
2913 		adj_free_meta->km_link.le_prev = NULL;
2914 	}
2915 
2916 	/*
2917 	 * Update hwm
2918 	 */
2919 	meta_update = from_right ? start : end;
2920 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
2921 
2922 	/*
2923 	 * Initialize metadata
2924 	 */
2925 	start = from_right ? start : (end - meta_req);
2926 	kmem_init_allocated_chunk(start, sizeclass, size_idx);
2927 
2928 	return start;
2929 }
2930 
2931 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)2932 kmem_requeue_meta(
2933 	struct kmem_page_meta  *meta,
2934 	struct kmem_list_head  *head)
2935 {
2936 	LIST_REMOVE(meta, km_link);
2937 	LIST_INSERT_HEAD(head, meta, km_link);
2938 }
2939 
2940 /*
2941  * Return corresponding sizeclass to stash free chunks in
2942  */
2943 __abortlike
2944 static void
kmem_invalid_chunk_num(uint32_t chunks)2945 kmem_invalid_chunk_num(uint32_t chunks)
2946 {
2947 	panic("Invalid number of chunks %u\n", chunks);
2948 }
2949 
2950 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)2951 kmem_get_size_idx_for_chunks(uint32_t chunks)
2952 {
2953 	for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
2954 		if (chunks >= kmem_size_array[i].ks_num_chunk) {
2955 			return i;
2956 		}
2957 	}
2958 	kmem_invalid_chunk_num(chunks);
2959 }
2960 
2961 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)2962 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
2963 {
2964 	bzero(meta, count * sizeof(struct kmem_page_meta));
2965 }
2966 
2967 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)2968 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
2969 {
2970 #if MACH_ASSERT
2971 	size_t size = count * sizeof(struct kmem_page_meta);
2972 
2973 	assert(memcmp_zero_ptr_aligned(meta, size) == 0);
2974 #else
2975 #pragma unused(meta, count)
2976 #endif
2977 }
2978 
2979 /*!
2980  * @function kmem_init_free_chunk()
2981  *
2982  * @discussion
2983  * This function prepares a range of chunks to be put on a free list.
2984  * The first and last metadata might be dirty, but the "inner" ones
2985  * must be zero filled by the caller prior to calling this function.
2986  */
2987 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)2988 kmem_init_free_chunk(
2989 	struct kmem_page_meta  *meta,
2990 	uint32_t                num_chunks,
2991 	uint32_t                front)
2992 {
2993 	struct kmem_sizeclass *sizeclass;
2994 	uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
2995 
2996 	if (num_chunks > 2) {
2997 		kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
2998 	}
2999 
3000 	meta[0] = (struct kmem_page_meta){
3001 		.km_free_chunks = num_chunks,
3002 		.km_page_marker = KMEM_META_FREE,
3003 		.km_sizeclass   = (uint8_t)size_idx,
3004 	};
3005 	if (num_chunks > 1) {
3006 		meta[num_chunks - 1] = (struct kmem_page_meta){
3007 			.km_free_chunks = num_chunks,
3008 			.km_page_marker = KMEM_META_FREE,
3009 			.km_sizeclass   = (uint8_t)size_idx,
3010 		};
3011 	}
3012 
3013 	sizeclass = &kmem_size_array[size_idx];
3014 	LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3015 }
3016 
3017 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3018 kmem_get_free_chunk_from_list(
3019 	struct kmem_sizeclass  *org_sizeclass,
3020 	uint32_t                size_idx,
3021 	uint32_t                front)
3022 {
3023 	struct kmem_sizeclass *sizeclass;
3024 	uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3025 	struct kmem_page_meta *meta;
3026 	uint32_t idx = size_idx;
3027 
3028 	while (idx < KMEM_NUM_SIZECLASS) {
3029 		sizeclass = &kmem_size_array[idx];
3030 		meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3031 		if (meta) {
3032 			break;
3033 		}
3034 		idx++;
3035 	}
3036 
3037 	/*
3038 	 * Trim if larger in size
3039 	 */
3040 	if (meta) {
3041 		uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3042 
3043 		assert(meta->km_page_marker == KMEM_META_FREE);
3044 		LIST_REMOVE(meta, km_link);
3045 		LIST_NEXT(meta, km_link) = NULL;
3046 		meta->km_link.le_prev = NULL;
3047 		if (num_chunks_free > num_chunks) {
3048 			num_chunks_free -= num_chunks;
3049 			kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3050 		}
3051 
3052 		kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3053 	}
3054 
3055 	return meta;
3056 }
3057 
3058 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3059 kmem_locate_space(
3060 	vm_map_size_t           size,
3061 	vm_map_range_id_t       range_id,
3062 	bool                    from_right,
3063 	vm_map_offset_t        *start_inout,
3064 	vm_map_entry_t         *entry_out)
3065 {
3066 	vm_map_entry_t entry;
3067 	uint32_t size_idx = kmem_get_idx_from_size(size);
3068 	uint32_t front = kmem_get_front(range_id, from_right);
3069 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3070 	struct kmem_page_meta *meta;
3071 
3072 	assert(size <= sizeclass->ks_size);
3073 again:
3074 	if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3075 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3076 		/*
3077 		 * Requeue to full if necessary
3078 		 */
3079 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3080 		if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3081 			kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3082 		}
3083 	} else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3084 	    front)) != NULL) {
3085 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3086 		/*
3087 		 * Queue to partial
3088 		 */
3089 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3090 		assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3091 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3092 	} else {
3093 		meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3094 		if (meta == NULL) {
3095 			goto again;
3096 		}
3097 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3098 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3099 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3100 	}
3101 
3102 	if (entry_out) {
3103 		*entry_out = entry;
3104 	}
3105 
3106 	return KERN_SUCCESS;
3107 }
3108 
3109 /*
3110  * Determine whether the given metadata was allocated from the right
3111  */
3112 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3113 kmem_meta_is_from_right(
3114 	kmem_range_id_t         range_id,
3115 	struct kmem_page_meta  *meta)
3116 {
3117 	struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3118 #if DEBUG || DEVELOPMENT
3119 	struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3120 #endif
3121 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3122 	struct kmem_page_meta *meta_end;
3123 
3124 	meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3125 
3126 	if ((meta >= meta_base) && (meta < metaf)) {
3127 		return false;
3128 	}
3129 
3130 	assert(meta >= metab && meta < meta_end);
3131 	return true;
3132 }
3133 
3134 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3135 kmem_free_chunk(
3136 	kmem_range_id_t         range_id,
3137 	struct kmem_page_meta  *meta,
3138 	bool                    from_right)
3139 {
3140 	struct kmem_page_meta *meta_coalesce = meta - 1;
3141 	struct kmem_page_meta *meta_start = meta;
3142 	uint32_t num_chunks = kmem_get_chunk_len(meta);
3143 	uint32_t add_chunks;
3144 	struct kmem_page_meta *meta_end = meta + num_chunks;
3145 	struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3146 	uint32_t front = kmem_get_front(range_id, from_right);
3147 
3148 	meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3149 	meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3150 
3151 	LIST_REMOVE(meta, km_link);
3152 	kmem_clear_meta_range(meta, num_chunks);
3153 
3154 	/*
3155 	 * Coalesce left
3156 	 */
3157 	if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3158 	    (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3159 		meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3160 		add_chunks = kmem_get_free_chunk_len(meta_start);
3161 		num_chunks += add_chunks;
3162 		LIST_REMOVE(meta_start, km_link);
3163 		kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3164 	}
3165 
3166 	/*
3167 	 * Coalesce right
3168 	 */
3169 	if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3170 	    (meta_end->km_page_marker == KMEM_META_FREE)) {
3171 		add_chunks = kmem_get_free_chunk_len(meta_end);
3172 		LIST_REMOVE(meta_end, km_link);
3173 		kmem_clear_meta_range(meta_end, 1);
3174 		meta_end = meta_end + add_chunks;
3175 		num_chunks += add_chunks;
3176 	}
3177 
3178 	kmem_init_free_chunk(meta_start, num_chunks, front);
3179 }
3180 
3181 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3182 kmem_free_slot(
3183 	kmem_range_id_t         range_id,
3184 	mach_vm_range_t         slot)
3185 {
3186 	struct kmem_page_meta *meta;
3187 	vm_map_offset_t chunk_start;
3188 	uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3189 	struct kmem_sizeclass *sizeclass;
3190 	vm_map_size_t slot_size;
3191 
3192 	meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3193 	size_idx = meta->km_sizeclass;
3194 	slot_size = kmem_get_size_from_idx(size_idx);
3195 	slot_idx = (slot->min_address - chunk_start) / slot_size;
3196 	assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3197 	meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3198 
3199 	sizeclass = &kmem_size_array[size_idx];
3200 	chunk_elem = sizeclass->ks_num_elem;
3201 	num_elem = __builtin_popcount(meta->km_bitmap);
3202 
3203 	if (num_elem == chunk_elem) {
3204 		/*
3205 		 * If entire chunk empty add to emtpy list
3206 		 */
3207 		bool from_right = kmem_meta_is_from_right(range_id, meta);
3208 
3209 		kmem_free_chunk(range_id, meta, from_right);
3210 	} else if (num_elem == KMEM_NUM_GUARDS + 1) {
3211 		/*
3212 		 * If we freed to full chunk move it to partial
3213 		 */
3214 		uint32_t front = kmem_get_front(range_id,
3215 		    kmem_meta_is_from_right(range_id, meta));
3216 
3217 		kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3218 	}
3219 }
3220 
3221 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3222 kmem_free_space(
3223 	vm_map_offset_t         start,
3224 	vm_map_offset_t         end,
3225 	vm_map_range_id_t       range_id,
3226 	mach_vm_range_t         slot)
3227 {
3228 	bool entry_present = false;
3229 	vm_map_entry_t prev_entry;
3230 	vm_map_entry_t next_entry;
3231 
3232 	if ((slot->min_address == start) && (slot->max_address == end)) {
3233 		/*
3234 		 * Entire slot is being freed at once
3235 		 */
3236 		return kmem_free_slot(range_id, slot);
3237 	}
3238 
3239 	entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3240 	assert(!entry_present);
3241 	next_entry = prev_entry->vme_next;
3242 
3243 	if (((prev_entry == vm_map_to_entry(kernel_map) ||
3244 	    prev_entry->vme_end <= slot->min_address)) &&
3245 	    (next_entry == vm_map_to_entry(kernel_map) ||
3246 	    (next_entry->vme_start >= slot->max_address))) {
3247 		/*
3248 		 * Free entire slot
3249 		 */
3250 		kmem_free_slot(range_id, slot);
3251 	}
3252 }
3253 
3254 #pragma mark kmem init
3255 
3256 /*
3257  * The default percentage of memory that can be mlocked is scaled based on the total
3258  * amount of memory in the system. These percentages are caclulated
3259  * offline and stored in this table. We index this table by
3260  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3261  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3262  *
3263  * Note that these values were picked for mac.
3264  * If we ever have very large memory config arm devices, we may want to revisit
3265  * since the kernel overhead is smaller there due to the larger page size.
3266  */
3267 
3268 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3269 #define VM_USER_WIREABLE_MIN_CONFIG 32
3270 #if CONFIG_JETSAM
3271 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3272  * pressure.
3273  */
3274 static vm_map_size_t wire_limit_percents[] =
3275 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3276 #else
3277 static vm_map_size_t wire_limit_percents[] =
3278 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3279 #endif /* CONFIG_JETSAM */
3280 
3281 /*
3282  * Sets the default global user wire limit which limits the amount of
3283  * memory that can be locked via mlock() based on the above algorithm..
3284  * This can be overridden via a sysctl.
3285  */
3286 static void
kmem_set_user_wire_limits(void)3287 kmem_set_user_wire_limits(void)
3288 {
3289 	uint64_t available_mem_log;
3290 	uint64_t max_wire_percent;
3291 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3292 	    sizeof(vm_map_size_t);
3293 	vm_map_size_t limit;
3294 	uint64_t config_memsize = max_mem;
3295 #if defined(XNU_TARGET_OS_OSX)
3296 	config_memsize = max_mem_actual;
3297 #endif /* defined(XNU_TARGET_OS_OSX) */
3298 
3299 	available_mem_log = bit_floor(config_memsize);
3300 
3301 	if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3302 		available_mem_log = 0;
3303 	} else {
3304 		available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3305 	}
3306 	if (available_mem_log >= wire_limit_percents_length) {
3307 		available_mem_log = wire_limit_percents_length - 1;
3308 	}
3309 	max_wire_percent = wire_limit_percents[available_mem_log];
3310 
3311 	limit = config_memsize * max_wire_percent / 100;
3312 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3313 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3314 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3315 	}
3316 
3317 	vm_global_user_wire_limit = limit;
3318 	/* the default per task limit is the same as the global limit */
3319 	vm_per_task_user_wire_limit = limit;
3320 	vm_add_wire_count_over_global_limit = 0;
3321 	vm_add_wire_count_over_user_limit = 0;
3322 }
3323 
3324 #define KMEM_MAX_CLAIMS 50
3325 __startup_data
3326 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3327 __startup_data
3328 uint32_t kmem_claim_count = 0;
3329 
3330 __startup_func
3331 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3332 kmem_range_startup_init(
3333 	struct kmem_range_startup_spec *sp)
3334 {
3335 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3336 	if (sp->kc_calculate_sz) {
3337 		sp->kc_size = (sp->kc_calculate_sz)();
3338 	}
3339 	if (sp->kc_size) {
3340 		kmem_claims[kmem_claim_count] = *sp;
3341 		kmem_claim_count++;
3342 	}
3343 }
3344 
3345 static vm_offset_t
kmem_fuzz_start(void)3346 kmem_fuzz_start(void)
3347 {
3348 	vm_offset_t kmapoff_kaddr = 0;
3349 	uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3350 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3351 
3352 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3353 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3354 	    VM_KERN_MEMORY_OSFMK);
3355 	return kmapoff_kaddr + kmapoff_size;
3356 }
3357 
3358 /*
3359  * Generate a randomly shuffled array of indices from 0 to count - 1
3360  */
3361 __startup_func
3362 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3363 kmem_shuffle(
3364 	uint16_t       *shuffle_buf,
3365 	uint16_t        count)
3366 {
3367 	for (uint16_t i = 0; i < count; i++) {
3368 		uint16_t j = kmem_get_random16(i);
3369 		if (j != i) {
3370 			shuffle_buf[i] = shuffle_buf[j];
3371 		}
3372 		shuffle_buf[j] = i;
3373 	}
3374 }
3375 
3376 __startup_func
3377 static void
kmem_shuffle_claims(void)3378 kmem_shuffle_claims(void)
3379 {
3380 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3381 	uint16_t limit = (uint16_t)kmem_claim_count;
3382 
3383 	kmem_shuffle(&shuffle_buf[0], limit);
3384 	for (uint16_t i = 0; i < limit; i++) {
3385 		struct kmem_range_startup_spec tmp = kmem_claims[i];
3386 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3387 		kmem_claims[shuffle_buf[i]] = tmp;
3388 	}
3389 }
3390 
3391 __startup_func
3392 static void
kmem_readjust_ranges(uint32_t cur_idx)3393 kmem_readjust_ranges(
3394 	uint32_t        cur_idx)
3395 {
3396 	assert(cur_idx != 0);
3397 	uint32_t j = cur_idx - 1, random;
3398 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3399 	struct mach_vm_range *sp_range = sp.kc_range;
3400 
3401 	/*
3402 	 * Find max index where restriction is met
3403 	 */
3404 	for (; j > 0; j--) {
3405 		struct kmem_range_startup_spec spj = kmem_claims[j];
3406 		vm_map_offset_t max_start = spj.kc_range->min_address;
3407 		if (spj.kc_flags & KC_NO_MOVE) {
3408 			panic("kmem_range_init: Can't scramble with multiple constraints");
3409 		}
3410 		if (max_start <= sp_range->min_address) {
3411 			break;
3412 		}
3413 	}
3414 
3415 	/*
3416 	 * Pick a random index from 0 to max index and shift claims to the right
3417 	 * to make room for restricted claim
3418 	 */
3419 	random = kmem_get_random16((uint16_t)j);
3420 	assert(random <= j);
3421 
3422 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
3423 	sp_range->max_address = sp_range->min_address + sp.kc_size;
3424 
3425 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3426 		struct kmem_range_startup_spec spj = kmem_claims[j];
3427 		struct mach_vm_range *range = spj.kc_range;
3428 		range->min_address += sp.kc_size;
3429 		range->max_address += sp.kc_size;
3430 		kmem_claims[j + 1] = spj;
3431 	}
3432 
3433 	sp.kc_flags = KC_NO_MOVE;
3434 	kmem_claims[random] = sp;
3435 }
3436 
3437 __startup_func
3438 static vm_map_size_t
kmem_add_ptr_claims(void)3439 kmem_add_ptr_claims(void)
3440 {
3441 	uint64_t kmem_meta_num, kmem_ptr_chunks;
3442 	vm_map_size_t org_ptr_range_size = ptr_range_size;
3443 
3444 	ptr_range_size -= PAGE_SIZE;
3445 	ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3446 	ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3447 
3448 	kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3449 	ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3450 
3451 	kmem_meta_num = kmem_ptr_chunks + 2;
3452 	kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3453 
3454 	assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3455 	/*
3456 	 * Add claims for kmem's ranges
3457 	 */
3458 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3459 		struct kmem_range_startup_spec kmem_spec = {
3460 			.kc_name = "kmem_ptr_range",
3461 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3462 			.kc_size = ptr_range_size,
3463 			.kc_flags = KC_NO_ENTRY,
3464 		};
3465 		kmem_claims[kmem_claim_count++] = kmem_spec;
3466 
3467 		struct kmem_range_startup_spec kmem_meta_spec = {
3468 			.kc_name = "kmem_ptr_range_meta",
3469 			.kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3470 			.kc_size = kmem_meta_size,
3471 			.kc_flags = KC_NONE,
3472 		};
3473 		kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3474 	}
3475 	return (org_ptr_range_size - ptr_range_size - kmem_meta_size) *
3476 	       kmem_ptr_ranges;
3477 }
3478 
3479 __startup_func
3480 static void
kmem_add_extra_claims(void)3481 kmem_add_extra_claims(void)
3482 {
3483 	vm_map_size_t largest_free_size = 0, total_claims = 0;
3484 
3485 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3486 	largest_free_size = trunc_page(largest_free_size);
3487 
3488 	/*
3489 	 * kasan and configs w/o *TRR need to have just one ptr range due to
3490 	 * resource constraints.
3491 	 */
3492 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3493 	kmem_ptr_ranges = 1;
3494 #endif
3495 	/*
3496 	 * Determine size of data and pointer kmem_ranges
3497 	 */
3498 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3499 		total_claims += kmem_claims[i].kc_size;
3500 	}
3501 	assert((total_claims & PAGE_MASK) == 0);
3502 	largest_free_size -= total_claims;
3503 
3504 	/*
3505 	 * Use half the total available VA for all pointer allocations (this
3506 	 * includes the kmem_sprayqtn range). Given that we have 4 total
3507 	 * ranges divide the available VA by 8.
3508 	 */
3509 	ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3510 	sprayqtn_range_size = ptr_range_size;
3511 
3512 	if (sprayqtn_range_size > (sane_size / 2)) {
3513 		sprayqtn_range_size = sane_size / 2;
3514 	}
3515 
3516 	ptr_range_size = round_page(ptr_range_size);
3517 	sprayqtn_range_size = round_page(sprayqtn_range_size);
3518 
3519 
3520 	data_range_size = largest_free_size
3521 	    - (ptr_range_size * kmem_ptr_ranges)
3522 	    - sprayqtn_range_size;
3523 
3524 	/*
3525 	 * Add claims for kmem's ranges
3526 	 */
3527 	data_range_size += kmem_add_ptr_claims();
3528 	assert(data_range_size + sprayqtn_range_size +
3529 	    ((ptr_range_size + kmem_meta_size) * kmem_ptr_ranges) <=
3530 	    largest_free_size);
3531 
3532 	struct kmem_range_startup_spec kmem_spec_sprayqtn = {
3533 		.kc_name = "kmem_sprayqtn_range",
3534 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
3535 		.kc_size = sprayqtn_range_size,
3536 		.kc_flags = KC_NO_ENTRY,
3537 	};
3538 	kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
3539 
3540 	struct kmem_range_startup_spec kmem_spec_data = {
3541 		.kc_name = "kmem_data_range",
3542 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
3543 		.kc_size = data_range_size,
3544 		.kc_flags = KC_NO_ENTRY,
3545 	};
3546 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
3547 }
3548 
3549 __startup_func
3550 static void
kmem_scramble_ranges(void)3551 kmem_scramble_ranges(void)
3552 {
3553 	vm_map_offset_t start = 0;
3554 
3555 	/*
3556 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
3557 	 * the vm can find the requested ranges.
3558 	 */
3559 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
3560 	    VM_MAP_PAGE_SIZE(kernel_map));
3561 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
3562 
3563 	/*
3564 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
3565 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
3566 	 * could overflow into the heap.
3567 	 */
3568 	kext_alloc_init();
3569 
3570 	/*
3571 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
3572 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
3573 	 * eats about 2M of VA from the map)
3574 	 *
3575 	 * Note that we always need to slide by at least one page because the VM
3576 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
3577 	 * do not admit this address to be part of any zone submap.
3578 	 */
3579 	start = kmem_fuzz_start();
3580 
3581 	/*
3582 	 * Add claims for ptr and data kmem_ranges
3583 	 */
3584 	kmem_add_extra_claims();
3585 
3586 	/*
3587 	 * Shuffle registered claims
3588 	 */
3589 	assert(kmem_claim_count < UINT16_MAX);
3590 	kmem_shuffle_claims();
3591 
3592 	/*
3593 	 * Apply restrictions and determine range for each claim
3594 	 */
3595 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3596 		vm_map_offset_t end = 0;
3597 		struct kmem_range_startup_spec sp = kmem_claims[i];
3598 		struct mach_vm_range *sp_range = sp.kc_range;
3599 		if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
3600 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &start, NULL) != KERN_SUCCESS) {
3601 			panic("kmem_range_init: vm_map_locate_space failing for claim %s",
3602 			    sp.kc_name);
3603 		}
3604 
3605 		end = start + sp.kc_size;
3606 		/*
3607 		 * Re-adjust ranges if restriction not met
3608 		 */
3609 		if (sp_range->min_address && start > sp_range->min_address) {
3610 			kmem_readjust_ranges(i);
3611 		} else {
3612 			sp_range->min_address = start;
3613 			sp_range->max_address = end;
3614 		}
3615 		start = end;
3616 	}
3617 
3618 	/*
3619 	 * We have settled on the ranges, now create temporary entries for the
3620 	 * claims
3621 	 */
3622 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3623 		struct kmem_range_startup_spec sp = kmem_claims[i];
3624 		vm_map_entry_t entry = NULL;
3625 		if (sp.kc_flags & KC_NO_ENTRY) {
3626 			continue;
3627 		}
3628 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
3629 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(), &entry) != KERN_SUCCESS) {
3630 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
3631 			    sp.kc_name);
3632 		}
3633 		vm_object_reference(kernel_object_default);
3634 		VME_OBJECT_SET(entry, kernel_object_default, false, 0);
3635 		VME_OFFSET_SET(entry, entry->vme_start);
3636 		vm_map_unlock(kernel_map);
3637 	}
3638 	/*
3639 	 * Now that we are done assigning all the ranges, reset
3640 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
3641 	 */
3642 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
3643 
3644 #if DEBUG || DEVELOPMENT
3645 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3646 		struct kmem_range_startup_spec sp = kmem_claims[i];
3647 
3648 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
3649 		    (void *)sp.kc_range->min_address,
3650 		    (void *)sp.kc_range->max_address,
3651 		    mach_vm_size_pretty(sp.kc_size),
3652 		    mach_vm_size_unit(sp.kc_size));
3653 	}
3654 #endif /* DEBUG || DEVELOPMENT */
3655 }
3656 
3657 __startup_func
3658 static void
kmem_range_init(void)3659 kmem_range_init(void)
3660 {
3661 	vm_size_t range_adjustment;
3662 
3663 	kmem_scramble_ranges();
3664 
3665 	range_adjustment = sprayqtn_range_size >> 3;
3666 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
3667 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
3668 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
3669 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
3670 
3671 	range_adjustment = data_range_size >> 3;
3672 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
3673 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
3674 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
3675 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
3676 
3677 	pmap_init();
3678 	kmem_metadata_init();
3679 	kmem_sizeclass_init();
3680 
3681 #if DEBUG || DEVELOPMENT
3682 	for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
3683 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
3684 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
3685 		    (void *)kmem_large_ranges[i].min_address,
3686 		    (void *)kmem_large_ranges[i].max_address,
3687 		    mach_vm_size_pretty(range_size),
3688 		    mach_vm_size_unit(range_size));
3689 	}
3690 #endif
3691 }
3692 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
3693 
3694 #if DEBUG || DEVELOPMENT
3695 __startup_func
3696 static void
kmem_log_init(void)3697 kmem_log_init(void)
3698 {
3699 	/*
3700 	 * Log can only be created after the the kmem subsystem is initialized as
3701 	 * btlog creation uses kmem
3702 	 */
3703 	kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
3704 }
3705 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
3706 
3707 kmem_gobj_stats
kmem_get_gobj_stats(void)3708 kmem_get_gobj_stats(void)
3709 {
3710 	kmem_gobj_stats stats = {};
3711 
3712 	vm_map_lock(kernel_map);
3713 	for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
3714 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
3715 		struct mach_vm_range range = kmem_ranges[range_id];
3716 		struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3717 		struct kmem_page_meta *meta_end;
3718 		uint64_t meta_idx = meta - kmem_meta_base[range_id];
3719 		vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
3720 		vm_map_offset_t addr;
3721 		vm_map_entry_t entry;
3722 
3723 		/*
3724 		 * Left front
3725 		 */
3726 		va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
3727 		meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
3728 
3729 		/*
3730 		 * Right front
3731 		 */
3732 		meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3733 		meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
3734 		    &meta_idx);
3735 		meta_idx = meta_end - meta;
3736 		meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
3737 		va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
3738 
3739 		/*
3740 		 * Compute VA allocated in entire range
3741 		 */
3742 		if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
3743 			entry = entry->vme_next;
3744 		}
3745 		while (entry != vm_map_to_entry(kernel_map) &&
3746 		    entry->vme_start < range.max_address) {
3747 			used += (entry->vme_end - entry->vme_start);
3748 			entry = entry->vme_next;
3749 		}
3750 
3751 		pte_sz = round_page(atop(va - used) * 8);
3752 
3753 		stats.total_used += used;
3754 		stats.total_va += va;
3755 		stats.pte_sz += pte_sz;
3756 		stats.meta_sz += meta_sz;
3757 	}
3758 	vm_map_unlock(kernel_map);
3759 
3760 	return stats;
3761 }
3762 
3763 #endif /* DEBUG || DEVELOPMENT */
3764 
3765 /*
3766  *	kmem_init:
3767  *
3768  *	Initialize the kernel's virtual memory map, taking
3769  *	into account all memory allocated up to this time.
3770  */
3771 __startup_func
3772 void
kmem_init(vm_offset_t start,vm_offset_t end)3773 kmem_init(
3774 	vm_offset_t     start,
3775 	vm_offset_t     end)
3776 {
3777 	vm_map_offset_t map_start;
3778 	vm_map_offset_t map_end;
3779 
3780 	map_start = vm_map_trunc_page(start,
3781 	    VM_MAP_PAGE_MASK(kernel_map));
3782 	map_end = vm_map_round_page(end,
3783 	    VM_MAP_PAGE_MASK(kernel_map));
3784 
3785 	vm_map_will_allocate_early_map(&kernel_map);
3786 #if defined(__arm64__)
3787 	kernel_map = vm_map_create_options(pmap_kernel(),
3788 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3789 	    VM_MAX_KERNEL_ADDRESS,
3790 	    VM_MAP_CREATE_DEFAULT);
3791 	/*
3792 	 *	Reserve virtual memory allocated up to this time.
3793 	 */
3794 	{
3795 		unsigned int    region_select = 0;
3796 		vm_map_offset_t region_start;
3797 		vm_map_size_t   region_size;
3798 		vm_map_offset_t map_addr;
3799 		kern_return_t kr;
3800 
3801 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
3802 			map_addr = region_start;
3803 			kr = vm_map_enter(kernel_map, &map_addr,
3804 			    vm_map_round_page(region_size,
3805 			    VM_MAP_PAGE_MASK(kernel_map)),
3806 			    (vm_map_offset_t) 0,
3807 			    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vmkf_no_pmap_check = true),
3808 			    VM_OBJECT_NULL,
3809 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
3810 			    VM_INHERIT_DEFAULT);
3811 
3812 			if (kr != KERN_SUCCESS) {
3813 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3814 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
3815 				    (uint64_t) region_size, kr);
3816 			}
3817 
3818 			region_select++;
3819 		}
3820 	}
3821 #else
3822 	kernel_map = vm_map_create_options(pmap_kernel(),
3823 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
3824 	    VM_MAP_CREATE_DEFAULT);
3825 	/*
3826 	 *	Reserve virtual memory allocated up to this time.
3827 	 */
3828 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
3829 		vm_map_offset_t map_addr;
3830 		kern_return_t kr;
3831 
3832 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3833 		kr = vm_map_enter(kernel_map,
3834 		    &map_addr,
3835 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3836 		    (vm_map_offset_t) 0,
3837 		    VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
3838 		    VM_OBJECT_NULL,
3839 		    (vm_object_offset_t) 0, FALSE,
3840 		    VM_PROT_NONE, VM_PROT_NONE,
3841 		    VM_INHERIT_DEFAULT);
3842 
3843 		if (kr != KERN_SUCCESS) {
3844 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
3845 			    (uint64_t) start, (uint64_t) end,
3846 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
3847 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
3848 			    kr);
3849 		}
3850 	}
3851 #endif
3852 
3853 	kmem_set_user_wire_limits();
3854 }
3855 
3856 
3857 #pragma mark map copyio
3858 
3859 /*
3860  *	Routine:	copyinmap
3861  *	Purpose:
3862  *		Like copyin, except that fromaddr is an address
3863  *		in the specified VM map.  This implementation
3864  *		is incomplete; it handles the current user map
3865  *		and the kernel map/submaps.
3866  */
3867 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)3868 copyinmap(
3869 	vm_map_t                map,
3870 	vm_map_offset_t         fromaddr,
3871 	void                    *todata,
3872 	vm_size_t               length)
3873 {
3874 	kern_return_t   kr = KERN_SUCCESS;
3875 	vm_map_t oldmap;
3876 
3877 	if (vm_map_pmap(map) == pmap_kernel()) {
3878 		/* assume a correct copy */
3879 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
3880 	} else if (current_map() == map) {
3881 		if (copyin(fromaddr, todata, length) != 0) {
3882 			kr = KERN_INVALID_ADDRESS;
3883 		}
3884 	} else {
3885 		vm_map_reference(map);
3886 		oldmap = vm_map_switch(map);
3887 		if (copyin(fromaddr, todata, length) != 0) {
3888 			kr = KERN_INVALID_ADDRESS;
3889 		}
3890 		vm_map_switch(oldmap);
3891 		vm_map_deallocate(map);
3892 	}
3893 	return kr;
3894 }
3895 
3896 /*
3897  *	Routine:	copyoutmap
3898  *	Purpose:
3899  *		Like copyout, except that toaddr is an address
3900  *		in the specified VM map.
3901  */
3902 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)3903 copyoutmap(
3904 	vm_map_t                map,
3905 	void                    *fromdata,
3906 	vm_map_address_t        toaddr,
3907 	vm_size_t               length)
3908 {
3909 	kern_return_t   kr = KERN_SUCCESS;
3910 	vm_map_t        oldmap;
3911 
3912 	if (vm_map_pmap(map) == pmap_kernel()) {
3913 		/* assume a correct copy */
3914 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
3915 	} else if (current_map() == map) {
3916 		if (copyout(fromdata, toaddr, length) != 0) {
3917 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
3918 			kr = KERN_INVALID_ADDRESS;
3919 		}
3920 	} else {
3921 		vm_map_reference(map);
3922 		oldmap = vm_map_switch(map);
3923 		if (copyout(fromdata, toaddr, length) != 0) {
3924 			ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR), KERN_INVALID_ADDRESS /* arg */);
3925 			kr = KERN_INVALID_ADDRESS;
3926 		}
3927 		vm_map_switch(oldmap);
3928 		vm_map_deallocate(map);
3929 	}
3930 	return kr;
3931 }
3932 
3933 /*
3934  *	Routine:	copyoutmap_atomic{32, 64}
3935  *	Purpose:
3936  *		Like copyoutmap, except that the operation is atomic.
3937  *      Takes in value rather than *fromdata pointer.
3938  */
3939 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)3940 copyoutmap_atomic32(
3941 	vm_map_t                map,
3942 	uint32_t                value,
3943 	vm_map_address_t        toaddr)
3944 {
3945 	kern_return_t   kr = KERN_SUCCESS;
3946 	vm_map_t        oldmap;
3947 
3948 	if (vm_map_pmap(map) == pmap_kernel()) {
3949 		/* assume a correct toaddr */
3950 		*(uint32_t *)toaddr = value;
3951 	} else if (current_map() == map) {
3952 		if (copyout_atomic32(value, toaddr) != 0) {
3953 			kr = KERN_INVALID_ADDRESS;
3954 		}
3955 	} else {
3956 		vm_map_reference(map);
3957 		oldmap = vm_map_switch(map);
3958 		if (copyout_atomic32(value, toaddr) != 0) {
3959 			kr = KERN_INVALID_ADDRESS;
3960 		}
3961 		vm_map_switch(oldmap);
3962 		vm_map_deallocate(map);
3963 	}
3964 	return kr;
3965 }
3966 
3967 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)3968 copyoutmap_atomic64(
3969 	vm_map_t                map,
3970 	uint64_t                value,
3971 	vm_map_address_t        toaddr)
3972 {
3973 	kern_return_t   kr = KERN_SUCCESS;
3974 	vm_map_t        oldmap;
3975 
3976 	if (vm_map_pmap(map) == pmap_kernel()) {
3977 		/* assume a correct toaddr */
3978 		*(uint64_t *)toaddr = value;
3979 	} else if (current_map() == map) {
3980 		if (copyout_atomic64(value, toaddr) != 0) {
3981 			kr = KERN_INVALID_ADDRESS;
3982 		}
3983 	} else {
3984 		vm_map_reference(map);
3985 		oldmap = vm_map_switch(map);
3986 		if (copyout_atomic64(value, toaddr) != 0) {
3987 			kr = KERN_INVALID_ADDRESS;
3988 		}
3989 		vm_map_switch(oldmap);
3990 		vm_map_deallocate(map);
3991 	}
3992 	return kr;
3993 }
3994 
3995 
3996 #pragma mark pointer obfuscation / packing
3997 
3998 /*
3999  *
4000  *	The following two functions are to be used when exposing kernel
4001  *	addresses to userspace via any of the various debug or info
4002  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4003  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4004  *	are exported to KEXTs.
4005  *
4006  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4007  */
4008 
4009 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4010 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4011 {
4012 	assert(salt != 0);
4013 
4014 	if (addr == 0) {
4015 		return 0ul;
4016 	}
4017 
4018 	if (VM_KERNEL_IS_SLID(addr)) {
4019 		return VM_KERNEL_UNSLIDE(addr);
4020 	}
4021 
4022 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4023 	SHA256_CTX sha_ctx;
4024 
4025 	SHA256_Init(&sha_ctx);
4026 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4027 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4028 	SHA256_Final(sha_digest, &sha_ctx);
4029 
4030 	return sha_digest[0];
4031 }
4032 
4033 __exported vm_offset_t
4034 vm_kernel_addrhash_external(vm_offset_t addr);
4035 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4036 vm_kernel_addrhash_external(vm_offset_t addr)
4037 {
4038 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4039 }
4040 
4041 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4042 vm_kernel_addrhide(
4043 	vm_offset_t addr,
4044 	vm_offset_t *hide_addr)
4045 {
4046 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
4047 }
4048 
4049 /*
4050  *	vm_kernel_addrperm_external:
4051  *	vm_kernel_unslide_or_perm_external:
4052  *
4053  *	Use these macros when exposing an address to userspace that could come from
4054  *	either kernel text/data *or* the heap.
4055  */
4056 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4057 vm_kernel_addrperm_external(
4058 	vm_offset_t addr,
4059 	vm_offset_t *perm_addr)
4060 {
4061 	if (VM_KERNEL_IS_SLID(addr)) {
4062 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
4063 	} else if (VM_KERNEL_ADDRESS(addr)) {
4064 		*perm_addr = addr + vm_kernel_addrperm_ext;
4065 	} else {
4066 		*perm_addr = addr;
4067 	}
4068 }
4069 
4070 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4071 vm_kernel_unslide_or_perm_external(
4072 	vm_offset_t addr,
4073 	vm_offset_t *up_addr)
4074 {
4075 	vm_kernel_addrperm_external(addr, up_addr);
4076 }
4077 
4078 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4079 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4080 {
4081 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4082 		panic("pointer %p can't be packed: low %d bits aren't 0",
4083 		    (void *)ptr, params.vmpp_shift);
4084 	} else if (ptr <= params.vmpp_base) {
4085 		panic("pointer %p can't be packed: below base %p",
4086 		    (void *)ptr, (void *)params.vmpp_base);
4087 	} else {
4088 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
4089 		    (void *)ptr, (void *)vm_packing_max_packable(params));
4090 	}
4091 }
4092 
4093 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4094 vm_packing_verify_range(
4095 	const char *subsystem,
4096 	vm_offset_t min_address,
4097 	vm_offset_t max_address,
4098 	vm_packing_params_t params)
4099 {
4100 	if (min_address > max_address) {
4101 		panic("%s: %s range invalid min:%p > max:%p",
4102 		    __func__, subsystem, (void *)min_address, (void *)max_address);
4103 	}
4104 
4105 	if (!params.vmpp_base_relative) {
4106 		return;
4107 	}
4108 
4109 	if (min_address <= params.vmpp_base) {
4110 		panic("%s: %s range invalid min:%p <= base:%p",
4111 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4112 	}
4113 
4114 	if (max_address > vm_packing_max_packable(params)) {
4115 		panic("%s: %s range invalid max:%p >= max packable:%p",
4116 		    __func__, subsystem, (void *)max_address,
4117 		    (void *)vm_packing_max_packable(params));
4118 	}
4119 }
4120 
4121 #pragma mark tests
4122 #if DEBUG || DEVELOPMENT
4123 #include <sys/errno.h>
4124 
4125 static void
4126 kmem_test_for_entry(
4127 	vm_map_t                map,
4128 	vm_offset_t             addr,
4129 	void                  (^block)(vm_map_entry_t))
4130 {
4131 	vm_map_entry_t entry;
4132 
4133 	vm_map_lock(map);
4134 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4135 	vm_map_unlock(map);
4136 }
4137 
4138 #define kmem_test_assert_map(map, pg, entries) ({ \
4139 	assert3u((map)->size, ==, ptoa(pg)); \
4140 	assert3u((map)->hdr.nentries, ==, entries); \
4141 })
4142 
4143 static bool
can_write_at(vm_offset_t offs,uint32_t page)4144 can_write_at(vm_offset_t offs, uint32_t page)
4145 {
4146 	static const int zero;
4147 
4148 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4149 }
4150 #define assert_writeable(offs, page) \
4151 	assertf(can_write_at(offs, page), \
4152 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4153 
4154 #define assert_faults(offs, page) \
4155 	assertf(!can_write_at(offs, page), \
4156 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4157 
4158 #define peek(offs, page) \
4159 	(*(uint32_t *)((offs) + ptoa(page)))
4160 
4161 #define poke(offs, page, v) \
4162 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
4163 
4164 __attribute__((noinline))
4165 static void
kmem_alloc_basic_test(vm_map_t map)4166 kmem_alloc_basic_test(vm_map_t map)
4167 {
4168 	kmem_guard_t guard = {
4169 		.kmg_tag = VM_KERN_MEMORY_DIAG,
4170 	};
4171 	vm_offset_t addr;
4172 
4173 	/*
4174 	 * Test wired basics:
4175 	 * - KMA_KOBJECT
4176 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4177 	 * - allocation alignment
4178 	 */
4179 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4180 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4181 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4182 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4183 	kmem_test_assert_map(map, 10, 1);
4184 
4185 	kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
4186 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4187 		assert(e->vme_kernel_object);
4188 		assert(!e->vme_atomic);
4189 		assert3u(e->vme_start, <=, addr);
4190 		assert3u(addr + ptoa(10), <=, e->vme_end);
4191 	});
4192 
4193 	assert_faults(addr, 0);
4194 	for (int i = 1; i < 9; i++) {
4195 		assert_writeable(addr, i);
4196 	}
4197 	assert_faults(addr, 9);
4198 
4199 	kmem_free(map, addr, ptoa(10));
4200 	kmem_test_assert_map(map, 0, 0);
4201 
4202 	/*
4203 	 * Test pageable basics.
4204 	 */
4205 	addr = kmem_alloc_guard(map, ptoa(10), 0,
4206 	    KMA_PAGEABLE, guard).kmr_address;
4207 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4208 	kmem_test_assert_map(map, 10, 1);
4209 
4210 	for (int i = 0; i < 9; i++) {
4211 		assert_faults(addr, i);
4212 		poke(addr, i, 42);
4213 		assert_writeable(addr, i);
4214 	}
4215 
4216 	kmem_free(map, addr, ptoa(10));
4217 	kmem_test_assert_map(map, 0, 0);
4218 }
4219 
4220 __attribute__((noinline))
4221 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4222 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4223 {
4224 	kmem_guard_t guard = {
4225 		.kmg_atomic  = !(kind & KMR_DATA),
4226 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
4227 		.kmg_context = 0xefface,
4228 	};
4229 	vm_offset_t addr, newaddr;
4230 	const int N = 10;
4231 
4232 	/*
4233 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
4234 	 *	we could conceive an implementation where it grows in place
4235 	 *	if there's space after it.
4236 	 *
4237 	 *	However, this is what the implementation does today.
4238 	 */
4239 	bool realloc_growth_changes_address = true;
4240 	bool GL = (kind & KMR_GUARD_LAST);
4241 
4242 	/*
4243 	 *	Initial N page allocation
4244 	 */
4245 	addr = kmem_alloc_guard(map, ptoa(N), 0,
4246 	    (kind & (KMA_KOBJECT | KMA_GUARD_LAST)) | KMA_ZERO,
4247 	    guard).kmr_address;
4248 	assert3u(addr, !=, 0);
4249 	kmem_test_assert_map(map, N, 1);
4250 	for (int pg = 0; pg < N - GL; pg++) {
4251 		poke(addr, pg, 42 + pg);
4252 	}
4253 	for (int pg = N - GL; pg < N; pg++) {
4254 		assert_faults(addr, pg);
4255 	}
4256 
4257 
4258 	/*
4259 	 *	Grow to N + 3 pages
4260 	 */
4261 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4262 	    kind | KMR_ZERO, guard).kmr_address;
4263 	assert3u(newaddr, !=, 0);
4264 	if (realloc_growth_changes_address) {
4265 		assert3u(addr, !=, newaddr);
4266 	}
4267 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4268 		kmem_test_assert_map(map, N + 3, 1);
4269 	} else {
4270 		kmem_test_assert_map(map, 2 * N + 3, 2);
4271 	}
4272 	for (int pg = 0; pg < N - GL; pg++) {
4273 		assert3u(peek(newaddr, pg), ==, 42 + pg);
4274 	}
4275 	if ((kind & KMR_FREEOLD) == 0) {
4276 		for (int pg = 0; pg < N - GL; pg++) {
4277 			assert3u(peek(addr, pg), ==, 42 + pg);
4278 		}
4279 		/* check for tru-share */
4280 		poke(addr + 16, 0, 1234);
4281 		assert3u(peek(newaddr + 16, 0), ==, 1234);
4282 		kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
4283 		kmem_test_assert_map(map, N + 3, 1);
4284 	}
4285 	if (addr != newaddr) {
4286 		for (int pg = 0; pg < N - GL; pg++) {
4287 			assert_faults(addr, pg);
4288 		}
4289 	}
4290 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4291 		assert3u(peek(newaddr, pg), ==, 0);
4292 	}
4293 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4294 		assert_faults(newaddr, pg);
4295 	}
4296 	addr = newaddr;
4297 
4298 
4299 	/*
4300 	 *	Shrink to N - 2 pages
4301 	 */
4302 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4303 	    kind | KMR_ZERO, guard).kmr_address;
4304 	assert3u(map->size, ==, ptoa(N - 2));
4305 	assert3u(newaddr, ==, addr);
4306 	kmem_test_assert_map(map, N - 2, 1);
4307 
4308 	for (int pg = 0; pg < N - 2 - GL; pg++) {
4309 		assert3u(peek(addr, pg), ==, 42 + pg);
4310 	}
4311 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4312 		assert_faults(addr, pg);
4313 	}
4314 
4315 	kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
4316 	kmem_test_assert_map(map, 0, 0);
4317 }
4318 
4319 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4320 kmem_basic_test(__unused int64_t in, int64_t *out)
4321 {
4322 	mach_vm_offset_t addr;
4323 	vm_map_t map;
4324 
4325 	printf("%s: test running\n", __func__);
4326 
4327 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4328 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4329 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4330 
4331 	printf("%s: kmem_alloc ...\n", __func__);
4332 	kmem_alloc_basic_test(map);
4333 	printf("%s:     PASS\n", __func__);
4334 
4335 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4336 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4337 	printf("%s:     PASS\n", __func__);
4338 
4339 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4340 	kmem_realloc_basic_test(map, KMR_FREEOLD);
4341 	printf("%s:     PASS\n", __func__);
4342 
4343 	printf("%s: kmem_realloc (KMR_NONE) ...\n", __func__);
4344 	kmem_realloc_basic_test(map, KMR_NONE);
4345 	printf("%s:     PASS\n", __func__);
4346 
4347 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4348 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4349 	printf("%s:     PASS\n", __func__);
4350 
4351 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4352 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4353 	printf("%s:     PASS\n", __func__);
4354 
4355 	/* using KMR_DATA signals to test the non atomic realloc path */
4356 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4357 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4358 	printf("%s:     PASS\n", __func__);
4359 
4360 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4361 	kmem_realloc_basic_test(map, KMR_DATA);
4362 	printf("%s:     PASS\n", __func__);
4363 
4364 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
4365 	vm_map_deallocate(map);
4366 
4367 	printf("%s: test passed\n", __func__);
4368 	*out = 1;
4369 	return 0;
4370 }
4371 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
4372 
4373 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)4374 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
4375 {
4376 	uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
4377 
4378 	assert(chunks >= kmem_size_array[idx].ks_num_chunk);
4379 }
4380 
4381 __attribute__((noinline))
4382 static void
kmem_test_get_size_idx_for_all_chunks()4383 kmem_test_get_size_idx_for_all_chunks()
4384 {
4385 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
4386 		uint32_t chunks = kmem_size_array[i].ks_num_chunk;
4387 
4388 		if (chunks != 1) {
4389 			kmem_test_get_size_idx_for_chunks(chunks - 1);
4390 		}
4391 		kmem_test_get_size_idx_for_chunks(chunks);
4392 		kmem_test_get_size_idx_for_chunks(chunks + 1);
4393 	}
4394 }
4395 
4396 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)4397 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
4398 {
4399 	printf("%s: test running\n", __func__);
4400 
4401 	printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
4402 	kmem_test_get_size_idx_for_all_chunks();
4403 	printf("%s:     PASS\n", __func__);
4404 
4405 	printf("%s: test passed\n", __func__);
4406 	*out = 1;
4407 	return 0;
4408 }
4409 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
4410 #endif /* DEBUG || DEVELOPMENT */
4411