xref: /xnu-12377.61.12/osfmk/vm/vm_kern.c (revision 4d495c6e23c53686cf65f45067f79024cf5dcee8)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern_internal.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object_internal.h>
73 #include <vm/vm_page_internal.h>
74 #include <vm/vm_compressor_xnu.h>
75 #include <vm/vm_pageout_xnu.h>
76 #include <vm/vm_init_xnu.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #if HAS_MTE
80 #include <vm/vm_mteinfo_internal.h>
81 #endif /* HAS_MTE */
82 #include <vm/vm_far.h>
83 #include <kern/misc_protos.h>
84 #include <vm/cpm_internal.h>
85 #include <kern/ledger.h>
86 #include <kern/bits.h>
87 #include <kern/startup.h>
88 #include <kern/telemetry.h>
89 
90 #include <string.h>
91 
92 #include <libkern/OSDebug.h>
93 #include <libkern/crypto/sha2.h>
94 #include <libkern/section_keywords.h>
95 #include <sys/kdebug.h>
96 #include <sys/kdebug_triage.h>
97 
98 #include <san/kasan.h>
99 #include <kern/kext_alloc.h>
100 #include <kern/backtrace.h>
101 #include <os/hash.h>
102 #include <kern/zalloc_internal.h>
103 #include <libkern/crypto/rand.h>
104 
105 /*
106  *	Variables exported by this module.
107  */
108 
109 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
110 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
111 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
112 
113 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
114     KMEM_RANGE_ID_NUM_PTR);
115 #define KMEM_GOBJ_THRESHOLD   (32ULL << 20)
116 #if DEBUG || DEVELOPMENT
117 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
118 #define KMEM_OUTLIER_SIZE      0
119 #define KMEM_OUTLIER_ALIGN     1
120 btlog_t kmem_outlier_log;
121 #endif /* DEBUG || DEVELOPMENT */
122 
123 __startup_data static vm_map_size_t data_range_size;
124 __startup_data static vm_map_size_t shared_data_range_size;
125 __startup_data static vm_map_size_t ptr_range_size;
126 __startup_data static vm_map_size_t sprayqtn_range_size;
127 
128 #pragma mark helpers
129 
130 __attribute__((overloadable))
131 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)132 ANYF(kma_flags_t flags)
133 {
134 	return (kmem_flags_t)flags;
135 }
136 
137 __attribute__((overloadable))
138 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)139 ANYF(kmr_flags_t flags)
140 {
141 	return (kmem_flags_t)flags;
142 }
143 
144 __attribute__((overloadable))
145 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)146 ANYF(kmf_flags_t flags)
147 {
148 	return (kmem_flags_t)flags;
149 }
150 
151 __abortlike
152 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)153 __kmem_invalid_size_panic(
154 	vm_map_t        map,
155 	vm_size_t       size,
156 	uint32_t        flags)
157 {
158 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
159 	    map, flags, (size_t)size);
160 }
161 
162 __abortlike
163 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)164 __kmem_invalid_arguments_panic(
165 	const char     *what,
166 	vm_map_t        map,
167 	vm_address_t    address,
168 	vm_size_t       size,
169 	uint32_t        flags)
170 {
171 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
172 	    "invalid arguments passed",
173 	    what, map, (void *)address, (size_t)size, flags);
174 }
175 
176 __abortlike
177 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)178 __kmem_failed_panic(
179 	vm_map_t        map,
180 	vm_size_t       size,
181 	uint32_t        flags,
182 	kern_return_t   kr,
183 	const char     *what)
184 {
185 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
186 	    what, map, (size_t)size, flags, kr);
187 }
188 
189 __abortlike
190 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)191 __kmem_entry_not_found_panic(
192 	vm_map_t        map,
193 	vm_offset_t     addr)
194 {
195 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
196 }
197 
198 static inline vm_object_t
__kmem_object(kmem_flags_t flags)199 __kmem_object(kmem_flags_t flags)
200 {
201 	if (flags & KMEM_COMPRESSOR) {
202 		if (flags & KMEM_KOBJECT) {
203 			panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
204 		}
205 		return compressor_object;
206 	}
207 	if (!(flags & KMEM_KOBJECT)) {
208 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
209 	}
210 #if HAS_MTE
211 	if (flags & KMEM_TAG) {
212 		return kernel_object_tagged;
213 	}
214 #endif /* HAS_MTE */
215 	return kernel_object_default;
216 }
217 
218 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)219 __kmem_mapping_type(kmem_flags_t flags)
220 {
221 	if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
222 		return PMAP_MAPPING_TYPE_DEFAULT;
223 	} else if (flags & KMEM_DATA) {
224 		return kalloc_is_restricted_data_mode_enforced() ?
225 		       PMAP_MAPPING_TYPE_RESTRICTED : PMAP_MAPPING_TYPE_DEFAULT;
226 	} else {
227 		return PMAP_MAPPING_TYPE_RESTRICTED;
228 	}
229 }
230 
231 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)232 __kmem_guard_left(kmem_flags_t flags)
233 {
234 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
235 }
236 
237 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)238 __kmem_guard_right(kmem_flags_t flags)
239 {
240 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
241 }
242 
243 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)244 __kmem_guard_size(kmem_flags_t flags)
245 {
246 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
247 }
248 
249 __pure2
250 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)251 __kmem_entry_orig_size(vm_map_entry_t entry)
252 {
253 	vm_object_t object = VME_OBJECT(entry);
254 
255 	if (entry->vme_kernel_object) {
256 		return entry->vme_end - entry->vme_start -
257 		       entry->vme_object_or_delta;
258 	} else {
259 		return object->vo_size - object->vo_size_delta;
260 	}
261 }
262 
263 
264 #pragma mark kmem range methods
265 
266 #define mach_vm_range_load(r, rmin, rmax) \
267 	({ (rmin) = (r)->min_address; (rmax) = (r)->max_address; })
268 
269 __abortlike
270 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)271 __mach_vm_range_overflow(
272 	mach_vm_offset_t        addr,
273 	mach_vm_offset_t        size)
274 {
275 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
276 	    addr, addr, size);
277 }
278 
279 __abortlike
280 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)281 __mach_vm_range_invalid(
282 	mach_vm_offset_t        min_address,
283 	mach_vm_offset_t        max_address)
284 {
285 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
286 	    min_address, max_address);
287 }
288 
289 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)290 mach_vm_range_size(const struct mach_vm_range *r)
291 {
292 	mach_vm_offset_t rmin, rmax;
293 
294 	mach_vm_range_load(r, rmin, rmax);
295 	return rmax - rmin;
296 }
297 
298 __attribute__((overloadable))
299 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)300 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
301 {
302 	mach_vm_offset_t rmin, rmax;
303 	/*
304 	 * The `&` is not a typo: we really expect the check to pass,
305 	 * so encourage the compiler to eagerly load and test without branches
306 	 */
307 	mach_vm_range_load(r, rmin, rmax);
308 	return (addr >= rmin) & (addr < rmax);
309 }
310 
311 __attribute__((overloadable))
312 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)313 mach_vm_range_contains(
314 	const struct mach_vm_range *r,
315 	mach_vm_offset_t        addr,
316 	mach_vm_offset_t        size)
317 {
318 	mach_vm_offset_t rmin, rmax;
319 	mach_vm_offset_t end;
320 
321 	if (__improbable(os_add_overflow(addr, size, &end))) {
322 		return false;
323 	}
324 
325 	/*
326 	 *	 The `&` is not a typo: we really expect the check to pass,
327 	 *   so encourage the compiler to eagerly load and test without branches
328 	 */
329 	mach_vm_range_load(r, rmin, rmax);
330 	return (addr >= rmin) & (end >= rmin) & (end <= rmax);
331 }
332 
333 __attribute__((overloadable))
334 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)335 mach_vm_range_intersects(
336 	const struct mach_vm_range *r1,
337 	const struct mach_vm_range *r2)
338 {
339 	mach_vm_offset_t r1_min, r1_max;
340 	mach_vm_offset_t r2_min, r2_max;
341 
342 	mach_vm_range_load(r1, r1_min, r1_max);
343 	r2_min = r2->min_address;
344 	r2_max = r2->max_address;
345 
346 	if (r1_min > r1_max) {
347 		__mach_vm_range_invalid(r1_min, r1_max);
348 	}
349 
350 	if (r2_min > r2_max) {
351 		__mach_vm_range_invalid(r2_min, r2_max);
352 	}
353 
354 	return r1_max > r2_min && r1_min < r2_max;
355 }
356 
357 __attribute__((overloadable))
358 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)359 mach_vm_range_intersects(
360 	const struct mach_vm_range *r1,
361 	mach_vm_offset_t        addr,
362 	mach_vm_offset_t        size)
363 {
364 	struct mach_vm_range r2;
365 
366 	r2.min_address = addr;
367 	if (os_add_overflow(addr, size, &r2.max_address)) {
368 		__mach_vm_range_overflow(addr, size);
369 	}
370 
371 	return mach_vm_range_intersects(r1, &r2);
372 }
373 
374 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)375 kmem_range_id_contains(
376 	kmem_range_id_t         range_id,
377 	vm_map_offset_t         addr,
378 	vm_map_size_t           size)
379 {
380 	return mach_vm_range_contains(&kmem_ranges[range_id], vm_memtag_canonicalize_kernel(addr), size);
381 }
382 
383 __abortlike
384 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)385 kmem_range_invalid_panic(
386 	kmem_range_id_t         range_id,
387 	vm_map_offset_t         addr,
388 	vm_map_size_t           size)
389 {
390 	const struct mach_vm_range *r = &kmem_ranges[range_id];
391 	mach_vm_offset_t rmin, rmax;
392 
393 	mach_vm_range_load(r, rmin, rmax);
394 	if (addr + size < rmin) {
395 		panic("addr %p + size %llu overflows %p", (void *)addr, size,
396 		    (void *)(addr + size));
397 	}
398 	panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
399 	    (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
400 }
401 
402 /*
403  * Return whether the entire allocation is contained in the given range
404  */
405 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)406 kmem_range_contains_fully(
407 	kmem_range_id_t         range_id,
408 	vm_map_offset_t         addr,
409 	vm_map_size_t           size)
410 {
411 	const struct mach_vm_range *r = &kmem_ranges[range_id];
412 	mach_vm_offset_t rmin, rmax;
413 	bool result = false;
414 
415 	if (VM_KERNEL_ADDRESS(addr)) {
416 		addr = vm_memtag_canonicalize_kernel(addr);
417 	}
418 
419 	/*
420 	 * The `&` is not a typo: we really expect the check to pass,
421 	 * so encourage the compiler to eagerly load and test without branches
422 	 */
423 	mach_vm_range_load(r, rmin, rmax);
424 	result = (addr >= rmin) & (addr < rmax);
425 	if (__improbable(result
426 	    && ((addr + size < rmin) || (addr + size > rmax)))) {
427 		kmem_range_invalid_panic(range_id, addr, size);
428 	}
429 	return result;
430 }
431 
432 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)433 kmem_range_id_size(kmem_range_id_t range_id)
434 {
435 	return mach_vm_range_size(&kmem_ranges[range_id]);
436 }
437 
438 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)439 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
440 {
441 	kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
442 
443 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
444 		if (kmem_range_contains_fully(range_id, addr, size)) {
445 			return range_id;
446 		}
447 	}
448 	return KMEM_RANGE_ID_NONE;
449 }
450 
451 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)452 kmem_is_ptr_range(vm_map_range_id_t range_id)
453 {
454 	return (range_id >= KMEM_RANGE_ID_FIRST) &&
455 	       (range_id <= KMEM_RANGE_ID_NUM_PTR);
456 }
457 
458 __abortlike
459 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)460 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
461 {
462 	panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
463 	    (void *)addr);
464 }
465 
466 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)467 kmem_validate_range_for_overwrite(
468 	vm_map_offset_t         addr,
469 	vm_map_size_t           size)
470 {
471 	vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
472 
473 	if (kmem_is_ptr_range(range_id)) {
474 		kmem_range_invalid_for_overwrite(addr);
475 	}
476 
477 	return &kmem_ranges[range_id];
478 }
479 
480 
481 #pragma mark entry parameters
482 
483 
484 __abortlike
485 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)486 __kmem_entry_validate_panic(
487 	vm_map_t        map,
488 	vm_map_entry_t  entry,
489 	vm_offset_t     addr,
490 	vm_size_t       size,
491 	uint32_t        flags,
492 	kmem_guard_t    guard)
493 {
494 	const char *what = "???";
495 
496 	if (entry->vme_atomic != guard.kmg_atomic) {
497 		what = "atomicity";
498 	} else if (entry->is_sub_map != guard.kmg_submap) {
499 		what = "objectness";
500 	} else if (addr != entry->vme_start) {
501 		what = "left bound";
502 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
503 		what = "right bound";
504 	} else if (guard.kmg_context != entry->vme_context) {
505 		what = "guard";
506 	}
507 
508 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
509 	    "entry:%p %s mismatch guard(0x%08x)",
510 	    map, (void *)addr, size, flags, entry,
511 	    what, guard.kmg_context);
512 }
513 
514 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)515 __kmem_entry_validate_guard(
516 	vm_map_entry_t  entry,
517 	vm_offset_t     addr,
518 	vm_size_t       size,
519 	kmem_flags_t    flags,
520 	kmem_guard_t    guard)
521 {
522 	if (entry->vme_atomic != guard.kmg_atomic) {
523 		return false;
524 	}
525 
526 	if (!guard.kmg_atomic) {
527 		return true;
528 	}
529 
530 	if (entry->is_sub_map != guard.kmg_submap) {
531 		return false;
532 	}
533 
534 	if (addr != entry->vme_start) {
535 		return false;
536 	}
537 
538 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
539 		return false;
540 	}
541 
542 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
543 		return false;
544 	}
545 
546 	return true;
547 }
548 
549 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)550 kmem_entry_validate_guard(
551 	vm_map_t        map,
552 	vm_map_entry_t  entry,
553 	vm_offset_t     addr,
554 	vm_size_t       size,
555 	kmem_guard_t    guard)
556 {
557 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
558 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
559 	}
560 }
561 
562 __abortlike
563 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)564 __kmem_entry_validate_object_panic(
565 	vm_map_t        map,
566 	vm_map_entry_t  entry,
567 	kmem_flags_t    flags)
568 {
569 	const char *what;
570 	const char *verb;
571 
572 	if (entry->is_sub_map) {
573 		panic("kmem(map=%p) entry %p is a submap", map, entry);
574 	}
575 
576 	if (flags & KMEM_KOBJECT) {
577 		what = "kernel";
578 		verb = "isn't";
579 	} else if (flags & KMEM_COMPRESSOR) {
580 		what = "compressor";
581 		verb = "isn't";
582 	} else if (entry->vme_kernel_object) {
583 		what = "kernel";
584 		verb = "is unexpectedly";
585 	} else {
586 		what = "compressor";
587 		verb = "is unexpectedly";
588 	}
589 
590 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
591 	    map, flags, entry, verb, what);
592 }
593 
594 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)595 __kmem_entry_validate_object(
596 	vm_map_entry_t  entry,
597 	kmem_flags_t    flags)
598 {
599 	if (entry->is_sub_map) {
600 		return false;
601 	}
602 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
603 		return false;
604 	}
605 
606 	return (bool)(flags & KMEM_COMPRESSOR) ==
607 	       (VME_OBJECT(entry) == compressor_object);
608 }
609 
610 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)611 kmem_size_guard(
612 	vm_map_t        map,
613 	vm_offset_t     addr,
614 	kmem_guard_t    guard)
615 {
616 	kmem_flags_t flags = KMEM_GUESS_SIZE;
617 	vm_map_entry_t entry;
618 	vm_size_t size;
619 
620 	vmlp_api_start(KMEM_SIZE_GUARD);
621 
622 	vm_map_lock_read(map);
623 
624 #if KASAN_CLASSIC
625 	addr -= PAGE_SIZE;
626 #endif /* KASAN_CLASSIC */
627 	addr = vm_memtag_canonicalize_kernel(addr);
628 
629 	if (!vm_map_lookup_entry(map, addr, &entry)) {
630 		__kmem_entry_not_found_panic(map, addr);
631 	}
632 
633 	vmlp_range_event_entry(map, entry);
634 
635 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
636 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
637 	}
638 
639 	size = __kmem_entry_orig_size(entry);
640 
641 	vm_map_unlock_read(map);
642 
643 	vmlp_api_end(KMEM_SIZE_GUARD, 0);
644 	return size;
645 }
646 
647 static inline uint16_t
kmem_hash_backtrace(void * fp)648 kmem_hash_backtrace(
649 	void                     *fp)
650 {
651 	uint64_t  bt_count;
652 	uintptr_t bt[8] = {};
653 
654 	struct backtrace_control ctl = {
655 		.btc_frame_addr = (uintptr_t)fp,
656 	};
657 
658 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
659 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
660 }
661 
662 static_assert(KMEM_RANGE_ID_DATA_SHARED - 1 <= KMEM_RANGE_MASK,
663     "Insufficient bits to represent ptr ranges");
664 
665 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)666 kmem_adjust_range_id(
667 	uint32_t                  hash)
668 {
669 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
670 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
671 }
672 
673 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)674 kmem_use_sprayqtn(
675 	kma_flags_t               kma_flags,
676 	vm_map_size_t             map_size,
677 	vm_offset_t               mask)
678 {
679 	/*
680 	 * Pointer allocations that are above the guard objects threshold or have
681 	 * leading guard pages with non standard alignment requests are redirected
682 	 * to the sprayqtn range.
683 	 */
684 #if DEBUG || DEVELOPMENT
685 	btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
686 	    BTREF_GET_NOWAIT : 0;
687 
688 	if ((kma_flags & KMA_SPRAYQTN) == 0) {
689 		if (map_size > KMEM_GOBJ_THRESHOLD) {
690 			btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
691 			    btref_get(__builtin_frame_address(0), flags));
692 		} else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
693 			btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
694 			    btref_get(__builtin_frame_address(0), flags));
695 		}
696 	}
697 #endif /* DEBUG || DEVELOPMENT */
698 
699 	return (kma_flags & KMA_SPRAYQTN) ||
700 	       (map_size > KMEM_GOBJ_THRESHOLD) ||
701 	       ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
702 }
703 
704 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)705 kmem_apply_security_policy(
706 	vm_map_t                  map,
707 	kma_flags_t               kma_flags,
708 	kmem_guard_t              guard,
709 	vm_map_size_t             map_size,
710 	vm_offset_t               mask,
711 	vm_map_kernel_flags_t    *vmk_flags,
712 	bool                      assert_dir __unused)
713 {
714 	kmem_range_id_t range_id;
715 	bool from_right;
716 	uint16_t type_hash = guard.kmg_type_hash;
717 
718 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
719 		return;
720 	}
721 
722 	/*
723 	 * A non-zero type-hash must be passed by krealloc_type
724 	 */
725 #if (DEBUG || DEVELOPMENT)
726 	if (assert_dir && !(kma_flags & (KMA_DATA | KMA_DATA_SHARED))) {
727 		assert(type_hash != 0);
728 	}
729 #endif
730 
731 	if (kma_flags & (KMA_DATA | KMA_DATA_SHARED)) {
732 		/*
733 		 * Choose the specific which data range.
734 		 */
735 		if (kma_flags & KMA_DATA) {
736 			range_id  = KMEM_RANGE_ID_DATA;
737 		} else {
738 			range_id  = kmem_needs_data_share_range() ?
739 			    KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA;
740 		}
741 
742 		/*
743 		 * As an optimization in KMA_DATA to avoid fragmentation,
744 		 * allocate static carveouts at the end of the DATA range.
745 		 */
746 		from_right = (bool)(kma_flags & KMA_PERMANENT);
747 	} else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
748 		range_id = KMEM_RANGE_ID_SPRAYQTN;
749 		from_right = (bool)(kma_flags & KMA_PERMANENT);
750 	} else if (type_hash) {
751 		range_id  = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
752 		from_right = type_hash & KMEM_DIRECTION_MASK;
753 	} else {
754 		/*
755 		 * Range id needs to correspond to one of the PTR ranges
756 		 */
757 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
758 		range_id  = kmem_adjust_range_id(type_hash);
759 		from_right = type_hash & KMEM_DIRECTION_MASK;
760 	}
761 
762 	vmk_flags->vmkf_range_id = range_id;
763 	vmk_flags->vmkf_last_free = from_right;
764 }
765 
766 #pragma mark allocation
767 
768 static kmem_return_t
769 kmem_alloc_guard_internal(
770 	vm_map_t                map,
771 	vm_size_t               size,
772 	vm_offset_t             mask,
773 	kma_flags_t             flags,
774 	kmem_guard_t            guard,
775 	kern_return_t         (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
776 {
777 	vm_object_t             object;
778 	vm_offset_t             delta = 0;
779 	vm_map_entry_t          entry = NULL;
780 	vm_map_offset_t         map_addr, fill_start;
781 	vm_map_size_t           map_size, fill_size;
782 	vm_page_t               guard_left = VM_PAGE_NULL;
783 	vm_page_t               guard_right = VM_PAGE_NULL;
784 	vm_page_t               wired_page_list = VM_PAGE_NULL;
785 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
786 	bool                    skip_guards;
787 	kmem_return_t           kmr = { };
788 
789 	vmlp_api_start(KMEM_ALLOC_GUARD_INTERNAL);
790 
791 	assert(kernel_map && map->pmap == kernel_pmap);
792 
793 	/* DATA and DATA_SHARED are mutually exclusive */
794 	assert((flags & (KMA_DATA | KMA_DATA_SHARED)) != (KMA_DATA | KMA_DATA_SHARED));
795 
796 #if defined(__arm64__)
797 	/*
798 	 * Pageable allocations should be marked as shared.
799 	 *
800 	 * Only assert this on arm64 architectures, since we do not
801 	 * adopt the shared heap on older ones.
802 	 */
803 	assert((flags & (KMA_PAGEABLE | KMA_DATA)) != (KMA_PAGEABLE | KMA_DATA));
804 #endif /* defined(__arm64__) */
805 
806 #if DEBUG || DEVELOPMENT
807 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
808 	    size, 0, 0, 0);
809 #endif
810 
811 #if HAS_MTE
812 	if (__improbable(!is_mte_enabled)) {
813 		flags &= ~KMA_TAG;
814 	}
815 #endif /* HAS_MTE */
816 
817 	if (size == 0 ||
818 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
819 	    (size < __kmem_guard_size(ANYF(flags)))) {
820 		__kmem_invalid_size_panic(map, size, flags);
821 	}
822 
823 	/*
824 	 * limit the size of a single extent of wired memory
825 	 * to try and limit the damage to the system if
826 	 * too many pages get wired down
827 	 * limit raised to 2GB with 128GB max physical limit,
828 	 * but scaled by installed memory above this
829 	 *
830 	 * Note: kmem_alloc_contig_guard() is immune to this check.
831 	 */
832 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
833 	    alloc_pages == NULL &&
834 	    size > MAX(1ULL << 31, sane_size / 64))) {
835 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
836 		goto out_error;
837 	}
838 
839 	/*
840 	 * Guard pages:
841 	 *
842 	 * Guard pages are implemented as fictitious pages.
843 	 *
844 	 * However, some maps, and some objects are known
845 	 * to manage their memory explicitly, and do not need
846 	 * those to be materialized, which saves memory.
847 	 *
848 	 * By placing guard pages on either end of a stack,
849 	 * they can help detect cases where a thread walks
850 	 * off either end of its stack.
851 	 *
852 	 * They are allocated and set up here and attempts
853 	 * to access those pages are trapped in vm_fault_page().
854 	 *
855 	 * The map_size we were passed may include extra space for
856 	 * guard pages. fill_size represents the actual size to populate.
857 	 * Similarly, fill_start indicates where the actual pages
858 	 * will begin in the range.
859 	 */
860 
861 	map_size   = round_page(size);
862 	fill_start = 0;
863 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
864 
865 #if KASAN_CLASSIC
866 	if (flags & KMA_KASAN_GUARD) {
867 		assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
868 		flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
869 		delta     = ptoa(2);
870 		map_size += delta;
871 	}
872 #else
873 	(void)delta;
874 #endif /* KASAN_CLASSIC */
875 
876 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
877 	    map->never_faults;
878 
879 	if (flags & KMA_GUARD_FIRST) {
880 		vmk_flags.vmkf_guard_before = true;
881 		fill_start += PAGE_SIZE;
882 	}
883 	if (flags & KMA_NOSOFTLIMIT) {
884 		vmk_flags.vmkf_no_soft_limit = true;
885 	}
886 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
887 		guard_left = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
888 		if (__improbable(guard_left == VM_PAGE_NULL)) {
889 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
890 			goto out_error;
891 		}
892 	}
893 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
894 		guard_right = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
895 		if (__improbable(guard_right == VM_PAGE_NULL)) {
896 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
897 			goto out_error;
898 		}
899 	}
900 
901 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
902 		if (alloc_pages) {
903 			kmr.kmr_return = alloc_pages(fill_size, flags,
904 			    &wired_page_list);
905 		} else {
906 			kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
907 			    &wired_page_list);
908 		}
909 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
910 			goto out_error;
911 		}
912 	}
913 
914 	/*
915 	 *	Allocate a new object (if necessary).  We must do this before
916 	 *	locking the map, or risk deadlock with the default pager.
917 	 */
918 	if (flags & KMA_KOBJECT) {
919 #if HAS_MTE
920 		if (flags & KMA_TAG) {
921 			object = kernel_object_tagged;
922 			vmk_flags.vmf_mte = true;
923 		} else
924 #endif /* HAS_MTE */
925 		{
926 			object = kernel_object_default;
927 		}
928 		vm_object_reference(object);
929 	} else if (flags & KMA_COMPRESSOR) {
930 		object = compressor_object;
931 		vm_object_reference(object);
932 	} else {
933 		object = vm_object_allocate(map_size, map->serial_id);
934 		vm_object_lock(object);
935 		vm_object_set_size(object, map_size, size);
936 		/* stabilize the object to prevent shadowing */
937 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
938 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
939 #if HAS_MTE
940 		if (flags & KMA_TAG) {
941 			object->wimg_bits = VM_WIMG_MTE;
942 			object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
943 			VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
944 		}
945 #endif /* HAS_MTE */
946 		vm_object_unlock(object);
947 	}
948 
949 	if (flags & KMA_LAST_FREE) {
950 		vmk_flags.vmkf_last_free = true;
951 	}
952 	if (flags & KMA_PERMANENT) {
953 		vmk_flags.vmf_permanent = true;
954 	}
955 	kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
956 	    false);
957 
958 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
959 	    vmk_flags, &entry);
960 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
961 		vm_object_deallocate(object);
962 		goto out_error;
963 	}
964 
965 	vmlp_range_event_entry(map, entry);
966 
967 	map_addr = entry->vme_start;
968 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
969 	VME_ALIAS_SET(entry, guard.kmg_tag);
970 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
971 		VME_OFFSET_SET(entry, map_addr);
972 	}
973 
974 #if KASAN
975 	if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
976 		entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
977 	}
978 #endif /* KASAN */
979 
980 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
981 		entry->wired_count = 1;
982 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
983 	}
984 
985 	if (guard_left || guard_right || wired_page_list) {
986 		vm_object_offset_t offset = 0ull;
987 
988 		vm_object_lock(object);
989 		vm_map_unlock(map);
990 
991 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
992 			offset = map_addr;
993 		}
994 
995 		if (guard_left) {
996 			vm_page_insert(guard_left, object, offset);
997 			guard_left->vmp_busy = FALSE;
998 			guard_left = VM_PAGE_NULL;
999 		}
1000 
1001 		if (guard_right) {
1002 			vm_page_insert(guard_right, object,
1003 			    offset + fill_start + fill_size);
1004 			guard_right->vmp_busy = FALSE;
1005 			guard_right = VM_PAGE_NULL;
1006 		}
1007 
1008 		if (wired_page_list) {
1009 			kernel_memory_populate_object_and_unlock(object,
1010 			    map_addr + fill_start, offset + fill_start, fill_size,
1011 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
1012 			    __kmem_mapping_type(ANYF(flags)));
1013 		} else {
1014 			vm_object_unlock(object);
1015 		}
1016 	} else {
1017 		vm_map_unlock(map);
1018 	}
1019 
1020 	/*
1021 	 * now that the pages are wired, we no longer have to fear coalesce
1022 	 */
1023 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1024 		vm_map_simplify(map, map_addr);
1025 	}
1026 
1027 #if DEBUG || DEVELOPMENT
1028 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1029 	    atop(fill_size), 0, 0, 0);
1030 #endif /* DEBUG || DEVELOPMENT */
1031 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
1032 
1033 #if KASAN
1034 	if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
1035 		/*
1036 		 * We need to allow the range for pageable memory,
1037 		 * or faulting will not be allowed.
1038 		 */
1039 		kasan_notify_address(map_addr, map_size);
1040 	}
1041 #endif /* KASAN */
1042 #if KASAN_CLASSIC
1043 	if (flags & KMA_KASAN_GUARD) {
1044 		kmr.kmr_address += PAGE_SIZE;
1045 		kasan_alloc_large(kmr.kmr_address, size);
1046 	}
1047 #endif /* KASAN_CLASSIC */
1048 #if CONFIG_KERNEL_TAGGING
1049 	if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
1050 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag((caddr_t)kmr.kmr_address + fill_start, fill_size);
1051 		kmr.kmr_ptr = (caddr_t)kmr.kmr_ptr - fill_start;
1052 #if KASAN_TBI
1053 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, map_size, size);
1054 #endif /* KASAN_TBI */
1055 	}
1056 #endif /* CONFIG_KERNEL_TAGGING */
1057 	vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return);
1058 	return kmr;
1059 
1060 out_error:
1061 	if (flags & KMA_NOFAIL) {
1062 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1063 	}
1064 	if (guard_left) {
1065 		guard_left->vmp_snext = wired_page_list;
1066 		wired_page_list = guard_left;
1067 	}
1068 	if (guard_right) {
1069 		guard_right->vmp_snext = wired_page_list;
1070 		wired_page_list = guard_right;
1071 	}
1072 	if (wired_page_list) {
1073 		vm_page_free_list(wired_page_list, FALSE);
1074 	}
1075 
1076 #if DEBUG || DEVELOPMENT
1077 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1078 	    0, 0, 0, 0);
1079 #endif /* DEBUG || DEVELOPMENT */
1080 
1081 	vmlp_api_end(KMEM_ALLOC_GUARD_INTERNAL, kmr.kmr_return);
1082 	return kmr;
1083 }
1084 
1085 __mockable kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1086 kmem_alloc_guard(
1087 	vm_map_t        map,
1088 	vm_size_t       size,
1089 	vm_offset_t     mask,
1090 	kma_flags_t     flags,
1091 	kmem_guard_t    guard)
1092 {
1093 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1094 }
1095 
1096 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1097 kmem_alloc_contig_guard(
1098 	vm_map_t                map,
1099 	vm_size_t               size,
1100 	vm_offset_t             mask,
1101 	ppnum_t                 max_pnum,
1102 	ppnum_t                 pnum_mask,
1103 	kma_flags_t             flags,
1104 	kmem_guard_t            guard)
1105 {
1106 	__auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1107 		return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1108 	};
1109 
1110 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1111 }
1112 
1113 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1114 kmem_suballoc(
1115 	vm_map_t                parent,
1116 	mach_vm_offset_t       *addr,
1117 	vm_size_t               size,
1118 	vm_map_create_options_t vmc_options,
1119 	int                     vm_flags,
1120 	kms_flags_t             flags,
1121 	vm_tag_t                tag)
1122 {
1123 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1124 	vm_map_offset_t map_addr = 0;
1125 	kmem_return_t kmr = { };
1126 	vm_map_t map;
1127 
1128 	assert(page_aligned(size));
1129 	assert(parent->pmap == kernel_pmap);
1130 
1131 	vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1132 
1133 	if (parent == kernel_map) {
1134 		assert(vmk_flags.vmf_overwrite || (flags & (KMS_DATA | KMS_DATA_SHARED)));
1135 	}
1136 
1137 	if (vmk_flags.vmf_fixed) {
1138 		map_addr = trunc_page(*addr);
1139 	}
1140 
1141 	pmap_reference(vm_map_pmap(parent));
1142 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1143 
1144 	/*
1145 	 * 1. vm_map_enter() will consume one ref on success.
1146 	 *
1147 	 * 2. make the entry atomic as kernel submaps should never be split.
1148 	 *
1149 	 * 3. instruct vm_map_enter() that it is a fresh submap
1150 	 *    that needs to be taught its bounds as it inserted.
1151 	 */
1152 	vm_map_reference(map);
1153 
1154 	vmk_flags.vmkf_submap = true;
1155 	if ((flags & (KMS_DATA | KMS_DATA_SHARED)) == 0) {
1156 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1157 		vmk_flags.vmkf_submap_atomic = true;
1158 	}
1159 	vmk_flags.vmkf_submap_adjust = true;
1160 	if (flags & KMS_LAST_FREE) {
1161 		vmk_flags.vmkf_last_free = true;
1162 	}
1163 	if (flags & KMS_PERMANENT) {
1164 		vmk_flags.vmf_permanent = true;
1165 	}
1166 	if (flags & (KMS_DATA | KMS_DATA_SHARED)) {
1167 		if (flags & KMS_DATA) {
1168 			vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1169 		} else {
1170 			vmk_flags.vmkf_range_id = kmem_needs_data_share_range() ?
1171 			    KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA;
1172 		}
1173 	}
1174 	if (flags & KMS_NOSOFTLIMIT) {
1175 		vmk_flags.vmkf_no_soft_limit = true;
1176 	}
1177 
1178 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1179 	    vmk_flags, (vm_object_t)map, 0, FALSE,
1180 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1181 
1182 	if (kmr.kmr_return != KERN_SUCCESS) {
1183 		if (flags & KMS_NOFAIL) {
1184 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1185 			    parent, size, kmr.kmr_return);
1186 		}
1187 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1188 		vm_map_deallocate(map);
1189 		vm_map_deallocate(map); /* also removes ref to pmap */
1190 		return kmr;
1191 	}
1192 
1193 	/*
1194 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1195 	 * that the exact same range is returned.
1196 	 */
1197 	if (*addr != 0 && parent == kernel_map &&
1198 	    startup_phase > STARTUP_SUB_KMEM) {
1199 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1200 	} else {
1201 		*addr = map_addr;
1202 	}
1203 
1204 	kmr.kmr_submap = map;
1205 	return kmr;
1206 }
1207 
1208 /*
1209  *	kmem_alloc:
1210  *
1211  *	Allocate wired-down memory in the kernel's address map
1212  *	or a submap.  The memory is not zero-filled.
1213  */
1214 
1215 __exported kern_return_t
1216 kmem_alloc_external(
1217 	vm_map_t        map,
1218 	vm_offset_t     *addrp,
1219 	vm_size_t       size);
1220 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1221 kmem_alloc_external(
1222 	vm_map_t        map,
1223 	vm_offset_t     *addrp,
1224 	vm_size_t       size)
1225 {
1226 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1227 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1228 	}
1229 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1230 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1231 }
1232 
1233 
1234 /*
1235  *	kmem_alloc_kobject:
1236  *
1237  *	Allocate wired-down memory in the kernel's address map
1238  *	or a submap.  The memory is not zero-filled.
1239  *
1240  *	The memory is allocated in the kernel_object.
1241  *	It may not be copied with vm_map_copy, and
1242  *	it may not be reallocated with kmem_realloc.
1243  */
1244 
1245 __exported kern_return_t
1246 kmem_alloc_kobject_external(
1247 	vm_map_t        map,
1248 	vm_offset_t     *addrp,
1249 	vm_size_t       size);
1250 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1251 kmem_alloc_kobject_external(
1252 	vm_map_t        map,
1253 	vm_offset_t     *addrp,
1254 	vm_size_t       size)
1255 {
1256 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1257 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1258 	}
1259 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1260 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1261 }
1262 
1263 /*
1264  *	kmem_alloc_pageable:
1265  *
1266  *	Allocate pageable memory in the kernel's address map.
1267  */
1268 
1269 __exported kern_return_t
1270 kmem_alloc_pageable_external(
1271 	vm_map_t        map,
1272 	vm_offset_t     *addrp,
1273 	vm_size_t       size);
1274 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1275 kmem_alloc_pageable_external(
1276 	vm_map_t        map,
1277 	vm_offset_t     *addrp,
1278 	vm_size_t       size)
1279 {
1280 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1281 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA_SHARED, vm_tag_bt());
1282 	}
1283 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1284 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1285 }
1286 
1287 static __attribute__((always_inline, warn_unused_result))
1288 kern_return_t
mach_vm_allocate_kernel_sanitize(vm_map_t map,mach_vm_offset_ut addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * map_addr,vm_map_size_t * map_size)1289 mach_vm_allocate_kernel_sanitize(
1290 	vm_map_t                map,
1291 	mach_vm_offset_ut       addr_u,
1292 	mach_vm_size_ut         size_u,
1293 	vm_map_kernel_flags_t   vmk_flags,
1294 	vm_map_offset_t        *map_addr,
1295 	vm_map_size_t          *map_size)
1296 {
1297 	kern_return_t   result;
1298 	vm_map_offset_t map_end;
1299 
1300 	if (vmk_flags.vmf_fixed) {
1301 		result = vm_sanitize_addr_size(addr_u, size_u,
1302 		    VM_SANITIZE_CALLER_VM_ALLOCATE_FIXED,
1303 		    map,
1304 		    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_REALIGN_START,
1305 		    map_addr, &map_end, map_size);
1306 		if (__improbable(result != KERN_SUCCESS)) {
1307 			return result;
1308 		}
1309 	} else {
1310 		*map_addr = 0;
1311 		result = vm_sanitize_size(0, size_u,
1312 		    VM_SANITIZE_CALLER_VM_ALLOCATE_ANYWHERE, map,
1313 		    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
1314 		    map_size);
1315 		if (__improbable(result != KERN_SUCCESS)) {
1316 			return result;
1317 		}
1318 	}
1319 
1320 	return KERN_SUCCESS;
1321 }
1322 
1323 kern_return_t
mach_vm_allocate_kernel(vm_map_t map,mach_vm_offset_ut * addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags)1324 mach_vm_allocate_kernel(
1325 	vm_map_t                map,
1326 	mach_vm_offset_ut      *addr_u,
1327 	mach_vm_size_ut         size_u,
1328 	vm_map_kernel_flags_t   vmk_flags)
1329 {
1330 	vm_map_offset_t map_addr;
1331 	vm_map_size_t   map_size;
1332 	kern_return_t   result;
1333 
1334 	if (map == VM_MAP_NULL) {
1335 		ktriage_record(thread_tid(current_thread()),
1336 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1337 		    KDBG_TRIAGE_RESERVED,
1338 		    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR),
1339 		    KERN_INVALID_ARGUMENT /* arg */);
1340 		return KERN_INVALID_ARGUMENT;
1341 	}
1342 
1343 	if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
1344 	    VM_FLAGS_USER_ALLOCATE)) {
1345 		return KERN_INVALID_ARGUMENT;
1346 	}
1347 
1348 	result = mach_vm_allocate_kernel_sanitize(map,
1349 	    *addr_u,
1350 	    size_u,
1351 	    vmk_flags,
1352 	    &map_addr,
1353 	    &map_size);
1354 	if (__improbable(result != KERN_SUCCESS)) {
1355 		result = vm_sanitize_get_kr(result);
1356 		if (result == KERN_SUCCESS) {
1357 			*addr_u = vm_sanitize_wrap_addr(0);
1358 		} else {
1359 			ktriage_record(thread_tid(current_thread()),
1360 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1361 			    KDBG_TRIAGE_RESERVED,
1362 			    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR),
1363 			    KERN_INVALID_ARGUMENT /* arg */);
1364 		}
1365 		return result;
1366 	}
1367 
1368 	vm_map_kernel_flags_update_range_id(&vmk_flags, map, map_size);
1369 
1370 	result = vm_map_enter(
1371 		map,
1372 		&map_addr,
1373 		map_size,
1374 		(vm_map_offset_t)0,
1375 		vmk_flags,
1376 		VM_OBJECT_NULL,
1377 		(vm_object_offset_t)0,
1378 		FALSE,
1379 		VM_PROT_DEFAULT,
1380 		VM_PROT_ALL,
1381 		VM_INHERIT_DEFAULT);
1382 
1383 	if (result == KERN_SUCCESS) {
1384 #if KASAN
1385 		if (map->pmap == kernel_pmap) {
1386 			kasan_notify_address(map_addr, map_size);
1387 		}
1388 #endif
1389 		*addr_u = vm_sanitize_wrap_addr(map_addr);
1390 	} else {
1391 		ktriage_record(thread_tid(current_thread()),
1392 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1393 		    KDBG_TRIAGE_RESERVED,
1394 		    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR),
1395 		    result /* arg */);
1396 	}
1397 	return result;
1398 }
1399 
1400 #pragma mark population
1401 
1402 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1403 kernel_memory_populate_pmap_enter(
1404 	vm_object_t             object,
1405 	vm_address_t            addr,
1406 	vm_object_offset_t      offset,
1407 	vm_page_t               mem,
1408 	vm_prot_t               prot,
1409 	int                     pe_flags,
1410 	pmap_mapping_type_t     mapping_type)
1411 {
1412 	kern_return_t   pe_result;
1413 	int             pe_options;
1414 
1415 	if (VMP_ERROR_GET(mem)) {
1416 		panic("VM page %p should not have an error", mem);
1417 	}
1418 
1419 	pe_options = PMAP_OPTIONS_NOWAIT;
1420 	if (object->internal) {
1421 		pe_options |= PMAP_OPTIONS_INTERNAL;
1422 	}
1423 	if (mem->vmp_reusable || object->all_reusable) {
1424 		pe_options |= PMAP_OPTIONS_REUSABLE;
1425 	}
1426 
1427 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1428 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1429 	    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1430 
1431 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1432 		vm_object_unlock(object);
1433 
1434 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1435 
1436 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1437 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1438 		    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1439 
1440 		vm_object_lock(object);
1441 	}
1442 
1443 	assert(pe_result == KERN_SUCCESS);
1444 }
1445 
1446 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1447 kernel_memory_populate_object_and_unlock(
1448 	vm_object_t             object, /* must be locked */
1449 	vm_address_t            addr,
1450 	vm_offset_t             offset,
1451 	vm_size_t               size,
1452 	vm_page_t               page_list,
1453 	kma_flags_t             flags,
1454 	vm_tag_t                tag,
1455 	vm_prot_t               prot,
1456 	pmap_mapping_type_t     mapping_type)
1457 {
1458 	vm_page_t       mem;
1459 	int             pe_flags;
1460 	bool            gobbled_list = page_list && page_list->vmp_gobbled;
1461 
1462 	assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1463 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1464 
1465 #if HAS_MTE
1466 	if (!is_mte_enabled) {
1467 		assert(!(flags & KMA_TAG));
1468 	}
1469 #endif /* HAS_MTE */
1470 
1471 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1472 		assert3u(offset, ==, addr);
1473 	} else {
1474 		/*
1475 		 * kernel_memory_populate_pmap_enter() might drop the object
1476 		 * lock, and the caller might not own a reference anymore
1477 		 * and rely on holding the vm object lock for liveness.
1478 		 */
1479 		vm_object_reference_locked(object);
1480 	}
1481 
1482 	if (flags & KMA_KSTACK) {
1483 		pe_flags = VM_MEM_STACK;
1484 	} else {
1485 		pe_flags = 0;
1486 	}
1487 
1488 #if HAS_MTE
1489 	/* Inform the PMAP layer that we want an MTE backed page. */
1490 	if (flags & KMA_TAG) {
1491 		pe_flags |= VM_MEM_MAP_MTE;
1492 		assert((object->wimg_bits & VM_WIMG_MTE) != 0);
1493 	} else {
1494 		assert((object->wimg_bits & VM_WIMG_MTE) == 0);
1495 	}
1496 #endif /* HAS_MTE */
1497 
1498 	for (vm_object_offset_t pg_offset = 0;
1499 	    pg_offset < size;
1500 	    pg_offset += PAGE_SIZE_64) {
1501 		if (page_list == NULL) {
1502 			panic("%s: page_list too short", __func__);
1503 		}
1504 
1505 		mem = page_list;
1506 		page_list = mem->vmp_snext;
1507 		mem->vmp_snext = NULL;
1508 
1509 		assert(mem->vmp_wire_count == 0);
1510 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1511 		assert(vm_page_is_canonical(mem));
1512 
1513 		if (flags & KMA_COMPRESSOR) {
1514 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1515 			/*
1516 			 * Background processes doing I/O accounting can call
1517 			 * into NVME driver to do some work which results in
1518 			 * an allocation here and so we want to make sure
1519 			 * that the pages used by compressor, regardless of
1520 			 * process context, are never on the special Q.
1521 			 */
1522 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1523 
1524 			vm_page_insert(mem, object, offset + pg_offset);
1525 		} else {
1526 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1527 			mem->vmp_wire_count = 1;
1528 
1529 #if HAS_MTE
1530 			mteinfo_increment_wire_count(mem);
1531 #endif /* HAS_MTE */
1532 
1533 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1534 		}
1535 
1536 		mem->vmp_gobbled = false;
1537 		mem->vmp_busy = false;
1538 		mem->vmp_pmapped = true;
1539 		mem->vmp_wpmapped = true;
1540 
1541 		/*
1542 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1543 		 * for the kernel and compressor objects.
1544 		 */
1545 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1546 		    mem, prot, pe_flags, mapping_type);
1547 
1548 		if (flags & KMA_NOENCRYPT) {
1549 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1550 		}
1551 	}
1552 
1553 	if (page_list) {
1554 		panic("%s: page_list too long", __func__);
1555 	}
1556 
1557 	vm_object_unlock(object);
1558 	if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1559 		vm_object_deallocate(object);
1560 	}
1561 
1562 	/*
1563 	 * Update the accounting:
1564 	 * - the compressor "wired" pages don't really count as wired
1565 	 * - kmem_alloc_contig_guard() gives gobbled pages,
1566 	 *   which already count as wired but need to be ungobbled.
1567 	 */
1568 	if (gobbled_list) {
1569 		vm_page_lockspin_queues();
1570 		if (flags & KMA_COMPRESSOR) {
1571 			vm_page_wire_count -= atop(size);
1572 		}
1573 		vm_page_gobble_count -= atop(size);
1574 		vm_page_unlock_queues();
1575 	} else if ((flags & KMA_COMPRESSOR) == 0) {
1576 		vm_page_lockspin_queues();
1577 		vm_page_wire_count += atop(size);
1578 		vm_page_unlock_queues();
1579 	}
1580 
1581 	if (flags & KMA_KOBJECT) {
1582 		/* vm_page_insert_wired() handles regular objects already */
1583 		vm_tag_update_size(tag, size, NULL);
1584 	}
1585 
1586 #if KASAN
1587 	if (flags & KMA_COMPRESSOR) {
1588 		kasan_notify_address_nopoison(addr, size);
1589 	} else {
1590 		kasan_notify_address(addr, size);
1591 	}
1592 #endif /* KASAN */
1593 }
1594 
1595 
1596 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1597 kernel_memory_populate(
1598 	vm_offset_t     addr,
1599 	vm_size_t       size,
1600 	kma_flags_t     flags,
1601 	vm_tag_t        tag)
1602 {
1603 	kern_return_t   kr = KERN_SUCCESS;
1604 	vm_page_t       page_list = NULL;
1605 	vm_size_t       page_count = atop_64(size);
1606 	vm_object_t     object = __kmem_object(ANYF(flags));
1607 
1608 #if DEBUG || DEVELOPMENT
1609 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1610 	    size, 0, 0, 0);
1611 #endif /* DEBUG || DEVELOPMENT */
1612 
1613 #if HAS_MTE
1614 	if (!is_mte_enabled) {
1615 		assert(!(flags & KMA_TAG));
1616 	}
1617 #endif /* HAS_MTE */
1618 
1619 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1620 	if (kr == KERN_SUCCESS) {
1621 		vm_object_lock(object);
1622 		kernel_memory_populate_object_and_unlock(object, addr,
1623 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1624 		    __kmem_mapping_type(ANYF(flags)));
1625 	}
1626 
1627 #if DEBUG || DEVELOPMENT
1628 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1629 	    page_count, 0, 0, 0);
1630 #endif /* DEBUG || DEVELOPMENT */
1631 	return kr;
1632 }
1633 
1634 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1635 kernel_memory_depopulate(
1636 	vm_offset_t        addr,
1637 	vm_size_t          size,
1638 	kma_flags_t        flags,
1639 	vm_tag_t           tag)
1640 {
1641 	vm_object_t        object = __kmem_object(ANYF(flags));
1642 	vm_object_offset_t offset = addr;
1643 	vm_page_t          mem;
1644 	vm_page_t          local_freeq = NULL;
1645 	unsigned int       pages_unwired = 0;
1646 
1647 	vm_object_lock(object);
1648 
1649 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1650 
1651 	for (vm_object_offset_t pg_offset = 0;
1652 	    pg_offset < size;
1653 	    pg_offset += PAGE_SIZE_64) {
1654 		mem = vm_page_lookup(object, offset + pg_offset);
1655 
1656 		assert(mem);
1657 
1658 		if (flags & KMA_COMPRESSOR) {
1659 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1660 		} else {
1661 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1662 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1663 			pages_unwired++;
1664 		}
1665 
1666 		mem->vmp_busy = TRUE;
1667 
1668 		assert(mem->vmp_tabled);
1669 		vm_page_remove(mem, TRUE);
1670 		assert(mem->vmp_busy);
1671 
1672 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1673 
1674 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1675 		mem->vmp_snext = local_freeq;
1676 		local_freeq = mem;
1677 	}
1678 
1679 	vm_object_unlock(object);
1680 
1681 	vm_page_free_list(local_freeq, TRUE);
1682 
1683 	if (!(flags & KMA_COMPRESSOR)) {
1684 		vm_page_lockspin_queues();
1685 		vm_page_wire_count -= pages_unwired;
1686 		vm_page_unlock_queues();
1687 	}
1688 
1689 	if (flags & KMA_KOBJECT) {
1690 		/* vm_page_remove() handles regular objects already */
1691 		vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1692 	}
1693 }
1694 
1695 #pragma mark reallocation
1696 
1697 __abortlike
1698 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1699 __kmem_realloc_invalid_object_size_panic(
1700 	vm_map_t                map,
1701 	vm_address_t            address,
1702 	vm_size_t               size,
1703 	vm_map_entry_t          entry)
1704 {
1705 	vm_object_t object  = VME_OBJECT(entry);
1706 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
1707 
1708 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1709 	    "object %p has unexpected size %ld",
1710 	    map, (void *)address, (size_t)size, entry, object, objsize);
1711 }
1712 
1713 __abortlike
1714 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1715 __kmem_realloc_invalid_pager_panic(
1716 	vm_map_t                map,
1717 	vm_address_t            address,
1718 	vm_size_t               size,
1719 	vm_map_entry_t          entry)
1720 {
1721 	vm_object_t object     = VME_OBJECT(entry);
1722 	memory_object_t pager  = object->pager;
1723 	bool pager_created     = object->pager_created;
1724 	bool pager_initialized = object->pager_initialized;
1725 	bool pager_ready       = object->pager_ready;
1726 
1727 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1728 	    "object %p has unexpected pager %p (%d,%d,%d)",
1729 	    map, (void *)address, (size_t)size, entry, object,
1730 	    pager, pager_created, pager_initialized, pager_ready);
1731 }
1732 
1733 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1734 kmem_realloc_shrink_guard(
1735 	vm_map_t                map,
1736 	vm_offset_t             req_oldaddr,
1737 	vm_size_t               req_oldsize,
1738 	vm_size_t               req_newsize,
1739 	kmr_flags_t             flags,
1740 	kmem_guard_t            guard,
1741 	vm_map_entry_t          entry)
1742 {
1743 	vmr_flags_t             vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1744 	vm_object_t             object;
1745 	vm_offset_t             delta = 0;
1746 	kmem_return_t           kmr;
1747 	bool                    was_atomic;
1748 	vm_size_t               oldsize = round_page(req_oldsize);
1749 	vm_size_t               newsize = round_page(req_newsize);
1750 	vm_address_t            oldaddr = req_oldaddr;
1751 
1752 #if KASAN_CLASSIC
1753 	if (flags & KMR_KASAN_GUARD) {
1754 		assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1755 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1756 		oldaddr -= PAGE_SIZE;
1757 		delta    = ptoa(2);
1758 		oldsize += delta;
1759 		newsize += delta;
1760 	}
1761 #endif /* KASAN_CLASSIC */
1762 
1763 	if (flags & KMR_TAG) {
1764 		oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1765 	}
1766 
1767 	vm_map_lock_assert_exclusive(map);
1768 
1769 	if ((flags & KMR_KOBJECT) == 0) {
1770 		object = VME_OBJECT(entry);
1771 		vm_object_reference(object);
1772 	}
1773 
1774 	/*
1775 	 *	Shrinking an atomic entry starts with splitting it,
1776 	 *	and removing the second half.
1777 	 */
1778 	was_atomic = entry->vme_atomic;
1779 	entry->vme_atomic = false;
1780 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1781 	entry->vme_atomic = was_atomic;
1782 
1783 #if KASAN
1784 	if (entry->vme_kernel_object && was_atomic) {
1785 		entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1786 	}
1787 #if KASAN_CLASSIC
1788 	if (flags & KMR_KASAN_GUARD) {
1789 		kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1790 		    ASAN_VALID);
1791 	}
1792 #endif
1793 #if KASAN_TBI
1794 	if (flags & KMR_TAG) {
1795 		kasan_tbi_mark_free_space((caddr_t)req_oldaddr + newsize, oldsize - newsize);
1796 	}
1797 #endif /* KASAN_TBI */
1798 #endif /* KASAN */
1799 	(void)vm_map_remove_and_unlock(map,
1800 	    oldaddr + newsize, oldaddr + oldsize,
1801 	    vmr_flags, KMEM_GUARD_NONE);
1802 
1803 
1804 	/*
1805 	 *	Lastly, if there are guard pages, deal with them.
1806 	 *
1807 	 *	The kernel object just needs to depopulate,
1808 	 *	regular objects require freeing the last page
1809 	 *	and replacing it with a guard.
1810 	 */
1811 	if (flags & KMR_KOBJECT) {
1812 		if (flags & KMR_GUARD_LAST) {
1813 			kma_flags_t dflags = KMA_KOBJECT;
1814 #if HAS_MTE
1815 			dflags |= (ANYF(flags) & KMEM_TAG);
1816 #endif
1817 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1818 			    PAGE_SIZE, dflags, guard.kmg_tag);
1819 		}
1820 	} else {
1821 		vm_page_t guard_right = VM_PAGE_NULL;
1822 		vm_offset_t remove_start = newsize;
1823 
1824 		if (flags & KMR_GUARD_LAST) {
1825 			if (!map->never_faults) {
1826 				guard_right = vm_page_create_guard(true);
1827 			}
1828 			remove_start -= PAGE_SIZE;
1829 		}
1830 
1831 		vm_object_lock(object);
1832 
1833 		if (object->vo_size != oldsize) {
1834 			__kmem_realloc_invalid_object_size_panic(map,
1835 			    req_oldaddr, req_oldsize + delta, entry);
1836 		}
1837 		vm_object_set_size(object, newsize, req_newsize);
1838 
1839 		vm_object_page_remove(object, remove_start, oldsize);
1840 
1841 		if (guard_right) {
1842 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1843 			guard_right->vmp_busy = false;
1844 		}
1845 		vm_object_unlock(object);
1846 		vm_object_deallocate(object);
1847 	}
1848 
1849 	kmr.kmr_address = req_oldaddr;
1850 	kmr.kmr_return  = 0;
1851 #if KASAN_CLASSIC
1852 	if (flags & KMA_KASAN_GUARD) {
1853 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1854 	}
1855 #endif /* KASAN_CLASSIC */
1856 #if KASAN_TBI
1857 	if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1858 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1859 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1860 	}
1861 #endif /* KASAN_TBI */
1862 
1863 	return kmr;
1864 }
1865 
1866 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1867 kmem_realloc_guard(
1868 	vm_map_t                map,
1869 	vm_offset_t             req_oldaddr,
1870 	vm_size_t               req_oldsize,
1871 	vm_size_t               req_newsize,
1872 	kmr_flags_t             flags,
1873 	kmem_guard_t            guard)
1874 {
1875 	vm_object_t             object;
1876 	vm_size_t               oldsize;
1877 	vm_size_t               newsize;
1878 	vm_offset_t             delta = 0;
1879 	vm_map_offset_t         oldaddr;
1880 	vm_map_offset_t         newaddr;
1881 	vm_object_offset_t      newoffs;
1882 	vm_map_entry_t          oldentry;
1883 	vm_map_entry_t          newentry;
1884 	vm_page_t               page_list = NULL;
1885 	bool                    needs_wakeup = false;
1886 	kmem_return_t           kmr = { };
1887 	unsigned int            last_timestamp;
1888 	vm_map_kernel_flags_t   vmk_flags = {
1889 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1890 	};
1891 
1892 	vmlp_api_start(KMEM_REALLOC_GUARD);
1893 
1894 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1895 
1896 	if (!guard.kmg_atomic) {
1897 		if (!(flags & (KMR_DATA | KMR_DATA_SHARED))) {
1898 			__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1899 			    req_oldsize, flags);
1900 		}
1901 
1902 		if (flags & KMR_KOBJECT) {
1903 			__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1904 			    req_oldsize, flags);
1905 		}
1906 	}
1907 
1908 	if (req_oldaddr == 0ul) {
1909 		kmem_return_t ret = kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1910 		vmlp_api_end(KMEM_REALLOC_GUARD, ret.kmr_return);
1911 		return ret;
1912 	}
1913 
1914 	if (req_newsize == 0ul) {
1915 		kmem_free_guard(map, req_oldaddr, req_oldsize,
1916 		    (kmf_flags_t)flags, guard);
1917 		vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1918 		return kmr;
1919 	}
1920 
1921 	if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1922 		__kmem_invalid_size_panic(map, req_newsize, flags);
1923 	}
1924 	if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1925 		__kmem_invalid_size_panic(map, req_newsize, flags);
1926 	}
1927 
1928 	oldsize = round_page(req_oldsize);
1929 	newsize = round_page(req_newsize);
1930 	oldaddr = req_oldaddr;
1931 #if KASAN_CLASSIC
1932 	if (flags & KMR_KASAN_GUARD) {
1933 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1934 		oldaddr -= PAGE_SIZE;
1935 		delta    = ptoa(2);
1936 		oldsize += delta;
1937 		newsize += delta;
1938 	}
1939 #endif /* KASAN_CLASSIC */
1940 #if CONFIG_KERNEL_TAGGING
1941 	if (flags & KMR_TAG) {
1942 		vm_memtag_verify_tag(req_oldaddr + __kmem_guard_left(ANYF(flags)));
1943 		oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1944 #if HAS_MTE
1945 		vmk_flags.vmf_mte = true;
1946 #endif /* HAS_MTE */
1947 	}
1948 #endif /* CONFIG_KERNEL_TAGGING */
1949 
1950 #if !KASAN
1951 	/*
1952 	 *	If not on a KASAN variant and no difference in requested size,
1953 	 *  just return.
1954 	 *
1955 	 *	Otherwise we want to validate the size and re-tag for KASAN_TBI.
1956 	 */
1957 	if (oldsize == newsize) {
1958 		kmr.kmr_address = req_oldaddr;
1959 		vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1960 		return kmr;
1961 	}
1962 #endif /* !KASAN */
1963 
1964 	/*
1965 	 *	If we're growing the allocation,
1966 	 *	then reserve the pages we'll need,
1967 	 *	and find a spot for its new place.
1968 	 */
1969 	if (oldsize < newsize) {
1970 #if DEBUG || DEVELOPMENT
1971 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1972 		    DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1973 		    newsize - oldsize, 0, 0, 0);
1974 #endif /* DEBUG || DEVELOPMENT */
1975 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1976 		    (kma_flags_t)flags, &page_list);
1977 		if (kmr.kmr_return == KERN_SUCCESS) {
1978 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1979 			    newsize, 0, &vmk_flags, true);
1980 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1981 			    vmk_flags, &newentry);
1982 		}
1983 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1984 			if (flags & KMR_REALLOCF) {
1985 				kmem_free_guard(map, req_oldaddr, req_oldsize,
1986 				    flags & (KMF_TAG | KMF_GUARD_FIRST |
1987 				    KMF_GUARD_LAST | KMF_KASAN_GUARD), guard);
1988 			}
1989 			if (page_list) {
1990 				vm_page_free_list(page_list, FALSE);
1991 			}
1992 #if DEBUG || DEVELOPMENT
1993 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1994 			    DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1995 			    0, 0, 0, 0);
1996 #endif /* DEBUG || DEVELOPMENT */
1997 			vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
1998 			return kmr;
1999 		}
2000 
2001 		/* map is locked */
2002 	} else {
2003 		vm_map_lock(map);
2004 	}
2005 
2006 
2007 	/*
2008 	 *	Locate the entry:
2009 	 *	- wait for it to quiesce.
2010 	 *	- validate its guard,
2011 	 *	- learn its correct tag,
2012 	 */
2013 again:
2014 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2015 		__kmem_entry_not_found_panic(map, req_oldaddr);
2016 	}
2017 
2018 	vmlp_range_event_entry(map, oldentry);
2019 
2020 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
2021 		oldentry->needs_wakeup = true;
2022 		vm_map_entry_wait(map, THREAD_UNINT);
2023 		goto again;
2024 	}
2025 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
2026 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
2027 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
2028 	}
2029 	/*
2030 	 *	TODO: We should validate for non atomic entries that the range
2031 	 *	      we are acting on is what we expect here.
2032 	 */
2033 #if KASAN
2034 	if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
2035 		__kmem_realloc_invalid_object_size_panic(map,
2036 		    req_oldaddr, req_oldsize + delta, oldentry);
2037 	}
2038 
2039 	if (oldsize == newsize) {
2040 		kmr.kmr_address = req_oldaddr;
2041 		if (oldentry->vme_kernel_object) {
2042 			oldentry->vme_object_or_delta = delta +
2043 			    (-req_newsize & PAGE_MASK);
2044 		} else {
2045 			object = VME_OBJECT(oldentry);
2046 			vm_object_lock(object);
2047 			vm_object_set_size(object, newsize, req_newsize);
2048 			vm_object_unlock(object);
2049 		}
2050 		vm_map_unlock(map);
2051 
2052 #if KASAN_CLASSIC
2053 		if (flags & KMA_KASAN_GUARD) {
2054 			kasan_alloc_large(kmr.kmr_address, req_newsize);
2055 		}
2056 #endif /* KASAN_CLASSIC */
2057 #if KASAN_TBI
2058 		if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
2059 			kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2060 			kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2061 		}
2062 #endif /* KASAN_TBI */
2063 		vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2064 		return kmr;
2065 	}
2066 #endif /* KASAN */
2067 
2068 	guard.kmg_tag = VME_ALIAS(oldentry);
2069 
2070 	if (newsize < oldsize) {
2071 		kmem_return_t ret = kmem_realloc_shrink_guard(map, req_oldaddr,
2072 		    req_oldsize, req_newsize, flags, guard, oldentry);
2073 		vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2074 		return ret;
2075 	}
2076 
2077 
2078 	/*
2079 	 *	We are growing the entry
2080 	 *
2081 	 *	For regular objects we use the object `vo_size` updates
2082 	 *	as a guarantee that no 2 kmem_realloc() can happen
2083 	 *	concurrently (by doing it before the map is unlocked.
2084 	 *
2085 	 *	For the kernel object, prevent the entry from being
2086 	 *	reallocated or changed by marking it "in_transition".
2087 	 */
2088 
2089 	object = VME_OBJECT(oldentry);
2090 	vm_object_lock(object);
2091 	vm_object_reference_locked(object);
2092 
2093 	newaddr = newentry->vme_start;
2094 	newoffs = oldsize;
2095 
2096 	vmlp_range_event_entry(map, newentry);
2097 
2098 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
2099 	VME_ALIAS_SET(newentry, guard.kmg_tag);
2100 	if (flags & KMR_KOBJECT) {
2101 		oldentry->in_transition = true;
2102 		VME_OFFSET_SET(newentry, newaddr);
2103 		newentry->wired_count = 1;
2104 		vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
2105 		newoffs = newaddr + oldsize;
2106 #if KASAN
2107 		newentry->vme_object_or_delta = delta +
2108 		    (-req_newsize & PAGE_MASK);
2109 #endif /* KASAN */
2110 	} else {
2111 		if (object->pager_created || object->pager) {
2112 			/*
2113 			 * We can't "realloc/grow" the pager, so pageable
2114 			 * allocations should not go through this path.
2115 			 */
2116 			__kmem_realloc_invalid_pager_panic(map,
2117 			    req_oldaddr, req_oldsize + delta, oldentry);
2118 		}
2119 		if (object->vo_size != oldsize) {
2120 			__kmem_realloc_invalid_object_size_panic(map,
2121 			    req_oldaddr, req_oldsize + delta, oldentry);
2122 		}
2123 		vm_object_set_size(object, newsize, req_newsize);
2124 	}
2125 
2126 	last_timestamp = map->timestamp;
2127 	vm_map_unlock(map);
2128 
2129 
2130 	/*
2131 	 *	Now proceed with the population of pages.
2132 	 *
2133 	 *	Kernel objects can use the kmem population helpers.
2134 	 *
2135 	 *	Regular objects will insert pages manually,
2136 	 *	then wire the memory into the new range.
2137 	 */
2138 
2139 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
2140 
2141 	if (flags & KMR_KOBJECT) {
2142 		pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
2143 
2144 		pmap_protect(kernel_pmap,
2145 		    oldaddr, oldaddr + oldsize - guard_right_size,
2146 		    VM_PROT_NONE);
2147 
2148 		for (vm_object_offset_t offset = 0;
2149 		    offset < oldsize - guard_right_size;
2150 		    offset += PAGE_SIZE_64) {
2151 			vm_page_t mem;
2152 
2153 			mem = vm_page_lookup(object, oldaddr + offset);
2154 			if (mem == VM_PAGE_NULL) {
2155 				continue;
2156 			}
2157 
2158 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
2159 
2160 			mem->vmp_busy = true;
2161 			vm_page_remove(mem, true);
2162 			vm_page_insert_wired(mem, object, newaddr + offset,
2163 			    guard.kmg_tag);
2164 			mem->vmp_busy = false;
2165 
2166 			kernel_memory_populate_pmap_enter(object, newaddr,
2167 			    offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
2168 		}
2169 
2170 		kernel_memory_populate_object_and_unlock(object,
2171 		    newaddr + oldsize - guard_right_size,
2172 		    newoffs - guard_right_size,
2173 		    newsize - oldsize,
2174 		    page_list, (kma_flags_t)flags,
2175 		    guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
2176 	} else {
2177 		vm_page_t guard_right = VM_PAGE_NULL;
2178 
2179 		/*
2180 		 *	Note: we are borrowing the new entry reference
2181 		 *	on the object for the duration of this code,
2182 		 *	which works because we keep the object locked
2183 		 *	throughout.
2184 		 */
2185 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
2186 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
2187 			assert(vm_page_is_guard(guard_right));
2188 			guard_right->vmp_busy = true;
2189 			vm_page_remove(guard_right, true);
2190 		}
2191 
2192 		if (flags & KMR_FREEOLD) {
2193 			/*
2194 			 * Freeing the old mapping will make
2195 			 * the old pages become pageable until
2196 			 * the new mapping makes them wired again.
2197 			 * Let's take an extra "wire_count" to
2198 			 * prevent any accidental "page out".
2199 			 * We'll have to undo that after wiring
2200 			 * the new mapping.
2201 			 */
2202 			vm_object_reference_locked(object); /* keep object alive */
2203 			for (vm_object_offset_t offset = 0;
2204 			    offset < oldsize - guard_right_size;
2205 			    offset += PAGE_SIZE_64) {
2206 				vm_page_t mem;
2207 
2208 				mem = vm_page_lookup(object, offset);
2209 				assert(mem != VM_PAGE_NULL);
2210 				assertf(!VM_PAGE_PAGEABLE(mem),
2211 				    "mem %p qstate %d",
2212 				    mem, mem->vmp_q_state);
2213 				if (vm_page_is_guard(mem)) {
2214 					/* guard pages are not wired */
2215 				} else {
2216 					assertf(VM_PAGE_WIRED(mem),
2217 					    "mem %p qstate %d wirecount %d",
2218 					    mem,
2219 					    mem->vmp_q_state,
2220 					    mem->vmp_wire_count);
2221 					assertf(mem->vmp_wire_count >= 1,
2222 					    "mem %p wirecount %d",
2223 					    mem, mem->vmp_wire_count);
2224 					mem->vmp_wire_count++;
2225 				}
2226 			}
2227 		}
2228 
2229 		for (vm_object_offset_t offset = oldsize - guard_right_size;
2230 		    offset < newsize - guard_right_size;
2231 		    offset += PAGE_SIZE_64) {
2232 			vm_page_t mem = page_list;
2233 
2234 			page_list = mem->vmp_snext;
2235 			mem->vmp_snext = VM_PAGE_NULL;
2236 			assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
2237 			assert(!VM_PAGE_PAGEABLE(mem));
2238 
2239 			vm_page_insert(mem, object, offset);
2240 			mem->vmp_busy = false;
2241 		}
2242 
2243 		if (guard_right) {
2244 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
2245 			guard_right->vmp_busy = false;
2246 		}
2247 
2248 		vm_object_unlock(object);
2249 	}
2250 
2251 	/*
2252 	 *	Mark the entry as idle again,
2253 	 *	and honor KMR_FREEOLD if needed.
2254 	 */
2255 
2256 	vm_map_lock(map);
2257 	if (last_timestamp + 1 != map->timestamp &&
2258 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2259 		__kmem_entry_not_found_panic(map, req_oldaddr);
2260 	}
2261 
2262 	if (flags & KMR_KOBJECT) {
2263 		assert(oldentry->in_transition);
2264 		oldentry->in_transition = false;
2265 		if (oldentry->needs_wakeup) {
2266 			needs_wakeup = true;
2267 			oldentry->needs_wakeup = false;
2268 		}
2269 	}
2270 
2271 	if (flags & KMR_FREEOLD) {
2272 		vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2273 
2274 #if KASAN_CLASSIC
2275 		if (flags & KMR_KASAN_GUARD) {
2276 			kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2277 		}
2278 #endif
2279 #if KASAN_TBI
2280 		if (flags & KMR_TAG) {
2281 			kasan_tbi_mark_free_space((caddr_t)req_oldaddr, oldsize);
2282 		}
2283 #endif /* KASAN_TBI */
2284 		if (flags & KMR_GUARD_LAST) {
2285 			vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2286 		}
2287 		(void)vm_map_remove_and_unlock(map,
2288 		    oldaddr, oldaddr + oldsize,
2289 		    vmr_flags, guard);
2290 	} else {
2291 		vm_map_unlock(map);
2292 	}
2293 
2294 	if ((flags & KMR_KOBJECT) == 0) {
2295 		kern_return_t kr;
2296 		/*
2297 		 * This must happen _after_ we do the KMR_FREEOLD,
2298 		 * because wiring the pages will call into the pmap,
2299 		 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2300 		 * this would cause a second mapping of the page and panic.
2301 		 */
2302 		kr = vm_map_wire_kernel(map,
2303 		    vm_sanitize_wrap_addr(newaddr),
2304 		    vm_sanitize_wrap_addr(newaddr + newsize),
2305 		    vm_sanitize_wrap_prot(VM_PROT_DEFAULT),
2306 		    guard.kmg_tag, FALSE);
2307 		assert(kr == KERN_SUCCESS);
2308 
2309 		if (flags & KMR_FREEOLD) {
2310 			/*
2311 			 * Undo the extra "wiring" we made above
2312 			 * and release the extra reference we took
2313 			 * on the object.
2314 			 */
2315 			vm_object_lock(object);
2316 			for (vm_object_offset_t offset = 0;
2317 			    offset < oldsize - guard_right_size;
2318 			    offset += PAGE_SIZE_64) {
2319 				vm_page_t mem;
2320 
2321 				mem = vm_page_lookup(object, offset);
2322 				assert(mem != VM_PAGE_NULL);
2323 				assertf(!VM_PAGE_PAGEABLE(mem),
2324 				    "mem %p qstate %d",
2325 				    mem, mem->vmp_q_state);
2326 				if (vm_page_is_guard(mem)) {
2327 					/* guard pages are not wired */
2328 				} else {
2329 					assertf(VM_PAGE_WIRED(mem),
2330 					    "mem %p qstate %d wirecount %d",
2331 					    mem,
2332 					    mem->vmp_q_state,
2333 					    mem->vmp_wire_count);
2334 					assertf(mem->vmp_wire_count >= 2,
2335 					    "mem %p wirecount %d",
2336 					    mem, mem->vmp_wire_count);
2337 					mem->vmp_wire_count--;
2338 					assert(VM_PAGE_WIRED(mem));
2339 					assert(mem->vmp_wire_count >= 1);
2340 				}
2341 			}
2342 			vm_object_unlock(object);
2343 			vm_object_deallocate(object); /* release extra ref */
2344 		}
2345 	}
2346 
2347 	if (needs_wakeup) {
2348 		vm_map_entry_wakeup(map);
2349 	}
2350 
2351 #if DEBUG || DEVELOPMENT
2352 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
2353 	    atop(newsize - oldsize), 0, 0, 0);
2354 #endif /* DEBUG || DEVELOPMENT */
2355 	kmr.kmr_address = newaddr;
2356 
2357 #if KASAN
2358 	kasan_notify_address(kmr.kmr_address, newsize);
2359 #endif /* KASAN */
2360 #if KASAN_CLASSIC
2361 	if (flags & KMR_KASAN_GUARD) {
2362 		kmr.kmr_address += PAGE_SIZE;
2363 		kasan_alloc_large(kmr.kmr_address, req_newsize);
2364 	}
2365 #endif /* KASAN_CLASSIC */
2366 #if CONFIG_KERNEL_TAGGING
2367 	if (flags & KMR_TAG) {
2368 #if HAS_MTE
2369 		kmr.kmr_address = vm_memtag_insert_tag(kmr.kmr_address,
2370 		    vm_memtag_extract_tag(req_oldaddr));
2371 		vm_memtag_store_tag((caddr_t)kmr.kmr_ptr + oldsize - guard_right_size,
2372 		    newsize - oldsize);
2373 #elif KASAN_TBI
2374 		/*
2375 		 * Validate the current buffer, then generate a new tag,
2376 		 * even if the address is stable, it's a "new" allocation.
2377 		 */
2378 		__asan_loadN((vm_offset_t)kmr.kmr_address, oldsize);
2379 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2380 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2381 #endif /* KASAN_TBI */
2382 	}
2383 #endif /* CONFIG_KERNEL_TAGGING */
2384 
2385 	vmlp_api_end(KMEM_REALLOC_GUARD, kmr.kmr_return);
2386 	return kmr;
2387 }
2388 
2389 #pragma mark map/remap/wire
2390 
2391 kern_return_t
mach_vm_map_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut initial_size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,memory_object_offset_ut offset,boolean_t copy,vm_prot_ut cur_protection,vm_prot_ut max_protection,vm_inherit_ut inheritance)2392 mach_vm_map_kernel(
2393 	vm_map_t                target_map,
2394 	mach_vm_offset_ut      *address,
2395 	mach_vm_size_ut         initial_size,
2396 	mach_vm_offset_ut       mask,
2397 	vm_map_kernel_flags_t   vmk_flags,
2398 	ipc_port_t              port,
2399 	memory_object_offset_ut offset,
2400 	boolean_t               copy,
2401 	vm_prot_ut              cur_protection,
2402 	vm_prot_ut              max_protection,
2403 	vm_inherit_ut           inheritance)
2404 {
2405 	/* range_id is set by vm_map_enter_mem_object */
2406 	return vm_map_enter_mem_object(target_map,
2407 	           address,
2408 	           initial_size,
2409 	           mask,
2410 	           vmk_flags,
2411 	           port,
2412 	           offset,
2413 	           copy,
2414 	           cur_protection,
2415 	           max_protection,
2416 	           inheritance,
2417 	           NULL,
2418 	           0);
2419 }
2420 
2421 kern_return_t
mach_vm_remap_new_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,mach_vm_offset_ut memory_address,boolean_t copy,vm_prot_ut * cur_protection,vm_prot_ut * max_protection,vm_inherit_ut inheritance)2422 mach_vm_remap_new_kernel(
2423 	vm_map_t                target_map,
2424 	mach_vm_offset_ut      *address,
2425 	mach_vm_size_ut         size,
2426 	mach_vm_offset_ut       mask,
2427 	vm_map_kernel_flags_t   vmk_flags,
2428 	vm_map_t                src_map,
2429 	mach_vm_offset_ut       memory_address,
2430 	boolean_t               copy,
2431 	vm_prot_ut             *cur_protection,   /* IN/OUT */
2432 	vm_prot_ut             *max_protection,   /* IN/OUT */
2433 	vm_inherit_ut           inheritance)
2434 {
2435 	if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
2436 	    VM_FLAGS_USER_REMAP)) {
2437 		return KERN_INVALID_ARGUMENT;
2438 	}
2439 
2440 
2441 	vmk_flags.vmf_return_data_addr = true;
2442 
2443 	/* range_id is set by vm_map_remap */
2444 	return vm_map_remap(target_map,
2445 	           address,
2446 	           size,
2447 	           mask,
2448 	           vmk_flags,
2449 	           src_map,
2450 	           memory_address,
2451 	           copy,
2452 	           cur_protection,
2453 	           max_protection,
2454 	           inheritance);
2455 }
2456 
2457 #pragma mark free
2458 
2459 #if KASAN
2460 
2461 __abortlike
2462 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2463 __kmem_free_invalid_object_size_panic(
2464 	vm_map_t                map,
2465 	vm_address_t            address,
2466 	vm_size_t               size,
2467 	vm_map_entry_t          entry)
2468 {
2469 	vm_object_t object  = VME_OBJECT(entry);
2470 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
2471 
2472 	panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2473 	    "object %p has unexpected size %ld",
2474 	    map, (void *)address, (size_t)size, entry, object, objsize);
2475 }
2476 
2477 #endif /* KASAN */
2478 
2479 __mockable vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2480 kmem_free_guard(
2481 	vm_map_t        map,
2482 	vm_offset_t     req_addr,
2483 	vm_size_t       req_size,
2484 	kmf_flags_t     flags,
2485 	kmem_guard_t    guard)
2486 {
2487 	vmr_flags_t     vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2488 	vm_address_t    addr      = req_addr;
2489 	vm_offset_t     delta     = 0;
2490 	vm_size_t       size;
2491 #if KASAN
2492 	vm_map_entry_t  entry;
2493 #endif /* KASAN */
2494 
2495 	vmlp_api_start(KMEM_FREE_GUARD);
2496 
2497 	assert(map->pmap == kernel_pmap);
2498 
2499 #if KASAN_CLASSIC
2500 	if (flags & KMF_KASAN_GUARD) {
2501 		addr  -= PAGE_SIZE;
2502 		delta  = ptoa(2);
2503 	}
2504 #endif /* KASAN_CLASSIC */
2505 #if CONFIG_KERNEL_TAGGING
2506 	if (flags & KMF_TAG) {
2507 		vm_memtag_verify_tag(req_addr + __kmem_guard_left(ANYF(flags)));
2508 		addr = vm_memtag_canonicalize_kernel(req_addr);
2509 	}
2510 #endif /* CONFIG_KERNEL_TAGGING */
2511 
2512 	if (flags & KMF_GUESS_SIZE) {
2513 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2514 		size = PAGE_SIZE;
2515 	} else if (req_size == 0) {
2516 		__kmem_invalid_size_panic(map, req_size, flags);
2517 	} else {
2518 		size = round_page(req_size) + delta;
2519 	}
2520 
2521 	vm_map_lock(map);
2522 
2523 #if KASAN
2524 	if (!vm_map_lookup_entry(map, addr, &entry)) {
2525 		__kmem_entry_not_found_panic(map, req_addr);
2526 	}
2527 	if (flags & KMF_GUESS_SIZE) {
2528 		vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2529 		req_size = __kmem_entry_orig_size(entry);
2530 		size = round_page(req_size + delta);
2531 	} else if (guard.kmg_atomic && entry->vme_kernel_object &&
2532 	    __kmem_entry_orig_size(entry) != req_size) {
2533 		/*
2534 		 * We can't make a strict check for regular
2535 		 * VM objects because it could be:
2536 		 *
2537 		 * - the kmem_guard_free() of a kmem_realloc_guard() without
2538 		 *   KMR_FREEOLD, and in that case the object size won't match.
2539 		 *
2540 		 * - a submap, in which case there is no "orig size".
2541 		 */
2542 		__kmem_free_invalid_object_size_panic(map,
2543 		    req_addr, req_size + delta, entry);
2544 	}
2545 #endif /* KASAN */
2546 #if KASAN_CLASSIC
2547 	if (flags & KMR_KASAN_GUARD) {
2548 		kasan_poison_range(addr, size, ASAN_VALID);
2549 	}
2550 #endif
2551 #if KASAN_TBI
2552 	if (flags & KMF_TAG) {
2553 		kasan_tbi_mark_free_space((caddr_t)req_addr, size);
2554 	}
2555 #endif /* KASAN_TBI */
2556 
2557 	/*
2558 	 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2559 	 * unwires the kernel mapping. The page won't be mapped any longer so
2560 	 * there is no extra step that is required for memory tagging to "clear"
2561 	 * it -- the page will be later laundered when reused.
2562 	 */
2563 	vmlp_range_event(map, addr, size);
2564 	vmlp_api_end(KMEM_FREE_GUARD, 0);
2565 	return vm_map_remove_and_unlock(map, addr, addr + size,
2566 	           vmr_flags, guard).kmr_size - delta;
2567 }
2568 
2569 __exported void
2570 kmem_free_external(
2571 	vm_map_t        map,
2572 	vm_offset_t     addr,
2573 	vm_size_t       size);
2574 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2575 kmem_free_external(
2576 	vm_map_t        map,
2577 	vm_offset_t     addr,
2578 	vm_size_t       size)
2579 {
2580 	if (size) {
2581 		kmem_free(map, trunc_page(addr), size);
2582 #if MACH_ASSERT
2583 	} else {
2584 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2585 		    map, (void *)addr, __builtin_return_address(0));
2586 #endif
2587 	}
2588 }
2589 
2590 #pragma mark kmem metadata
2591 
2592 /*
2593  * Guard objects for kmem pointer allocation:
2594  *
2595  * Guard objects introduce size slabs to kmem pointer allocations that are
2596  * allocated in chunks of n * sizeclass. When an allocation of a specific
2597  * sizeclass is requested a random slot from [0, n) is returned.
2598  * Allocations are returned from that chunk until m slots are left. The
2599  * remaining m slots are referred to as guard objects. They don't get
2600  * allocated and the chunk is now considered full. When an allocation is
2601  * freed to the chunk 1 slot is now available from m + 1 for the next
2602  * allocation of that sizeclass.
2603  *
2604  * Guard objects are intended to make exploitation of use after frees harder
2605  * as allocations that are freed can no longer be reliable reallocated.
2606  * They also make exploitation of OOBs harder as overflowing out of an
2607  * allocation can no longer be safe even with sufficient spraying.
2608  */
2609 
2610 #define KMEM_META_PRIMARY    0xf
2611 #define KMEM_META_START      0xe
2612 #define KMEM_META_FREE       0xd
2613 #if __ARM_16K_PG__
2614 #define KMEM_MIN_SIZE        PAGE_SIZE
2615 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2616 #else /* __ARM_16K_PG__ */
2617 /*
2618  * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2619  * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2620  * Therefore populate sizeclasses from 4k for those devices.
2621  */
2622 #define KMEM_MIN_SIZE       (4 * 1024)
2623 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2624 #endif /* __ARM_16K_PG__ */
2625 #define KMEM_MAX_SIZE       (32ULL << 20)
2626 #define KMEM_START_IDX      (kmem_log2down(KMEM_MIN_SIZE))
2627 #define KMEM_LAST_IDX       (kmem_log2down(KMEM_MAX_SIZE))
2628 #define KMEM_NUM_SIZECLASS  (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2629 #define KMEM_FRONTS         (KMEM_RANGE_ID_NUM_PTR * 2)
2630 #define KMEM_NUM_SLOTS       8
2631 #define KMEM_NUM_GUARDS      2
2632 #define KMEM_NUM_QUARANTINE  2
2633 
2634 #define KMEM_PAGEMARKER_BITS 4
2635 #define KMEM_SIZECLASS_BITS  4
2636 #define KMEM_QUARANTINE_BITS 3
2637 #define KMEM_AVAIL_BITS      5
2638 
2639 static_assert(KMEM_NUM_SIZECLASS <= (1u << KMEM_SIZECLASS_BITS));
2640 
2641 struct kmem_page_meta {
2642 	union {
2643 		/*
2644 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2645 		 */
2646 		uint32_t km_bitmap;
2647 		/*
2648 		 * On start and end of free chunk with KMEM_META_FREE marker
2649 		 */
2650 		uint32_t km_free_chunks;
2651 	};
2652 
2653 	/*
2654 	 * KMEM_META_PRIMARY: Start meta of allocated chunk
2655 	 * KMEM_META_FREE   : Start and end meta of free chunk
2656 	 * KMEM_META_START  : Meta region start and end
2657 	 */
2658 	uint8_t  km_page_marker : KMEM_PAGEMARKER_BITS;
2659 	uint8_t  km_sizeclass   : KMEM_SIZECLASS_BITS;
2660 	uint8_t  km_quarantined : KMEM_QUARANTINE_BITS;
2661 	uint8_t  km_avail_count : KMEM_AVAIL_BITS;
2662 
2663 	union {
2664 		/*
2665 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2666 		 */
2667 		uint16_t km_chunk_len;
2668 		/*
2669 		 * On secondary allocated chunks
2670 		 */
2671 		uint16_t km_page_idx;
2672 	};
2673 	LIST_ENTRY(kmem_page_meta) km_link;
2674 } kmem_page_meta_t;
2675 
2676 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2677 struct kmem_sizeclass {
2678 	vm_map_size_t                   ks_size;
2679 	uint32_t                        ks_num_chunk;
2680 	uint32_t                        ks_num_elem;
2681 	crypto_random_ctx_t __zpercpu   ks_rng_ctx;
2682 	kmem_list_head_t                ks_allfree_head[KMEM_FRONTS];
2683 	kmem_list_head_t                ks_partial_head[KMEM_FRONTS];
2684 	kmem_list_head_t                ks_full_head[KMEM_FRONTS];
2685 };
2686 
2687 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2688 
2689 /*
2690  * Locks to synchronize metadata population
2691  */
2692 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2693 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2694 #define kmem_meta_lock()   lck_mtx_lock(&kmem_meta_region_lck)
2695 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2696 
2697 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2698 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2699 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2700 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2701 /*
2702  * Keeps track of metadata high water mark for each front
2703  */
2704 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2705 static SECURITY_READ_ONLY_LATE(vm_map_t)
2706 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2707 static vm_map_size_t kmem_meta_size;
2708 
2709 static uint32_t
kmem_guard_count(struct kmem_sizeclass * kmem)2710 kmem_guard_count(struct kmem_sizeclass *kmem)
2711 {
2712 	return kmem->ks_num_elem * KMEM_NUM_GUARDS / KMEM_NUM_SLOTS;
2713 }
2714 
2715 static uint32_t
kmem_quarantine_count(struct kmem_sizeclass * kmem)2716 kmem_quarantine_count(struct kmem_sizeclass *kmem)
2717 {
2718 	return kmem->ks_num_elem * KMEM_NUM_QUARANTINE /
2719 	       KMEM_NUM_SLOTS;
2720 }
2721 
2722 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2723 kmem_get_front(
2724 	kmem_range_id_t         range_id,
2725 	bool                    from_right)
2726 {
2727 	assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2728 	    (range_id <= KMEM_RANGE_ID_NUM_PTR));
2729 	return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2730 }
2731 
2732 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2733 kmem_slot_idx_to_bit(
2734 	uint32_t                slot_idx,
2735 	uint32_t                size_idx __unused)
2736 {
2737 	assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2738 	return 1ull << slot_idx;
2739 }
2740 
2741 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2742 kmem_get_idx_from_size(vm_map_size_t size)
2743 {
2744 	assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2745 	return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2746 }
2747 
2748 __abortlike
2749 static void
kmem_invalid_size_idx(uint32_t idx)2750 kmem_invalid_size_idx(uint32_t idx)
2751 {
2752 	panic("Invalid sizeclass idx %u", idx);
2753 }
2754 
2755 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2756 kmem_get_size_from_idx(uint32_t idx)
2757 {
2758 	if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2759 		kmem_invalid_size_idx(idx);
2760 	}
2761 	return 1ul << (idx + KMEM_START_IDX);
2762 }
2763 
2764 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2765 kmem_get_page_idx(struct kmem_page_meta *meta)
2766 {
2767 	uint8_t page_marker = meta->km_page_marker;
2768 
2769 	return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2770 }
2771 
2772 __abortlike
2773 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2774 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2775 {
2776 	panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2777 	    meta);
2778 }
2779 
2780 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2781 kmem_get_chunk_len(struct kmem_page_meta *meta)
2782 {
2783 	if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2784 		kmem_invalid_chunk_len(meta);
2785 	}
2786 
2787 	return meta->km_chunk_len;
2788 }
2789 
2790 __abortlike
2791 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2792 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2793 {
2794 	panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2795 	    meta);
2796 }
2797 
2798 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2799 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2800 {
2801 	if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2802 		kmem_invalid_free_chunk_len(meta);
2803 	}
2804 
2805 	return meta->km_free_chunks;
2806 }
2807 
2808 /*
2809  * Return the metadata corresponding to the specified address
2810  */
2811 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2812 kmem_addr_to_meta(
2813 	vm_map_offset_t         addr,
2814 	vm_map_range_id_t       range_id,
2815 	vm_map_offset_t        *range_start,
2816 	uint64_t               *meta_idx)
2817 {
2818 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2819 
2820 	*range_start = kmem_ranges[range_id].min_address;
2821 	*meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2822 	return VM_FAR_ADD_PTR_UNBOUNDED(meta_base, *meta_idx);
2823 }
2824 
2825 /*
2826  * Return the metadata start of the chunk that the address belongs to
2827  */
2828 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2829 kmem_addr_to_meta_start(
2830 	vm_address_t            addr,
2831 	vm_map_range_id_t       range_id,
2832 	vm_map_offset_t        *chunk_start)
2833 {
2834 	vm_map_offset_t range_start;
2835 	uint64_t meta_idx;
2836 	struct kmem_page_meta *meta;
2837 
2838 	meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2839 	meta_idx -= kmem_get_page_idx(meta);
2840 	meta = VM_FAR_ADD_PTR_UNBOUNDED(meta, -(ptrdiff_t)kmem_get_page_idx(meta));
2841 	assert(meta->km_page_marker == KMEM_META_PRIMARY);
2842 	*chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2843 	return meta;
2844 }
2845 
2846 __startup_func
2847 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2848 kmem_init_meta_front(
2849 	struct kmem_page_meta  *meta,
2850 	kmem_range_id_t         range_id,
2851 	bool                    from_right)
2852 {
2853 	kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2854 	    KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2855 	meta->km_page_marker = KMEM_META_START;
2856 	if (!from_right) {
2857 		meta++;
2858 		kmem_meta_base[range_id] = meta;
2859 	}
2860 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2861 }
2862 
2863 __startup_func
2864 static void
kmem_metadata_init(void)2865 kmem_metadata_init(void)
2866 {
2867 	for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2868 		vm_map_offset_t addr = kmem_meta_range[i].min_address;
2869 		struct kmem_page_meta *meta;
2870 		uint64_t meta_idx;
2871 
2872 		vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2873 		kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2874 		    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2875 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
2876 		    KMS_PERMANENT | KMS_NOFAIL | KMS_NOSOFTLIMIT,
2877 		    VM_KERN_MEMORY_OSFMK).kmr_submap;
2878 
2879 		kmem_meta_range[i].min_address = addr;
2880 		kmem_meta_range[i].max_address = addr + kmem_meta_size;
2881 
2882 		meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2883 		kmem_init_meta_front(meta, i, 0);
2884 
2885 		meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2886 		    &meta_idx);
2887 		kmem_init_meta_front(meta, i, 1);
2888 	}
2889 }
2890 
2891 __startup_func
2892 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2893 kmem_init_front_head(
2894 	struct kmem_sizeclass  *ks,
2895 	uint32_t                front)
2896 {
2897 	LIST_INIT(&ks->ks_allfree_head[front]);
2898 	LIST_INIT(&ks->ks_partial_head[front]);
2899 	LIST_INIT(&ks->ks_full_head[front]);
2900 }
2901 
2902 __startup_func
2903 static void
kmem_sizeclass_init(void)2904 kmem_sizeclass_init(void)
2905 {
2906 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2907 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2908 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2909 
2910 		ks->ks_size = kmem_get_size_from_idx(i);
2911 		ks->ks_num_chunk = roundup(KMEM_NUM_SLOTS * ks->ks_size,
2912 		    KMEM_CHUNK_SIZE_MIN) / KMEM_CHUNK_SIZE_MIN;
2913 		ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2914 
2915 		/*
2916 		 * Check that everything fits in the metadata.
2917 		 */
2918 		assert(ks->ks_num_elem <=
2919 		    (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2920 		assert(kmem_quarantine_count(ks) - 1 <
2921 		    (1u << KMEM_QUARANTINE_BITS));
2922 		assert(ks->ks_num_elem - kmem_guard_count(ks) <
2923 		    (1u << KMEM_AVAIL_BITS));
2924 
2925 		for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2926 			kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2927 			kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2928 		}
2929 	}
2930 }
2931 
2932 /*
2933  * This is done during EARLY_BOOT as it needs the corecrypto module to be
2934  * set up.
2935  */
2936 __startup_func
2937 static void
kmem_crypto_init(void)2938 kmem_crypto_init(void)
2939 {
2940 	vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2941 
2942 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2943 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2944 
2945 		ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2946 		zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2947 			crypto_random_kmem_init(ctx);
2948 		}
2949 	}
2950 }
2951 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2952 
2953 __abortlike
2954 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2955 kmem_validate_slot_panic(
2956 	vm_map_offset_t         addr,
2957 	struct kmem_page_meta  *meta,
2958 	uint32_t                slot_idx,
2959 	uint32_t                size_idx)
2960 {
2961 	if (meta->km_page_marker != KMEM_META_PRIMARY) {
2962 		panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2963 	}
2964 	if (meta->km_sizeclass != size_idx) {
2965 		panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2966 		    meta, meta->km_sizeclass, size_idx);
2967 	}
2968 	panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2969 	    slot_idx, meta, (void *)addr);
2970 }
2971 
2972 __abortlike
2973 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2974 kmem_invalid_slot_for_addr(
2975 	mach_vm_range_t         slot,
2976 	vm_map_offset_t         start,
2977 	vm_map_offset_t         end)
2978 {
2979 	panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2980 	    (void *)slot->min_address, (void *)slot->max_address,
2981 	    (void *)start, (void *)end);
2982 }
2983 
2984 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2985 kmem_validate_slot(
2986 	vm_map_offset_t         addr,
2987 	struct kmem_page_meta  *meta,
2988 	uint32_t                size_idx,
2989 	uint32_t                slot_idx)
2990 {
2991 	if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2992 	    (meta->km_sizeclass != size_idx) ||
2993 	    ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2994 		kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2995 	}
2996 }
2997 
2998 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2999 kmem_validate_slot_initial(
3000 	mach_vm_range_t         slot,
3001 	vm_map_offset_t         start,
3002 	vm_map_offset_t         end,
3003 	struct kmem_page_meta  *meta,
3004 	uint32_t                size_idx,
3005 	uint32_t                slot_idx)
3006 {
3007 	if ((slot->min_address == 0) || (slot->max_address == 0) ||
3008 	    (start < slot->min_address) || (start >= slot->max_address) ||
3009 	    (end > slot->max_address)) {
3010 		kmem_invalid_slot_for_addr(slot, start, end);
3011 	}
3012 
3013 	kmem_validate_slot(start, meta, size_idx, slot_idx);
3014 }
3015 
3016 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)3017 kmem_addr_get_slot_idx(
3018 	vm_map_offset_t         start,
3019 	vm_map_offset_t         end,
3020 	vm_map_range_id_t       range_id,
3021 	struct kmem_page_meta **meta,
3022 	uint32_t               *size_idx,
3023 	mach_vm_range_t         slot)
3024 {
3025 	vm_map_offset_t chunk_start;
3026 	vm_map_size_t slot_size;
3027 	uint32_t slot_idx;
3028 
3029 	*meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
3030 	*size_idx = (*meta)->km_sizeclass;
3031 	slot_size = kmem_get_size_from_idx(*size_idx);
3032 	slot_idx = (start - chunk_start) / slot_size;
3033 	slot->min_address = chunk_start + slot_idx * slot_size;
3034 	slot->max_address = slot->min_address + slot_size;
3035 
3036 	kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
3037 
3038 	return slot_idx;
3039 }
3040 
3041 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)3042 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
3043 {
3044 #if KASAN
3045 #pragma unused(from, to)
3046 	return true;
3047 #else
3048 	vm_offset_t page_addr = trunc_page(from);
3049 
3050 	for (; page_addr < to; page_addr += PAGE_SIZE) {
3051 		/*
3052 		 * This can race with another thread doing a populate on the same metadata
3053 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
3054 		 * fault in the shadow when we first access the metadata page. Avoid this
3055 		 * by always synchronizing on the kmem_meta_lock with KASan.
3056 		 */
3057 		if (!pmap_find_phys(kernel_pmap, page_addr)) {
3058 			return true;
3059 		}
3060 	}
3061 
3062 	return false;
3063 #endif /* !KASAN */
3064 }
3065 
3066 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)3067 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
3068 {
3069 	vm_offset_t page_addr = trunc_page(from);
3070 
3071 	vmlp_api_start(KMEM_POPULATE_META_LOCKED);
3072 
3073 	vm_map_unlock(kernel_map);
3074 
3075 	vmlp_range_event(kernel_map, from, to - from);
3076 
3077 	for (; page_addr < to; page_addr += PAGE_SIZE) {
3078 		for (;;) {
3079 			kern_return_t ret = KERN_SUCCESS;
3080 
3081 			/*
3082 			 * All updates to kmem metadata are done under the kmem_meta_lock
3083 			 */
3084 			kmem_meta_lock();
3085 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
3086 				ret = kernel_memory_populate(page_addr,
3087 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
3088 				    VM_KERN_MEMORY_OSFMK);
3089 			}
3090 			kmem_meta_unlock();
3091 
3092 			if (ret == KERN_SUCCESS) {
3093 				break;
3094 			}
3095 
3096 			/*
3097 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
3098 			 * to bad system deadlocks, so if the allocation failed,
3099 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
3100 			 */
3101 			VM_PAGE_WAIT();
3102 		}
3103 	}
3104 
3105 	vm_map_lock(kernel_map);
3106 	vmlp_api_end(KMEM_POPULATE_META_LOCKED, 0);
3107 }
3108 
3109 __abortlike
3110 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass * sizeclass)3111 kmem_invalid_meta_panic(
3112 	struct kmem_page_meta  *meta,
3113 	uint32_t                slot_idx,
3114 	struct kmem_sizeclass  *sizeclass)
3115 {
3116 	uint32_t size_idx = kmem_get_idx_from_size(sizeclass->ks_size);
3117 
3118 	if (slot_idx >= sizeclass->ks_num_elem) {
3119 		panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
3120 		    sizeclass->ks_num_elem, meta);
3121 	}
3122 	if (meta->km_sizeclass != size_idx) {
3123 		panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
3124 		    meta->km_sizeclass, meta);
3125 	}
3126 	panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
3127 }
3128 
3129 __abortlike
3130 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)3131 kmem_slot_has_entry_panic(
3132 	vm_map_entry_t          entry,
3133 	vm_map_offset_t         addr)
3134 {
3135 	panic("Entry (%p) already exists for addr (%p) being returned",
3136 	    entry, (void *)addr);
3137 }
3138 
3139 __abortlike
3140 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)3141 kmem_slot_not_found(
3142 	struct kmem_page_meta  *meta,
3143 	uint32_t                slot_idx)
3144 {
3145 	panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
3146 	    meta->km_bitmap);
3147 }
3148 
3149 /*
3150  * Returns a 16bit random number between 0 and
3151  * upper_limit (inclusive)
3152  */
3153 __startup_func
3154 uint16_t
kmem_get_random16(uint16_t upper_limit)3155 kmem_get_random16(
3156 	uint16_t                upper_limit)
3157 {
3158 	static uint64_t random_entropy;
3159 	assert(upper_limit < UINT16_MAX);
3160 	if (random_entropy == 0) {
3161 		random_entropy = early_random();
3162 	}
3163 	uint32_t result = random_entropy & UINT32_MAX;
3164 	random_entropy >>= 32;
3165 	return (uint16_t)(result % (upper_limit + 1));
3166 }
3167 
3168 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)3169 kmem_get_nth_free_slot(
3170 	struct kmem_page_meta  *meta,
3171 	uint32_t                n,
3172 	uint32_t                bitmap)
3173 {
3174 	uint32_t zeros_seen = 0, ones_seen = 0;
3175 
3176 	while (bitmap) {
3177 		uint32_t count = __builtin_ctz(bitmap);
3178 
3179 		zeros_seen += count;
3180 		bitmap >>= count;
3181 		if (__probable(~bitmap)) {
3182 			count = __builtin_ctz(~bitmap);
3183 		} else {
3184 			count = 32;
3185 		}
3186 		if (count + ones_seen > n) {
3187 			meta->km_avail_count -= 1;
3188 			return zeros_seen + n;
3189 		}
3190 		ones_seen += count;
3191 		bitmap >>= count;
3192 	}
3193 
3194 	kmem_slot_not_found(meta, n);
3195 }
3196 
3197 
3198 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass * sizeclass,uint32_t bitmap)3199 kmem_get_next_slot(
3200 	struct kmem_page_meta  *meta,
3201 	struct kmem_sizeclass  *sizeclass,
3202 	uint32_t                bitmap)
3203 {
3204 	uint32_t num_slots = meta->km_avail_count + meta->km_quarantined +
3205 	    kmem_guard_count(sizeclass);
3206 	uint64_t slot_idx = 0;
3207 
3208 	assert(meta->km_avail_count > 0 &&
3209 	    num_slots == __builtin_popcount(meta->km_bitmap));
3210 
3211 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
3212 		/*
3213 		 * Use early random prior to early boot as the ks_rng_ctx requires
3214 		 * the corecrypto module to be setup before it is initialized and
3215 		 * used.
3216 		 *
3217 		 * num_slots can't be 0 as we take this path when we have more than
3218 		 * one slot left.
3219 		 */
3220 		slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
3221 	} else {
3222 		crypto_random_uniform(zpercpu_get(sizeclass->ks_rng_ctx),
3223 		    num_slots, &slot_idx);
3224 	}
3225 
3226 	return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
3227 }
3228 
3229 /*
3230  * Returns an unallocated slot from the given metadata
3231  */
3232 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass * sizeclass,vm_map_entry_t * entry)3233 kmem_get_addr_from_meta(
3234 	struct kmem_page_meta  *meta,
3235 	vm_map_range_id_t       range_id,
3236 	struct kmem_sizeclass  *sizeclass,
3237 	vm_map_entry_t         *entry)
3238 {
3239 	vm_map_offset_t addr;
3240 	vm_map_size_t size = sizeclass->ks_size;
3241 	uint32_t size_idx = kmem_get_idx_from_size(size);
3242 	uint64_t meta_idx = meta - kmem_meta_base[range_id];
3243 	mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
3244 	uint32_t slot_bit;
3245 	uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
3246 
3247 	if ((slot_idx >= sizeclass->ks_num_elem) ||
3248 	    (meta->km_sizeclass != size_idx) ||
3249 	    (meta->km_page_marker != KMEM_META_PRIMARY)) {
3250 		kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
3251 	}
3252 
3253 	slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
3254 	meta->km_bitmap &= ~slot_bit;
3255 
3256 	addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
3257 	assert(kmem_range_contains_fully(range_id, addr, size));
3258 	if (vm_map_lookup_entry(kernel_map, addr, entry)) {
3259 		kmem_slot_has_entry_panic(*entry, addr);
3260 	}
3261 	if ((*entry != vm_map_to_entry(kernel_map)) &&
3262 	    ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
3263 	    ((*entry)->vme_next->vme_start < (addr + size))) {
3264 		kmem_slot_has_entry_panic(*entry, addr);
3265 	}
3266 	return addr;
3267 }
3268 
3269 __abortlike
3270 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)3271 kmem_range_out_of_va(
3272 	kmem_range_id_t         range_id,
3273 	uint32_t                num_chunks)
3274 {
3275 	panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
3276 }
3277 
3278 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass * sizeclass,uint32_t size_idx)3279 kmem_init_allocated_chunk(
3280 	struct kmem_page_meta  *meta,
3281 	struct kmem_sizeclass  *sizeclass,
3282 	uint32_t                size_idx)
3283 {
3284 	uint32_t meta_num = sizeclass->ks_num_chunk;
3285 	uint32_t num_elem = sizeclass->ks_num_elem;
3286 
3287 	meta->km_bitmap = (1ull << num_elem) - 1;
3288 	meta->km_chunk_len = (uint16_t)meta_num;
3289 	meta->km_avail_count = (uint8_t)(num_elem - kmem_guard_count(sizeclass));
3290 	meta->km_quarantined = 0;
3291 	assert(LIST_NEXT(meta, km_link) == NULL);
3292 	assert(meta->km_link.le_prev == NULL);
3293 	meta->km_sizeclass = (uint8_t)size_idx;
3294 	meta->km_page_marker = KMEM_META_PRIMARY;
3295 	meta++;
3296 	for (uint32_t i = 1; i < meta_num; i++) {
3297 		meta->km_page_idx = (uint16_t)i;
3298 		meta->km_avail_count = 0;
3299 		meta->km_quarantined = 0;
3300 		meta->km_sizeclass = (uint8_t)size_idx;
3301 		meta->km_page_marker = 0;
3302 		meta->km_bitmap = 0;
3303 		meta++;
3304 	}
3305 }
3306 
3307 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)3308 kmem_get_additional_meta(
3309 	struct kmem_page_meta  *meta,
3310 	uint32_t                meta_req,
3311 	bool                    from_right,
3312 	struct kmem_page_meta **adj_free_meta)
3313 {
3314 	struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
3315 
3316 	if (meta_prev->km_page_marker == KMEM_META_FREE) {
3317 		uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
3318 
3319 		*adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
3320 		meta_req -= chunk_len;
3321 	} else {
3322 		*adj_free_meta = NULL;
3323 	}
3324 
3325 	return meta_req;
3326 }
3327 
3328 
3329 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx,uint32_t front)3330 kmem_get_new_chunk(
3331 	vm_map_range_id_t       range_id,
3332 	bool                    from_right,
3333 	uint32_t                size_idx,
3334 	uint32_t                front)
3335 {
3336 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3337 	struct kmem_page_meta *start, *end, *meta_update;
3338 	struct kmem_page_meta *adj_free_meta = NULL;
3339 	uint32_t meta_req = sizeclass->ks_num_chunk;
3340 
3341 	for (;;) {
3342 		struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3343 		struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3344 		struct kmem_page_meta *meta;
3345 		vm_offset_t start_addr, end_addr;
3346 		uint32_t meta_num;
3347 
3348 		meta = from_right ? metab : metaf;
3349 		meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
3350 		    &adj_free_meta);
3351 
3352 		if (metaf + meta_num >= metab) {
3353 			kmem_range_out_of_va(range_id, meta_num);
3354 		}
3355 
3356 		start = from_right ? (metab - meta_num) : metaf;
3357 		end = from_right ? metab : (metaf + meta_num);
3358 
3359 		start_addr = (vm_offset_t)start;
3360 		end_addr   = (vm_offset_t)end;
3361 
3362 		/*
3363 		 * If the new high watermark stays on the same page,
3364 		 * no need to populate and drop the lock.
3365 		 */
3366 		if (!page_aligned(from_right ? end_addr : start_addr) &&
3367 		    trunc_page(start_addr) == trunc_page(end_addr - 1)) {
3368 			break;
3369 		}
3370 		if (!kmem_populate_needed(start_addr, end_addr)) {
3371 			break;
3372 		}
3373 
3374 		kmem_populate_meta_locked(start_addr, end_addr);
3375 
3376 		/*
3377 		 * Since we dropped the lock, reassess conditions still hold:
3378 		 * - the HWM we are changing must not have moved
3379 		 * - the other HWM must not intersect with ours
3380 		 * - in case of coalescing, the adjacent free meta must still
3381 		 *   be free and of the same size.
3382 		 *
3383 		 * If we failed to grow, reevaluate whether freelists have
3384 		 * entries now by returning NULL.
3385 		 */
3386 		metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3387 		metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3388 		if (meta != (from_right ? metab : metaf)) {
3389 			return NULL;
3390 		}
3391 		if (metaf + meta_num >= metab) {
3392 			kmem_range_out_of_va(range_id, meta_num);
3393 		}
3394 		if (adj_free_meta) {
3395 			if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
3396 			    kmem_get_free_chunk_len(adj_free_meta) !=
3397 			    meta_req - meta_num) {
3398 				return NULL;
3399 			}
3400 		}
3401 
3402 		break;
3403 	}
3404 
3405 	/*
3406 	 * If there is an adjacent free chunk remove it from free list
3407 	 */
3408 	if (adj_free_meta) {
3409 		LIST_REMOVE(adj_free_meta, km_link);
3410 		LIST_NEXT(adj_free_meta, km_link) = NULL;
3411 		adj_free_meta->km_link.le_prev = NULL;
3412 	}
3413 
3414 	/*
3415 	 * Update hwm
3416 	 */
3417 	meta_update = from_right ? start : end;
3418 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3419 
3420 	/*
3421 	 * Initialize metadata
3422 	 */
3423 	start = from_right ? start : (end - meta_req);
3424 	kmem_init_allocated_chunk(start, sizeclass, size_idx);
3425 	LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], start, km_link);
3426 
3427 	return start;
3428 }
3429 
3430 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3431 kmem_requeue_meta(
3432 	struct kmem_page_meta  *meta,
3433 	struct kmem_list_head  *head)
3434 {
3435 	LIST_REMOVE(meta, km_link);
3436 	LIST_INSERT_HEAD(head, meta, km_link);
3437 }
3438 
3439 /*
3440  * Return corresponding sizeclass to stash free chunks in
3441  */
3442 __abortlike
3443 static void
kmem_invalid_chunk_num(uint32_t chunks)3444 kmem_invalid_chunk_num(uint32_t chunks)
3445 {
3446 	panic("Invalid number of chunks %u\n", chunks);
3447 }
3448 
3449 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3450 kmem_get_size_idx_for_chunks(uint32_t chunks)
3451 {
3452 	for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3453 		if (chunks >= kmem_size_array[i].ks_num_chunk) {
3454 			return i;
3455 		}
3456 	}
3457 	kmem_invalid_chunk_num(chunks);
3458 }
3459 
3460 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3461 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3462 {
3463 	bzero(meta, count * sizeof(struct kmem_page_meta));
3464 }
3465 
3466 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3467 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3468 {
3469 #if MACH_ASSERT
3470 	size_t size = count * sizeof(struct kmem_page_meta);
3471 
3472 	assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3473 #else
3474 #pragma unused(meta, count)
3475 #endif
3476 }
3477 
3478 /*!
3479  * @function kmem_init_free_chunk()
3480  *
3481  * @discussion
3482  * This function prepares a range of chunks to be put on a free list.
3483  * The first and last metadata might be dirty, but the "inner" ones
3484  * must be zero filled by the caller prior to calling this function.
3485  */
3486 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3487 kmem_init_free_chunk(
3488 	struct kmem_page_meta  *meta,
3489 	uint32_t                num_chunks,
3490 	uint32_t                front)
3491 {
3492 	struct kmem_sizeclass *sizeclass;
3493 	uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3494 
3495 	if (num_chunks > 2) {
3496 		kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3497 	}
3498 
3499 	meta[0] = (struct kmem_page_meta){
3500 		.km_free_chunks = num_chunks,
3501 		.km_page_marker = KMEM_META_FREE,
3502 		.km_sizeclass   = (uint8_t)size_idx,
3503 	};
3504 	if (num_chunks > 1) {
3505 		meta[num_chunks - 1] = (struct kmem_page_meta){
3506 			.km_free_chunks = num_chunks,
3507 			.km_page_marker = KMEM_META_FREE,
3508 			.km_sizeclass   = (uint8_t)size_idx,
3509 		};
3510 	}
3511 
3512 	sizeclass = &kmem_size_array[size_idx];
3513 	LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3514 }
3515 
3516 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3517 kmem_get_free_chunk_from_list(
3518 	struct kmem_sizeclass  *org_sizeclass,
3519 	uint32_t                size_idx,
3520 	uint32_t                front)
3521 {
3522 	struct kmem_sizeclass *sizeclass;
3523 	uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3524 	struct kmem_page_meta *meta;
3525 	uint32_t idx = size_idx;
3526 
3527 	while (idx < KMEM_NUM_SIZECLASS) {
3528 		sizeclass = &kmem_size_array[idx];
3529 		meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3530 		if (meta) {
3531 			break;
3532 		}
3533 		idx++;
3534 	}
3535 
3536 	/*
3537 	 * Trim if larger in size
3538 	 */
3539 	if (meta) {
3540 		uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3541 
3542 		assert(meta->km_page_marker == KMEM_META_FREE);
3543 		LIST_REMOVE(meta, km_link);
3544 		LIST_NEXT(meta, km_link) = NULL;
3545 		meta->km_link.le_prev = NULL;
3546 		if (num_chunks_free > num_chunks) {
3547 			num_chunks_free -= num_chunks;
3548 			kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3549 		}
3550 
3551 		kmem_init_allocated_chunk(meta, org_sizeclass, size_idx);
3552 		LIST_INSERT_HEAD(&org_sizeclass->ks_partial_head[front], meta, km_link);
3553 	}
3554 
3555 	return meta;
3556 }
3557 
3558 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3559 kmem_locate_space(
3560 	vm_map_size_t           size,
3561 	vm_map_range_id_t       range_id,
3562 	bool                    from_right,
3563 	vm_map_offset_t        *start_inout,
3564 	vm_map_entry_t         *entry_out)
3565 {
3566 	vm_map_entry_t entry;
3567 	uint32_t size_idx = kmem_get_idx_from_size(size);
3568 	uint32_t front = kmem_get_front(range_id, from_right);
3569 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3570 	struct kmem_page_meta *meta;
3571 
3572 	assert(size <= sizeclass->ks_size);
3573 
3574 	do {
3575 		/*
3576 		 * Attempt to find space trying:
3577 		 * 1. partial heads;
3578 		 * 2. free chunks in the segregated free-lists;
3579 		 * 3. extending the metadata range.
3580 		 */
3581 		meta = LIST_FIRST(&sizeclass->ks_partial_head[front]) ?:
3582 		    kmem_get_free_chunk_from_list(sizeclass, size_idx, front) ?:
3583 		    kmem_get_new_chunk(range_id, from_right, size_idx, front);
3584 	} while (meta == NULL);
3585 
3586 	*start_inout = kmem_get_addr_from_meta(meta, range_id, sizeclass, &entry);
3587 
3588 	if (meta->km_avail_count == 0) {
3589 		kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3590 	}
3591 	if (entry_out) {
3592 		*entry_out = entry;
3593 	}
3594 
3595 	return KERN_SUCCESS;
3596 }
3597 
3598 /*
3599  * Determine whether the given metadata was allocated from the right
3600  */
3601 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3602 kmem_meta_is_from_right(
3603 	kmem_range_id_t         range_id,
3604 	struct kmem_page_meta  *meta)
3605 {
3606 	struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3607 	__assert_only struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3608 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3609 	struct kmem_page_meta *meta_end;
3610 
3611 	meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3612 
3613 	if ((meta >= meta_base) && (meta < metaf)) {
3614 		return false;
3615 	}
3616 
3617 	assert(meta >= metab && meta < meta_end);
3618 	return true;
3619 }
3620 
3621 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3622 kmem_free_chunk(
3623 	kmem_range_id_t         range_id,
3624 	struct kmem_page_meta  *meta,
3625 	bool                    from_right)
3626 {
3627 	struct kmem_page_meta *meta_coalesce = meta - 1;
3628 	struct kmem_page_meta *meta_start = meta;
3629 	uint32_t num_chunks = kmem_get_chunk_len(meta);
3630 	uint32_t add_chunks;
3631 	struct kmem_page_meta *meta_end = meta + num_chunks;
3632 	struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3633 	uint32_t front = kmem_get_front(range_id, from_right);
3634 
3635 	meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3636 	meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3637 
3638 	LIST_REMOVE(meta, km_link);
3639 	kmem_clear_meta_range(meta, num_chunks);
3640 
3641 	/*
3642 	 * Coalesce left
3643 	 */
3644 	if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3645 	    (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3646 		meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3647 		add_chunks = kmem_get_free_chunk_len(meta_start);
3648 		num_chunks += add_chunks;
3649 		LIST_REMOVE(meta_start, km_link);
3650 		kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3651 	}
3652 
3653 	/*
3654 	 * Coalesce right
3655 	 */
3656 	if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3657 	    (meta_end->km_page_marker == KMEM_META_FREE)) {
3658 		add_chunks = kmem_get_free_chunk_len(meta_end);
3659 		LIST_REMOVE(meta_end, km_link);
3660 		kmem_clear_meta_range(meta_end, 1);
3661 		meta_end = meta_end + add_chunks;
3662 		num_chunks += add_chunks;
3663 	}
3664 
3665 	kmem_init_free_chunk(meta_start, num_chunks, front);
3666 }
3667 
3668 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3669 kmem_free_slot(
3670 	kmem_range_id_t         range_id,
3671 	mach_vm_range_t         slot)
3672 {
3673 	struct kmem_page_meta *meta;
3674 	vm_map_offset_t chunk_start;
3675 	uint32_t size_idx, slot_idx;
3676 	struct kmem_sizeclass *sizeclass;
3677 	vm_map_size_t slot_size;
3678 
3679 	meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3680 	size_idx = meta->km_sizeclass;
3681 
3682 	sizeclass = &kmem_size_array[size_idx];
3683 	slot_size = kmem_get_size_from_idx(size_idx);
3684 	slot_idx = (slot->min_address - chunk_start) / slot_size;
3685 	assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3686 	meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3687 
3688 	if (meta->km_bitmap == ((1u << sizeclass->ks_num_elem) - 1)) {
3689 		/*
3690 		 * If entire chunk empty add to emtpy list
3691 		 */
3692 		bool from_right = kmem_meta_is_from_right(range_id, meta);
3693 
3694 		kmem_free_chunk(range_id, meta, from_right);
3695 	} else if (meta->km_avail_count + meta->km_quarantined + 1 <
3696 	    kmem_quarantine_count(sizeclass)) {
3697 		/*
3698 		 * If we're below quarantine levels, quarantine the slot
3699 		 * and move on.
3700 		 */
3701 		meta->km_quarantined += 1;
3702 	} else {
3703 		/*
3704 		 * If we freed to full chunk move it to partial
3705 		 */
3706 		if (meta->km_avail_count == 0) {
3707 			uint32_t front = kmem_get_front(range_id,
3708 			    kmem_meta_is_from_right(range_id, meta));
3709 
3710 			kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3711 		}
3712 
3713 		meta->km_avail_count += meta->km_quarantined + 1;
3714 		meta->km_quarantined = 0;
3715 	}
3716 }
3717 
3718 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3719 kmem_free_space(
3720 	vm_map_offset_t         start,
3721 	vm_map_offset_t         end,
3722 	vm_map_range_id_t       range_id,
3723 	mach_vm_range_t         slot)
3724 {
3725 	bool entry_present = false;
3726 	vm_map_entry_t prev_entry;
3727 	vm_map_entry_t next_entry;
3728 
3729 	if ((slot->min_address == start) && (slot->max_address == end)) {
3730 		/*
3731 		 * Entire slot is being freed at once
3732 		 */
3733 		return kmem_free_slot(range_id, slot);
3734 	}
3735 
3736 	entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3737 	assert(!entry_present);
3738 	next_entry = prev_entry->vme_next;
3739 
3740 	if (((prev_entry == vm_map_to_entry(kernel_map) ||
3741 	    prev_entry->vme_end <= slot->min_address)) &&
3742 	    (next_entry == vm_map_to_entry(kernel_map) ||
3743 	    (next_entry->vme_start >= slot->max_address))) {
3744 		/*
3745 		 * Free entire slot
3746 		 */
3747 		kmem_free_slot(range_id, slot);
3748 	}
3749 }
3750 
3751 #pragma mark kmem init
3752 
3753 /*
3754  * The default percentage of memory that can be mlocked is scaled based on the total
3755  * amount of memory in the system. These percentages are caclulated
3756  * offline and stored in this table. We index this table by
3757  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3758  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3759  *
3760  * Note that these values were picked for mac.
3761  * If we ever have very large memory config arm devices, we may want to revisit
3762  * since the kernel overhead is smaller there due to the larger page size.
3763  */
3764 
3765 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3766 #define VM_USER_WIREABLE_MIN_CONFIG 32
3767 #if CONFIG_JETSAM
3768 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3769  * pressure.
3770  */
3771 static vm_map_size_t wire_limit_percents[] =
3772 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3773 #else
3774 static vm_map_size_t wire_limit_percents[] =
3775 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3776 #endif /* CONFIG_JETSAM */
3777 
3778 /* Set limit to 95% of DRAM if serverperfmode=1 */
3779 #define VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT 95
3780 /* Use special serverperfmode behavior iff DRAM > 2^35 = 32GiB of RAM. */
3781 #define VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG 35
3782 
3783 /*
3784  * Sets the default global user wire limit which limits the amount of
3785  * memory that can be locked via mlock() based on the above algorithm..
3786  * This can be overridden via a sysctl.
3787  */
3788 static void
kmem_set_user_wire_limits(void)3789 kmem_set_user_wire_limits(void)
3790 {
3791 	uint64_t available_mem_log;
3792 	uint64_t max_wire_percent;
3793 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3794 	    sizeof(vm_map_size_t);
3795 	vm_map_size_t limit;
3796 	uint64_t config_memsize = max_mem;
3797 #if defined(XNU_TARGET_OS_OSX)
3798 	config_memsize = max_mem_actual;
3799 #endif /* defined(XNU_TARGET_OS_OSX) */
3800 
3801 	available_mem_log = bit_floor(config_memsize);
3802 
3803 	if (serverperfmode &&
3804 	    (available_mem_log >= VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG)) {
3805 		max_wire_percent = VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT;
3806 	} else {
3807 		if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3808 			available_mem_log = 0;
3809 		} else {
3810 			available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3811 		}
3812 		if (available_mem_log >= wire_limit_percents_length) {
3813 			available_mem_log = wire_limit_percents_length - 1;
3814 		}
3815 		max_wire_percent = wire_limit_percents[available_mem_log];
3816 	}
3817 
3818 	limit = config_memsize * max_wire_percent / 100;
3819 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3820 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3821 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3822 	}
3823 
3824 	vm_global_user_wire_limit = limit;
3825 	/* the default per task limit is the same as the global limit */
3826 	vm_per_task_user_wire_limit = limit;
3827 	vm_add_wire_count_over_global_limit = 0;
3828 	vm_add_wire_count_over_user_limit = 0;
3829 }
3830 
3831 #define KMEM_MAX_CLAIMS 50
3832 __startup_data
3833 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3834 
3835 #if !MACH_ASSERT
3836 __startup_data
3837 #endif /* !MACH_ASSERT */
3838 uint32_t kmem_claim_count = 0;
3839 
3840 #if MACH_ASSERT
3841 /**
3842  * Save off some minimal information about the ranges for consumption by
3843  * post-lockdown tests.
3844  */
3845 static struct mach_vm_range kmem_test_saved_ranges[KMEM_MAX_CLAIMS];
3846 #endif /* MACH_ASSERT */
3847 
3848 /**
3849  * For a requested claim size (i.e. kc_size), get the number of bytes which
3850  * should actually be allocated for a region in order to be able to properly
3851  * provide the requested size (the allocation size).
3852  *
3853  * This allocation size is always greater or equal to the claim size. It can,
3854  * for example, include additional space as required by the kernel memory
3855  * configuration.
3856  *
3857  * @param known_last Is the claim in question known to be the last region after
3858  * all placing has completed? The size for a known_last allocation is always
3859  * less than or equal to a non-known_last allocation of the same size.
3860  */
3861 __startup_func
3862 static vm_map_size_t
kmem_claim_to_allocation_size(vm_map_size_t claim_size,bool known_last)3863 kmem_claim_to_allocation_size(vm_map_size_t claim_size, bool known_last)
3864 {
3865 	(void)known_last;
3866 	/*
3867 	 * Allocation size and claim size are identical.
3868 	 */
3869 	return claim_size;
3870 }
3871 
3872 /**
3873  * Compute the largest claim which can be made from a given allocation size.
3874  */
3875 static vm_map_size_t
kmem_allocation_to_claim_size(vm_map_size_t allocation_size)3876 kmem_allocation_to_claim_size(vm_map_size_t allocation_size)
3877 {
3878 	/*
3879 	 * Allocation size and claim size are identical.
3880 	 */
3881 	return allocation_size;
3882 }
3883 
3884 __startup_func
3885 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3886 kmem_range_startup_init(
3887 	struct kmem_range_startup_spec *sp)
3888 {
3889 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3890 	if (sp->kc_calculate_sz) {
3891 		sp->kc_size = (sp->kc_calculate_sz)();
3892 	}
3893 	if (sp->kc_size) {
3894 		kmem_claims[kmem_claim_count] = *sp;
3895 		kmem_claim_count++;
3896 	}
3897 }
3898 
3899 static vm_offset_t
kmem_fuzz_start(void)3900 kmem_fuzz_start(void)
3901 {
3902 	vm_offset_t kmapoff_kaddr = 0;
3903 	uint32_t kmapoff_pgcnt;
3904 
3905 	kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3906 
3907 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3908 
3909 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3910 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3911 	    VM_KERN_MEMORY_OSFMK);
3912 
3913 
3914 	return kmapoff_kaddr + kmapoff_size;
3915 }
3916 
3917 /*
3918  * Generate a randomly shuffled array of indices from 0 to count - 1
3919  */
3920 __startup_func
3921 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3922 kmem_shuffle(
3923 	uint16_t       *shuffle_buf,
3924 	uint16_t        count)
3925 {
3926 	for (uint16_t i = 0; i < count; i++) {
3927 		uint16_t j = kmem_get_random16(i);
3928 		if (j != i) {
3929 			shuffle_buf[i] = shuffle_buf[j];
3930 		}
3931 		shuffle_buf[j] = i;
3932 	}
3933 }
3934 
3935 __startup_func
3936 static void
kmem_shuffle_claims(void)3937 kmem_shuffle_claims(void)
3938 {
3939 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3940 	uint16_t limit = (uint16_t)kmem_claim_count;
3941 
3942 	kmem_shuffle(&shuffle_buf[0], limit);
3943 	for (uint16_t i = 0; i < limit; i++) {
3944 		struct kmem_range_startup_spec tmp = kmem_claims[i];
3945 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3946 		kmem_claims[shuffle_buf[i]] = tmp;
3947 	}
3948 }
3949 
3950 __startup_func
3951 static void
kmem_readjust_ranges(uint32_t cur_idx)3952 kmem_readjust_ranges(
3953 	uint32_t        cur_idx)
3954 {
3955 	assert(cur_idx != 0);
3956 	uint32_t j = cur_idx - 1, random;
3957 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3958 	struct mach_vm_range *sp_range = sp.kc_range;
3959 	/*
3960 	 * Even if sp is currently last, it will never be last after it is moved.
3961 	 * As such, we want to bump other claims over it and include any necessary
3962 	 * padding for a non-last claim.
3963 	 *
3964 	 * While changing which claim is last can impact the total VA usage, since a
3965 	 * known_last allocation size is guaranteed to always be less-than-or-equal
3966 	 * to a non-known_last allocation (which is used for pre-placement sizing),
3967 	 * we will always have enough space so long as the pre-placement sizing had
3968 	 * enough space.
3969 	 */
3970 	vm_map_offset_t sp_allocation_size =
3971 	    kmem_claim_to_allocation_size(sp.kc_size, /* known_last */ false);
3972 
3973 	/*
3974 	 * Find max index where restriction is met
3975 	 */
3976 	for (; j > 0; j--) {
3977 		struct kmem_range_startup_spec spj = kmem_claims[j];
3978 		vm_map_offset_t max_start = spj.kc_range->min_address;
3979 		if (spj.kc_flags & KC_NO_MOVE) {
3980 			panic("kmem_range_init: Can't scramble with multiple constraints");
3981 		}
3982 		if (max_start <= sp_range->min_address) {
3983 			break;
3984 		}
3985 	}
3986 
3987 	/*
3988 	 * Pick a random index from 0 to max index and shift claims to the right
3989 	 * to make room for restricted claim
3990 	 */
3991 	random = kmem_get_random16((uint16_t)j);
3992 	assert(random <= j);
3993 
3994 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
3995 	sp_range->max_address = sp_range->min_address + sp.kc_size;
3996 
3997 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3998 		struct kmem_range_startup_spec spj = kmem_claims[j];
3999 		struct mach_vm_range *range = spj.kc_range;
4000 		range->min_address += sp_allocation_size;
4001 		range->max_address += sp_allocation_size;
4002 		kmem_claims[j + 1] = spj;
4003 	}
4004 
4005 	sp.kc_flags |= KC_NO_MOVE;
4006 	kmem_claims[random] = sp;
4007 }
4008 
4009 __startup_func
4010 static void
kmem_add_ptr_claims(void)4011 kmem_add_ptr_claims(void)
4012 {
4013 	uint64_t kmem_meta_num, kmem_ptr_chunks;
4014 	vm_map_size_t org_ptr_range_size __assert_only;
4015 
4016 	org_ptr_range_size = ptr_range_size;
4017 
4018 	ptr_range_size -= PAGE_SIZE;
4019 	ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
4020 	ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
4021 
4022 	kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
4023 	ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
4024 
4025 	kmem_meta_num = kmem_ptr_chunks + 2;
4026 	kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
4027 
4028 	assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
4029 	/*
4030 	 * Add claims for kmem's ranges
4031 	 */
4032 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
4033 		struct kmem_range_startup_spec kmem_spec = {
4034 			.kc_name = "kmem_ptr_range",
4035 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
4036 			.kc_size = ptr_range_size,
4037 			.kc_flags = KC_NO_ENTRY,
4038 		};
4039 		kmem_claims[kmem_claim_count++] = kmem_spec;
4040 
4041 		struct kmem_range_startup_spec kmem_meta_spec = {
4042 			.kc_name = "kmem_ptr_range_meta",
4043 			.kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
4044 			.kc_size = kmem_meta_size,
4045 			.kc_flags = KC_NONE,
4046 		};
4047 		kmem_claims[kmem_claim_count++] = kmem_meta_spec;
4048 	}
4049 }
4050 
4051 __startup_func
4052 static void
kmem_add_extra_claims(void)4053 kmem_add_extra_claims(void)
4054 {
4055 	vm_map_size_t largest_free_size = 0, total_claims = 0;
4056 	vm_map_size_t sane_sprayqtn_size = 0, sprayqtn_allocation_size = 0;
4057 	vm_map_size_t ptr_total_allocation_size = 0;
4058 
4059 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
4060 	largest_free_size = trunc_page(largest_free_size);
4061 
4062 	/*
4063 	 * kasan and configs w/o *TRR need to have just one ptr range due to
4064 	 * resource constraints.
4065 	 */
4066 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
4067 	kmem_ptr_ranges = 1;
4068 #endif
4069 	/*
4070 	 * Determine size of data and pointer kmem_ranges
4071 	 */
4072 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4073 		struct kmem_range_startup_spec sp_i = kmem_claims[i];
4074 
4075 		total_claims += kmem_claim_to_allocation_size(
4076 			sp_i.kc_size, /* known_last */ false);
4077 	}
4078 	assert((total_claims & PAGE_MASK) == 0);
4079 
4080 
4081 	largest_free_size -= total_claims;
4082 
4083 	/*
4084 	 * Use half the total available VA for all pointer allocations (this
4085 	 * includes the kmem_sprayqtn range). Given that we have 4 total
4086 	 * ranges divide the available VA by 8.
4087 	 */
4088 	ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
4089 
4090 	sprayqtn_range_size = ptr_range_size;
4091 	sane_sprayqtn_size = kmem_claim_to_allocation_size(
4092 		/* claim_size */ sane_size / 2, /* known_last */ false);
4093 	if (sprayqtn_range_size > sane_sprayqtn_size) {
4094 		vm_map_size_t sprayqtn_extra;
4095 
4096 		/*
4097 		 * Spray quarantine doesn't need that much space.
4098 		 * Shrink it to something reasonable and equally share the leftover VA
4099 		 * with the other pointer ranges.
4100 		 */
4101 		sprayqtn_extra = sprayqtn_range_size - sane_sprayqtn_size;
4102 		sprayqtn_range_size -= sprayqtn_extra;
4103 		ptr_range_size += sprayqtn_extra / kmem_ptr_ranges;
4104 	}
4105 
4106 	ptr_range_size = round_page(ptr_range_size);
4107 	sprayqtn_range_size = round_page(sprayqtn_range_size);
4108 
4109 	/* Less any necessary allocation padding... */
4110 	ptr_range_size = kmem_allocation_to_claim_size(ptr_range_size);
4111 	sprayqtn_range_size = kmem_allocation_to_claim_size(sprayqtn_range_size);
4112 
4113 	/*
4114 	 * Add the pointer and metadata claims
4115 	 * Note: this call modifies ptr_range_size and may, depending on the padding
4116 	 * requirements, slightly increase or decrease the overall allocation size
4117 	 * of the pointer+metadata region.
4118 	 */
4119 	kmem_add_ptr_claims();
4120 
4121 	sprayqtn_allocation_size = kmem_claim_to_allocation_size(
4122 		sprayqtn_range_size, /* known_last */ false);
4123 	ptr_total_allocation_size =
4124 	    (kmem_claim_to_allocation_size(ptr_range_size, /* known_last */ false) +
4125 	    kmem_claim_to_allocation_size(kmem_meta_size, /* known_last */ false)) *
4126 	    kmem_ptr_ranges;
4127 
4128 	/*
4129 	 * Check: spray and ptr_range are minimally valid.
4130 	 * This is a useful assert as it should catch us if we were to end up with a
4131 	 * "negative" (or extremely large) data_range_size.
4132 	 */
4133 	assert(sprayqtn_allocation_size + ptr_total_allocation_size < largest_free_size);
4134 
4135 	/*
4136 	 * Finally, give any remaining allocable space to the data region.
4137 	 */
4138 	data_range_size = largest_free_size - sprayqtn_allocation_size -
4139 	    ptr_total_allocation_size;
4140 
4141 	/*
4142 	 * If we need the data shared range, divide the size
4143 	 * for the data ranges between BUFFERS and SHARED.
4144 	 *
4145 	 * If not, all data allocations go into KMEM_RANGE_ID_DATA.
4146 	 */
4147 	if (kmem_needs_data_share_range()) {
4148 		/*
4149 		 * Round down the size, because our kmem ranges logic round
4150 		 * these sizes to page size, and we need to make sure we never
4151 		 * exceed the remaining allocable space we divided.
4152 		 */
4153 		shared_data_range_size = data_range_size =
4154 		    trunc_page(data_range_size / 2);
4155 	} else {
4156 		shared_data_range_size = 0;
4157 	}
4158 
4159 	/* Less any necessary allocation padding... */
4160 	data_range_size = kmem_allocation_to_claim_size(data_range_size);
4161 	shared_data_range_size = shared_data_range_size ?
4162 	    kmem_allocation_to_claim_size(shared_data_range_size) : 0;
4163 
4164 	/* Check: our allocations should all still fit in the free space */
4165 	assert(sprayqtn_allocation_size + ptr_total_allocation_size +
4166 	    kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) +
4167 	    kmem_claim_to_allocation_size(shared_data_range_size, /* known_last */ false) <=
4168 	    largest_free_size);
4169 
4170 	struct kmem_range_startup_spec kmem_spec_sprayqtn = {
4171 		.kc_name = "kmem_sprayqtn_range",
4172 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
4173 		.kc_size = sprayqtn_range_size,
4174 		.kc_flags = KC_NO_ENTRY,
4175 	};
4176 	kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
4177 
4178 	struct kmem_range_startup_spec kmem_spec_data_buffers = {
4179 		.kc_name = "kmem_data_buffers_range",
4180 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
4181 		.kc_size = data_range_size,
4182 		.kc_flags = KC_NO_ENTRY,
4183 	};
4184 	kmem_claims[kmem_claim_count++] = kmem_spec_data_buffers;
4185 
4186 	if (kmem_needs_data_share_range()) {
4187 		struct kmem_range_startup_spec kmem_spec_data_shared = {
4188 			.kc_name = "kmem_data_shared_range",
4189 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA_SHARED],
4190 			.kc_size = shared_data_range_size,
4191 			.kc_flags = KC_NO_ENTRY,
4192 		};
4193 		kmem_claims[kmem_claim_count++] = kmem_spec_data_shared;
4194 	}
4195 }
4196 
4197 __startup_func
4198 static void
kmem_scramble_ranges(void)4199 kmem_scramble_ranges(void)
4200 {
4201 	vm_map_offset_t va_alloc_head = 0;
4202 
4203 	/*
4204 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
4205 	 * the vm can find the requested ranges.
4206 	 */
4207 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
4208 	    VM_MAP_PAGE_SIZE(kernel_map));
4209 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
4210 
4211 	/*
4212 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
4213 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
4214 	 * could overflow into the heap.
4215 	 */
4216 	kext_alloc_init();
4217 
4218 	/*
4219 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
4220 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
4221 	 * eats about 2M of VA from the map)
4222 	 *
4223 	 * Note that we always need to slide by at least one page because the VM
4224 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
4225 	 * do not admit this address to be part of any zone submap.
4226 	 */
4227 	va_alloc_head = kmem_fuzz_start();
4228 
4229 	/*
4230 	 * Add claims for ptr and data kmem_ranges
4231 	 */
4232 	kmem_add_extra_claims();
4233 
4234 	/*
4235 	 * Minimally verify that our placer will be able to resolve the constraints
4236 	 * of all claims
4237 	 */
4238 	bool has_min_address = false;
4239 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4240 		struct kmem_range_startup_spec sp_i = kmem_claims[i];
4241 
4242 		/* Verify that we have only one claim with a min address constraint */
4243 		if (sp_i.kc_range->min_address) {
4244 			if (has_min_address) {
4245 				panic("Cannot place with multiple min_address constraints");
4246 			} else {
4247 				has_min_address = true;
4248 			}
4249 		}
4250 
4251 		if (sp_i.kc_range->max_address) {
4252 			panic("Cannot place with a max_address constraint");
4253 		}
4254 	}
4255 
4256 
4257 	/*
4258 	 * Shuffle registered claims
4259 	 */
4260 	assert(kmem_claim_count < UINT16_MAX);
4261 	kmem_shuffle_claims();
4262 
4263 	/*
4264 	 * Apply restrictions and determine range for each claim
4265 	 */
4266 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4267 		struct kmem_range_startup_spec sp = kmem_claims[i];
4268 		struct mach_vm_range *sp_range = sp.kc_range;
4269 
4270 		/*
4271 		 * Find space using the allocation size (rather than the claim size) in
4272 		 * order to ensure we provide any applicable padding.
4273 		 */
4274 		bool is_last = (i == kmem_claim_count - 1);
4275 		vm_map_offset_t sp_allocation_size =
4276 		    kmem_claim_to_allocation_size(sp.kc_size, is_last);
4277 
4278 		if (vm_map_locate_space_anywhere(kernel_map, sp_allocation_size, 0,
4279 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4280 		    &va_alloc_head, NULL) != KERN_SUCCESS) {
4281 			panic("kmem_range_init: vm_map_locate_space failing for claim %s, "
4282 			    "size 0x%llx",
4283 			    sp.kc_name, sp_allocation_size);
4284 		}
4285 
4286 		/*
4287 		 * Re-adjust ranges if restriction not met
4288 		 */
4289 		if (sp_range->min_address && va_alloc_head > sp_range->min_address) {
4290 			kmem_readjust_ranges(i);
4291 		} else {
4292 			/*
4293 			 * Though the actual allocated space may be larger, provide only the
4294 			 * size requested by the original claim.
4295 			 */
4296 			sp_range->min_address = va_alloc_head;
4297 			sp_range->max_address = va_alloc_head + sp.kc_size;
4298 		}
4299 
4300 		va_alloc_head += sp_allocation_size;
4301 	}
4302 
4303 	/*
4304 	 * We have settled on the ranges, now create temporary entries for the
4305 	 * claims
4306 	 */
4307 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4308 		struct kmem_range_startup_spec sp = kmem_claims[i];
4309 		bool is_last = (i == kmem_claim_count - 1);
4310 		vm_map_offset_t sp_allocation_size =
4311 		    kmem_claim_to_allocation_size(sp.kc_size, is_last);
4312 		vm_map_entry_t entry = NULL;
4313 		if (sp.kc_flags & KC_NO_ENTRY) {
4314 			continue;
4315 		}
4316 
4317 
4318 		/*
4319 		 * We reserve the full allocation size (rather than the claim size) so
4320 		 * that nothing ends up placed in the padding space (if applicable).
4321 		 */
4322 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address,
4323 		    sp_allocation_size, 0,
4324 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4325 		    &entry) != KERN_SUCCESS) {
4326 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
4327 			    sp.kc_name);
4328 		}
4329 		vm_object_reference(kernel_object_default);
4330 		VME_OBJECT_SET(entry, kernel_object_default, false, 0);
4331 		VME_OFFSET_SET(entry, entry->vme_start);
4332 		vm_map_unlock(kernel_map);
4333 	}
4334 
4335 	/*
4336 	 * Now that we are done assigning all the ranges, reset
4337 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
4338 	 */
4339 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
4340 
4341 #if DEBUG || DEVELOPMENT
4342 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4343 		struct kmem_range_startup_spec sp = kmem_claims[i];
4344 
4345 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
4346 		    (void *)sp.kc_range->min_address,
4347 		    (void *)sp.kc_range->max_address,
4348 		    mach_vm_size_pretty(sp.kc_size),
4349 		    mach_vm_size_unit(sp.kc_size));
4350 	}
4351 #endif /* DEBUG || DEVELOPMENT */
4352 
4353 #if MACH_ASSERT
4354 	/*
4355 	 * Since many parts of the claim infrastructure are marked as startup data
4356 	 * (and are thus unavailable post-lockdown), save off information our tests
4357 	 * need now.
4358 	 */
4359 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4360 		kmem_test_saved_ranges[i] = *(kmem_claims[i].kc_range);
4361 	}
4362 #endif /* MACH_ASSERT */
4363 }
4364 
4365 __startup_func
4366 static void
kmem_range_init(void)4367 kmem_range_init(void)
4368 {
4369 	vm_size_t range_adjustment;
4370 
4371 	kmem_scramble_ranges();
4372 
4373 	range_adjustment = sprayqtn_range_size >> 3;
4374 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
4375 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
4376 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
4377 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
4378 
4379 	range_adjustment = data_range_size >> 3;
4380 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
4381 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
4382 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
4383 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
4384 
4385 	if (kmem_needs_data_share_range()) {
4386 		range_adjustment = shared_data_range_size >> 3;
4387 		kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address =
4388 		    kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].min_address + range_adjustment;
4389 		kmem_large_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address =
4390 		    kmem_ranges[KMEM_RANGE_ID_DATA_SHARED].max_address;
4391 	}
4392 
4393 	pmap_init();
4394 	kmem_metadata_init();
4395 	kmem_sizeclass_init();
4396 
4397 #if DEBUG || DEVELOPMENT
4398 	for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
4399 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
4400 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
4401 		    (void *)kmem_large_ranges[i].min_address,
4402 		    (void *)kmem_large_ranges[i].max_address,
4403 		    mach_vm_size_pretty(range_size),
4404 		    mach_vm_size_unit(range_size));
4405 	}
4406 #endif
4407 }
4408 #ifndef __BUILDING_XNU_LIB_UNITTEST__ /* kernel map is not maintained in unit-test */
4409 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
4410 #endif /* __BUILDING_XNU_LIB_UNITTEST__ */
4411 
4412 #if DEBUG || DEVELOPMENT
4413 __startup_func
4414 static void
kmem_log_init(void)4415 kmem_log_init(void)
4416 {
4417 	/*
4418 	 * Log can only be created after the the kmem subsystem is initialized as
4419 	 * btlog creation uses kmem
4420 	 */
4421 	kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
4422 }
4423 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
4424 
4425 kmem_gobj_stats
kmem_get_gobj_stats(void)4426 kmem_get_gobj_stats(void)
4427 {
4428 	vmlp_api_start(KMEM_GET_GOBJ_STATS);
4429 	kmem_gobj_stats stats = {};
4430 
4431 	vm_map_lock(kernel_map);
4432 	for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
4433 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
4434 		struct mach_vm_range range = kmem_ranges[range_id];
4435 		struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
4436 		struct kmem_page_meta *meta_end;
4437 		uint64_t meta_idx = meta - kmem_meta_base[range_id];
4438 		vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
4439 		vm_map_offset_t addr;
4440 		vm_map_entry_t entry;
4441 
4442 		/*
4443 		 * Left front
4444 		 */
4445 		va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
4446 		meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
4447 
4448 		/*
4449 		 * Right front
4450 		 */
4451 		meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
4452 		meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
4453 		    &meta_idx);
4454 		meta_idx = meta_end - meta;
4455 		meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
4456 		va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
4457 
4458 		/*
4459 		 * Compute VA allocated in entire range
4460 		 */
4461 		if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
4462 			entry = entry->vme_next;
4463 		}
4464 
4465 		vmlp_range_event_entry(kernel_map, entry);
4466 
4467 		while (entry != vm_map_to_entry(kernel_map) &&
4468 		    entry->vme_start < range.max_address) {
4469 			used += (entry->vme_end - entry->vme_start);
4470 			entry = entry->vme_next;
4471 		}
4472 
4473 		pte_sz = round_page(atop(va - used) * 8);
4474 
4475 		stats.total_used += used;
4476 		stats.total_va += va;
4477 		stats.pte_sz += pte_sz;
4478 		stats.meta_sz += meta_sz;
4479 	}
4480 	vm_map_unlock(kernel_map);
4481 
4482 	vmlp_api_end(KMEM_GET_GOBJ_STATS, 0);
4483 	return stats;
4484 }
4485 
4486 #endif /* DEBUG || DEVELOPMENT */
4487 
4488 /*
4489  *	kmem_init:
4490  *
4491  *	Initialize the kernel's virtual memory map, taking
4492  *	into account all memory allocated up to this time.
4493  */
4494 __startup_func
4495 void
kmem_init(vm_offset_t start,vm_offset_t end)4496 kmem_init(
4497 	vm_offset_t     start,
4498 	vm_offset_t     end)
4499 {
4500 	vm_map_offset_t map_start;
4501 	vm_map_offset_t map_end;
4502 
4503 	map_start = vm_map_trunc_page(start,
4504 	    VM_MAP_PAGE_MASK(kernel_map));
4505 	map_end = vm_map_round_page(end,
4506 	    VM_MAP_PAGE_MASK(kernel_map));
4507 
4508 	vm_map_will_allocate_early_map(&kernel_map);
4509 #if defined(__arm64__)
4510 	kernel_map = vm_map_create_options(pmap_kernel(),
4511 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4512 	    VM_MAX_KERNEL_ADDRESS,
4513 	    VM_MAP_CREATE_DEFAULT);
4514 	/*
4515 	 *	Reserve virtual memory allocated up to this time.
4516 	 */
4517 	{
4518 		unsigned int    region_select = 0;
4519 		vm_map_offset_t region_start;
4520 		vm_map_size_t   region_size;
4521 		vm_map_offset_t map_addr;
4522 		kern_return_t kr;
4523 
4524 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
4525 			map_addr = region_start;
4526 			kr = vm_map_enter(kernel_map, &map_addr,
4527 			    vm_map_round_page(region_size,
4528 			    VM_MAP_PAGE_MASK(kernel_map)),
4529 			    (vm_map_offset_t) 0,
4530 			    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(
4531 				    .vmkf_no_pmap_check = true,
4532 				    .vmkf_no_soft_limit = true),
4533 			    VM_OBJECT_NULL,
4534 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
4535 			    VM_INHERIT_DEFAULT);
4536 
4537 			if (kr != KERN_SUCCESS) {
4538 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4539 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
4540 				    (uint64_t) region_size, kr);
4541 			}
4542 
4543 			region_select++;
4544 		}
4545 	}
4546 #else
4547 	kernel_map = vm_map_create_options(pmap_kernel(),
4548 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
4549 	    VM_MAP_CREATE_DEFAULT);
4550 	/*
4551 	 *	Reserve virtual memory allocated up to this time.
4552 	 */
4553 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
4554 		vm_map_offset_t map_addr;
4555 		kern_return_t kr;
4556 
4557 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
4558 		kr = vm_map_enter(kernel_map,
4559 		    &map_addr,
4560 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4561 		    (vm_map_offset_t) 0,
4562 		    VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
4563 		    VM_OBJECT_NULL,
4564 		    (vm_object_offset_t) 0, FALSE,
4565 		    VM_PROT_NONE, VM_PROT_NONE,
4566 		    VM_INHERIT_DEFAULT);
4567 
4568 		if (kr != KERN_SUCCESS) {
4569 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4570 			    (uint64_t) start, (uint64_t) end,
4571 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4572 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4573 			    kr);
4574 		}
4575 	}
4576 #endif
4577 
4578 	kmem_set_user_wire_limits();
4579 }
4580 
4581 
4582 #pragma mark map copyio
4583 
4584 /*
4585  * Note: semantic types aren't used as `copyio` already validates.
4586  */
4587 
4588 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)4589 copyinmap(
4590 	vm_map_t                map,
4591 	vm_map_offset_t         fromaddr,
4592 	void                   *todata,
4593 	vm_size_t               length)
4594 {
4595 	kern_return_t kr = KERN_SUCCESS;
4596 	vm_map_switch_context_t switch_ctx;
4597 
4598 	if (vm_map_pmap(map) == pmap_kernel()) {
4599 		/* assume a correct copy */
4600 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
4601 	} else if (current_map() == map) {
4602 		if (copyin(fromaddr, todata, length) != 0) {
4603 			kr = KERN_INVALID_ADDRESS;
4604 		}
4605 	} else {
4606 		vm_map_reference(map);
4607 		switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4608 		if (copyin(fromaddr, todata, length) != 0) {
4609 			kr = KERN_INVALID_ADDRESS;
4610 		}
4611 		vm_map_switch_back(switch_ctx);
4612 		vm_map_deallocate(map);
4613 	}
4614 	return kr;
4615 }
4616 
4617 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4618 copyoutmap(
4619 	vm_map_t                map,
4620 	void                   *fromdata,
4621 	vm_map_address_t        toaddr,
4622 	vm_size_t               length)
4623 {
4624 	kern_return_t kr = KERN_SUCCESS;
4625 	vm_map_switch_context_t switch_ctx;
4626 
4627 	if (vm_map_pmap(map) == pmap_kernel()) {
4628 		/* assume a correct copy */
4629 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4630 	} else if (current_map() == map) {
4631 		if (copyout(fromdata, toaddr, length) != 0) {
4632 			ktriage_record(thread_tid(current_thread()),
4633 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4634 			    KDBG_TRIAGE_RESERVED,
4635 			    KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR),
4636 			    KERN_INVALID_ADDRESS /* arg */);
4637 			kr = KERN_INVALID_ADDRESS;
4638 		}
4639 	} else {
4640 		vm_map_reference(map);
4641 		switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4642 		if (copyout(fromdata, toaddr, length) != 0) {
4643 			ktriage_record(thread_tid(current_thread()),
4644 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4645 			    KDBG_TRIAGE_RESERVED,
4646 			    KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR),
4647 			    KERN_INVALID_ADDRESS /* arg */);
4648 			kr = KERN_INVALID_ADDRESS;
4649 		}
4650 		vm_map_switch_back(switch_ctx);
4651 		vm_map_deallocate(map);
4652 	}
4653 	return kr;
4654 }
4655 
4656 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4657 copyoutmap_atomic32(
4658 	vm_map_t                map,
4659 	uint32_t                value,
4660 	vm_map_address_t        toaddr)
4661 {
4662 	kern_return_t kr = KERN_SUCCESS;
4663 	vm_map_switch_context_t switch_ctx;
4664 
4665 	if (vm_map_pmap(map) == pmap_kernel()) {
4666 		/* assume a correct toaddr */
4667 		*(uint32_t *)toaddr = value;
4668 	} else if (current_map() == map) {
4669 		if (copyout_atomic32(value, toaddr) != 0) {
4670 			kr = KERN_INVALID_ADDRESS;
4671 		}
4672 	} else {
4673 		vm_map_reference(map);
4674 		switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4675 		if (copyout_atomic32(value, toaddr) != 0) {
4676 			kr = KERN_INVALID_ADDRESS;
4677 		}
4678 		vm_map_switch_back(switch_ctx);
4679 		vm_map_deallocate(map);
4680 	}
4681 	return kr;
4682 }
4683 
4684 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4685 copyoutmap_atomic64(
4686 	vm_map_t                map,
4687 	uint64_t                value,
4688 	vm_map_address_t        toaddr)
4689 {
4690 	kern_return_t kr = KERN_SUCCESS;
4691 	vm_map_switch_context_t switch_ctx;
4692 
4693 	if (vm_map_pmap(map) == pmap_kernel()) {
4694 		/* assume a correct toaddr */
4695 		*(uint64_t *)toaddr = value;
4696 	} else if (current_map() == map) {
4697 		if (copyout_atomic64(value, toaddr) != 0) {
4698 			kr = KERN_INVALID_ADDRESS;
4699 		}
4700 	} else {
4701 		vm_map_reference(map);
4702 		switch_ctx = vm_map_switch_with_sec_override(map, TRUE);
4703 		if (copyout_atomic64(value, toaddr) != 0) {
4704 			kr = KERN_INVALID_ADDRESS;
4705 		}
4706 		vm_map_switch_back(switch_ctx);
4707 		vm_map_deallocate(map);
4708 	}
4709 	return kr;
4710 }
4711 
4712 
4713 #pragma mark pointer obfuscation / packing
4714 
4715 /*
4716  *
4717  *	The following two functions are to be used when exposing kernel
4718  *	addresses to userspace via any of the various debug or info
4719  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4720  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4721  *	are exported to KEXTs.
4722  *
4723  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4724  */
4725 
4726 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4727 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4728 {
4729 	assert(salt != 0);
4730 
4731 	if (addr == 0) {
4732 		return 0ul;
4733 	}
4734 
4735 	if (VM_KERNEL_IS_SLID(addr)) {
4736 		return VM_KERNEL_UNSLIDE(addr);
4737 	}
4738 
4739 #if HAS_MTE
4740 	/*
4741 	 * Remove traces of MTE tags or PAC signatures, to prevent observers from seeing
4742 	 * identical repeated values.
4743 	 */
4744 #endif /* HAS_MTE */
4745 	addr = VM_KERNEL_STRIP_PTR(addr);
4746 
4747 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4748 	SHA256_CTX sha_ctx;
4749 
4750 	SHA256_Init(&sha_ctx);
4751 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4752 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4753 	SHA256_Final(sha_digest, &sha_ctx);
4754 
4755 	return sha_digest[0];
4756 }
4757 
4758 __exported vm_offset_t
4759 vm_kernel_addrhash_external(vm_offset_t addr);
4760 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4761 vm_kernel_addrhash_external(vm_offset_t addr)
4762 {
4763 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4764 }
4765 
4766 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4767 vm_kernel_addrhide(
4768 	vm_offset_t addr,
4769 	vm_offset_t *hide_addr)
4770 {
4771 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
4772 }
4773 
4774 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4775 vm_kernel_addrperm_external(
4776 	vm_offset_t addr,
4777 	vm_offset_t *perm_addr)
4778 {
4779 	addr = VM_KERNEL_STRIP_UPTR(addr);
4780 
4781 	if (VM_KERNEL_IS_SLID(addr)) {
4782 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
4783 	} else if (VM_KERNEL_ADDRESS(addr)) {
4784 		*perm_addr = ML_ADDRPERM(addr, vm_kernel_addrperm_ext);
4785 	} else {
4786 		*perm_addr = addr;
4787 	}
4788 }
4789 
4790 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4791 vm_kernel_unslide_or_perm_external(
4792 	vm_offset_t addr,
4793 	vm_offset_t *up_addr)
4794 {
4795 	vm_kernel_addrperm_external(addr, up_addr);
4796 }
4797 
4798 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4799 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4800 {
4801 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4802 		panic("pointer %p can't be packed: low %d bits aren't 0",
4803 		    (void *)ptr, params.vmpp_shift);
4804 	} else if (ptr <= params.vmpp_base) {
4805 		panic("pointer %p can't be packed: below base %p",
4806 		    (void *)ptr, (void *)params.vmpp_base);
4807 	} else {
4808 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
4809 		    (void *)ptr, (void *)vm_packing_max_packable(params));
4810 	}
4811 }
4812 
4813 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4814 vm_packing_verify_range(
4815 	const char *subsystem,
4816 	vm_offset_t min_address,
4817 	vm_offset_t max_address,
4818 	vm_packing_params_t params)
4819 {
4820 	if (min_address > max_address) {
4821 		panic("%s: %s range invalid min:%p > max:%p",
4822 		    __func__, subsystem, (void *)min_address, (void *)max_address);
4823 	}
4824 
4825 	if (!params.vmpp_base_relative) {
4826 		return;
4827 	}
4828 
4829 	if (min_address <= params.vmpp_base) {
4830 		panic("%s: %s range invalid min:%p <= base:%p",
4831 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4832 	}
4833 
4834 	if (max_address > vm_packing_max_packable(params)) {
4835 		panic("%s: %s range invalid max:%p >= max packable:%p",
4836 		    __func__, subsystem, (void *)max_address,
4837 		    (void *)vm_packing_max_packable(params));
4838 	}
4839 }
4840 
4841 #pragma mark tests
4842 #if MACH_ASSERT
4843 #include <sys/errno.h>
4844 
4845 static void
4846 kmem_test_for_entry(
4847 	vm_map_t                map,
4848 	vm_offset_t             addr,
4849 	void                  (^block)(vm_map_entry_t))
4850 {
4851 	vm_map_entry_t entry;
4852 
4853 	vm_map_lock(map);
4854 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4855 	vm_map_unlock(map);
4856 }
4857 
4858 #define kmem_test_assert_map(map, pg, entries) ({ \
4859 	assert3u((map)->size, ==, ptoa(pg)); \
4860 	assert3u((map)->hdr.nentries, ==, entries); \
4861 })
4862 
4863 static bool
can_write_at(vm_offset_t offs,uint32_t page)4864 can_write_at(vm_offset_t offs, uint32_t page)
4865 {
4866 	static const int zero;
4867 
4868 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4869 }
4870 #define assert_writeable(offs, page) \
4871 	assertf(can_write_at(offs, page), \
4872 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4873 
4874 #define assert_faults(offs, page) \
4875 	assertf(!can_write_at(offs, page), \
4876 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4877 
4878 #define peek(offs, page) \
4879 	(*(uint32_t *)((offs) + ptoa(page)))
4880 
4881 #define poke(offs, page, v) \
4882 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
4883 
4884 #if CONFIG_SPTM
4885 __attribute__((noinline))
4886 static void
kmem_test_verify_type_policy(vm_offset_t addr,kmem_flags_t flags)4887 kmem_test_verify_type_policy(vm_offset_t addr, kmem_flags_t flags)
4888 {
4889 	extern bool use_xnu_restricted;
4890 	pmap_mapping_type_t expected_type = PMAP_MAPPING_TYPE_RESTRICTED;
4891 
4892 	/* Explicitly state the expected policy */
4893 	if (flags & (KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
4894 		expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4895 	} else if ((flags & KMEM_DATA) &&
4896 	    !kalloc_is_restricted_data_mode_enforced()) {
4897 		expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4898 	}
4899 
4900 	/* If X_K_R is disabled, DEFAULT is the only possible mapping */
4901 	if (!use_xnu_restricted) {
4902 		expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4903 	}
4904 
4905 	/* Verify if derived correctly */
4906 	assert3u(expected_type, ==, __kmem_mapping_type(flags));
4907 
4908 	pmap_paddr_t pa = kvtophys(addr);
4909 	if (pa == 0) {
4910 		return;
4911 	}
4912 
4913 	/* Verify if the mapped address actually got the expected type */
4914 	assert3u(expected_type, ==, sptm_get_frame_type(pa));
4915 }
4916 #endif /* CONFIG_SPTM */
4917 
4918 __attribute__((noinline))
4919 static void
kmem_alloc_basic_test(vm_map_t map)4920 kmem_alloc_basic_test(vm_map_t map)
4921 {
4922 	kmem_guard_t guard = {
4923 		.kmg_tag = VM_KERN_MEMORY_DIAG,
4924 	};
4925 	vm_offset_t addr;
4926 
4927 	/*
4928 	 * Test wired basics:
4929 	 * - KMA_KOBJECT
4930 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4931 	 * - allocation alignment
4932 	 */
4933 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4934 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4935 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4936 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4937 	kmem_test_assert_map(map, 10, 1);
4938 
4939 	kmem_test_for_entry(map, addr, ^(__assert_only vm_map_entry_t e){
4940 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4941 		assert(e->vme_kernel_object);
4942 		assert(!e->vme_atomic);
4943 		assert3u(e->vme_start, <=, addr);
4944 		assert3u(addr + ptoa(10), <=, e->vme_end);
4945 	});
4946 
4947 	assert_faults(addr, 0);
4948 	for (int i = 1; i < 9; i++) {
4949 		assert_writeable(addr, i);
4950 	}
4951 	assert_faults(addr, 9);
4952 
4953 	kmem_free(map, addr, ptoa(10));
4954 	kmem_test_assert_map(map, 0, 0);
4955 
4956 	/*
4957 	 * Test pageable basics.
4958 	 */
4959 	addr = kmem_alloc_guard(map, ptoa(10), 0,
4960 	    KMA_PAGEABLE, guard).kmr_address;
4961 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4962 	kmem_test_assert_map(map, 10, 1);
4963 
4964 	for (int i = 0; i < 9; i++) {
4965 		assert_faults(addr, i);
4966 		poke(addr, i, 42);
4967 		assert_writeable(addr, i);
4968 	}
4969 
4970 	kmem_free_guard(map, addr, ptoa(10),
4971 	    KMF_GUARD_FIRST | KMF_GUARD_LAST, guard);
4972 	kmem_test_assert_map(map, 0, 0);
4973 }
4974 
4975 __attribute__((noinline))
4976 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4977 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4978 {
4979 	kmem_guard_t guard = {
4980 		.kmg_atomic  = !(kind & (KMR_DATA | KMR_DATA_SHARED)),
4981 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
4982 		.kmg_context = 0xefface,
4983 	};
4984 	vm_offset_t addr, newaddr;
4985 	const int N = 10;
4986 
4987 	/*
4988 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
4989 	 *	we could conceive an implementation where it grows in place
4990 	 *	if there's space after it.
4991 	 *
4992 	 *	However, this is what the implementation does today.
4993 	 */
4994 	bool realloc_growth_changes_address = true;
4995 	bool GF = (kind & KMR_GUARD_FIRST);
4996 	bool GL = (kind & KMR_GUARD_LAST);
4997 
4998 	/*
4999 	 *	Initial N page allocation
5000 	 */
5001 	addr = kmem_alloc_guard(map, ptoa(N), 0,
5002 	    (kind & ~KMEM_FREEOLD) | KMA_ZERO, guard).kmr_address;
5003 	assert3u(addr, !=, 0);
5004 
5005 	kmem_test_assert_map(map, N, 1);
5006 	for (int pg = GF; pg < N - GL; pg++) {
5007 		poke(addr, pg, 42 + pg);
5008 	}
5009 	for (int pg = N - GL; pg < N; pg++) {
5010 		assert_faults(addr, pg);
5011 	}
5012 
5013 #if CONFIG_SPTM
5014 	kmem_test_verify_type_policy(addr, ANYF(kind));
5015 #endif /* CONFIG_SPTM */
5016 	/*
5017 	 *	Grow to N + 3 pages
5018 	 */
5019 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
5020 	    kind | KMR_ZERO, guard).kmr_address;
5021 	assert3u(newaddr, !=, 0);
5022 	if (realloc_growth_changes_address) {
5023 		assert3u(addr, !=, newaddr);
5024 	}
5025 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
5026 		kmem_test_assert_map(map, N + 3, 1);
5027 	} else {
5028 		kmem_test_assert_map(map, 2 * N + 3, 2);
5029 	}
5030 	for (int pg = GF; pg < N - GL; pg++) {
5031 		assert3u(peek(newaddr, pg), ==, 42 + pg);
5032 	}
5033 	if ((kind & KMR_FREEOLD) == 0) {
5034 		for (int pg = GF; pg < N - GL; pg++) {
5035 			assert3u(peek(addr, pg), ==, 42 + pg);
5036 		}
5037 		/* check for tru-share */
5038 		poke(addr + 16, 0, 1234);
5039 		assert3u(peek(newaddr + 16, 0), ==, 1234);
5040 		kmem_free_guard(map, addr, ptoa(N),
5041 		    kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
5042 		kmem_test_assert_map(map, N + 3, 1);
5043 	}
5044 	if (addr != newaddr) {
5045 		for (int pg = GF; pg < N - GL; pg++) {
5046 			assert_faults(addr, pg);
5047 		}
5048 	}
5049 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
5050 		assert3u(peek(newaddr, pg), ==, 0);
5051 	}
5052 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
5053 		assert_faults(newaddr, pg);
5054 	}
5055 	addr = newaddr;
5056 
5057 
5058 	/*
5059 	 *	Shrink to N - 2 pages
5060 	 */
5061 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
5062 	    kind | KMR_ZERO, guard).kmr_address;
5063 	assert3u(map->size, ==, ptoa(N - 2));
5064 	assert3u(newaddr, ==, addr);
5065 	kmem_test_assert_map(map, N - 2, 1);
5066 
5067 	for (int pg = GF; pg < N - 2 - GL; pg++) {
5068 		assert3u(peek(addr, pg), ==, 42 + pg);
5069 	}
5070 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
5071 		assert_faults(addr, pg);
5072 	}
5073 
5074 	kmem_free_guard(map, addr, ptoa(N - 2),
5075 	    kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
5076 	kmem_test_assert_map(map, 0, 0);
5077 }
5078 
5079 static int
kmem_basic_test(__unused int64_t in,int64_t * out)5080 kmem_basic_test(__unused int64_t in, int64_t *out)
5081 {
5082 	mach_vm_offset_t addr;
5083 	vm_map_t map;
5084 
5085 	printf("%s: test running\n", __func__);
5086 
5087 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
5088 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
5089 	        KMS_NOFAIL | KMS_DATA_SHARED, VM_KERN_MEMORY_DIAG).kmr_submap;
5090 
5091 	printf("%s: kmem_alloc ...\n", __func__);
5092 	kmem_alloc_basic_test(map);
5093 	printf("%s:     PASS\n", __func__);
5094 
5095 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
5096 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
5097 	printf("%s:     PASS\n", __func__);
5098 
5099 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
5100 	kmem_realloc_basic_test(map, KMR_FREEOLD);
5101 	printf("%s:     PASS\n", __func__);
5102 
5103 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5104 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
5105 	printf("%s:     PASS\n", __func__);
5106 
5107 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5108 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
5109 	printf("%s:     PASS\n", __func__);
5110 
5111 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5112 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5113 	printf("%s:     PASS\n", __func__);
5114 
5115 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5116 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
5117 	printf("%s:     PASS\n", __func__);
5118 
5119 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5120 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
5121 	printf("%s:     PASS\n", __func__);
5122 
5123 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5124 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5125 	printf("%s:     PASS\n", __func__);
5126 
5127 #if HAS_MTE
5128 	printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
5129 	kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD);
5130 	printf("%s:     PASS\n", __func__);
5131 
5132 	printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD) ...\n", __func__);
5133 	kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD);
5134 	printf("%s:     PASS\n", __func__);
5135 
5136 	printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5137 	kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
5138 	printf("%s:     PASS\n", __func__);
5139 
5140 	printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5141 	kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
5142 	printf("%s:     PASS\n", __func__);
5143 
5144 	printf("%s: kmem_realloc (KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5145 	kmem_realloc_basic_test(map, KMR_TAG | KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5146 	printf("%s:     PASS\n", __func__);
5147 
5148 	printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
5149 	kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST);
5150 	printf("%s:     PASS\n", __func__);
5151 
5152 	printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
5153 	kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_LAST);
5154 	printf("%s:     PASS\n", __func__);
5155 
5156 	printf("%s: kmem_realloc (KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
5157 	kmem_realloc_basic_test(map, KMR_TAG | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
5158 	printf("%s:     PASS\n", __func__);
5159 #endif /* HAS_MTE */
5160 
5161 	/* using KMR_DATA signals to test the non atomic realloc path */
5162 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
5163 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
5164 	printf("%s:     PASS\n", __func__);
5165 
5166 	/*
5167 	 * Using KMR_DATA without KMR_FREEOLD violates the
5168 	 * single-mappability of RESTRICTED pages.
5169 	 */
5170 
5171 	/* test KMR_SHARED_DATA for the new shared kheap */
5172 	printf("%s: kmem_realloc (KMR_DATA_SHARED | KMR_FREEOLD) ...\n", __func__);
5173 	kmem_realloc_basic_test(map, KMR_DATA_SHARED | KMR_FREEOLD);
5174 	printf("%s:     PASS\n", __func__);
5175 
5176 	/* test KMR_SHARED_DATA for the new shared kheap */
5177 	printf("%s: kmem_realloc (KMR_DATA_SHARED) ...\n", __func__);
5178 	kmem_realloc_basic_test(map, KMR_DATA_SHARED);
5179 	printf("%s:     PASS\n", __func__);
5180 
5181 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
5182 	vm_map_deallocate(map);
5183 
5184 	printf("%s: test passed\n", __func__);
5185 	*out = 1;
5186 	return 0;
5187 }
5188 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
5189 
5190 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)5191 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
5192 {
5193 	__assert_only uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
5194 
5195 	assert(chunks >= kmem_size_array[idx].ks_num_chunk);
5196 }
5197 
5198 __attribute__((noinline))
5199 static void
kmem_test_get_size_idx_for_all_chunks()5200 kmem_test_get_size_idx_for_all_chunks()
5201 {
5202 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
5203 		uint32_t chunks = kmem_size_array[i].ks_num_chunk;
5204 
5205 		if (chunks != 1) {
5206 			kmem_test_get_size_idx_for_chunks(chunks - 1);
5207 		}
5208 		kmem_test_get_size_idx_for_chunks(chunks);
5209 		kmem_test_get_size_idx_for_chunks(chunks + 1);
5210 	}
5211 }
5212 
5213 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)5214 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
5215 {
5216 	printf("%s: test running\n", __func__);
5217 
5218 	printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
5219 	kmem_test_get_size_idx_for_all_chunks();
5220 	printf("%s:     PASS\n", __func__);
5221 
5222 	printf("%s: test passed\n", __func__);
5223 	*out = 1;
5224 	return 0;
5225 }
5226 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
5227 
5228 
5229 #endif /* MACH_ASSERT */
5230