xref: /xnu-11417.140.69/osfmk/vm/vm_kern.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern_internal.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object_internal.h>
73 #include <vm/vm_page_internal.h>
74 #include <vm/vm_compressor_xnu.h>
75 #include <vm/vm_pageout_xnu.h>
76 #include <vm/vm_init_xnu.h>
77 #include <vm/vm_fault.h>
78 #include <vm/vm_memtag.h>
79 #include <vm/vm_far.h>
80 #include <kern/misc_protos.h>
81 #include <vm/cpm_internal.h>
82 #include <kern/ledger.h>
83 #include <kern/bits.h>
84 #include <kern/startup.h>
85 #include <kern/telemetry.h>
86 
87 #include <string.h>
88 
89 #include <libkern/OSDebug.h>
90 #include <libkern/crypto/sha2.h>
91 #include <libkern/section_keywords.h>
92 #include <sys/kdebug.h>
93 #include <sys/kdebug_triage.h>
94 
95 #include <san/kasan.h>
96 #include <kern/kext_alloc.h>
97 #include <kern/backtrace.h>
98 #include <os/hash.h>
99 #include <kern/zalloc_internal.h>
100 #include <libkern/crypto/rand.h>
101 
102 /*
103  *	Variables exported by this module.
104  */
105 
106 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
107 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT];
108 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_large_ranges[KMEM_RANGE_COUNT];
109 
110 static TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges",
111     KMEM_RANGE_ID_NUM_PTR);
112 #define KMEM_GOBJ_THRESHOLD   (32ULL << 20)
113 #if DEBUG || DEVELOPMENT
114 #define KMEM_OUTLIER_LOG_SIZE (16ULL << 10)
115 #define KMEM_OUTLIER_SIZE      0
116 #define KMEM_OUTLIER_ALIGN     1
117 btlog_t kmem_outlier_log;
118 #endif /* DEBUG || DEVELOPMENT */
119 
120 __startup_data static vm_map_size_t data_range_size;
121 __startup_data static vm_map_size_t ptr_range_size;
122 __startup_data static vm_map_size_t sprayqtn_range_size;
123 
124 #pragma mark helpers
125 
126 __attribute__((overloadable))
127 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)128 ANYF(kma_flags_t flags)
129 {
130 	return (kmem_flags_t)flags;
131 }
132 
133 __attribute__((overloadable))
134 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)135 ANYF(kmr_flags_t flags)
136 {
137 	return (kmem_flags_t)flags;
138 }
139 
140 __attribute__((overloadable))
141 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)142 ANYF(kmf_flags_t flags)
143 {
144 	return (kmem_flags_t)flags;
145 }
146 
147 __abortlike
148 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)149 __kmem_invalid_size_panic(
150 	vm_map_t        map,
151 	vm_size_t       size,
152 	uint32_t        flags)
153 {
154 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
155 	    map, flags, (size_t)size);
156 }
157 
158 __abortlike
159 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)160 __kmem_invalid_arguments_panic(
161 	const char     *what,
162 	vm_map_t        map,
163 	vm_address_t    address,
164 	vm_size_t       size,
165 	uint32_t        flags)
166 {
167 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
168 	    "invalid arguments passed",
169 	    what, map, (void *)address, (size_t)size, flags);
170 }
171 
172 __abortlike
173 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)174 __kmem_failed_panic(
175 	vm_map_t        map,
176 	vm_size_t       size,
177 	uint32_t        flags,
178 	kern_return_t   kr,
179 	const char     *what)
180 {
181 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
182 	    what, map, (size_t)size, flags, kr);
183 }
184 
185 __abortlike
186 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)187 __kmem_entry_not_found_panic(
188 	vm_map_t        map,
189 	vm_offset_t     addr)
190 {
191 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
192 }
193 
194 static inline vm_object_t
__kmem_object(kmem_flags_t flags)195 __kmem_object(kmem_flags_t flags)
196 {
197 	if (flags & KMEM_COMPRESSOR) {
198 		if (flags & KMEM_KOBJECT) {
199 			panic("both KMEM_KOBJECT and KMEM_COMPRESSOR specified");
200 		}
201 		return compressor_object;
202 	}
203 	if (!(flags & KMEM_KOBJECT)) {
204 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
205 	}
206 	return kernel_object_default;
207 }
208 
209 static inline pmap_mapping_type_t
__kmem_mapping_type(kmem_flags_t flags)210 __kmem_mapping_type(kmem_flags_t flags)
211 {
212 	if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
213 		return PMAP_MAPPING_TYPE_DEFAULT;
214 	} else {
215 		return PMAP_MAPPING_TYPE_RESTRICTED;
216 	}
217 }
218 
219 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)220 __kmem_guard_left(kmem_flags_t flags)
221 {
222 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
223 }
224 
225 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)226 __kmem_guard_right(kmem_flags_t flags)
227 {
228 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
229 }
230 
231 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)232 __kmem_guard_size(kmem_flags_t flags)
233 {
234 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
235 }
236 
237 __pure2
238 static inline vm_size_t
__kmem_entry_orig_size(vm_map_entry_t entry)239 __kmem_entry_orig_size(vm_map_entry_t entry)
240 {
241 	vm_object_t object = VME_OBJECT(entry);
242 
243 	if (entry->vme_kernel_object) {
244 		return entry->vme_end - entry->vme_start -
245 		       entry->vme_object_or_delta;
246 	} else {
247 		return object->vo_size - object->vo_size_delta;
248 	}
249 }
250 
251 
252 #pragma mark kmem range methods
253 
254 #define mach_vm_range_load(r, rmin, rmax) \
255 	({ (rmin) = (r)->min_address; (rmax) = (r)->max_address; })
256 
257 __abortlike
258 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)259 __mach_vm_range_overflow(
260 	mach_vm_offset_t        addr,
261 	mach_vm_offset_t        size)
262 {
263 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
264 	    addr, addr, size);
265 }
266 
267 __abortlike
268 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)269 __mach_vm_range_invalid(
270 	mach_vm_offset_t        min_address,
271 	mach_vm_offset_t        max_address)
272 {
273 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
274 	    min_address, max_address);
275 }
276 
277 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)278 mach_vm_range_size(const struct mach_vm_range *r)
279 {
280 	mach_vm_offset_t rmin, rmax;
281 
282 	mach_vm_range_load(r, rmin, rmax);
283 	return rmax - rmin;
284 }
285 
286 __attribute__((overloadable))
287 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)288 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
289 {
290 	mach_vm_offset_t rmin, rmax;
291 	/*
292 	 * The `&` is not a typo: we really expect the check to pass,
293 	 * so encourage the compiler to eagerly load and test without branches
294 	 */
295 	mach_vm_range_load(r, rmin, rmax);
296 	return (addr >= rmin) & (addr < rmax);
297 }
298 
299 __attribute__((overloadable))
300 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)301 mach_vm_range_contains(
302 	const struct mach_vm_range *r,
303 	mach_vm_offset_t        addr,
304 	mach_vm_offset_t        size)
305 {
306 	mach_vm_offset_t rmin, rmax;
307 	mach_vm_offset_t end;
308 
309 	if (__improbable(os_add_overflow(addr, size, &end))) {
310 		return false;
311 	}
312 
313 	/*
314 	 *	 The `&` is not a typo: we really expect the check to pass,
315 	 *   so encourage the compiler to eagerly load and test without branches
316 	 */
317 	mach_vm_range_load(r, rmin, rmax);
318 	return (addr >= rmin) & (end >= rmin) & (end <= rmax);
319 }
320 
321 __attribute__((overloadable))
322 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)323 mach_vm_range_intersects(
324 	const struct mach_vm_range *r1,
325 	const struct mach_vm_range *r2)
326 {
327 	mach_vm_offset_t r1_min, r1_max;
328 	mach_vm_offset_t r2_min, r2_max;
329 
330 	mach_vm_range_load(r1, r1_min, r1_max);
331 	r2_min = r2->min_address;
332 	r2_max = r2->max_address;
333 
334 	if (r1_min > r1_max) {
335 		__mach_vm_range_invalid(r1_min, r1_max);
336 	}
337 
338 	if (r2_min > r2_max) {
339 		__mach_vm_range_invalid(r2_min, r2_max);
340 	}
341 
342 	return r1_max > r2_min && r1_min < r2_max;
343 }
344 
345 __attribute__((overloadable))
346 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)347 mach_vm_range_intersects(
348 	const struct mach_vm_range *r1,
349 	mach_vm_offset_t        addr,
350 	mach_vm_offset_t        size)
351 {
352 	struct mach_vm_range r2;
353 
354 	r2.min_address = addr;
355 	if (os_add_overflow(addr, size, &r2.max_address)) {
356 		__mach_vm_range_overflow(addr, size);
357 	}
358 
359 	return mach_vm_range_intersects(r1, &r2);
360 }
361 
362 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)363 kmem_range_id_contains(
364 	kmem_range_id_t         range_id,
365 	vm_map_offset_t         addr,
366 	vm_map_size_t           size)
367 {
368 	return mach_vm_range_contains(&kmem_ranges[range_id], vm_memtag_canonicalize_kernel(addr), size);
369 }
370 
371 __abortlike
372 static void
kmem_range_invalid_panic(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)373 kmem_range_invalid_panic(
374 	kmem_range_id_t         range_id,
375 	vm_map_offset_t         addr,
376 	vm_map_size_t           size)
377 {
378 	const struct mach_vm_range *r = &kmem_ranges[range_id];
379 	mach_vm_offset_t rmin, rmax;
380 
381 	mach_vm_range_load(r, rmin, rmax);
382 	if (addr + size < rmin) {
383 		panic("addr %p + size %llu overflows %p", (void *)addr, size,
384 		    (void *)(addr + size));
385 	}
386 	panic("addr %p + size %llu doesnt fit in one range (id: %u min: %p max: %p)",
387 	    (void *)addr, size, range_id, (void *)rmin, (void *)rmax);
388 }
389 
390 /*
391  * Return whether the entire allocation is contained in the given range
392  */
393 static bool
kmem_range_contains_fully(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)394 kmem_range_contains_fully(
395 	kmem_range_id_t         range_id,
396 	vm_map_offset_t         addr,
397 	vm_map_size_t           size)
398 {
399 	const struct mach_vm_range *r = &kmem_ranges[range_id];
400 	mach_vm_offset_t rmin, rmax;
401 	bool result = false;
402 
403 	if (VM_KERNEL_ADDRESS(addr)) {
404 		addr = vm_memtag_canonicalize_kernel(addr);
405 	}
406 
407 	/*
408 	 * The `&` is not a typo: we really expect the check to pass,
409 	 * so encourage the compiler to eagerly load and test without branches
410 	 */
411 	mach_vm_range_load(r, rmin, rmax);
412 	result = (addr >= rmin) & (addr < rmax);
413 	if (__improbable(result
414 	    && ((addr + size < rmin) || (addr + size > rmax)))) {
415 		kmem_range_invalid_panic(range_id, addr, size);
416 	}
417 	return result;
418 }
419 
420 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)421 kmem_range_id_size(kmem_range_id_t range_id)
422 {
423 	return mach_vm_range_size(&kmem_ranges[range_id]);
424 }
425 
426 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)427 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
428 {
429 	kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
430 
431 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
432 		if (kmem_range_contains_fully(range_id, addr, size)) {
433 			return range_id;
434 		}
435 	}
436 	return KMEM_RANGE_ID_NONE;
437 }
438 
439 bool
kmem_is_ptr_range(vm_map_range_id_t range_id)440 kmem_is_ptr_range(vm_map_range_id_t range_id)
441 {
442 	return (range_id >= KMEM_RANGE_ID_FIRST) &&
443 	       (range_id <= KMEM_RANGE_ID_NUM_PTR);
444 }
445 
446 __abortlike
447 static void
kmem_range_invalid_for_overwrite(vm_map_offset_t addr)448 kmem_range_invalid_for_overwrite(vm_map_offset_t addr)
449 {
450 	panic("Can't overwrite mappings (addr: %p) in kmem ptr ranges",
451 	    (void *)addr);
452 }
453 
454 mach_vm_range_t
kmem_validate_range_for_overwrite(vm_map_offset_t addr,vm_map_size_t size)455 kmem_validate_range_for_overwrite(
456 	vm_map_offset_t         addr,
457 	vm_map_size_t           size)
458 {
459 	vm_map_range_id_t range_id = kmem_addr_get_range(addr, size);
460 
461 	if (kmem_is_ptr_range(range_id)) {
462 		kmem_range_invalid_for_overwrite(addr);
463 	}
464 
465 	return &kmem_ranges[range_id];
466 }
467 
468 
469 #pragma mark entry parameters
470 
471 
472 __abortlike
473 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)474 __kmem_entry_validate_panic(
475 	vm_map_t        map,
476 	vm_map_entry_t  entry,
477 	vm_offset_t     addr,
478 	vm_size_t       size,
479 	uint32_t        flags,
480 	kmem_guard_t    guard)
481 {
482 	const char *what = "???";
483 
484 	if (entry->vme_atomic != guard.kmg_atomic) {
485 		what = "atomicity";
486 	} else if (entry->is_sub_map != guard.kmg_submap) {
487 		what = "objectness";
488 	} else if (addr != entry->vme_start) {
489 		what = "left bound";
490 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
491 		what = "right bound";
492 	} else if (guard.kmg_context != entry->vme_context) {
493 		what = "guard";
494 	}
495 
496 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
497 	    "entry:%p %s mismatch guard(0x%08x)",
498 	    map, (void *)addr, size, flags, entry,
499 	    what, guard.kmg_context);
500 }
501 
502 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)503 __kmem_entry_validate_guard(
504 	vm_map_entry_t  entry,
505 	vm_offset_t     addr,
506 	vm_size_t       size,
507 	kmem_flags_t    flags,
508 	kmem_guard_t    guard)
509 {
510 	if (entry->vme_atomic != guard.kmg_atomic) {
511 		return false;
512 	}
513 
514 	if (!guard.kmg_atomic) {
515 		return true;
516 	}
517 
518 	if (entry->is_sub_map != guard.kmg_submap) {
519 		return false;
520 	}
521 
522 	if (addr != entry->vme_start) {
523 		return false;
524 	}
525 
526 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
527 		return false;
528 	}
529 
530 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
531 		return false;
532 	}
533 
534 	return true;
535 }
536 
537 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)538 kmem_entry_validate_guard(
539 	vm_map_t        map,
540 	vm_map_entry_t  entry,
541 	vm_offset_t     addr,
542 	vm_size_t       size,
543 	kmem_guard_t    guard)
544 {
545 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
546 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
547 	}
548 }
549 
550 __abortlike
551 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)552 __kmem_entry_validate_object_panic(
553 	vm_map_t        map,
554 	vm_map_entry_t  entry,
555 	kmem_flags_t    flags)
556 {
557 	const char *what;
558 	const char *verb;
559 
560 	if (entry->is_sub_map) {
561 		panic("kmem(map=%p) entry %p is a submap", map, entry);
562 	}
563 
564 	if (flags & KMEM_KOBJECT) {
565 		what = "kernel";
566 		verb = "isn't";
567 	} else if (flags & KMEM_COMPRESSOR) {
568 		what = "compressor";
569 		verb = "isn't";
570 	} else if (entry->vme_kernel_object) {
571 		what = "kernel";
572 		verb = "is unexpectedly";
573 	} else {
574 		what = "compressor";
575 		verb = "is unexpectedly";
576 	}
577 
578 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
579 	    map, flags, entry, verb, what);
580 }
581 
582 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)583 __kmem_entry_validate_object(
584 	vm_map_entry_t  entry,
585 	kmem_flags_t    flags)
586 {
587 	if (entry->is_sub_map) {
588 		return false;
589 	}
590 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
591 		return false;
592 	}
593 
594 	return (bool)(flags & KMEM_COMPRESSOR) ==
595 	       (VME_OBJECT(entry) == compressor_object);
596 }
597 
598 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)599 kmem_size_guard(
600 	vm_map_t        map,
601 	vm_offset_t     addr,
602 	kmem_guard_t    guard)
603 {
604 	kmem_flags_t flags = KMEM_GUESS_SIZE;
605 	vm_map_entry_t entry;
606 	vm_size_t size;
607 
608 	vm_map_lock_read(map);
609 
610 #if KASAN_CLASSIC
611 	addr -= PAGE_SIZE;
612 #endif /* KASAN_CLASSIC */
613 	addr = vm_memtag_canonicalize_kernel(addr);
614 
615 	if (!vm_map_lookup_entry(map, addr, &entry)) {
616 		__kmem_entry_not_found_panic(map, addr);
617 	}
618 
619 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
620 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
621 	}
622 
623 	size = __kmem_entry_orig_size(entry);
624 
625 	vm_map_unlock_read(map);
626 
627 	return size;
628 }
629 
630 static inline uint16_t
kmem_hash_backtrace(void * fp)631 kmem_hash_backtrace(
632 	void                     *fp)
633 {
634 	uint64_t  bt_count;
635 	uintptr_t bt[8] = {};
636 
637 	struct backtrace_control ctl = {
638 		.btc_frame_addr = (uintptr_t)fp,
639 	};
640 
641 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
642 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
643 }
644 
645 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
646     "Insufficient bits to represent ptr ranges");
647 
648 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)649 kmem_adjust_range_id(
650 	uint32_t                  hash)
651 {
652 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
653 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
654 }
655 
656 static bool
kmem_use_sprayqtn(kma_flags_t kma_flags,vm_map_size_t map_size,vm_offset_t mask)657 kmem_use_sprayqtn(
658 	kma_flags_t               kma_flags,
659 	vm_map_size_t             map_size,
660 	vm_offset_t               mask)
661 {
662 	/*
663 	 * Pointer allocations that are above the guard objects threshold or have
664 	 * leading guard pages with non standard alignment requests are redirected
665 	 * to the sprayqtn range.
666 	 */
667 #if DEBUG || DEVELOPMENT
668 	btref_get_flags_t flags = (kma_flags & KMA_NOPAGEWAIT) ?
669 	    BTREF_GET_NOWAIT : 0;
670 
671 	if ((kma_flags & KMA_SPRAYQTN) == 0) {
672 		if (map_size > KMEM_GOBJ_THRESHOLD) {
673 			btlog_record(kmem_outlier_log, (void *)map_size, KMEM_OUTLIER_SIZE,
674 			    btref_get(__builtin_frame_address(0), flags));
675 		} else if ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK)) {
676 			btlog_record(kmem_outlier_log, (void *)mask, KMEM_OUTLIER_ALIGN,
677 			    btref_get(__builtin_frame_address(0), flags));
678 		}
679 	}
680 #endif /* DEBUG || DEVELOPMENT */
681 
682 	return (kma_flags & KMA_SPRAYQTN) ||
683 	       (map_size > KMEM_GOBJ_THRESHOLD) ||
684 	       ((kma_flags & KMA_GUARD_FIRST) && (mask > PAGE_MASK));
685 }
686 
687 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_size_t map_size,vm_offset_t mask,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)688 kmem_apply_security_policy(
689 	vm_map_t                  map,
690 	kma_flags_t               kma_flags,
691 	kmem_guard_t              guard,
692 	vm_map_size_t             map_size,
693 	vm_offset_t               mask,
694 	vm_map_kernel_flags_t    *vmk_flags,
695 	bool                      assert_dir __unused)
696 {
697 	kmem_range_id_t range_id;
698 	bool from_right;
699 	uint16_t type_hash = guard.kmg_type_hash;
700 
701 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
702 		return;
703 	}
704 
705 	/*
706 	 * A non-zero type-hash must be passed by krealloc_type
707 	 */
708 #if (DEBUG || DEVELOPMENT)
709 	if (assert_dir && !(kma_flags & (KMA_DATA | KMA_DATA_SHARED))) {
710 		assert(type_hash != 0);
711 	}
712 #endif
713 
714 	if (kma_flags & (KMA_DATA | KMA_DATA_SHARED)) {
715 		range_id  = KMEM_RANGE_ID_DATA;
716 		/*
717 		 * As an optimization in KMA_DATA to avoid fragmentation,
718 		 * allocate static carveouts at the end of the DATA range.
719 		 */
720 		from_right = (bool)(kma_flags & KMA_PERMANENT);
721 	} else if (kmem_use_sprayqtn(kma_flags, map_size, mask)) {
722 		range_id = KMEM_RANGE_ID_SPRAYQTN;
723 		from_right = (bool)(kma_flags & KMA_PERMANENT);
724 	} else if (type_hash) {
725 		range_id  = (kmem_range_id_t)(type_hash & KMEM_RANGE_MASK);
726 		from_right = type_hash & KMEM_DIRECTION_MASK;
727 	} else {
728 		/*
729 		 * Range id needs to correspond to one of the PTR ranges
730 		 */
731 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
732 		range_id  = kmem_adjust_range_id(type_hash);
733 		from_right = type_hash & KMEM_DIRECTION_MASK;
734 	}
735 
736 	vmk_flags->vmkf_range_id = range_id;
737 	vmk_flags->vmkf_last_free = from_right;
738 }
739 
740 #pragma mark allocation
741 
742 static kmem_return_t
743 kmem_alloc_guard_internal(
744 	vm_map_t                map,
745 	vm_size_t               size,
746 	vm_offset_t             mask,
747 	kma_flags_t             flags,
748 	kmem_guard_t            guard,
749 	kern_return_t         (^alloc_pages)(vm_size_t, kma_flags_t, vm_page_t *))
750 {
751 	vm_object_t             object;
752 	vm_offset_t             delta = 0;
753 	vm_map_entry_t          entry = NULL;
754 	vm_map_offset_t         map_addr, fill_start;
755 	vm_map_size_t           map_size, fill_size;
756 	vm_page_t               guard_left = VM_PAGE_NULL;
757 	vm_page_t               guard_right = VM_PAGE_NULL;
758 	vm_page_t               wired_page_list = VM_PAGE_NULL;
759 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
760 	bool                    skip_guards;
761 	kmem_return_t           kmr = { };
762 
763 	assert(kernel_map && map->pmap == kernel_pmap);
764 
765 #if DEBUG || DEVELOPMENT
766 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
767 	    size, 0, 0, 0);
768 #endif
769 
770 
771 	if (size == 0 ||
772 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
773 	    (size < __kmem_guard_size(ANYF(flags)))) {
774 		__kmem_invalid_size_panic(map, size, flags);
775 	}
776 
777 	/*
778 	 * limit the size of a single extent of wired memory
779 	 * to try and limit the damage to the system if
780 	 * too many pages get wired down
781 	 * limit raised to 2GB with 128GB max physical limit,
782 	 * but scaled by installed memory above this
783 	 *
784 	 * Note: kmem_alloc_contig_guard() is immune to this check.
785 	 */
786 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
787 	    alloc_pages == NULL &&
788 	    size > MAX(1ULL << 31, sane_size / 64))) {
789 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
790 		goto out_error;
791 	}
792 
793 #if 136275805
794 	/*
795 	 * XXX: Redundantly check the mapping size here so that failure stack traces
796 	 *      are more useful. This has no functional value but is helpful because
797 	 *      telemetry traps can currently only capture the last five calls and
798 	 *      so we want to trap as shallow as possible in a select few cases
799 	 *      where we anticipate issues.
800 	 *
801 	 *      When telemetry collection is complete, this will be removed.
802 	 */
803 	if (__improbable(!vm_map_is_map_size_valid(
804 		    kernel_map, size, flags & KMA_NOSOFTLIMIT))) {
805 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
806 		goto out_error;
807 	}
808 #endif /* 136275805 */
809 
810 	/*
811 	 * Guard pages:
812 	 *
813 	 * Guard pages are implemented as fictitious pages.
814 	 *
815 	 * However, some maps, and some objects are known
816 	 * to manage their memory explicitly, and do not need
817 	 * those to be materialized, which saves memory.
818 	 *
819 	 * By placing guard pages on either end of a stack,
820 	 * they can help detect cases where a thread walks
821 	 * off either end of its stack.
822 	 *
823 	 * They are allocated and set up here and attempts
824 	 * to access those pages are trapped in vm_fault_page().
825 	 *
826 	 * The map_size we were passed may include extra space for
827 	 * guard pages. fill_size represents the actual size to populate.
828 	 * Similarly, fill_start indicates where the actual pages
829 	 * will begin in the range.
830 	 */
831 
832 	map_size   = round_page(size);
833 	fill_start = 0;
834 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
835 
836 #if KASAN_CLASSIC
837 	if (flags & KMA_KASAN_GUARD) {
838 		assert((flags & (KMA_GUARD_FIRST | KMA_GUARD_LAST)) == 0);
839 		flags |= KMA_GUARD_FIRST | KMEM_GUARD_LAST;
840 		delta     = ptoa(2);
841 		map_size += delta;
842 	}
843 #else
844 	(void)delta;
845 #endif /* KASAN_CLASSIC */
846 
847 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
848 	    map->never_faults;
849 
850 	if (flags & KMA_GUARD_FIRST) {
851 		vmk_flags.vmkf_guard_before = true;
852 		fill_start += PAGE_SIZE;
853 	}
854 	if (flags & KMA_NOSOFTLIMIT) {
855 		vmk_flags.vmkf_no_soft_limit = true;
856 	}
857 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
858 		guard_left = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
859 		if (__improbable(guard_left == VM_PAGE_NULL)) {
860 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
861 			goto out_error;
862 		}
863 	}
864 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
865 		guard_right = vm_page_create_guard((flags & KMA_NOPAGEWAIT) == 0);
866 		if (__improbable(guard_right == VM_PAGE_NULL)) {
867 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
868 			goto out_error;
869 		}
870 	}
871 
872 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
873 		if (alloc_pages) {
874 			kmr.kmr_return = alloc_pages(fill_size, flags,
875 			    &wired_page_list);
876 		} else {
877 			kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
878 			    &wired_page_list);
879 		}
880 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
881 			goto out_error;
882 		}
883 	}
884 
885 	/*
886 	 *	Allocate a new object (if necessary).  We must do this before
887 	 *	locking the map, or risk deadlock with the default pager.
888 	 */
889 	if (flags & KMA_KOBJECT) {
890 		{
891 			object = kernel_object_default;
892 		}
893 		vm_object_reference(object);
894 	} else if (flags & KMA_COMPRESSOR) {
895 		object = compressor_object;
896 		vm_object_reference(object);
897 	} else {
898 		object = vm_object_allocate(map_size, map->serial_id);
899 		vm_object_lock(object);
900 		vm_object_set_size(object, map_size, size);
901 		/* stabilize the object to prevent shadowing */
902 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
903 		VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
904 		vm_object_unlock(object);
905 	}
906 
907 	if (flags & KMA_LAST_FREE) {
908 		vmk_flags.vmkf_last_free = true;
909 	}
910 	if (flags & KMA_PERMANENT) {
911 		vmk_flags.vmf_permanent = true;
912 	}
913 	kmem_apply_security_policy(map, flags, guard, map_size, mask, &vmk_flags,
914 	    false);
915 
916 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
917 	    vmk_flags, &entry);
918 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
919 		vm_object_deallocate(object);
920 		goto out_error;
921 	}
922 
923 	map_addr = entry->vme_start;
924 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
925 	VME_ALIAS_SET(entry, guard.kmg_tag);
926 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
927 		VME_OFFSET_SET(entry, map_addr);
928 	}
929 
930 #if KASAN
931 	if ((flags & KMA_KOBJECT) && guard.kmg_atomic) {
932 		entry->vme_object_or_delta = (-size & PAGE_MASK) + delta;
933 	}
934 #endif /* KASAN */
935 
936 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
937 		entry->wired_count = 1;
938 		vme_btref_consider_and_set(entry, __builtin_frame_address(0));
939 	}
940 
941 	if (guard_left || guard_right || wired_page_list) {
942 		vm_object_offset_t offset = 0ull;
943 
944 		vm_object_lock(object);
945 		vm_map_unlock(map);
946 
947 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
948 			offset = map_addr;
949 		}
950 
951 		if (guard_left) {
952 			vm_page_insert(guard_left, object, offset);
953 			guard_left->vmp_busy = FALSE;
954 			guard_left = VM_PAGE_NULL;
955 		}
956 
957 		if (guard_right) {
958 			vm_page_insert(guard_right, object,
959 			    offset + fill_start + fill_size);
960 			guard_right->vmp_busy = FALSE;
961 			guard_right = VM_PAGE_NULL;
962 		}
963 
964 		if (wired_page_list) {
965 			kernel_memory_populate_object_and_unlock(object,
966 			    map_addr + fill_start, offset + fill_start, fill_size,
967 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT,
968 			    __kmem_mapping_type(ANYF(flags)));
969 		} else {
970 			vm_object_unlock(object);
971 		}
972 	} else {
973 		vm_map_unlock(map);
974 	}
975 
976 	/*
977 	 * now that the pages are wired, we no longer have to fear coalesce
978 	 */
979 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
980 		vm_map_simplify(map, map_addr);
981 	}
982 
983 #if DEBUG || DEVELOPMENT
984 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
985 	    atop(fill_size), 0, 0, 0);
986 #endif /* DEBUG || DEVELOPMENT */
987 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
988 
989 #if KASAN
990 	if (flags & (KMA_KASAN_GUARD | KMA_PAGEABLE)) {
991 		/*
992 		 * We need to allow the range for pageable memory,
993 		 * or faulting will not be allowed.
994 		 */
995 		kasan_notify_address(map_addr, map_size);
996 	}
997 #endif /* KASAN */
998 #if KASAN_CLASSIC
999 	if (flags & KMA_KASAN_GUARD) {
1000 		kmr.kmr_address += PAGE_SIZE;
1001 		kasan_alloc_large(kmr.kmr_address, size);
1002 	}
1003 #endif /* KASAN_CLASSIC */
1004 #if CONFIG_KERNEL_TAGGING
1005 	if (!(flags & KMA_VAONLY) && (flags & KMA_TAG)) {
1006 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag((caddr_t)kmr.kmr_address + fill_start, fill_size);
1007 		kmr.kmr_ptr = (caddr_t)kmr.kmr_ptr - fill_start;
1008 #if KASAN_TBI
1009 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, map_size, size);
1010 #endif /* KASAN_TBI */
1011 	}
1012 #endif /* CONFIG_KERNEL_TAGGING */
1013 	return kmr;
1014 
1015 out_error:
1016 	if (flags & KMA_NOFAIL) {
1017 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
1018 	}
1019 	if (guard_left) {
1020 		guard_left->vmp_snext = wired_page_list;
1021 		wired_page_list = guard_left;
1022 	}
1023 	if (guard_right) {
1024 		guard_right->vmp_snext = wired_page_list;
1025 		wired_page_list = guard_right;
1026 	}
1027 	if (wired_page_list) {
1028 		vm_page_free_list(wired_page_list, FALSE);
1029 	}
1030 
1031 #if DEBUG || DEVELOPMENT
1032 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1033 	    0, 0, 0, 0);
1034 #endif /* DEBUG || DEVELOPMENT */
1035 
1036 	return kmr;
1037 }
1038 
1039 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)1040 kmem_alloc_guard(
1041 	vm_map_t        map,
1042 	vm_size_t       size,
1043 	vm_offset_t     mask,
1044 	kma_flags_t     flags,
1045 	kmem_guard_t    guard)
1046 {
1047 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, NULL);
1048 }
1049 
1050 kmem_return_t
kmem_alloc_contig_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,kmem_guard_t guard)1051 kmem_alloc_contig_guard(
1052 	vm_map_t                map,
1053 	vm_size_t               size,
1054 	vm_offset_t             mask,
1055 	ppnum_t                 max_pnum,
1056 	ppnum_t                 pnum_mask,
1057 	kma_flags_t             flags,
1058 	kmem_guard_t            guard)
1059 {
1060 	__auto_type alloc_pages = ^(vm_size_t fill_size, kma_flags_t kma_flags, vm_page_t *pages) {
1061 		return cpm_allocate(fill_size, pages, max_pnum, pnum_mask, FALSE, kma_flags);
1062 	};
1063 
1064 	return kmem_alloc_guard_internal(map, size, mask, flags, guard, alloc_pages);
1065 }
1066 
1067 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)1068 kmem_suballoc(
1069 	vm_map_t                parent,
1070 	mach_vm_offset_t       *addr,
1071 	vm_size_t               size,
1072 	vm_map_create_options_t vmc_options,
1073 	int                     vm_flags,
1074 	kms_flags_t             flags,
1075 	vm_tag_t                tag)
1076 {
1077 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
1078 	vm_map_offset_t map_addr = 0;
1079 	kmem_return_t kmr = { };
1080 	vm_map_t map;
1081 
1082 	assert(page_aligned(size));
1083 	assert(parent->pmap == kernel_pmap);
1084 
1085 	vm_map_kernel_flags_set_vmflags(&vmk_flags, vm_flags, tag);
1086 
1087 	if (parent == kernel_map) {
1088 		assert(vmk_flags.vmf_overwrite || (flags & KMS_DATA));
1089 	}
1090 
1091 	if (vmk_flags.vmf_fixed) {
1092 		map_addr = trunc_page(*addr);
1093 	}
1094 
1095 	pmap_reference(vm_map_pmap(parent));
1096 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1097 
1098 	/*
1099 	 * 1. vm_map_enter() will consume one ref on success.
1100 	 *
1101 	 * 2. make the entry atomic as kernel submaps should never be split.
1102 	 *
1103 	 * 3. instruct vm_map_enter() that it is a fresh submap
1104 	 *    that needs to be taught its bounds as it inserted.
1105 	 */
1106 	vm_map_reference(map);
1107 
1108 	vmk_flags.vmkf_submap = true;
1109 	if ((flags & KMS_DATA) == 0) {
1110 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1111 		vmk_flags.vmkf_submap_atomic = true;
1112 	}
1113 	vmk_flags.vmkf_submap_adjust = true;
1114 	if (flags & KMS_LAST_FREE) {
1115 		vmk_flags.vmkf_last_free = true;
1116 	}
1117 	if (flags & KMS_PERMANENT) {
1118 		vmk_flags.vmf_permanent = true;
1119 	}
1120 	if (flags & KMS_DATA) {
1121 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1122 	}
1123 	if (flags & KMS_NOSOFTLIMIT) {
1124 		vmk_flags.vmkf_no_soft_limit = true;
1125 	}
1126 
1127 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1128 	    vmk_flags, (vm_object_t)map, 0, FALSE,
1129 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1130 
1131 	if (kmr.kmr_return != KERN_SUCCESS) {
1132 		if (flags & KMS_NOFAIL) {
1133 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1134 			    parent, size, kmr.kmr_return);
1135 		}
1136 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1137 		vm_map_deallocate(map);
1138 		vm_map_deallocate(map); /* also removes ref to pmap */
1139 		return kmr;
1140 	}
1141 
1142 	/*
1143 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1144 	 * that the exact same range is returned.
1145 	 */
1146 	if (*addr != 0 && parent == kernel_map &&
1147 	    startup_phase > STARTUP_SUB_KMEM) {
1148 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1149 	} else {
1150 		*addr = map_addr;
1151 	}
1152 
1153 	kmr.kmr_submap = map;
1154 	return kmr;
1155 }
1156 
1157 /*
1158  *	kmem_alloc:
1159  *
1160  *	Allocate wired-down memory in the kernel's address map
1161  *	or a submap.  The memory is not zero-filled.
1162  */
1163 
1164 __exported kern_return_t
1165 kmem_alloc_external(
1166 	vm_map_t        map,
1167 	vm_offset_t     *addrp,
1168 	vm_size_t       size);
1169 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1170 kmem_alloc_external(
1171 	vm_map_t        map,
1172 	vm_offset_t     *addrp,
1173 	vm_size_t       size)
1174 {
1175 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1176 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1177 	}
1178 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1179 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1180 }
1181 
1182 
1183 /*
1184  *	kmem_alloc_kobject:
1185  *
1186  *	Allocate wired-down memory in the kernel's address map
1187  *	or a submap.  The memory is not zero-filled.
1188  *
1189  *	The memory is allocated in the kernel_object.
1190  *	It may not be copied with vm_map_copy, and
1191  *	it may not be reallocated with kmem_realloc.
1192  */
1193 
1194 __exported kern_return_t
1195 kmem_alloc_kobject_external(
1196 	vm_map_t        map,
1197 	vm_offset_t     *addrp,
1198 	vm_size_t       size);
1199 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1200 kmem_alloc_kobject_external(
1201 	vm_map_t        map,
1202 	vm_offset_t     *addrp,
1203 	vm_size_t       size)
1204 {
1205 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1206 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1207 	}
1208 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1209 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1210 }
1211 
1212 /*
1213  *	kmem_alloc_pageable:
1214  *
1215  *	Allocate pageable memory in the kernel's address map.
1216  */
1217 
1218 __exported kern_return_t
1219 kmem_alloc_pageable_external(
1220 	vm_map_t        map,
1221 	vm_offset_t     *addrp,
1222 	vm_size_t       size);
1223 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1224 kmem_alloc_pageable_external(
1225 	vm_map_t        map,
1226 	vm_offset_t     *addrp,
1227 	vm_size_t       size)
1228 {
1229 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1230 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1231 	}
1232 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1233 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1234 }
1235 
1236 static __attribute__((always_inline, warn_unused_result))
1237 kern_return_t
mach_vm_allocate_kernel_sanitize(vm_map_t map,mach_vm_offset_ut addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags,vm_map_offset_t * map_addr,vm_map_size_t * map_size)1238 mach_vm_allocate_kernel_sanitize(
1239 	vm_map_t                map,
1240 	mach_vm_offset_ut       addr_u,
1241 	mach_vm_size_ut         size_u,
1242 	vm_map_kernel_flags_t   vmk_flags,
1243 	vm_map_offset_t        *map_addr,
1244 	vm_map_size_t          *map_size)
1245 {
1246 	kern_return_t   result;
1247 	vm_map_offset_t map_end;
1248 
1249 	if (vmk_flags.vmf_fixed) {
1250 		result = vm_sanitize_addr_size(addr_u, size_u,
1251 		    VM_SANITIZE_CALLER_VM_ALLOCATE_FIXED,
1252 		    map,
1253 		    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS | VM_SANITIZE_FLAGS_REALIGN_START,
1254 		    map_addr, &map_end, map_size);
1255 		if (__improbable(result != KERN_SUCCESS)) {
1256 			return result;
1257 		}
1258 	} else {
1259 		*map_addr = 0;
1260 		result = vm_sanitize_size(0, size_u,
1261 		    VM_SANITIZE_CALLER_VM_ALLOCATE_ANYWHERE, map,
1262 		    VM_SANITIZE_FLAGS_SIZE_ZERO_SUCCEEDS,
1263 		    map_size);
1264 		if (__improbable(result != KERN_SUCCESS)) {
1265 			return result;
1266 		}
1267 	}
1268 
1269 	return KERN_SUCCESS;
1270 }
1271 
1272 kern_return_t
mach_vm_allocate_kernel(vm_map_t map,mach_vm_offset_ut * addr_u,mach_vm_size_ut size_u,vm_map_kernel_flags_t vmk_flags)1273 mach_vm_allocate_kernel(
1274 	vm_map_t                map,
1275 	mach_vm_offset_ut      *addr_u,
1276 	mach_vm_size_ut         size_u,
1277 	vm_map_kernel_flags_t   vmk_flags)
1278 {
1279 	vm_map_offset_t map_addr;
1280 	vm_map_size_t   map_size;
1281 	kern_return_t   result;
1282 
1283 	if (map == VM_MAP_NULL) {
1284 		ktriage_record(thread_tid(current_thread()),
1285 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1286 		    KDBG_TRIAGE_RESERVED,
1287 		    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADMAP_ERROR),
1288 		    KERN_INVALID_ARGUMENT /* arg */);
1289 		return KERN_INVALID_ARGUMENT;
1290 	}
1291 
1292 	if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
1293 	    VM_FLAGS_USER_ALLOCATE)) {
1294 		return KERN_INVALID_ARGUMENT;
1295 	}
1296 
1297 	result = mach_vm_allocate_kernel_sanitize(map,
1298 	    *addr_u,
1299 	    size_u,
1300 	    vmk_flags,
1301 	    &map_addr,
1302 	    &map_size);
1303 	if (__improbable(result != KERN_SUCCESS)) {
1304 		result = vm_sanitize_get_kr(result);
1305 		if (result == KERN_SUCCESS) {
1306 			*addr_u = vm_sanitize_wrap_addr(0);
1307 		} else {
1308 			ktriage_record(thread_tid(current_thread()),
1309 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1310 			    KDBG_TRIAGE_RESERVED,
1311 			    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_BADSIZE_ERROR),
1312 			    KERN_INVALID_ARGUMENT /* arg */);
1313 		}
1314 		return result;
1315 	}
1316 
1317 	vm_map_kernel_flags_update_range_id(&vmk_flags, map, map_size);
1318 
1319 	result = vm_map_enter(
1320 		map,
1321 		&map_addr,
1322 		map_size,
1323 		(vm_map_offset_t)0,
1324 		vmk_flags,
1325 		VM_OBJECT_NULL,
1326 		(vm_object_offset_t)0,
1327 		FALSE,
1328 		VM_PROT_DEFAULT,
1329 		VM_PROT_ALL,
1330 		VM_INHERIT_DEFAULT);
1331 
1332 	if (result == KERN_SUCCESS) {
1333 #if KASAN
1334 		if (map->pmap == kernel_pmap) {
1335 			kasan_notify_address(map_addr, map_size);
1336 		}
1337 #endif
1338 		*addr_u = vm_sanitize_wrap_addr(map_addr);
1339 	} else {
1340 		ktriage_record(thread_tid(current_thread()),
1341 		    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
1342 		    KDBG_TRIAGE_RESERVED,
1343 		    KDBG_TRIAGE_VM_ALLOCATE_KERNEL_VMMAPENTER_ERROR),
1344 		    result /* arg */);
1345 	}
1346 	return result;
1347 }
1348 
1349 #pragma mark population
1350 
1351 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags,pmap_mapping_type_t mapping_type)1352 kernel_memory_populate_pmap_enter(
1353 	vm_object_t             object,
1354 	vm_address_t            addr,
1355 	vm_object_offset_t      offset,
1356 	vm_page_t               mem,
1357 	vm_prot_t               prot,
1358 	int                     pe_flags,
1359 	pmap_mapping_type_t     mapping_type)
1360 {
1361 	kern_return_t   pe_result;
1362 	int             pe_options;
1363 
1364 	if (VMP_ERROR_GET(mem)) {
1365 		panic("VM page %p should not have an error", mem);
1366 	}
1367 
1368 	pe_options = PMAP_OPTIONS_NOWAIT;
1369 	if (object->internal) {
1370 		pe_options |= PMAP_OPTIONS_INTERNAL;
1371 	}
1372 	if (mem->vmp_reusable || object->all_reusable) {
1373 		pe_options |= PMAP_OPTIONS_REUSABLE;
1374 	}
1375 
1376 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1377 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1378 	    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1379 
1380 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1381 		vm_object_unlock(object);
1382 
1383 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1384 
1385 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1386 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1387 		    pe_flags, /* wired */ TRUE, pe_options, NULL, mapping_type);
1388 
1389 		vm_object_lock(object);
1390 	}
1391 
1392 	assert(pe_result == KERN_SUCCESS);
1393 }
1394 
1395 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot,pmap_mapping_type_t mapping_type)1396 kernel_memory_populate_object_and_unlock(
1397 	vm_object_t             object, /* must be locked */
1398 	vm_address_t            addr,
1399 	vm_offset_t             offset,
1400 	vm_size_t               size,
1401 	vm_page_t               page_list,
1402 	kma_flags_t             flags,
1403 	vm_tag_t                tag,
1404 	vm_prot_t               prot,
1405 	pmap_mapping_type_t     mapping_type)
1406 {
1407 	vm_page_t       mem;
1408 	int             pe_flags;
1409 	bool            gobbled_list = page_list && page_list->vmp_gobbled;
1410 
1411 	assert(((flags & KMA_KOBJECT) != 0) == (is_kernel_object(object) != 0));
1412 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1413 
1414 
1415 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1416 		assert3u(offset, ==, addr);
1417 	} else {
1418 		/*
1419 		 * kernel_memory_populate_pmap_enter() might drop the object
1420 		 * lock, and the caller might not own a reference anymore
1421 		 * and rely on holding the vm object lock for liveness.
1422 		 */
1423 		vm_object_reference_locked(object);
1424 	}
1425 
1426 	if (flags & KMA_KSTACK) {
1427 		pe_flags = VM_MEM_STACK;
1428 	} else {
1429 		pe_flags = 0;
1430 	}
1431 
1432 
1433 	for (vm_object_offset_t pg_offset = 0;
1434 	    pg_offset < size;
1435 	    pg_offset += PAGE_SIZE_64) {
1436 		if (page_list == NULL) {
1437 			panic("%s: page_list too short", __func__);
1438 		}
1439 
1440 		mem = page_list;
1441 		page_list = mem->vmp_snext;
1442 		mem->vmp_snext = NULL;
1443 
1444 		assert(mem->vmp_wire_count == 0);
1445 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1446 		assert(vm_page_is_canonical(mem));
1447 
1448 		if (flags & KMA_COMPRESSOR) {
1449 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1450 			/*
1451 			 * Background processes doing I/O accounting can call
1452 			 * into NVME driver to do some work which results in
1453 			 * an allocation here and so we want to make sure
1454 			 * that the pages used by compressor, regardless of
1455 			 * process context, are never on the special Q.
1456 			 */
1457 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1458 
1459 			vm_page_insert(mem, object, offset + pg_offset);
1460 		} else {
1461 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1462 			mem->vmp_wire_count = 1;
1463 
1464 
1465 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1466 		}
1467 
1468 		mem->vmp_gobbled = false;
1469 		mem->vmp_busy = false;
1470 		mem->vmp_pmapped = true;
1471 		mem->vmp_wpmapped = true;
1472 
1473 		/*
1474 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1475 		 * for the kernel and compressor objects.
1476 		 */
1477 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1478 		    mem, prot, pe_flags, mapping_type);
1479 
1480 		if (flags & KMA_NOENCRYPT) {
1481 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1482 		}
1483 	}
1484 
1485 	if (page_list) {
1486 		panic("%s: page_list too long", __func__);
1487 	}
1488 
1489 	vm_object_unlock(object);
1490 	if ((flags & (KMA_KOBJECT | KMA_COMPRESSOR)) == 0) {
1491 		vm_object_deallocate(object);
1492 	}
1493 
1494 	/*
1495 	 * Update the accounting:
1496 	 * - the compressor "wired" pages don't really count as wired
1497 	 * - kmem_alloc_contig_guard() gives gobbled pages,
1498 	 *   which already count as wired but need to be ungobbled.
1499 	 */
1500 	if (gobbled_list) {
1501 		vm_page_lockspin_queues();
1502 		if (flags & KMA_COMPRESSOR) {
1503 			vm_page_wire_count -= atop(size);
1504 		}
1505 		vm_page_gobble_count -= atop(size);
1506 		vm_page_unlock_queues();
1507 	} else if ((flags & KMA_COMPRESSOR) == 0) {
1508 		vm_page_lockspin_queues();
1509 		vm_page_wire_count += atop(size);
1510 		vm_page_unlock_queues();
1511 	}
1512 
1513 	if (flags & KMA_KOBJECT) {
1514 		/* vm_page_insert_wired() handles regular objects already */
1515 		vm_tag_update_size(tag, size, NULL);
1516 	}
1517 
1518 #if KASAN
1519 	if (flags & KMA_COMPRESSOR) {
1520 		kasan_notify_address_nopoison(addr, size);
1521 	} else {
1522 		kasan_notify_address(addr, size);
1523 	}
1524 #endif /* KASAN */
1525 }
1526 
1527 
1528 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1529 kernel_memory_populate(
1530 	vm_offset_t     addr,
1531 	vm_size_t       size,
1532 	kma_flags_t     flags,
1533 	vm_tag_t        tag)
1534 {
1535 	kern_return_t   kr = KERN_SUCCESS;
1536 	vm_page_t       page_list = NULL;
1537 	vm_size_t       page_count = atop_64(size);
1538 	vm_object_t     object = __kmem_object(ANYF(flags));
1539 
1540 #if DEBUG || DEVELOPMENT
1541 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1542 	    size, 0, 0, 0);
1543 #endif /* DEBUG || DEVELOPMENT */
1544 
1545 
1546 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1547 	if (kr == KERN_SUCCESS) {
1548 		vm_object_lock(object);
1549 		kernel_memory_populate_object_and_unlock(object, addr,
1550 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT,
1551 		    __kmem_mapping_type(ANYF(flags)));
1552 	}
1553 
1554 #if DEBUG || DEVELOPMENT
1555 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1556 	    page_count, 0, 0, 0);
1557 #endif /* DEBUG || DEVELOPMENT */
1558 	return kr;
1559 }
1560 
1561 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1562 kernel_memory_depopulate(
1563 	vm_offset_t        addr,
1564 	vm_size_t          size,
1565 	kma_flags_t        flags,
1566 	vm_tag_t           tag)
1567 {
1568 	vm_object_t        object = __kmem_object(ANYF(flags));
1569 	vm_object_offset_t offset = addr;
1570 	vm_page_t          mem;
1571 	vm_page_t          local_freeq = NULL;
1572 	unsigned int       pages_unwired = 0;
1573 
1574 	vm_object_lock(object);
1575 
1576 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1577 
1578 	for (vm_object_offset_t pg_offset = 0;
1579 	    pg_offset < size;
1580 	    pg_offset += PAGE_SIZE_64) {
1581 		mem = vm_page_lookup(object, offset + pg_offset);
1582 
1583 		assert(mem);
1584 
1585 		if (flags & KMA_COMPRESSOR) {
1586 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1587 		} else {
1588 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1589 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1590 			pages_unwired++;
1591 		}
1592 
1593 		mem->vmp_busy = TRUE;
1594 
1595 		assert(mem->vmp_tabled);
1596 		vm_page_remove(mem, TRUE);
1597 		assert(mem->vmp_busy);
1598 
1599 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1600 
1601 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1602 		mem->vmp_snext = local_freeq;
1603 		local_freeq = mem;
1604 	}
1605 
1606 	vm_object_unlock(object);
1607 
1608 	vm_page_free_list(local_freeq, TRUE);
1609 
1610 	if (!(flags & KMA_COMPRESSOR)) {
1611 		vm_page_lockspin_queues();
1612 		vm_page_wire_count -= pages_unwired;
1613 		vm_page_unlock_queues();
1614 	}
1615 
1616 	if (flags & KMA_KOBJECT) {
1617 		/* vm_page_remove() handles regular objects already */
1618 		vm_tag_update_size(tag, -ptoa_64(pages_unwired), NULL);
1619 	}
1620 }
1621 
1622 #pragma mark reallocation
1623 
1624 __abortlike
1625 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1626 __kmem_realloc_invalid_object_size_panic(
1627 	vm_map_t                map,
1628 	vm_address_t            address,
1629 	vm_size_t               size,
1630 	vm_map_entry_t          entry)
1631 {
1632 	vm_object_t object  = VME_OBJECT(entry);
1633 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
1634 
1635 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1636 	    "object %p has unexpected size %ld",
1637 	    map, (void *)address, (size_t)size, entry, object, objsize);
1638 }
1639 
1640 __abortlike
1641 static void
__kmem_realloc_invalid_pager_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)1642 __kmem_realloc_invalid_pager_panic(
1643 	vm_map_t                map,
1644 	vm_address_t            address,
1645 	vm_size_t               size,
1646 	vm_map_entry_t          entry)
1647 {
1648 	vm_object_t object     = VME_OBJECT(entry);
1649 	memory_object_t pager  = object->pager;
1650 	bool pager_created     = object->pager_created;
1651 	bool pager_initialized = object->pager_initialized;
1652 	bool pager_ready       = object->pager_ready;
1653 
1654 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1655 	    "object %p has unexpected pager %p (%d,%d,%d)",
1656 	    map, (void *)address, (size_t)size, entry, object,
1657 	    pager, pager_created, pager_initialized, pager_ready);
1658 }
1659 
1660 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1661 kmem_realloc_shrink_guard(
1662 	vm_map_t                map,
1663 	vm_offset_t             req_oldaddr,
1664 	vm_size_t               req_oldsize,
1665 	vm_size_t               req_newsize,
1666 	kmr_flags_t             flags,
1667 	kmem_guard_t            guard,
1668 	vm_map_entry_t          entry)
1669 {
1670 	vmr_flags_t             vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1671 	vm_object_t             object;
1672 	vm_offset_t             delta = 0;
1673 	kmem_return_t           kmr;
1674 	bool                    was_atomic;
1675 	vm_size_t               oldsize = round_page(req_oldsize);
1676 	vm_size_t               newsize = round_page(req_newsize);
1677 	vm_address_t            oldaddr = req_oldaddr;
1678 
1679 #if KASAN_CLASSIC
1680 	if (flags & KMR_KASAN_GUARD) {
1681 		assert((flags & (KMR_GUARD_FIRST | KMR_GUARD_LAST)) == 0);
1682 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1683 		oldaddr -= PAGE_SIZE;
1684 		delta    = ptoa(2);
1685 		oldsize += delta;
1686 		newsize += delta;
1687 	}
1688 #endif /* KASAN_CLASSIC */
1689 
1690 	if (flags & KMR_TAG) {
1691 		oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1692 	}
1693 
1694 	vm_map_lock_assert_exclusive(map);
1695 
1696 	if ((flags & KMR_KOBJECT) == 0) {
1697 		object = VME_OBJECT(entry);
1698 		vm_object_reference(object);
1699 	}
1700 
1701 	/*
1702 	 *	Shrinking an atomic entry starts with splitting it,
1703 	 *	and removing the second half.
1704 	 */
1705 	was_atomic = entry->vme_atomic;
1706 	entry->vme_atomic = false;
1707 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1708 	entry->vme_atomic = was_atomic;
1709 
1710 #if KASAN
1711 	if (entry->vme_kernel_object && was_atomic) {
1712 		entry->vme_object_or_delta = (-req_newsize & PAGE_MASK) + delta;
1713 	}
1714 #if KASAN_CLASSIC
1715 	if (flags & KMR_KASAN_GUARD) {
1716 		kasan_poison_range(oldaddr + newsize, oldsize - newsize,
1717 		    ASAN_VALID);
1718 	}
1719 #endif
1720 #if KASAN_TBI
1721 	if (flags & KMR_TAG) {
1722 		kasan_tbi_mark_free_space((caddr_t)req_oldaddr + newsize, oldsize - newsize);
1723 	}
1724 #endif /* KASAN_TBI */
1725 #endif /* KASAN */
1726 	(void)vm_map_remove_and_unlock(map,
1727 	    oldaddr + newsize, oldaddr + oldsize,
1728 	    vmr_flags, KMEM_GUARD_NONE);
1729 
1730 
1731 	/*
1732 	 *	Lastly, if there are guard pages, deal with them.
1733 	 *
1734 	 *	The kernel object just needs to depopulate,
1735 	 *	regular objects require freeing the last page
1736 	 *	and replacing it with a guard.
1737 	 */
1738 	if (flags & KMR_KOBJECT) {
1739 		if (flags & KMR_GUARD_LAST) {
1740 			kma_flags_t dflags = KMA_KOBJECT;
1741 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1742 			    PAGE_SIZE, dflags, guard.kmg_tag);
1743 		}
1744 	} else {
1745 		vm_page_t guard_right = VM_PAGE_NULL;
1746 		vm_offset_t remove_start = newsize;
1747 
1748 		if (flags & KMR_GUARD_LAST) {
1749 			if (!map->never_faults) {
1750 				guard_right = vm_page_create_guard(true);
1751 			}
1752 			remove_start -= PAGE_SIZE;
1753 		}
1754 
1755 		vm_object_lock(object);
1756 
1757 		if (object->vo_size != oldsize) {
1758 			__kmem_realloc_invalid_object_size_panic(map,
1759 			    req_oldaddr, req_oldsize + delta, entry);
1760 		}
1761 		vm_object_set_size(object, newsize, req_newsize);
1762 
1763 		vm_object_page_remove(object, remove_start, oldsize);
1764 
1765 		if (guard_right) {
1766 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1767 			guard_right->vmp_busy = false;
1768 		}
1769 		vm_object_unlock(object);
1770 		vm_object_deallocate(object);
1771 	}
1772 
1773 	kmr.kmr_address = req_oldaddr;
1774 	kmr.kmr_return  = 0;
1775 #if KASAN_CLASSIC
1776 	if (flags & KMA_KASAN_GUARD) {
1777 		kasan_alloc_large(kmr.kmr_address, req_newsize);
1778 	}
1779 #endif /* KASAN_CLASSIC */
1780 #if KASAN_TBI
1781 	if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1782 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1783 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1784 	}
1785 #endif /* KASAN_TBI */
1786 
1787 	return kmr;
1788 }
1789 
1790 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t req_oldaddr,vm_size_t req_oldsize,vm_size_t req_newsize,kmr_flags_t flags,kmem_guard_t guard)1791 kmem_realloc_guard(
1792 	vm_map_t                map,
1793 	vm_offset_t             req_oldaddr,
1794 	vm_size_t               req_oldsize,
1795 	vm_size_t               req_newsize,
1796 	kmr_flags_t             flags,
1797 	kmem_guard_t            guard)
1798 {
1799 	vm_object_t             object;
1800 	vm_size_t               oldsize;
1801 	vm_size_t               newsize;
1802 	vm_offset_t             delta = 0;
1803 	vm_map_offset_t         oldaddr;
1804 	vm_map_offset_t         newaddr;
1805 	vm_object_offset_t      newoffs;
1806 	vm_map_entry_t          oldentry;
1807 	vm_map_entry_t          newentry;
1808 	vm_page_t               page_list = NULL;
1809 	bool                    needs_wakeup = false;
1810 	kmem_return_t           kmr = { };
1811 	unsigned int            last_timestamp;
1812 	vm_map_kernel_flags_t   vmk_flags = {
1813 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1814 	};
1815 
1816 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1817 
1818 	if (!guard.kmg_atomic) {
1819 		if (!(flags & (KMR_DATA | KMR_DATA_SHARED))) {
1820 			__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1821 			    req_oldsize, flags);
1822 		}
1823 
1824 		if (flags & KMR_KOBJECT) {
1825 			__kmem_invalid_arguments_panic("realloc", map, req_oldaddr,
1826 			    req_oldsize, flags);
1827 		}
1828 	}
1829 
1830 	if (req_oldaddr == 0ul) {
1831 		return kmem_alloc_guard(map, req_newsize, 0, (kma_flags_t)flags, guard);
1832 	}
1833 
1834 	if (req_newsize == 0ul) {
1835 		kmem_free_guard(map, req_oldaddr, req_oldsize,
1836 		    (kmf_flags_t)flags, guard);
1837 		return kmr;
1838 	}
1839 
1840 	if (req_newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1841 		__kmem_invalid_size_panic(map, req_newsize, flags);
1842 	}
1843 	if (req_newsize < __kmem_guard_size(ANYF(flags))) {
1844 		__kmem_invalid_size_panic(map, req_newsize, flags);
1845 	}
1846 
1847 	oldsize = round_page(req_oldsize);
1848 	newsize = round_page(req_newsize);
1849 	oldaddr = req_oldaddr;
1850 #if KASAN_CLASSIC
1851 	if (flags & KMR_KASAN_GUARD) {
1852 		flags   |= KMR_GUARD_FIRST | KMR_GUARD_LAST;
1853 		oldaddr -= PAGE_SIZE;
1854 		delta    = ptoa(2);
1855 		oldsize += delta;
1856 		newsize += delta;
1857 	}
1858 #endif /* KASAN_CLASSIC */
1859 #if CONFIG_KERNEL_TAGGING
1860 	if (flags & KMR_TAG) {
1861 		vm_memtag_verify_tag(req_oldaddr + __kmem_guard_left(ANYF(flags)));
1862 		oldaddr = vm_memtag_canonicalize_kernel(req_oldaddr);
1863 	}
1864 #endif /* CONFIG_KERNEL_TAGGING */
1865 
1866 #if !KASAN
1867 	/*
1868 	 *	If not on a KASAN variant and no difference in requested size,
1869 	 *  just return.
1870 	 *
1871 	 *	Otherwise we want to validate the size and re-tag for KASAN_TBI.
1872 	 */
1873 	if (oldsize == newsize) {
1874 		kmr.kmr_address = req_oldaddr;
1875 		return kmr;
1876 	}
1877 #endif /* !KASAN */
1878 
1879 	/*
1880 	 *	If we're growing the allocation,
1881 	 *	then reserve the pages we'll need,
1882 	 *	and find a spot for its new place.
1883 	 */
1884 	if (oldsize < newsize) {
1885 #if DEBUG || DEVELOPMENT
1886 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1887 		    DBG_VM_KERN_REQUEST, DBG_FUNC_START,
1888 		    newsize - oldsize, 0, 0, 0);
1889 #endif /* DEBUG || DEVELOPMENT */
1890 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1891 		    (kma_flags_t)flags, &page_list);
1892 		if (kmr.kmr_return == KERN_SUCCESS) {
1893 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1894 			    newsize, 0, &vmk_flags, true);
1895 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1896 			    vmk_flags, &newentry);
1897 		}
1898 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1899 			if (flags & KMR_REALLOCF) {
1900 				kmem_free_guard(map, req_oldaddr, req_oldsize,
1901 				    flags & (KMF_TAG | KMF_GUARD_FIRST |
1902 				    KMF_GUARD_LAST | KMF_KASAN_GUARD), guard);
1903 			}
1904 			if (page_list) {
1905 				vm_page_free_list(page_list, FALSE);
1906 			}
1907 #if DEBUG || DEVELOPMENT
1908 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1909 			    DBG_VM_KERN_REQUEST, DBG_FUNC_END,
1910 			    0, 0, 0, 0);
1911 #endif /* DEBUG || DEVELOPMENT */
1912 			return kmr;
1913 		}
1914 
1915 		/* map is locked */
1916 	} else {
1917 		vm_map_lock(map);
1918 	}
1919 
1920 
1921 	/*
1922 	 *	Locate the entry:
1923 	 *	- wait for it to quiesce.
1924 	 *	- validate its guard,
1925 	 *	- learn its correct tag,
1926 	 */
1927 again:
1928 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1929 		__kmem_entry_not_found_panic(map, req_oldaddr);
1930 	}
1931 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1932 		oldentry->needs_wakeup = true;
1933 		vm_map_entry_wait(map, THREAD_UNINT);
1934 		goto again;
1935 	}
1936 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1937 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1938 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1939 	}
1940 	/*
1941 	 *	TODO: We should validate for non atomic entries that the range
1942 	 *	      we are acting on is what we expect here.
1943 	 */
1944 #if KASAN
1945 	if (__kmem_entry_orig_size(oldentry) != req_oldsize) {
1946 		__kmem_realloc_invalid_object_size_panic(map,
1947 		    req_oldaddr, req_oldsize + delta, oldentry);
1948 	}
1949 
1950 	if (oldsize == newsize) {
1951 		kmr.kmr_address = req_oldaddr;
1952 		if (oldentry->vme_kernel_object) {
1953 			oldentry->vme_object_or_delta = delta +
1954 			    (-req_newsize & PAGE_MASK);
1955 		} else {
1956 			object = VME_OBJECT(oldentry);
1957 			vm_object_lock(object);
1958 			vm_object_set_size(object, newsize, req_newsize);
1959 			vm_object_unlock(object);
1960 		}
1961 		vm_map_unlock(map);
1962 
1963 #if KASAN_CLASSIC
1964 		if (flags & KMA_KASAN_GUARD) {
1965 			kasan_alloc_large(kmr.kmr_address, req_newsize);
1966 		}
1967 #endif /* KASAN_CLASSIC */
1968 #if KASAN_TBI
1969 		if ((flags & KMR_TAG) && (flags & KMR_FREEOLD)) {
1970 			kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
1971 			kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
1972 		}
1973 #endif /* KASAN_TBI */
1974 		return kmr;
1975 	}
1976 #endif /* KASAN */
1977 
1978 	guard.kmg_tag = VME_ALIAS(oldentry);
1979 
1980 	if (newsize < oldsize) {
1981 		return kmem_realloc_shrink_guard(map, req_oldaddr,
1982 		           req_oldsize, req_newsize, flags, guard, oldentry);
1983 	}
1984 
1985 
1986 	/*
1987 	 *	We are growing the entry
1988 	 *
1989 	 *	For regular objects we use the object `vo_size` updates
1990 	 *	as a guarantee that no 2 kmem_realloc() can happen
1991 	 *	concurrently (by doing it before the map is unlocked.
1992 	 *
1993 	 *	For the kernel object, prevent the entry from being
1994 	 *	reallocated or changed by marking it "in_transition".
1995 	 */
1996 
1997 	object = VME_OBJECT(oldentry);
1998 	vm_object_lock(object);
1999 	vm_object_reference_locked(object);
2000 
2001 	newaddr = newentry->vme_start;
2002 	newoffs = oldsize;
2003 
2004 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
2005 	VME_ALIAS_SET(newentry, guard.kmg_tag);
2006 	if (flags & KMR_KOBJECT) {
2007 		oldentry->in_transition = true;
2008 		VME_OFFSET_SET(newentry, newaddr);
2009 		newentry->wired_count = 1;
2010 		vme_btref_consider_and_set(newentry, __builtin_frame_address(0));
2011 		newoffs = newaddr + oldsize;
2012 #if KASAN
2013 		newentry->vme_object_or_delta = delta +
2014 		    (-req_newsize & PAGE_MASK);
2015 #endif /* KASAN */
2016 	} else {
2017 		if (object->pager_created || object->pager) {
2018 			/*
2019 			 * We can't "realloc/grow" the pager, so pageable
2020 			 * allocations should not go through this path.
2021 			 */
2022 			__kmem_realloc_invalid_pager_panic(map,
2023 			    req_oldaddr, req_oldsize + delta, oldentry);
2024 		}
2025 		if (object->vo_size != oldsize) {
2026 			__kmem_realloc_invalid_object_size_panic(map,
2027 			    req_oldaddr, req_oldsize + delta, oldentry);
2028 		}
2029 		vm_object_set_size(object, newsize, req_newsize);
2030 	}
2031 
2032 	last_timestamp = map->timestamp;
2033 	vm_map_unlock(map);
2034 
2035 
2036 	/*
2037 	 *	Now proceed with the population of pages.
2038 	 *
2039 	 *	Kernel objects can use the kmem population helpers.
2040 	 *
2041 	 *	Regular objects will insert pages manually,
2042 	 *	then wire the memory into the new range.
2043 	 */
2044 
2045 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
2046 
2047 	if (flags & KMR_KOBJECT) {
2048 		pmap_mapping_type_t mapping_type = __kmem_mapping_type(ANYF(flags));
2049 
2050 		pmap_protect(kernel_pmap,
2051 		    oldaddr, oldaddr + oldsize - guard_right_size,
2052 		    VM_PROT_NONE);
2053 
2054 		for (vm_object_offset_t offset = 0;
2055 		    offset < oldsize - guard_right_size;
2056 		    offset += PAGE_SIZE_64) {
2057 			vm_page_t mem;
2058 
2059 			mem = vm_page_lookup(object, oldaddr + offset);
2060 			if (mem == VM_PAGE_NULL) {
2061 				continue;
2062 			}
2063 
2064 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
2065 
2066 			mem->vmp_busy = true;
2067 			vm_page_remove(mem, true);
2068 			vm_page_insert_wired(mem, object, newaddr + offset,
2069 			    guard.kmg_tag);
2070 			mem->vmp_busy = false;
2071 
2072 			kernel_memory_populate_pmap_enter(object, newaddr,
2073 			    offset, mem, VM_PROT_DEFAULT, 0, mapping_type);
2074 		}
2075 
2076 		kernel_memory_populate_object_and_unlock(object,
2077 		    newaddr + oldsize - guard_right_size,
2078 		    newoffs - guard_right_size,
2079 		    newsize - oldsize,
2080 		    page_list, (kma_flags_t)flags,
2081 		    guard.kmg_tag, VM_PROT_DEFAULT, mapping_type);
2082 	} else {
2083 		vm_page_t guard_right = VM_PAGE_NULL;
2084 
2085 		/*
2086 		 *	Note: we are borrowing the new entry reference
2087 		 *	on the object for the duration of this code,
2088 		 *	which works because we keep the object locked
2089 		 *	throughout.
2090 		 */
2091 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
2092 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
2093 			assert(vm_page_is_guard(guard_right));
2094 			guard_right->vmp_busy = true;
2095 			vm_page_remove(guard_right, true);
2096 		}
2097 
2098 		if (flags & KMR_FREEOLD) {
2099 			/*
2100 			 * Freeing the old mapping will make
2101 			 * the old pages become pageable until
2102 			 * the new mapping makes them wired again.
2103 			 * Let's take an extra "wire_count" to
2104 			 * prevent any accidental "page out".
2105 			 * We'll have to undo that after wiring
2106 			 * the new mapping.
2107 			 */
2108 			vm_object_reference_locked(object); /* keep object alive */
2109 			for (vm_object_offset_t offset = 0;
2110 			    offset < oldsize - guard_right_size;
2111 			    offset += PAGE_SIZE_64) {
2112 				vm_page_t mem;
2113 
2114 				mem = vm_page_lookup(object, offset);
2115 				assert(mem != VM_PAGE_NULL);
2116 				assertf(!VM_PAGE_PAGEABLE(mem),
2117 				    "mem %p qstate %d",
2118 				    mem, mem->vmp_q_state);
2119 				if (vm_page_is_guard(mem)) {
2120 					/* guard pages are not wired */
2121 				} else {
2122 					assertf(VM_PAGE_WIRED(mem),
2123 					    "mem %p qstate %d wirecount %d",
2124 					    mem,
2125 					    mem->vmp_q_state,
2126 					    mem->vmp_wire_count);
2127 					assertf(mem->vmp_wire_count >= 1,
2128 					    "mem %p wirecount %d",
2129 					    mem, mem->vmp_wire_count);
2130 					mem->vmp_wire_count++;
2131 				}
2132 			}
2133 		}
2134 
2135 		for (vm_object_offset_t offset = oldsize - guard_right_size;
2136 		    offset < newsize - guard_right_size;
2137 		    offset += PAGE_SIZE_64) {
2138 			vm_page_t mem = page_list;
2139 
2140 			page_list = mem->vmp_snext;
2141 			mem->vmp_snext = VM_PAGE_NULL;
2142 			assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
2143 			assert(!VM_PAGE_PAGEABLE(mem));
2144 
2145 			vm_page_insert(mem, object, offset);
2146 			mem->vmp_busy = false;
2147 		}
2148 
2149 		if (guard_right) {
2150 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
2151 			guard_right->vmp_busy = false;
2152 		}
2153 
2154 		vm_object_unlock(object);
2155 	}
2156 
2157 	/*
2158 	 *	Mark the entry as idle again,
2159 	 *	and honor KMR_FREEOLD if needed.
2160 	 */
2161 
2162 	vm_map_lock(map);
2163 	if (last_timestamp + 1 != map->timestamp &&
2164 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
2165 		__kmem_entry_not_found_panic(map, req_oldaddr);
2166 	}
2167 
2168 	if (flags & KMR_KOBJECT) {
2169 		assert(oldentry->in_transition);
2170 		oldentry->in_transition = false;
2171 		if (oldentry->needs_wakeup) {
2172 			needs_wakeup = true;
2173 			oldentry->needs_wakeup = false;
2174 		}
2175 	}
2176 
2177 	if (flags & KMR_FREEOLD) {
2178 		vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2179 
2180 #if KASAN_CLASSIC
2181 		if (flags & KMR_KASAN_GUARD) {
2182 			kasan_poison_range(oldaddr, oldsize, ASAN_VALID);
2183 		}
2184 #endif
2185 #if KASAN_TBI
2186 		if (flags & KMR_TAG) {
2187 			kasan_tbi_mark_free_space((caddr_t)req_oldaddr, oldsize);
2188 		}
2189 #endif /* KASAN_TBI */
2190 		if (flags & KMR_GUARD_LAST) {
2191 			vmr_flags |= VM_MAP_REMOVE_NOKUNWIRE_LAST;
2192 		}
2193 		(void)vm_map_remove_and_unlock(map,
2194 		    oldaddr, oldaddr + oldsize,
2195 		    vmr_flags, guard);
2196 	} else {
2197 		vm_map_unlock(map);
2198 	}
2199 
2200 	if ((flags & KMR_KOBJECT) == 0) {
2201 		kern_return_t kr;
2202 		/*
2203 		 * This must happen _after_ we do the KMR_FREEOLD,
2204 		 * because wiring the pages will call into the pmap,
2205 		 * and if the pages are typed XNU_KERNEL_RESTRICTED,
2206 		 * this would cause a second mapping of the page and panic.
2207 		 */
2208 		kr = vm_map_wire_kernel(map,
2209 		    vm_sanitize_wrap_addr(newaddr),
2210 		    vm_sanitize_wrap_addr(newaddr + newsize),
2211 		    vm_sanitize_wrap_prot(VM_PROT_DEFAULT),
2212 		    guard.kmg_tag, FALSE);
2213 		assert(kr == KERN_SUCCESS);
2214 
2215 		if (flags & KMR_FREEOLD) {
2216 			/*
2217 			 * Undo the extra "wiring" we made above
2218 			 * and release the extra reference we took
2219 			 * on the object.
2220 			 */
2221 			vm_object_lock(object);
2222 			for (vm_object_offset_t offset = 0;
2223 			    offset < oldsize - guard_right_size;
2224 			    offset += PAGE_SIZE_64) {
2225 				vm_page_t mem;
2226 
2227 				mem = vm_page_lookup(object, offset);
2228 				assert(mem != VM_PAGE_NULL);
2229 				assertf(!VM_PAGE_PAGEABLE(mem),
2230 				    "mem %p qstate %d",
2231 				    mem, mem->vmp_q_state);
2232 				if (vm_page_is_guard(mem)) {
2233 					/* guard pages are not wired */
2234 				} else {
2235 					assertf(VM_PAGE_WIRED(mem),
2236 					    "mem %p qstate %d wirecount %d",
2237 					    mem,
2238 					    mem->vmp_q_state,
2239 					    mem->vmp_wire_count);
2240 					assertf(mem->vmp_wire_count >= 2,
2241 					    "mem %p wirecount %d",
2242 					    mem, mem->vmp_wire_count);
2243 					mem->vmp_wire_count--;
2244 					assert(VM_PAGE_WIRED(mem));
2245 					assert(mem->vmp_wire_count >= 1);
2246 				}
2247 			}
2248 			vm_object_unlock(object);
2249 			vm_object_deallocate(object); /* release extra ref */
2250 		}
2251 	}
2252 
2253 	if (needs_wakeup) {
2254 		vm_map_entry_wakeup(map);
2255 	}
2256 
2257 #if DEBUG || DEVELOPMENT
2258 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, DBG_VM_KERN_REQUEST, DBG_FUNC_END,
2259 	    atop(newsize - oldsize), 0, 0, 0);
2260 #endif /* DEBUG || DEVELOPMENT */
2261 	kmr.kmr_address = newaddr;
2262 
2263 #if KASAN
2264 	kasan_notify_address(kmr.kmr_address, newsize);
2265 #endif /* KASAN */
2266 #if KASAN_CLASSIC
2267 	if (flags & KMR_KASAN_GUARD) {
2268 		kmr.kmr_address += PAGE_SIZE;
2269 		kasan_alloc_large(kmr.kmr_address, req_newsize);
2270 	}
2271 #endif /* KASAN_CLASSIC */
2272 #if CONFIG_KERNEL_TAGGING
2273 	if (flags & KMR_TAG) {
2274 #if   KASAN_TBI
2275 		/*
2276 		 * Validate the current buffer, then generate a new tag,
2277 		 * even if the address is stable, it's a "new" allocation.
2278 		 */
2279 		__asan_loadN((vm_offset_t)kmr.kmr_address, oldsize);
2280 		kmr.kmr_ptr = vm_memtag_generate_and_store_tag(kmr.kmr_ptr, req_newsize);
2281 		kasan_tbi_retag_unused_space(kmr.kmr_ptr, newsize, req_newsize);
2282 #endif /* KASAN_TBI */
2283 	}
2284 #endif /* CONFIG_KERNEL_TAGGING */
2285 
2286 	return kmr;
2287 }
2288 
2289 #pragma mark map/remap/wire
2290 
2291 kern_return_t
mach_vm_map_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut initial_size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,ipc_port_t port,memory_object_offset_ut offset,boolean_t copy,vm_prot_ut cur_protection,vm_prot_ut max_protection,vm_inherit_ut inheritance)2292 mach_vm_map_kernel(
2293 	vm_map_t                target_map,
2294 	mach_vm_offset_ut      *address,
2295 	mach_vm_size_ut         initial_size,
2296 	mach_vm_offset_ut       mask,
2297 	vm_map_kernel_flags_t   vmk_flags,
2298 	ipc_port_t              port,
2299 	memory_object_offset_ut offset,
2300 	boolean_t               copy,
2301 	vm_prot_ut              cur_protection,
2302 	vm_prot_ut              max_protection,
2303 	vm_inherit_ut           inheritance)
2304 {
2305 	/* range_id is set by vm_map_enter_mem_object */
2306 	return vm_map_enter_mem_object(target_map,
2307 	           address,
2308 	           initial_size,
2309 	           mask,
2310 	           vmk_flags,
2311 	           port,
2312 	           offset,
2313 	           copy,
2314 	           cur_protection,
2315 	           max_protection,
2316 	           inheritance,
2317 	           NULL,
2318 	           0);
2319 }
2320 
2321 kern_return_t
mach_vm_remap_new_kernel(vm_map_t target_map,mach_vm_offset_ut * address,mach_vm_size_ut size,mach_vm_offset_ut mask,vm_map_kernel_flags_t vmk_flags,vm_map_t src_map,mach_vm_offset_ut memory_address,boolean_t copy,vm_prot_ut * cur_protection,vm_prot_ut * max_protection,vm_inherit_ut inheritance)2322 mach_vm_remap_new_kernel(
2323 	vm_map_t                target_map,
2324 	mach_vm_offset_ut      *address,
2325 	mach_vm_size_ut         size,
2326 	mach_vm_offset_ut       mask,
2327 	vm_map_kernel_flags_t   vmk_flags,
2328 	vm_map_t                src_map,
2329 	mach_vm_offset_ut       memory_address,
2330 	boolean_t               copy,
2331 	vm_prot_ut             *cur_protection,   /* IN/OUT */
2332 	vm_prot_ut             *max_protection,   /* IN/OUT */
2333 	vm_inherit_ut           inheritance)
2334 {
2335 	if (!vm_map_kernel_flags_check_vm_and_kflags(vmk_flags,
2336 	    VM_FLAGS_USER_REMAP)) {
2337 		return KERN_INVALID_ARGUMENT;
2338 	}
2339 
2340 
2341 	vmk_flags.vmf_return_data_addr = true;
2342 
2343 	/* range_id is set by vm_map_remap */
2344 	return vm_map_remap(target_map,
2345 	           address,
2346 	           size,
2347 	           mask,
2348 	           vmk_flags,
2349 	           src_map,
2350 	           memory_address,
2351 	           copy,
2352 	           cur_protection,
2353 	           max_protection,
2354 	           inheritance);
2355 }
2356 
2357 #pragma mark free
2358 
2359 #if KASAN
2360 
2361 __abortlike
2362 static void
__kmem_free_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry)2363 __kmem_free_invalid_object_size_panic(
2364 	vm_map_t                map,
2365 	vm_address_t            address,
2366 	vm_size_t               size,
2367 	vm_map_entry_t          entry)
2368 {
2369 	vm_object_t object  = VME_OBJECT(entry);
2370 	vm_size_t   objsize = __kmem_entry_orig_size(entry);
2371 
2372 	panic("kmem_free(map=%p, addr=%p, size=%zd, entry=%p): "
2373 	    "object %p has unexpected size %ld",
2374 	    map, (void *)address, (size_t)size, entry, object, objsize);
2375 }
2376 
2377 #endif /* KASAN */
2378 
2379 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t req_addr,vm_size_t req_size,kmf_flags_t flags,kmem_guard_t guard)2380 kmem_free_guard(
2381 	vm_map_t        map,
2382 	vm_offset_t     req_addr,
2383 	vm_size_t       req_size,
2384 	kmf_flags_t     flags,
2385 	kmem_guard_t    guard)
2386 {
2387 	vmr_flags_t     vmr_flags = VM_MAP_REMOVE_KUNWIRE;
2388 	vm_address_t    addr      = req_addr;
2389 	vm_offset_t     delta     = 0;
2390 	vm_size_t       size;
2391 #if KASAN
2392 	vm_map_entry_t  entry;
2393 #endif /* KASAN */
2394 
2395 	assert(map->pmap == kernel_pmap);
2396 
2397 #if KASAN_CLASSIC
2398 	if (flags & KMF_KASAN_GUARD) {
2399 		addr  -= PAGE_SIZE;
2400 		delta  = ptoa(2);
2401 	}
2402 #endif /* KASAN_CLASSIC */
2403 #if CONFIG_KERNEL_TAGGING
2404 	if (flags & KMF_TAG) {
2405 		vm_memtag_verify_tag(req_addr + __kmem_guard_left(ANYF(flags)));
2406 		addr = vm_memtag_canonicalize_kernel(req_addr);
2407 	}
2408 #endif /* CONFIG_KERNEL_TAGGING */
2409 
2410 	if (flags & KMF_GUESS_SIZE) {
2411 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
2412 		size = PAGE_SIZE;
2413 	} else if (req_size == 0) {
2414 		__kmem_invalid_size_panic(map, req_size, flags);
2415 	} else {
2416 		size = round_page(req_size) + delta;
2417 	}
2418 
2419 	vm_map_lock(map);
2420 
2421 #if KASAN
2422 	if (!vm_map_lookup_entry(map, addr, &entry)) {
2423 		__kmem_entry_not_found_panic(map, req_addr);
2424 	}
2425 	if (flags & KMF_GUESS_SIZE) {
2426 		vmr_flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
2427 		req_size = __kmem_entry_orig_size(entry);
2428 		size = round_page(req_size + delta);
2429 	} else if (guard.kmg_atomic && entry->vme_kernel_object &&
2430 	    __kmem_entry_orig_size(entry) != req_size) {
2431 		/*
2432 		 * We can't make a strict check for regular
2433 		 * VM objects because it could be:
2434 		 *
2435 		 * - the kmem_guard_free() of a kmem_realloc_guard() without
2436 		 *   KMR_FREEOLD, and in that case the object size won't match.
2437 		 *
2438 		 * - a submap, in which case there is no "orig size".
2439 		 */
2440 		__kmem_free_invalid_object_size_panic(map,
2441 		    req_addr, req_size + delta, entry);
2442 	}
2443 #endif /* KASAN */
2444 #if KASAN_CLASSIC
2445 	if (flags & KMR_KASAN_GUARD) {
2446 		kasan_poison_range(addr, size, ASAN_VALID);
2447 	}
2448 #endif
2449 #if KASAN_TBI
2450 	if (flags & KMF_TAG) {
2451 		kasan_tbi_mark_free_space((caddr_t)req_addr, size);
2452 	}
2453 #endif /* KASAN_TBI */
2454 
2455 	/*
2456 	 * vm_map_remove_and_unlock is called with VM_MAP_REMOVE_KUNWIRE, which
2457 	 * unwires the kernel mapping. The page won't be mapped any longer so
2458 	 * there is no extra step that is required for memory tagging to "clear"
2459 	 * it -- the page will be later laundered when reused.
2460 	 */
2461 	return vm_map_remove_and_unlock(map, addr, addr + size,
2462 	           vmr_flags, guard).kmr_size - delta;
2463 }
2464 
2465 __exported void
2466 kmem_free_external(
2467 	vm_map_t        map,
2468 	vm_offset_t     addr,
2469 	vm_size_t       size);
2470 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)2471 kmem_free_external(
2472 	vm_map_t        map,
2473 	vm_offset_t     addr,
2474 	vm_size_t       size)
2475 {
2476 	if (size) {
2477 		kmem_free(map, trunc_page(addr), size);
2478 #if MACH_ASSERT
2479 	} else {
2480 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
2481 		    map, (void *)addr, __builtin_return_address(0));
2482 #endif
2483 	}
2484 }
2485 
2486 #pragma mark kmem metadata
2487 
2488 /*
2489  * Guard objects for kmem pointer allocation:
2490  *
2491  * Guard objects introduce size slabs to kmem pointer allocations that are
2492  * allocated in chunks of n * sizeclass. When an allocation of a specific
2493  * sizeclass is requested a random slot from [0, n) is returned.
2494  * Allocations are returned from that chunk until m slots are left. The
2495  * remaining m slots are referred to as guard objects. They don't get
2496  * allocated and the chunk is now considered full. When an allocation is
2497  * freed to the chunk 1 slot is now available from m + 1 for the next
2498  * allocation of that sizeclass.
2499  *
2500  * Guard objects are intended to make exploitation of use after frees harder
2501  * as allocations that are freed can no longer be reliable reallocated.
2502  * They also make exploitation of OOBs harder as overflowing out of an
2503  * allocation can no longer be safe even with sufficient spraying.
2504  */
2505 
2506 #define KMEM_META_PRIMARY    UINT8_MAX
2507 #define KMEM_META_START     (UINT8_MAX - 1)
2508 #define KMEM_META_FREE      (UINT8_MAX - 2)
2509 #if __ARM_16K_PG__
2510 #define KMEM_MIN_SIZE        PAGE_SIZE
2511 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 16)
2512 #else /* __ARM_16K_PG__ */
2513 /*
2514  * PAGE_SIZE isn't a compile time constant on some arm64 devices. Those
2515  * devices use 4k page size when their RAM is <= 1GB and 16k otherwise.
2516  * Therefore populate sizeclasses from 4k for those devices.
2517  */
2518 #define KMEM_MIN_SIZE       (4 * 1024)
2519 #define KMEM_CHUNK_SIZE_MIN (KMEM_MIN_SIZE * 32)
2520 #endif /* __ARM_16K_PG__ */
2521 #define KMEM_MAX_SIZE       (32ULL << 20)
2522 #define KMEM_START_IDX      (kmem_log2down(KMEM_MIN_SIZE))
2523 #define KMEM_LAST_IDX       (kmem_log2down(KMEM_MAX_SIZE))
2524 #define KMEM_NUM_SIZECLASS  (KMEM_LAST_IDX - KMEM_START_IDX + 1)
2525 #define KMEM_FRONTS         (KMEM_RANGE_ID_NUM_PTR * 2)
2526 #define KMEM_NUM_GUARDS      2
2527 
2528 struct kmem_page_meta {
2529 	union {
2530 		/*
2531 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2532 		 */
2533 		uint32_t km_bitmap;
2534 		/*
2535 		 * On start and end of free chunk with KMEM_META_FREE marker
2536 		 */
2537 		uint32_t km_free_chunks;
2538 	};
2539 	/*
2540 	 * KMEM_META_PRIMARY: Start meta of allocated chunk
2541 	 * KMEM_META_FREE   : Start and end meta of free chunk
2542 	 * KMEM_META_START  : Meta region start and end
2543 	 */
2544 	uint8_t  km_page_marker;
2545 	uint8_t  km_sizeclass;
2546 	union {
2547 		/*
2548 		 * On primary allocated chunk with KMEM_META_PRIMARY marker
2549 		 */
2550 		uint16_t km_chunk_len;
2551 		/*
2552 		 * On secondary allocated chunks
2553 		 */
2554 		uint16_t km_page_idx;
2555 	};
2556 	LIST_ENTRY(kmem_page_meta) km_link;
2557 } kmem_page_meta_t;
2558 
2559 typedef LIST_HEAD(kmem_list_head, kmem_page_meta) kmem_list_head_t;
2560 struct kmem_sizeclass {
2561 	vm_map_size_t                   ks_size;
2562 	uint32_t                        ks_num_chunk;
2563 	uint32_t                        ks_num_elem;
2564 	crypto_random_ctx_t __zpercpu   ks_rng_ctx;
2565 	kmem_list_head_t                ks_allfree_head[KMEM_FRONTS];
2566 	kmem_list_head_t                ks_partial_head[KMEM_FRONTS];
2567 	kmem_list_head_t                ks_full_head[KMEM_FRONTS];
2568 };
2569 
2570 static struct kmem_sizeclass kmem_size_array[KMEM_NUM_SIZECLASS];
2571 
2572 /*
2573  * Locks to synchronize metadata population
2574  */
2575 static LCK_GRP_DECLARE(kmem_locks_grp, "kmem_locks");
2576 static LCK_MTX_DECLARE(kmem_meta_region_lck, &kmem_locks_grp);
2577 #define kmem_meta_lock()   lck_mtx_lock(&kmem_meta_region_lck)
2578 #define kmem_meta_unlock() lck_mtx_unlock(&kmem_meta_region_lck)
2579 
2580 static SECURITY_READ_ONLY_LATE(struct mach_vm_range)
2581 kmem_meta_range[KMEM_RANGE_ID_NUM_PTR + 1];
2582 static SECURITY_READ_ONLY_LATE(struct kmem_page_meta *)
2583 kmem_meta_base[KMEM_RANGE_ID_NUM_PTR + 1];
2584 /*
2585  * Keeps track of metadata high water mark for each front
2586  */
2587 static struct kmem_page_meta *kmem_meta_hwm[KMEM_FRONTS];
2588 static SECURITY_READ_ONLY_LATE(vm_map_t)
2589 kmem_meta_map[KMEM_RANGE_ID_NUM_PTR + 1];
2590 static vm_map_size_t kmem_meta_size;
2591 
2592 static uint32_t
kmem_get_front(kmem_range_id_t range_id,bool from_right)2593 kmem_get_front(
2594 	kmem_range_id_t         range_id,
2595 	bool                    from_right)
2596 {
2597 	assert((range_id >= KMEM_RANGE_ID_FIRST) &&
2598 	    (range_id <= KMEM_RANGE_ID_NUM_PTR));
2599 	return (range_id - KMEM_RANGE_ID_FIRST) * 2 + from_right;
2600 }
2601 
2602 static inline uint32_t
kmem_slot_idx_to_bit(uint32_t slot_idx,uint32_t size_idx __unused)2603 kmem_slot_idx_to_bit(
2604 	uint32_t                slot_idx,
2605 	uint32_t                size_idx __unused)
2606 {
2607 	assert(slot_idx < kmem_size_array[size_idx].ks_num_elem);
2608 	return 1ull << slot_idx;
2609 }
2610 
2611 static uint32_t
kmem_get_idx_from_size(vm_map_size_t size)2612 kmem_get_idx_from_size(vm_map_size_t size)
2613 {
2614 	assert(size >= KMEM_MIN_SIZE && size <= KMEM_MAX_SIZE);
2615 	return kmem_log2down(size - 1) - KMEM_START_IDX + 1;
2616 }
2617 
2618 __abortlike
2619 static void
kmem_invalid_size_idx(uint32_t idx)2620 kmem_invalid_size_idx(uint32_t idx)
2621 {
2622 	panic("Invalid sizeclass idx %u", idx);
2623 }
2624 
2625 static vm_map_size_t
kmem_get_size_from_idx(uint32_t idx)2626 kmem_get_size_from_idx(uint32_t idx)
2627 {
2628 	if (__improbable(idx >= KMEM_NUM_SIZECLASS)) {
2629 		kmem_invalid_size_idx(idx);
2630 	}
2631 	return 1ul << (idx + KMEM_START_IDX);
2632 }
2633 
2634 static inline uint16_t
kmem_get_page_idx(struct kmem_page_meta * meta)2635 kmem_get_page_idx(struct kmem_page_meta *meta)
2636 {
2637 	uint8_t page_marker = meta->km_page_marker;
2638 
2639 	return (page_marker == KMEM_META_PRIMARY) ? 0 : meta->km_page_idx;
2640 }
2641 
2642 __abortlike
2643 static void
kmem_invalid_chunk_len(struct kmem_page_meta * meta)2644 kmem_invalid_chunk_len(struct kmem_page_meta *meta)
2645 {
2646 	panic("Reading free chunks for meta %p where marker != KMEM_META_PRIMARY",
2647 	    meta);
2648 }
2649 
2650 static inline uint16_t
kmem_get_chunk_len(struct kmem_page_meta * meta)2651 kmem_get_chunk_len(struct kmem_page_meta *meta)
2652 {
2653 	if (__improbable(meta->km_page_marker != KMEM_META_PRIMARY)) {
2654 		kmem_invalid_chunk_len(meta);
2655 	}
2656 
2657 	return meta->km_chunk_len;
2658 }
2659 
2660 __abortlike
2661 static void
kmem_invalid_free_chunk_len(struct kmem_page_meta * meta)2662 kmem_invalid_free_chunk_len(struct kmem_page_meta *meta)
2663 {
2664 	panic("Reading free chunks for meta %p where marker != KMEM_META_FREE",
2665 	    meta);
2666 }
2667 
2668 static inline uint32_t
kmem_get_free_chunk_len(struct kmem_page_meta * meta)2669 kmem_get_free_chunk_len(struct kmem_page_meta *meta)
2670 {
2671 	if (__improbable(meta->km_page_marker != KMEM_META_FREE)) {
2672 		kmem_invalid_free_chunk_len(meta);
2673 	}
2674 
2675 	return meta->km_free_chunks;
2676 }
2677 
2678 /*
2679  * Return the metadata corresponding to the specified address
2680  */
2681 static struct kmem_page_meta *
kmem_addr_to_meta(vm_map_offset_t addr,vm_map_range_id_t range_id,vm_map_offset_t * range_start,uint64_t * meta_idx)2682 kmem_addr_to_meta(
2683 	vm_map_offset_t         addr,
2684 	vm_map_range_id_t       range_id,
2685 	vm_map_offset_t        *range_start,
2686 	uint64_t               *meta_idx)
2687 {
2688 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
2689 
2690 	*range_start = kmem_ranges[range_id].min_address;
2691 	*meta_idx = (addr - *range_start) / KMEM_CHUNK_SIZE_MIN;
2692 	return VM_FAR_ADD_PTR_UNBOUNDED(meta_base, *meta_idx);
2693 }
2694 
2695 /*
2696  * Return the metadata start of the chunk that the address belongs to
2697  */
2698 static struct kmem_page_meta *
kmem_addr_to_meta_start(vm_address_t addr,vm_map_range_id_t range_id,vm_map_offset_t * chunk_start)2699 kmem_addr_to_meta_start(
2700 	vm_address_t            addr,
2701 	vm_map_range_id_t       range_id,
2702 	vm_map_offset_t        *chunk_start)
2703 {
2704 	vm_map_offset_t range_start;
2705 	uint64_t meta_idx;
2706 	struct kmem_page_meta *meta;
2707 
2708 	meta = kmem_addr_to_meta(addr, range_id, &range_start, &meta_idx);
2709 	meta_idx -= kmem_get_page_idx(meta);
2710 	meta = VM_FAR_ADD_PTR_UNBOUNDED(meta, -(ptrdiff_t)kmem_get_page_idx(meta));
2711 	assert(meta->km_page_marker == KMEM_META_PRIMARY);
2712 	*chunk_start = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN);
2713 	return meta;
2714 }
2715 
2716 __startup_func
2717 static void
kmem_init_meta_front(struct kmem_page_meta * meta,kmem_range_id_t range_id,bool from_right)2718 kmem_init_meta_front(
2719 	struct kmem_page_meta  *meta,
2720 	kmem_range_id_t         range_id,
2721 	bool                    from_right)
2722 {
2723 	kernel_memory_populate(trunc_page((vm_map_offset_t) meta), PAGE_SIZE,
2724 	    KMA_KOBJECT | KMA_ZERO | KMA_NOFAIL, VM_KERN_MEMORY_OSFMK);
2725 	meta->km_page_marker = KMEM_META_START;
2726 	if (!from_right) {
2727 		meta++;
2728 		kmem_meta_base[range_id] = meta;
2729 	}
2730 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta;
2731 }
2732 
2733 __startup_func
2734 static void
kmem_metadata_init(void)2735 kmem_metadata_init(void)
2736 {
2737 	for (kmem_range_id_t i = KMEM_RANGE_ID_FIRST; i <= kmem_ptr_ranges; i++) {
2738 		vm_map_offset_t addr = kmem_meta_range[i].min_address;
2739 		struct kmem_page_meta *meta;
2740 		uint64_t meta_idx;
2741 
2742 		vm_map_will_allocate_early_map(&kmem_meta_map[i]);
2743 		kmem_meta_map[i] = kmem_suballoc(kernel_map, &addr, kmem_meta_size,
2744 		    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
2745 		    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
2746 		    KMS_PERMANENT | KMS_NOFAIL | KMS_NOSOFTLIMIT,
2747 		    VM_KERN_MEMORY_OSFMK).kmr_submap;
2748 
2749 		kmem_meta_range[i].min_address = addr;
2750 		kmem_meta_range[i].max_address = addr + kmem_meta_size;
2751 
2752 		meta = (struct kmem_page_meta *) kmem_meta_range[i].min_address;
2753 		kmem_init_meta_front(meta, i, 0);
2754 
2755 		meta = kmem_addr_to_meta(kmem_ranges[i].max_address, i, &addr,
2756 		    &meta_idx);
2757 		kmem_init_meta_front(meta, i, 1);
2758 	}
2759 }
2760 
2761 __startup_func
2762 static void
kmem_init_front_head(struct kmem_sizeclass * ks,uint32_t front)2763 kmem_init_front_head(
2764 	struct kmem_sizeclass  *ks,
2765 	uint32_t                front)
2766 {
2767 	LIST_INIT(&ks->ks_allfree_head[front]);
2768 	LIST_INIT(&ks->ks_partial_head[front]);
2769 	LIST_INIT(&ks->ks_full_head[front]);
2770 }
2771 
2772 __startup_func
2773 static void
kmem_sizeclass_init(void)2774 kmem_sizeclass_init(void)
2775 {
2776 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2777 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2778 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST;
2779 
2780 		ks->ks_size = kmem_get_size_from_idx(i);
2781 		ks->ks_num_chunk = roundup(8 * ks->ks_size, KMEM_CHUNK_SIZE_MIN) /
2782 		    KMEM_CHUNK_SIZE_MIN;
2783 		ks->ks_num_elem = (ks->ks_num_chunk * KMEM_CHUNK_SIZE_MIN) / ks->ks_size;
2784 		assert(ks->ks_num_elem <=
2785 		    (sizeof(((struct kmem_page_meta *)0)->km_bitmap) * 8));
2786 		for (; range_id <= KMEM_RANGE_ID_NUM_PTR; range_id++) {
2787 			kmem_init_front_head(ks, kmem_get_front(range_id, 0));
2788 			kmem_init_front_head(ks, kmem_get_front(range_id, 1));
2789 		}
2790 	}
2791 }
2792 
2793 /*
2794  * This is done during EARLY_BOOT as it needs the corecrypto module to be
2795  * set up.
2796  */
2797 __startup_func
2798 static void
kmem_crypto_init(void)2799 kmem_crypto_init(void)
2800 {
2801 	vm_size_t ctx_size = crypto_random_kmem_ctx_size();
2802 
2803 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
2804 		struct kmem_sizeclass *ks = &kmem_size_array[i];
2805 
2806 		ks->ks_rng_ctx = zalloc_percpu_permanent(ctx_size, ZALIGN_PTR);
2807 		zpercpu_foreach(ctx, ks->ks_rng_ctx) {
2808 			crypto_random_kmem_init(ctx);
2809 		}
2810 	}
2811 }
2812 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, kmem_crypto_init);
2813 
2814 __abortlike
2815 static void
kmem_validate_slot_panic(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t slot_idx,uint32_t size_idx)2816 kmem_validate_slot_panic(
2817 	vm_map_offset_t         addr,
2818 	struct kmem_page_meta  *meta,
2819 	uint32_t                slot_idx,
2820 	uint32_t                size_idx)
2821 {
2822 	if (meta->km_page_marker != KMEM_META_PRIMARY) {
2823 		panic("Metadata (%p) for addr (%p) not primary", meta, (void *)addr);
2824 	}
2825 	if (meta->km_sizeclass != size_idx) {
2826 		panic("Metadata's (%p) sizeclass (%u != %u) changed during deletion",
2827 		    meta, meta->km_sizeclass, size_idx);
2828 	}
2829 	panic("Double free detected: Slot (%u) in meta (%p) for addr %p marked free",
2830 	    slot_idx, meta, (void *)addr);
2831 }
2832 
2833 __abortlike
2834 static void
kmem_invalid_slot_for_addr(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end)2835 kmem_invalid_slot_for_addr(
2836 	mach_vm_range_t         slot,
2837 	vm_map_offset_t         start,
2838 	vm_map_offset_t         end)
2839 {
2840 	panic("Invalid kmem ptr slot [%p:%p] for allocation [%p:%p]",
2841 	    (void *)slot->min_address, (void *)slot->max_address,
2842 	    (void *)start, (void *)end);
2843 }
2844 
2845 void
kmem_validate_slot(vm_map_offset_t addr,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2846 kmem_validate_slot(
2847 	vm_map_offset_t         addr,
2848 	struct kmem_page_meta  *meta,
2849 	uint32_t                size_idx,
2850 	uint32_t                slot_idx)
2851 {
2852 	if ((meta->km_page_marker != KMEM_META_PRIMARY) ||
2853 	    (meta->km_sizeclass != size_idx) ||
2854 	    ((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) != 0)) {
2855 		kmem_validate_slot_panic(addr, meta, size_idx, slot_idx);
2856 	}
2857 }
2858 
2859 static void
kmem_validate_slot_initial(mach_vm_range_t slot,vm_map_offset_t start,vm_map_offset_t end,struct kmem_page_meta * meta,uint32_t size_idx,uint32_t slot_idx)2860 kmem_validate_slot_initial(
2861 	mach_vm_range_t         slot,
2862 	vm_map_offset_t         start,
2863 	vm_map_offset_t         end,
2864 	struct kmem_page_meta  *meta,
2865 	uint32_t                size_idx,
2866 	uint32_t                slot_idx)
2867 {
2868 	if ((slot->min_address == 0) || (slot->max_address == 0) ||
2869 	    (start < slot->min_address) || (start >= slot->max_address) ||
2870 	    (end > slot->max_address)) {
2871 		kmem_invalid_slot_for_addr(slot, start, end);
2872 	}
2873 
2874 	kmem_validate_slot(start, meta, size_idx, slot_idx);
2875 }
2876 
2877 uint32_t
kmem_addr_get_slot_idx(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,struct kmem_page_meta ** meta,uint32_t * size_idx,mach_vm_range_t slot)2878 kmem_addr_get_slot_idx(
2879 	vm_map_offset_t         start,
2880 	vm_map_offset_t         end,
2881 	vm_map_range_id_t       range_id,
2882 	struct kmem_page_meta **meta,
2883 	uint32_t               *size_idx,
2884 	mach_vm_range_t         slot)
2885 {
2886 	vm_map_offset_t chunk_start;
2887 	vm_map_size_t slot_size;
2888 	uint32_t slot_idx;
2889 
2890 	*meta = kmem_addr_to_meta_start(start, range_id, &chunk_start);
2891 	*size_idx = (*meta)->km_sizeclass;
2892 	slot_size = kmem_get_size_from_idx(*size_idx);
2893 	slot_idx = (start - chunk_start) / slot_size;
2894 	slot->min_address = chunk_start + slot_idx * slot_size;
2895 	slot->max_address = slot->min_address + slot_size;
2896 
2897 	kmem_validate_slot_initial(slot, start, end, *meta, *size_idx, slot_idx);
2898 
2899 	return slot_idx;
2900 }
2901 
2902 static bool
kmem_populate_needed(vm_offset_t from,vm_offset_t to)2903 kmem_populate_needed(vm_offset_t from, vm_offset_t to)
2904 {
2905 #if KASAN
2906 #pragma unused(from, to)
2907 	return true;
2908 #else
2909 	vm_offset_t page_addr = trunc_page(from);
2910 
2911 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2912 		/*
2913 		 * This can race with another thread doing a populate on the same metadata
2914 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
2915 		 * fault in the shadow when we first access the metadata page. Avoid this
2916 		 * by always synchronizing on the kmem_meta_lock with KASan.
2917 		 */
2918 		if (!pmap_find_phys(kernel_pmap, page_addr)) {
2919 			return true;
2920 		}
2921 	}
2922 
2923 	return false;
2924 #endif /* !KASAN */
2925 }
2926 
2927 static void
kmem_populate_meta_locked(vm_offset_t from,vm_offset_t to)2928 kmem_populate_meta_locked(vm_offset_t from, vm_offset_t to)
2929 {
2930 	vm_offset_t page_addr = trunc_page(from);
2931 
2932 	vm_map_unlock(kernel_map);
2933 
2934 	for (; page_addr < to; page_addr += PAGE_SIZE) {
2935 		for (;;) {
2936 			kern_return_t ret = KERN_SUCCESS;
2937 
2938 			/*
2939 			 * All updates to kmem metadata are done under the kmem_meta_lock
2940 			 */
2941 			kmem_meta_lock();
2942 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
2943 				ret = kernel_memory_populate(page_addr,
2944 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
2945 				    VM_KERN_MEMORY_OSFMK);
2946 			}
2947 			kmem_meta_unlock();
2948 
2949 			if (ret == KERN_SUCCESS) {
2950 				break;
2951 			}
2952 
2953 			/*
2954 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
2955 			 * to bad system deadlocks, so if the allocation failed,
2956 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
2957 			 */
2958 			VM_PAGE_WAIT();
2959 		}
2960 	}
2961 
2962 	vm_map_lock(kernel_map);
2963 }
2964 
2965 __abortlike
2966 static void
kmem_invalid_meta_panic(struct kmem_page_meta * meta,uint32_t slot_idx,struct kmem_sizeclass sizeclass)2967 kmem_invalid_meta_panic(
2968 	struct kmem_page_meta  *meta,
2969 	uint32_t                slot_idx,
2970 	struct kmem_sizeclass   sizeclass)
2971 {
2972 	uint32_t size_idx = kmem_get_idx_from_size(sizeclass.ks_size);
2973 
2974 	if (slot_idx >= sizeclass.ks_num_elem) {
2975 		panic("Invalid slot idx %u [0:%u] for meta %p", slot_idx,
2976 		    sizeclass.ks_num_elem, meta);
2977 	}
2978 	if (meta->km_sizeclass != size_idx) {
2979 		panic("Invalid size_idx (%u != %u) in meta %p", size_idx,
2980 		    meta->km_sizeclass, meta);
2981 	}
2982 	panic("page_marker %u not primary in meta %p", meta->km_page_marker, meta);
2983 }
2984 
2985 __abortlike
2986 static void
kmem_slot_has_entry_panic(vm_map_entry_t entry,vm_map_offset_t addr)2987 kmem_slot_has_entry_panic(
2988 	vm_map_entry_t          entry,
2989 	vm_map_offset_t         addr)
2990 {
2991 	panic("Entry (%p) already exists for addr (%p) being returned",
2992 	    entry, (void *)addr);
2993 }
2994 
2995 __abortlike
2996 static void
kmem_slot_not_found(struct kmem_page_meta * meta,uint32_t slot_idx)2997 kmem_slot_not_found(
2998 	struct kmem_page_meta  *meta,
2999 	uint32_t                slot_idx)
3000 {
3001 	panic("%uth free slot not found for meta %p bitmap %u", slot_idx, meta,
3002 	    meta->km_bitmap);
3003 }
3004 
3005 /*
3006  * Returns a 16bit random number between 0 and
3007  * upper_limit (inclusive)
3008  */
3009 __startup_func
3010 uint16_t
kmem_get_random16(uint16_t upper_limit)3011 kmem_get_random16(
3012 	uint16_t                upper_limit)
3013 {
3014 	static uint64_t random_entropy;
3015 	assert(upper_limit < UINT16_MAX);
3016 	if (random_entropy == 0) {
3017 		random_entropy = early_random();
3018 	}
3019 	uint32_t result = random_entropy & UINT32_MAX;
3020 	random_entropy >>= 32;
3021 	return (uint16_t)(result % (upper_limit + 1));
3022 }
3023 
3024 static uint32_t
kmem_get_nth_free_slot(struct kmem_page_meta * meta,uint32_t n,uint32_t bitmap)3025 kmem_get_nth_free_slot(
3026 	struct kmem_page_meta  *meta,
3027 	uint32_t                n,
3028 	uint32_t                bitmap)
3029 {
3030 	uint32_t zeros_seen = 0, ones_seen = 0;
3031 
3032 	while (bitmap) {
3033 		uint32_t count = __builtin_ctz(bitmap);
3034 
3035 		zeros_seen += count;
3036 		bitmap >>= count;
3037 		if (__probable(~bitmap)) {
3038 			count = __builtin_ctz(~bitmap);
3039 		} else {
3040 			count = 32;
3041 		}
3042 		if (count + ones_seen > n) {
3043 			return zeros_seen + n;
3044 		}
3045 		ones_seen += count;
3046 		bitmap >>= count;
3047 	}
3048 
3049 	kmem_slot_not_found(meta, n);
3050 }
3051 
3052 
3053 static uint32_t
kmem_get_next_slot(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t bitmap)3054 kmem_get_next_slot(
3055 	struct kmem_page_meta  *meta,
3056 	struct kmem_sizeclass   sizeclass,
3057 	uint32_t                bitmap)
3058 {
3059 	uint32_t num_slots = __builtin_popcount(bitmap);
3060 	uint64_t slot_idx = 0;
3061 
3062 	assert(num_slots > 0);
3063 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
3064 		/*
3065 		 * Use early random prior to early boot as the ks_rng_ctx requires
3066 		 * the corecrypto module to be setup before it is initialized and
3067 		 * used.
3068 		 *
3069 		 * num_slots can't be 0 as we take this path when we have more than
3070 		 * one slot left.
3071 		 */
3072 		slot_idx = kmem_get_random16((uint16_t)num_slots - 1);
3073 	} else {
3074 		crypto_random_uniform(zpercpu_get(sizeclass.ks_rng_ctx), num_slots,
3075 		    &slot_idx);
3076 	}
3077 
3078 	return kmem_get_nth_free_slot(meta, slot_idx, bitmap);
3079 }
3080 
3081 /*
3082  * Returns an unallocated slot from the given metadata
3083  */
3084 static vm_map_offset_t
kmem_get_addr_from_meta(struct kmem_page_meta * meta,vm_map_range_id_t range_id,struct kmem_sizeclass sizeclass,vm_map_entry_t * entry)3085 kmem_get_addr_from_meta(
3086 	struct kmem_page_meta  *meta,
3087 	vm_map_range_id_t       range_id,
3088 	struct kmem_sizeclass   sizeclass,
3089 	vm_map_entry_t         *entry)
3090 {
3091 	vm_map_offset_t addr;
3092 	vm_map_size_t size = sizeclass.ks_size;
3093 	uint32_t size_idx = kmem_get_idx_from_size(size);
3094 	uint64_t meta_idx = meta - kmem_meta_base[range_id];
3095 	mach_vm_offset_t range_start = kmem_ranges[range_id].min_address;
3096 	uint32_t slot_bit;
3097 	uint32_t slot_idx = kmem_get_next_slot(meta, sizeclass, meta->km_bitmap);
3098 
3099 	if ((slot_idx >= sizeclass.ks_num_elem) ||
3100 	    (meta->km_sizeclass != size_idx) ||
3101 	    (meta->km_page_marker != KMEM_META_PRIMARY)) {
3102 		kmem_invalid_meta_panic(meta, slot_idx, sizeclass);
3103 	}
3104 
3105 	slot_bit = kmem_slot_idx_to_bit(slot_idx, size_idx);
3106 	meta->km_bitmap &= ~slot_bit;
3107 
3108 	addr = range_start + (meta_idx * KMEM_CHUNK_SIZE_MIN) + (slot_idx * size);
3109 	assert(kmem_range_contains_fully(range_id, addr, size));
3110 	if (vm_map_lookup_entry(kernel_map, addr, entry)) {
3111 		kmem_slot_has_entry_panic(*entry, addr);
3112 	}
3113 	if ((*entry != vm_map_to_entry(kernel_map)) &&
3114 	    ((*entry)->vme_next != vm_map_to_entry(kernel_map)) &&
3115 	    ((*entry)->vme_next->vme_start < (addr + size))) {
3116 		kmem_slot_has_entry_panic(*entry, addr);
3117 	}
3118 	return addr;
3119 }
3120 
3121 __abortlike
3122 static void
kmem_range_out_of_va(kmem_range_id_t range_id,uint32_t num_chunks)3123 kmem_range_out_of_va(
3124 	kmem_range_id_t         range_id,
3125 	uint32_t                num_chunks)
3126 {
3127 	panic("No more VA to allocate %u chunks in range %u", num_chunks, range_id);
3128 }
3129 
3130 static void
kmem_init_allocated_chunk(struct kmem_page_meta * meta,struct kmem_sizeclass sizeclass,uint32_t size_idx)3131 kmem_init_allocated_chunk(
3132 	struct kmem_page_meta  *meta,
3133 	struct kmem_sizeclass   sizeclass,
3134 	uint32_t                size_idx)
3135 {
3136 	uint32_t meta_num = sizeclass.ks_num_chunk;
3137 	uint32_t num_elem = sizeclass.ks_num_elem;
3138 
3139 	meta->km_bitmap = (1ull << num_elem) - 1;
3140 	meta->km_chunk_len = (uint16_t)meta_num;
3141 	assert(LIST_NEXT(meta, km_link) == NULL);
3142 	assert(meta->km_link.le_prev == NULL);
3143 	meta->km_sizeclass = (uint8_t)size_idx;
3144 	meta->km_page_marker = KMEM_META_PRIMARY;
3145 	meta++;
3146 	for (uint32_t i = 1; i < meta_num; i++) {
3147 		meta->km_page_idx = (uint16_t)i;
3148 		meta->km_sizeclass = (uint8_t)size_idx;
3149 		meta->km_page_marker = 0;
3150 		meta->km_bitmap = 0;
3151 		meta++;
3152 	}
3153 }
3154 
3155 static uint32_t
kmem_get_additional_meta(struct kmem_page_meta * meta,uint32_t meta_req,bool from_right,struct kmem_page_meta ** adj_free_meta)3156 kmem_get_additional_meta(
3157 	struct kmem_page_meta  *meta,
3158 	uint32_t                meta_req,
3159 	bool                    from_right,
3160 	struct kmem_page_meta **adj_free_meta)
3161 {
3162 	struct kmem_page_meta *meta_prev = from_right ? meta : (meta - 1);
3163 
3164 	if (meta_prev->km_page_marker == KMEM_META_FREE) {
3165 		uint32_t chunk_len = kmem_get_free_chunk_len(meta_prev);
3166 
3167 		*adj_free_meta = from_right ? meta_prev : (meta_prev - chunk_len + 1);
3168 		meta_req -= chunk_len;
3169 	} else {
3170 		*adj_free_meta = NULL;
3171 	}
3172 
3173 	return meta_req;
3174 }
3175 
3176 
3177 static struct kmem_page_meta *
kmem_get_new_chunk(vm_map_range_id_t range_id,bool from_right,uint32_t size_idx)3178 kmem_get_new_chunk(
3179 	vm_map_range_id_t       range_id,
3180 	bool                    from_right,
3181 	uint32_t                size_idx)
3182 {
3183 	struct kmem_sizeclass sizeclass = kmem_size_array[size_idx];
3184 	struct kmem_page_meta *start, *end, *meta_update;
3185 	struct kmem_page_meta *adj_free_meta = NULL;
3186 	uint32_t meta_req = sizeclass.ks_num_chunk;
3187 
3188 	for (;;) {
3189 		struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3190 		struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3191 		struct kmem_page_meta *meta;
3192 		vm_offset_t start_addr, end_addr;
3193 		uint32_t meta_num;
3194 
3195 		meta = from_right ? metab : metaf;
3196 		meta_num = kmem_get_additional_meta(meta, meta_req, from_right,
3197 		    &adj_free_meta);
3198 
3199 		if (metaf + meta_num >= metab) {
3200 			kmem_range_out_of_va(range_id, meta_num);
3201 		}
3202 
3203 		start = from_right ? (metab - meta_num) : metaf;
3204 		end = from_right ? metab : (metaf + meta_num);
3205 
3206 		start_addr = (vm_offset_t)start;
3207 		end_addr   = (vm_offset_t)end;
3208 
3209 		/*
3210 		 * If the new high watermark stays on the same page,
3211 		 * no need to populate and drop the lock.
3212 		 */
3213 		if (!page_aligned(from_right ? end_addr : start_addr) &&
3214 		    trunc_page(start_addr) == trunc_page(end_addr - 1)) {
3215 			break;
3216 		}
3217 		if (!kmem_populate_needed(start_addr, end_addr)) {
3218 			break;
3219 		}
3220 
3221 		kmem_populate_meta_locked(start_addr, end_addr);
3222 
3223 		/*
3224 		 * Since we dropped the lock, reassess conditions still hold:
3225 		 * - the HWM we are changing must not have moved
3226 		 * - the other HWM must not intersect with ours
3227 		 * - in case of coalescing, the adjacent free meta must still
3228 		 *   be free and of the same size.
3229 		 *
3230 		 * If we failed to grow, reevaluate whether freelists have
3231 		 * entries now by returning NULL.
3232 		 */
3233 		metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3234 		metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3235 		if (meta != (from_right ? metab : metaf)) {
3236 			return NULL;
3237 		}
3238 		if (metaf + meta_num >= metab) {
3239 			kmem_range_out_of_va(range_id, meta_num);
3240 		}
3241 		if (adj_free_meta) {
3242 			if (adj_free_meta->km_page_marker != KMEM_META_FREE ||
3243 			    kmem_get_free_chunk_len(adj_free_meta) !=
3244 			    meta_req - meta_num) {
3245 				return NULL;
3246 			}
3247 		}
3248 
3249 		break;
3250 	}
3251 
3252 	/*
3253 	 * If there is an adjacent free chunk remove it from free list
3254 	 */
3255 	if (adj_free_meta) {
3256 		LIST_REMOVE(adj_free_meta, km_link);
3257 		LIST_NEXT(adj_free_meta, km_link) = NULL;
3258 		adj_free_meta->km_link.le_prev = NULL;
3259 	}
3260 
3261 	/*
3262 	 * Update hwm
3263 	 */
3264 	meta_update = from_right ? start : end;
3265 	kmem_meta_hwm[kmem_get_front(range_id, from_right)] = meta_update;
3266 
3267 	/*
3268 	 * Initialize metadata
3269 	 */
3270 	start = from_right ? start : (end - meta_req);
3271 	kmem_init_allocated_chunk(start, sizeclass, size_idx);
3272 
3273 	return start;
3274 }
3275 
3276 static void
kmem_requeue_meta(struct kmem_page_meta * meta,struct kmem_list_head * head)3277 kmem_requeue_meta(
3278 	struct kmem_page_meta  *meta,
3279 	struct kmem_list_head  *head)
3280 {
3281 	LIST_REMOVE(meta, km_link);
3282 	LIST_INSERT_HEAD(head, meta, km_link);
3283 }
3284 
3285 /*
3286  * Return corresponding sizeclass to stash free chunks in
3287  */
3288 __abortlike
3289 static void
kmem_invalid_chunk_num(uint32_t chunks)3290 kmem_invalid_chunk_num(uint32_t chunks)
3291 {
3292 	panic("Invalid number of chunks %u\n", chunks);
3293 }
3294 
3295 static uint32_t
kmem_get_size_idx_for_chunks(uint32_t chunks)3296 kmem_get_size_idx_for_chunks(uint32_t chunks)
3297 {
3298 	for (uint32_t i = KMEM_NUM_SIZECLASS - 1; i > 0; i--) {
3299 		if (chunks >= kmem_size_array[i].ks_num_chunk) {
3300 			return i;
3301 		}
3302 	}
3303 	kmem_invalid_chunk_num(chunks);
3304 }
3305 
3306 static void
kmem_clear_meta_range(struct kmem_page_meta * meta,uint32_t count)3307 kmem_clear_meta_range(struct kmem_page_meta *meta, uint32_t count)
3308 {
3309 	bzero(meta, count * sizeof(struct kmem_page_meta));
3310 }
3311 
3312 static void
kmem_check_meta_range_is_clear(struct kmem_page_meta * meta,uint32_t count)3313 kmem_check_meta_range_is_clear(struct kmem_page_meta *meta, uint32_t count)
3314 {
3315 #if MACH_ASSERT
3316 	size_t size = count * sizeof(struct kmem_page_meta);
3317 
3318 	assert(memcmp_zero_ptr_aligned(meta, size) == 0);
3319 #else
3320 #pragma unused(meta, count)
3321 #endif
3322 }
3323 
3324 /*!
3325  * @function kmem_init_free_chunk()
3326  *
3327  * @discussion
3328  * This function prepares a range of chunks to be put on a free list.
3329  * The first and last metadata might be dirty, but the "inner" ones
3330  * must be zero filled by the caller prior to calling this function.
3331  */
3332 static void
kmem_init_free_chunk(struct kmem_page_meta * meta,uint32_t num_chunks,uint32_t front)3333 kmem_init_free_chunk(
3334 	struct kmem_page_meta  *meta,
3335 	uint32_t                num_chunks,
3336 	uint32_t                front)
3337 {
3338 	struct kmem_sizeclass *sizeclass;
3339 	uint32_t size_idx = kmem_get_size_idx_for_chunks(num_chunks);
3340 
3341 	if (num_chunks > 2) {
3342 		kmem_check_meta_range_is_clear(meta + 1, num_chunks - 2);
3343 	}
3344 
3345 	meta[0] = (struct kmem_page_meta){
3346 		.km_free_chunks = num_chunks,
3347 		.km_page_marker = KMEM_META_FREE,
3348 		.km_sizeclass   = (uint8_t)size_idx,
3349 	};
3350 	if (num_chunks > 1) {
3351 		meta[num_chunks - 1] = (struct kmem_page_meta){
3352 			.km_free_chunks = num_chunks,
3353 			.km_page_marker = KMEM_META_FREE,
3354 			.km_sizeclass   = (uint8_t)size_idx,
3355 		};
3356 	}
3357 
3358 	sizeclass = &kmem_size_array[size_idx];
3359 	LIST_INSERT_HEAD(&sizeclass->ks_allfree_head[front], meta, km_link);
3360 }
3361 
3362 static struct kmem_page_meta *
kmem_get_free_chunk_from_list(struct kmem_sizeclass * org_sizeclass,uint32_t size_idx,uint32_t front)3363 kmem_get_free_chunk_from_list(
3364 	struct kmem_sizeclass  *org_sizeclass,
3365 	uint32_t                size_idx,
3366 	uint32_t                front)
3367 {
3368 	struct kmem_sizeclass *sizeclass;
3369 	uint32_t num_chunks = org_sizeclass->ks_num_chunk;
3370 	struct kmem_page_meta *meta;
3371 	uint32_t idx = size_idx;
3372 
3373 	while (idx < KMEM_NUM_SIZECLASS) {
3374 		sizeclass = &kmem_size_array[idx];
3375 		meta = LIST_FIRST(&sizeclass->ks_allfree_head[front]);
3376 		if (meta) {
3377 			break;
3378 		}
3379 		idx++;
3380 	}
3381 
3382 	/*
3383 	 * Trim if larger in size
3384 	 */
3385 	if (meta) {
3386 		uint32_t num_chunks_free = kmem_get_free_chunk_len(meta);
3387 
3388 		assert(meta->km_page_marker == KMEM_META_FREE);
3389 		LIST_REMOVE(meta, km_link);
3390 		LIST_NEXT(meta, km_link) = NULL;
3391 		meta->km_link.le_prev = NULL;
3392 		if (num_chunks_free > num_chunks) {
3393 			num_chunks_free -= num_chunks;
3394 			kmem_init_free_chunk(meta + num_chunks, num_chunks_free, front);
3395 		}
3396 
3397 		kmem_init_allocated_chunk(meta, *org_sizeclass, size_idx);
3398 	}
3399 
3400 	return meta;
3401 }
3402 
3403 kern_return_t
kmem_locate_space(vm_map_size_t size,vm_map_range_id_t range_id,bool from_right,vm_map_offset_t * start_inout,vm_map_entry_t * entry_out)3404 kmem_locate_space(
3405 	vm_map_size_t           size,
3406 	vm_map_range_id_t       range_id,
3407 	bool                    from_right,
3408 	vm_map_offset_t        *start_inout,
3409 	vm_map_entry_t         *entry_out)
3410 {
3411 	vm_map_entry_t entry;
3412 	uint32_t size_idx = kmem_get_idx_from_size(size);
3413 	uint32_t front = kmem_get_front(range_id, from_right);
3414 	struct kmem_sizeclass *sizeclass = &kmem_size_array[size_idx];
3415 	struct kmem_page_meta *meta;
3416 
3417 	assert(size <= sizeclass->ks_size);
3418 again:
3419 	if ((meta = LIST_FIRST(&sizeclass->ks_partial_head[front])) != NULL) {
3420 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3421 		/*
3422 		 * Requeue to full if necessary
3423 		 */
3424 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3425 		if (__builtin_popcount(meta->km_bitmap) == KMEM_NUM_GUARDS) {
3426 			kmem_requeue_meta(meta, &sizeclass->ks_full_head[front]);
3427 		}
3428 	} else if ((meta = kmem_get_free_chunk_from_list(sizeclass, size_idx,
3429 	    front)) != NULL) {
3430 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3431 		/*
3432 		 * Queue to partial
3433 		 */
3434 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3435 		assert(__builtin_popcount(meta->km_bitmap) > KMEM_NUM_GUARDS);
3436 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3437 	} else {
3438 		meta = kmem_get_new_chunk(range_id, from_right, size_idx);
3439 		if (meta == NULL) {
3440 			goto again;
3441 		}
3442 		*start_inout = kmem_get_addr_from_meta(meta, range_id, *sizeclass, &entry);
3443 		assert(meta->km_page_marker == KMEM_META_PRIMARY);
3444 		LIST_INSERT_HEAD(&sizeclass->ks_partial_head[front], meta, km_link);
3445 	}
3446 
3447 	if (entry_out) {
3448 		*entry_out = entry;
3449 	}
3450 
3451 	return KERN_SUCCESS;
3452 }
3453 
3454 /*
3455  * Determine whether the given metadata was allocated from the right
3456  */
3457 static bool
kmem_meta_is_from_right(kmem_range_id_t range_id,struct kmem_page_meta * meta)3458 kmem_meta_is_from_right(
3459 	kmem_range_id_t         range_id,
3460 	struct kmem_page_meta  *meta)
3461 {
3462 	struct kmem_page_meta *metaf = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3463 	__assert_only struct kmem_page_meta *metab = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3464 	struct kmem_page_meta *meta_base = kmem_meta_base[range_id];
3465 	struct kmem_page_meta *meta_end;
3466 
3467 	meta_end = (struct kmem_page_meta *)kmem_meta_range[range_id].max_address;
3468 
3469 	if ((meta >= meta_base) && (meta < metaf)) {
3470 		return false;
3471 	}
3472 
3473 	assert(meta >= metab && meta < meta_end);
3474 	return true;
3475 }
3476 
3477 static void
kmem_free_chunk(kmem_range_id_t range_id,struct kmem_page_meta * meta,bool from_right)3478 kmem_free_chunk(
3479 	kmem_range_id_t         range_id,
3480 	struct kmem_page_meta  *meta,
3481 	bool                    from_right)
3482 {
3483 	struct kmem_page_meta *meta_coalesce = meta - 1;
3484 	struct kmem_page_meta *meta_start = meta;
3485 	uint32_t num_chunks = kmem_get_chunk_len(meta);
3486 	uint32_t add_chunks;
3487 	struct kmem_page_meta *meta_end = meta + num_chunks;
3488 	struct kmem_page_meta *meta_hwm_l, *meta_hwm_r;
3489 	uint32_t front = kmem_get_front(range_id, from_right);
3490 
3491 	meta_hwm_l = kmem_meta_hwm[kmem_get_front(range_id, 0)];
3492 	meta_hwm_r = kmem_meta_hwm[kmem_get_front(range_id, 1)];
3493 
3494 	LIST_REMOVE(meta, km_link);
3495 	kmem_clear_meta_range(meta, num_chunks);
3496 
3497 	/*
3498 	 * Coalesce left
3499 	 */
3500 	if (((from_right && (meta_coalesce >= meta_hwm_r)) || !from_right) &&
3501 	    (meta_coalesce->km_page_marker == KMEM_META_FREE)) {
3502 		meta_start = meta_coalesce - kmem_get_free_chunk_len(meta_coalesce) + 1;
3503 		add_chunks = kmem_get_free_chunk_len(meta_start);
3504 		num_chunks += add_chunks;
3505 		LIST_REMOVE(meta_start, km_link);
3506 		kmem_clear_meta_range(meta_start + add_chunks - 1, 1);
3507 	}
3508 
3509 	/*
3510 	 * Coalesce right
3511 	 */
3512 	if (((!from_right && (meta_end < meta_hwm_l)) || from_right) &&
3513 	    (meta_end->km_page_marker == KMEM_META_FREE)) {
3514 		add_chunks = kmem_get_free_chunk_len(meta_end);
3515 		LIST_REMOVE(meta_end, km_link);
3516 		kmem_clear_meta_range(meta_end, 1);
3517 		meta_end = meta_end + add_chunks;
3518 		num_chunks += add_chunks;
3519 	}
3520 
3521 	kmem_init_free_chunk(meta_start, num_chunks, front);
3522 }
3523 
3524 static void
kmem_free_slot(kmem_range_id_t range_id,mach_vm_range_t slot)3525 kmem_free_slot(
3526 	kmem_range_id_t         range_id,
3527 	mach_vm_range_t         slot)
3528 {
3529 	struct kmem_page_meta *meta;
3530 	vm_map_offset_t chunk_start;
3531 	uint32_t size_idx, chunk_elem, slot_idx, num_elem;
3532 	struct kmem_sizeclass *sizeclass;
3533 	vm_map_size_t slot_size;
3534 
3535 	meta = kmem_addr_to_meta_start(slot->min_address, range_id, &chunk_start);
3536 	size_idx = meta->km_sizeclass;
3537 	slot_size = kmem_get_size_from_idx(size_idx);
3538 	slot_idx = (slot->min_address - chunk_start) / slot_size;
3539 	assert((meta->km_bitmap & kmem_slot_idx_to_bit(slot_idx, size_idx)) == 0);
3540 	meta->km_bitmap |= kmem_slot_idx_to_bit(slot_idx, size_idx);
3541 
3542 	sizeclass = &kmem_size_array[size_idx];
3543 	chunk_elem = sizeclass->ks_num_elem;
3544 	num_elem = __builtin_popcount(meta->km_bitmap);
3545 
3546 	if (num_elem == chunk_elem) {
3547 		/*
3548 		 * If entire chunk empty add to emtpy list
3549 		 */
3550 		bool from_right = kmem_meta_is_from_right(range_id, meta);
3551 
3552 		kmem_free_chunk(range_id, meta, from_right);
3553 	} else if (num_elem == KMEM_NUM_GUARDS + 1) {
3554 		/*
3555 		 * If we freed to full chunk move it to partial
3556 		 */
3557 		uint32_t front = kmem_get_front(range_id,
3558 		    kmem_meta_is_from_right(range_id, meta));
3559 
3560 		kmem_requeue_meta(meta, &sizeclass->ks_partial_head[front]);
3561 	}
3562 }
3563 
3564 void
kmem_free_space(vm_map_offset_t start,vm_map_offset_t end,vm_map_range_id_t range_id,mach_vm_range_t slot)3565 kmem_free_space(
3566 	vm_map_offset_t         start,
3567 	vm_map_offset_t         end,
3568 	vm_map_range_id_t       range_id,
3569 	mach_vm_range_t         slot)
3570 {
3571 	bool entry_present = false;
3572 	vm_map_entry_t prev_entry;
3573 	vm_map_entry_t next_entry;
3574 
3575 	if ((slot->min_address == start) && (slot->max_address == end)) {
3576 		/*
3577 		 * Entire slot is being freed at once
3578 		 */
3579 		return kmem_free_slot(range_id, slot);
3580 	}
3581 
3582 	entry_present = vm_map_lookup_entry(kernel_map, start, &prev_entry);
3583 	assert(!entry_present);
3584 	next_entry = prev_entry->vme_next;
3585 
3586 	if (((prev_entry == vm_map_to_entry(kernel_map) ||
3587 	    prev_entry->vme_end <= slot->min_address)) &&
3588 	    (next_entry == vm_map_to_entry(kernel_map) ||
3589 	    (next_entry->vme_start >= slot->max_address))) {
3590 		/*
3591 		 * Free entire slot
3592 		 */
3593 		kmem_free_slot(range_id, slot);
3594 	}
3595 }
3596 
3597 #pragma mark kmem init
3598 
3599 /*
3600  * The default percentage of memory that can be mlocked is scaled based on the total
3601  * amount of memory in the system. These percentages are caclulated
3602  * offline and stored in this table. We index this table by
3603  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
3604  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
3605  *
3606  * Note that these values were picked for mac.
3607  * If we ever have very large memory config arm devices, we may want to revisit
3608  * since the kernel overhead is smaller there due to the larger page size.
3609  */
3610 
3611 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
3612 #define VM_USER_WIREABLE_MIN_CONFIG 32
3613 #if CONFIG_JETSAM
3614 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
3615  * pressure.
3616  */
3617 static vm_map_size_t wire_limit_percents[] =
3618 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
3619 #else
3620 static vm_map_size_t wire_limit_percents[] =
3621 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
3622 #endif /* CONFIG_JETSAM */
3623 
3624 /* Set limit to 95% of DRAM if serverperfmode=1 */
3625 #define VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT 95
3626 /* Use special serverperfmode behavior iff DRAM > 2^35 = 32GiB of RAM. */
3627 #define VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG 35
3628 
3629 /*
3630  * Sets the default global user wire limit which limits the amount of
3631  * memory that can be locked via mlock() based on the above algorithm..
3632  * This can be overridden via a sysctl.
3633  */
3634 static void
kmem_set_user_wire_limits(void)3635 kmem_set_user_wire_limits(void)
3636 {
3637 	uint64_t available_mem_log;
3638 	uint64_t max_wire_percent;
3639 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
3640 	    sizeof(vm_map_size_t);
3641 	vm_map_size_t limit;
3642 	uint64_t config_memsize = max_mem;
3643 #if defined(XNU_TARGET_OS_OSX)
3644 	config_memsize = max_mem_actual;
3645 #endif /* defined(XNU_TARGET_OS_OSX) */
3646 
3647 	available_mem_log = bit_floor(config_memsize);
3648 
3649 	if (serverperfmode &&
3650 	    (available_mem_log >= VM_USER_SERVERPERF_WIREABLE_MIN_CONFIG)) {
3651 		max_wire_percent = VM_USER_SERVERPERF_WIRE_LIMIT_PERCENT;
3652 	} else {
3653 		if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
3654 			available_mem_log = 0;
3655 		} else {
3656 			available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
3657 		}
3658 		if (available_mem_log >= wire_limit_percents_length) {
3659 			available_mem_log = wire_limit_percents_length - 1;
3660 		}
3661 		max_wire_percent = wire_limit_percents[available_mem_log];
3662 	}
3663 
3664 	limit = config_memsize * max_wire_percent / 100;
3665 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
3666 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
3667 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
3668 	}
3669 
3670 	vm_global_user_wire_limit = limit;
3671 	/* the default per task limit is the same as the global limit */
3672 	vm_per_task_user_wire_limit = limit;
3673 	vm_add_wire_count_over_global_limit = 0;
3674 	vm_add_wire_count_over_user_limit = 0;
3675 }
3676 
3677 #define KMEM_MAX_CLAIMS 50
3678 __startup_data
3679 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
3680 
3681 #if !MACH_ASSERT
3682 __startup_data
3683 #endif /* !MACH_ASSERT */
3684 uint32_t kmem_claim_count = 0;
3685 
3686 #if MACH_ASSERT
3687 /**
3688  * Save off some minimal information about the ranges for consumption by
3689  * post-lockdown tests.
3690  */
3691 static struct mach_vm_range kmem_test_saved_ranges[KMEM_MAX_CLAIMS];
3692 #endif /* MACH_ASSERT */
3693 
3694 /**
3695  * For a requested claim size (i.e. kc_size), get the number of bytes which
3696  * should actually be allocated for a region in order to be able to properly
3697  * provide the requested size (the allocation size).
3698  *
3699  * This allocation size is always greater or equal to the claim size. It can,
3700  * for example, include additional space as required by the kernel memory
3701  * configuration.
3702  *
3703  * @param known_last Is the claim in question known to be the last region after
3704  * all placing has completed? The size for a known_last allocation is always
3705  * less than or equal to a non-known_last allocation of the same size.
3706  */
3707 __startup_func
3708 static vm_map_size_t
kmem_claim_to_allocation_size(vm_map_size_t claim_size,bool known_last)3709 kmem_claim_to_allocation_size(vm_map_size_t claim_size, bool known_last)
3710 {
3711 	(void)known_last;
3712 	/*
3713 	 * Allocation size and claim size are identical.
3714 	 */
3715 	return claim_size;
3716 }
3717 
3718 /**
3719  * Compute the largest claim which can be made from a given allocation size.
3720  */
3721 static vm_map_size_t
kmem_allocation_to_claim_size(vm_map_size_t allocation_size)3722 kmem_allocation_to_claim_size(vm_map_size_t allocation_size)
3723 {
3724 	/*
3725 	 * Allocation size and claim size are identical.
3726 	 */
3727 	return allocation_size;
3728 }
3729 
3730 __startup_func
3731 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)3732 kmem_range_startup_init(
3733 	struct kmem_range_startup_spec *sp)
3734 {
3735 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
3736 	if (sp->kc_calculate_sz) {
3737 		sp->kc_size = (sp->kc_calculate_sz)();
3738 	}
3739 	if (sp->kc_size) {
3740 		kmem_claims[kmem_claim_count] = *sp;
3741 		kmem_claim_count++;
3742 	}
3743 }
3744 
3745 static vm_offset_t
kmem_fuzz_start(void)3746 kmem_fuzz_start(void)
3747 {
3748 	vm_offset_t kmapoff_kaddr = 0;
3749 	uint32_t kmapoff_pgcnt;
3750 
3751 	kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
3752 
3753 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
3754 
3755 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
3756 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
3757 	    VM_KERN_MEMORY_OSFMK);
3758 
3759 
3760 	return kmapoff_kaddr + kmapoff_size;
3761 }
3762 
3763 /*
3764  * Generate a randomly shuffled array of indices from 0 to count - 1
3765  */
3766 __startup_func
3767 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)3768 kmem_shuffle(
3769 	uint16_t       *shuffle_buf,
3770 	uint16_t        count)
3771 {
3772 	for (uint16_t i = 0; i < count; i++) {
3773 		uint16_t j = kmem_get_random16(i);
3774 		if (j != i) {
3775 			shuffle_buf[i] = shuffle_buf[j];
3776 		}
3777 		shuffle_buf[j] = i;
3778 	}
3779 }
3780 
3781 __startup_func
3782 static void
kmem_shuffle_claims(void)3783 kmem_shuffle_claims(void)
3784 {
3785 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
3786 	uint16_t limit = (uint16_t)kmem_claim_count;
3787 
3788 	kmem_shuffle(&shuffle_buf[0], limit);
3789 	for (uint16_t i = 0; i < limit; i++) {
3790 		struct kmem_range_startup_spec tmp = kmem_claims[i];
3791 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
3792 		kmem_claims[shuffle_buf[i]] = tmp;
3793 	}
3794 }
3795 
3796 __startup_func
3797 static void
kmem_readjust_ranges(uint32_t cur_idx)3798 kmem_readjust_ranges(
3799 	uint32_t        cur_idx)
3800 {
3801 	assert(cur_idx != 0);
3802 	uint32_t j = cur_idx - 1, random;
3803 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
3804 	struct mach_vm_range *sp_range = sp.kc_range;
3805 	/*
3806 	 * Even if sp is currently last, it will never be last after it is moved.
3807 	 * As such, we want to bump other claims over it and include any necessary
3808 	 * padding for a non-last claim.
3809 	 *
3810 	 * While changing which claim is last can impact the total VA usage, since a
3811 	 * known_last allocation size is guaranteed to always be less-than-or-equal
3812 	 * to a non-known_last allocation (which is used for pre-placement sizing),
3813 	 * we will always have enough space so long as the pre-placement sizing had
3814 	 * enough space.
3815 	 */
3816 	vm_map_offset_t sp_allocation_size =
3817 	    kmem_claim_to_allocation_size(sp.kc_size, /* known_last */ false);
3818 
3819 	/*
3820 	 * Find max index where restriction is met
3821 	 */
3822 	for (; j > 0; j--) {
3823 		struct kmem_range_startup_spec spj = kmem_claims[j];
3824 		vm_map_offset_t max_start = spj.kc_range->min_address;
3825 		if (spj.kc_flags & KC_NO_MOVE) {
3826 			panic("kmem_range_init: Can't scramble with multiple constraints");
3827 		}
3828 		if (max_start <= sp_range->min_address) {
3829 			break;
3830 		}
3831 	}
3832 
3833 	/*
3834 	 * Pick a random index from 0 to max index and shift claims to the right
3835 	 * to make room for restricted claim
3836 	 */
3837 	random = kmem_get_random16((uint16_t)j);
3838 	assert(random <= j);
3839 
3840 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
3841 	sp_range->max_address = sp_range->min_address + sp.kc_size;
3842 
3843 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
3844 		struct kmem_range_startup_spec spj = kmem_claims[j];
3845 		struct mach_vm_range *range = spj.kc_range;
3846 		range->min_address += sp_allocation_size;
3847 		range->max_address += sp_allocation_size;
3848 		kmem_claims[j + 1] = spj;
3849 	}
3850 
3851 	sp.kc_flags |= KC_NO_MOVE;
3852 	kmem_claims[random] = sp;
3853 }
3854 
3855 __startup_func
3856 static void
kmem_add_ptr_claims(void)3857 kmem_add_ptr_claims(void)
3858 {
3859 	uint64_t kmem_meta_num, kmem_ptr_chunks;
3860 	vm_map_size_t org_ptr_range_size __assert_only;
3861 
3862 	org_ptr_range_size = ptr_range_size;
3863 
3864 	ptr_range_size -= PAGE_SIZE;
3865 	ptr_range_size *= KMEM_CHUNK_SIZE_MIN;
3866 	ptr_range_size /= (KMEM_CHUNK_SIZE_MIN + sizeof(struct kmem_page_meta));
3867 
3868 	kmem_ptr_chunks = ptr_range_size / KMEM_CHUNK_SIZE_MIN;
3869 	ptr_range_size = kmem_ptr_chunks * KMEM_CHUNK_SIZE_MIN;
3870 
3871 	kmem_meta_num = kmem_ptr_chunks + 2;
3872 	kmem_meta_size = round_page(kmem_meta_num * sizeof(struct kmem_page_meta));
3873 
3874 	assert(kmem_meta_size + ptr_range_size <= org_ptr_range_size);
3875 	/*
3876 	 * Add claims for kmem's ranges
3877 	 */
3878 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
3879 		struct kmem_range_startup_spec kmem_spec = {
3880 			.kc_name = "kmem_ptr_range",
3881 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
3882 			.kc_size = ptr_range_size,
3883 			.kc_flags = KC_NO_ENTRY,
3884 		};
3885 		kmem_claims[kmem_claim_count++] = kmem_spec;
3886 
3887 		struct kmem_range_startup_spec kmem_meta_spec = {
3888 			.kc_name = "kmem_ptr_range_meta",
3889 			.kc_range = &kmem_meta_range[KMEM_RANGE_ID_PTR_0 + i],
3890 			.kc_size = kmem_meta_size,
3891 			.kc_flags = KC_NONE,
3892 		};
3893 		kmem_claims[kmem_claim_count++] = kmem_meta_spec;
3894 	}
3895 }
3896 
3897 __startup_func
3898 static void
kmem_add_extra_claims(void)3899 kmem_add_extra_claims(void)
3900 {
3901 	vm_map_size_t largest_free_size = 0, total_claims = 0;
3902 	vm_map_size_t sane_sprayqtn_size = 0, sprayqtn_allocation_size = 0;
3903 	vm_map_size_t ptr_total_allocation_size = 0;
3904 
3905 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
3906 	largest_free_size = trunc_page(largest_free_size);
3907 
3908 	/*
3909 	 * kasan and configs w/o *TRR need to have just one ptr range due to
3910 	 * resource constraints.
3911 	 */
3912 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
3913 	kmem_ptr_ranges = 1;
3914 #endif
3915 	/*
3916 	 * Determine size of data and pointer kmem_ranges
3917 	 */
3918 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
3919 		struct kmem_range_startup_spec sp_i = kmem_claims[i];
3920 
3921 		total_claims += kmem_claim_to_allocation_size(
3922 			sp_i.kc_size, /* known_last */ false);
3923 	}
3924 	assert((total_claims & PAGE_MASK) == 0);
3925 
3926 
3927 	largest_free_size -= total_claims;
3928 
3929 	/*
3930 	 * Use half the total available VA for all pointer allocations (this
3931 	 * includes the kmem_sprayqtn range). Given that we have 4 total
3932 	 * ranges divide the available VA by 8.
3933 	 */
3934 	ptr_range_size = largest_free_size / ((kmem_ptr_ranges + 1) * 2);
3935 
3936 	sprayqtn_range_size = ptr_range_size;
3937 	sane_sprayqtn_size = kmem_claim_to_allocation_size(
3938 		/* claim_size */ sane_size / 2, /* known_last */ false);
3939 	if (sprayqtn_range_size > sane_sprayqtn_size) {
3940 		vm_map_size_t sprayqtn_extra;
3941 
3942 		/*
3943 		 * Spray quarantine doesn't need that much space.
3944 		 * Shrink it to something reasonable and equally share the leftover VA
3945 		 * with the other pointer ranges.
3946 		 */
3947 		sprayqtn_extra = sprayqtn_range_size - sane_sprayqtn_size;
3948 		sprayqtn_range_size -= sprayqtn_extra;
3949 		ptr_range_size += sprayqtn_extra / kmem_ptr_ranges;
3950 	}
3951 
3952 	ptr_range_size = round_page(ptr_range_size);
3953 	sprayqtn_range_size = round_page(sprayqtn_range_size);
3954 
3955 	/* Less any necessary allocation padding... */
3956 	ptr_range_size = kmem_allocation_to_claim_size(ptr_range_size);
3957 	sprayqtn_range_size = kmem_allocation_to_claim_size(sprayqtn_range_size);
3958 
3959 	/*
3960 	 * Add the pointer and metadata claims
3961 	 * Note: this call modifies ptr_range_size and may, depending on the padding
3962 	 * requirements, slightly increase or decrease the overall allocation size
3963 	 * of the pointer+metadata region.
3964 	 */
3965 	kmem_add_ptr_claims();
3966 
3967 	sprayqtn_allocation_size = kmem_claim_to_allocation_size(
3968 		sprayqtn_range_size, /* known_last */ false);
3969 	ptr_total_allocation_size =
3970 	    (kmem_claim_to_allocation_size(ptr_range_size, /* known_last */ false) +
3971 	    kmem_claim_to_allocation_size(kmem_meta_size, /* known_last */ false)) *
3972 	    kmem_ptr_ranges;
3973 
3974 	/*
3975 	 * Check: spray and ptr_range are minimally valid.
3976 	 * This is a useful assert as it should catch us if we were to end up with a
3977 	 * "negative" (or extremely large) data_range_size.
3978 	 */
3979 	assert(sprayqtn_allocation_size + ptr_total_allocation_size < largest_free_size);
3980 
3981 	/*
3982 	 * Finally, give any remaining allocable space to the data region.
3983 	 */
3984 	data_range_size = largest_free_size - sprayqtn_allocation_size -
3985 	    ptr_total_allocation_size;
3986 
3987 	/* Less any necessary allocation padding... */
3988 	data_range_size = kmem_allocation_to_claim_size(data_range_size);
3989 
3990 	/* Check: our allocations should all still fit in the free space */
3991 	assert(sprayqtn_allocation_size + ptr_total_allocation_size +
3992 	    kmem_claim_to_allocation_size(data_range_size, /* known_last */ false) <=
3993 	    largest_free_size);
3994 
3995 	struct kmem_range_startup_spec kmem_spec_sprayqtn = {
3996 		.kc_name = "kmem_sprayqtn_range",
3997 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_SPRAYQTN],
3998 		.kc_size = sprayqtn_range_size,
3999 		.kc_flags = KC_NO_ENTRY,
4000 	};
4001 	kmem_claims[kmem_claim_count++] = kmem_spec_sprayqtn;
4002 
4003 	struct kmem_range_startup_spec kmem_spec_data = {
4004 		.kc_name = "kmem_data_range",
4005 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
4006 		.kc_size = data_range_size,
4007 		.kc_flags = KC_NO_ENTRY,
4008 	};
4009 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
4010 }
4011 
4012 __startup_func
4013 static void
kmem_scramble_ranges(void)4014 kmem_scramble_ranges(void)
4015 {
4016 	vm_map_offset_t va_alloc_head = 0;
4017 
4018 	/*
4019 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
4020 	 * the vm can find the requested ranges.
4021 	 */
4022 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
4023 	    VM_MAP_PAGE_SIZE(kernel_map));
4024 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
4025 
4026 	/*
4027 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
4028 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
4029 	 * could overflow into the heap.
4030 	 */
4031 	kext_alloc_init();
4032 
4033 	/*
4034 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
4035 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
4036 	 * eats about 2M of VA from the map)
4037 	 *
4038 	 * Note that we always need to slide by at least one page because the VM
4039 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
4040 	 * do not admit this address to be part of any zone submap.
4041 	 */
4042 	va_alloc_head = kmem_fuzz_start();
4043 
4044 	/*
4045 	 * Add claims for ptr and data kmem_ranges
4046 	 */
4047 	kmem_add_extra_claims();
4048 
4049 	/*
4050 	 * Minimally verify that our placer will be able to resolve the constraints
4051 	 * of all claims
4052 	 */
4053 	bool has_min_address = false;
4054 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4055 		struct kmem_range_startup_spec sp_i = kmem_claims[i];
4056 
4057 		/* Verify that we have only one claim with a min address constraint */
4058 		if (sp_i.kc_range->min_address) {
4059 			if (has_min_address) {
4060 				panic("Cannot place with multiple min_address constraints");
4061 			} else {
4062 				has_min_address = true;
4063 			}
4064 		}
4065 
4066 		if (sp_i.kc_range->max_address) {
4067 			panic("Cannot place with a max_address constraint");
4068 		}
4069 	}
4070 
4071 
4072 	/*
4073 	 * Shuffle registered claims
4074 	 */
4075 	assert(kmem_claim_count < UINT16_MAX);
4076 	kmem_shuffle_claims();
4077 
4078 	/*
4079 	 * Apply restrictions and determine range for each claim
4080 	 */
4081 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4082 		struct kmem_range_startup_spec sp = kmem_claims[i];
4083 		struct mach_vm_range *sp_range = sp.kc_range;
4084 
4085 		/*
4086 		 * Find space using the allocation size (rather than the claim size) in
4087 		 * order to ensure we provide any applicable padding.
4088 		 */
4089 		bool is_last = (i == kmem_claim_count - 1);
4090 		vm_map_offset_t sp_allocation_size =
4091 		    kmem_claim_to_allocation_size(sp.kc_size, is_last);
4092 
4093 		if (vm_map_locate_space_anywhere(kernel_map, sp_allocation_size, 0,
4094 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4095 		    &va_alloc_head, NULL) != KERN_SUCCESS) {
4096 			panic("kmem_range_init: vm_map_locate_space failing for claim %s, "
4097 			    "size 0x%llx",
4098 			    sp.kc_name, sp_allocation_size);
4099 		}
4100 
4101 		/*
4102 		 * Re-adjust ranges if restriction not met
4103 		 */
4104 		if (sp_range->min_address && va_alloc_head > sp_range->min_address) {
4105 			kmem_readjust_ranges(i);
4106 		} else {
4107 			/*
4108 			 * Though the actual allocated space may be larger, provide only the
4109 			 * size requested by the original claim.
4110 			 */
4111 			sp_range->min_address = va_alloc_head;
4112 			sp_range->max_address = va_alloc_head + sp.kc_size;
4113 		}
4114 
4115 		va_alloc_head += sp_allocation_size;
4116 	}
4117 
4118 	/*
4119 	 * We have settled on the ranges, now create temporary entries for the
4120 	 * claims
4121 	 */
4122 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4123 		struct kmem_range_startup_spec sp = kmem_claims[i];
4124 		bool is_last = (i == kmem_claim_count - 1);
4125 		vm_map_offset_t sp_allocation_size =
4126 		    kmem_claim_to_allocation_size(sp.kc_size, is_last);
4127 		vm_map_entry_t entry = NULL;
4128 		if (sp.kc_flags & KC_NO_ENTRY) {
4129 			continue;
4130 		}
4131 
4132 
4133 		/*
4134 		 * We reserve the full allocation size (rather than the claim size) so
4135 		 * that nothing ends up placed in the padding space (if applicable).
4136 		 */
4137 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address,
4138 		    sp_allocation_size, 0,
4139 		    VM_MAP_KERNEL_FLAGS_ANYWHERE(.vmkf_no_soft_limit = true),
4140 		    &entry) != KERN_SUCCESS) {
4141 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
4142 			    sp.kc_name);
4143 		}
4144 		vm_object_reference(kernel_object_default);
4145 		VME_OBJECT_SET(entry, kernel_object_default, false, 0);
4146 		VME_OFFSET_SET(entry, entry->vme_start);
4147 		vm_map_unlock(kernel_map);
4148 	}
4149 
4150 	/*
4151 	 * Now that we are done assigning all the ranges, reset
4152 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
4153 	 */
4154 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
4155 
4156 #if DEBUG || DEVELOPMENT
4157 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4158 		struct kmem_range_startup_spec sp = kmem_claims[i];
4159 
4160 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
4161 		    (void *)sp.kc_range->min_address,
4162 		    (void *)sp.kc_range->max_address,
4163 		    mach_vm_size_pretty(sp.kc_size),
4164 		    mach_vm_size_unit(sp.kc_size));
4165 	}
4166 #endif /* DEBUG || DEVELOPMENT */
4167 
4168 #if MACH_ASSERT
4169 	/*
4170 	 * Since many parts of the claim infrastructure are marked as startup data
4171 	 * (and are thus unavailable post-lockdown), save off information our tests
4172 	 * need now.
4173 	 */
4174 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
4175 		kmem_test_saved_ranges[i] = *(kmem_claims[i].kc_range);
4176 	}
4177 #endif /* MACH_ASSERT */
4178 }
4179 
4180 __startup_func
4181 static void
kmem_range_init(void)4182 kmem_range_init(void)
4183 {
4184 	vm_size_t range_adjustment;
4185 
4186 	kmem_scramble_ranges();
4187 
4188 	range_adjustment = sprayqtn_range_size >> 3;
4189 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address =
4190 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].min_address + range_adjustment;
4191 	kmem_large_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address =
4192 	    kmem_ranges[KMEM_RANGE_ID_SPRAYQTN].max_address;
4193 
4194 	range_adjustment = data_range_size >> 3;
4195 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
4196 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
4197 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
4198 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
4199 
4200 	pmap_init();
4201 	kmem_metadata_init();
4202 	kmem_sizeclass_init();
4203 
4204 #if DEBUG || DEVELOPMENT
4205 	for (kmem_range_id_t i = 1; i < KMEM_RANGE_COUNT; i++) {
4206 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
4207 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
4208 		    (void *)kmem_large_ranges[i].min_address,
4209 		    (void *)kmem_large_ranges[i].max_address,
4210 		    mach_vm_size_pretty(range_size),
4211 		    mach_vm_size_unit(range_size));
4212 	}
4213 #endif
4214 }
4215 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
4216 
4217 #if DEBUG || DEVELOPMENT
4218 __startup_func
4219 static void
kmem_log_init(void)4220 kmem_log_init(void)
4221 {
4222 	/*
4223 	 * Log can only be created after the the kmem subsystem is initialized as
4224 	 * btlog creation uses kmem
4225 	 */
4226 	kmem_outlier_log = btlog_create(BTLOG_LOG, KMEM_OUTLIER_LOG_SIZE, 0);
4227 }
4228 STARTUP(ZALLOC, STARTUP_RANK_FIRST, kmem_log_init);
4229 
4230 kmem_gobj_stats
kmem_get_gobj_stats(void)4231 kmem_get_gobj_stats(void)
4232 {
4233 	kmem_gobj_stats stats = {};
4234 
4235 	vm_map_lock(kernel_map);
4236 	for (uint8_t i = 0; i < kmem_ptr_ranges; i++) {
4237 		kmem_range_id_t range_id = KMEM_RANGE_ID_FIRST + i;
4238 		struct mach_vm_range range = kmem_ranges[range_id];
4239 		struct kmem_page_meta *meta = kmem_meta_hwm[kmem_get_front(range_id, 0)];
4240 		struct kmem_page_meta *meta_end;
4241 		uint64_t meta_idx = meta - kmem_meta_base[range_id];
4242 		vm_map_size_t used = 0, va = 0, meta_sz = 0, pte_sz = 0;
4243 		vm_map_offset_t addr;
4244 		vm_map_entry_t entry;
4245 
4246 		/*
4247 		 * Left front
4248 		 */
4249 		va = (meta_idx * KMEM_CHUNK_SIZE_MIN);
4250 		meta_sz = round_page(meta_idx * sizeof(struct kmem_page_meta));
4251 
4252 		/*
4253 		 * Right front
4254 		 */
4255 		meta = kmem_meta_hwm[kmem_get_front(range_id, 1)];
4256 		meta_end = kmem_addr_to_meta(range.max_address, range_id, &addr,
4257 		    &meta_idx);
4258 		meta_idx = meta_end - meta;
4259 		meta_sz += round_page(meta_idx * sizeof(struct kmem_page_meta));
4260 		va += (meta_idx * KMEM_CHUNK_SIZE_MIN);
4261 
4262 		/*
4263 		 * Compute VA allocated in entire range
4264 		 */
4265 		if (vm_map_lookup_entry(kernel_map, range.min_address, &entry) == false) {
4266 			entry = entry->vme_next;
4267 		}
4268 		while (entry != vm_map_to_entry(kernel_map) &&
4269 		    entry->vme_start < range.max_address) {
4270 			used += (entry->vme_end - entry->vme_start);
4271 			entry = entry->vme_next;
4272 		}
4273 
4274 		pte_sz = round_page(atop(va - used) * 8);
4275 
4276 		stats.total_used += used;
4277 		stats.total_va += va;
4278 		stats.pte_sz += pte_sz;
4279 		stats.meta_sz += meta_sz;
4280 	}
4281 	vm_map_unlock(kernel_map);
4282 
4283 	return stats;
4284 }
4285 
4286 #endif /* DEBUG || DEVELOPMENT */
4287 
4288 /*
4289  *	kmem_init:
4290  *
4291  *	Initialize the kernel's virtual memory map, taking
4292  *	into account all memory allocated up to this time.
4293  */
4294 __startup_func
4295 void
kmem_init(vm_offset_t start,vm_offset_t end)4296 kmem_init(
4297 	vm_offset_t     start,
4298 	vm_offset_t     end)
4299 {
4300 	vm_map_offset_t map_start;
4301 	vm_map_offset_t map_end;
4302 
4303 	map_start = vm_map_trunc_page(start,
4304 	    VM_MAP_PAGE_MASK(kernel_map));
4305 	map_end = vm_map_round_page(end,
4306 	    VM_MAP_PAGE_MASK(kernel_map));
4307 
4308 	vm_map_will_allocate_early_map(&kernel_map);
4309 #if defined(__arm64__)
4310 	kernel_map = vm_map_create_options(pmap_kernel(),
4311 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4312 	    VM_MAX_KERNEL_ADDRESS,
4313 	    VM_MAP_CREATE_DEFAULT);
4314 	/*
4315 	 *	Reserve virtual memory allocated up to this time.
4316 	 */
4317 	{
4318 		unsigned int    region_select = 0;
4319 		vm_map_offset_t region_start;
4320 		vm_map_size_t   region_size;
4321 		vm_map_offset_t map_addr;
4322 		kern_return_t kr;
4323 
4324 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
4325 			map_addr = region_start;
4326 			kr = vm_map_enter(kernel_map, &map_addr,
4327 			    vm_map_round_page(region_size,
4328 			    VM_MAP_PAGE_MASK(kernel_map)),
4329 			    (vm_map_offset_t) 0,
4330 			    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(
4331 				    .vmkf_no_pmap_check = true,
4332 				    .vmkf_no_soft_limit = true),
4333 			    VM_OBJECT_NULL,
4334 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
4335 			    VM_INHERIT_DEFAULT);
4336 
4337 			if (kr != KERN_SUCCESS) {
4338 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4339 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
4340 				    (uint64_t) region_size, kr);
4341 			}
4342 
4343 			region_select++;
4344 		}
4345 	}
4346 #else
4347 	kernel_map = vm_map_create_options(pmap_kernel(),
4348 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
4349 	    VM_MAP_CREATE_DEFAULT);
4350 	/*
4351 	 *	Reserve virtual memory allocated up to this time.
4352 	 */
4353 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
4354 		vm_map_offset_t map_addr;
4355 		kern_return_t kr;
4356 
4357 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
4358 		kr = vm_map_enter(kernel_map,
4359 		    &map_addr,
4360 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4361 		    (vm_map_offset_t) 0,
4362 		    VM_MAP_KERNEL_FLAGS_FIXED(.vmkf_no_pmap_check = true),
4363 		    VM_OBJECT_NULL,
4364 		    (vm_object_offset_t) 0, FALSE,
4365 		    VM_PROT_NONE, VM_PROT_NONE,
4366 		    VM_INHERIT_DEFAULT);
4367 
4368 		if (kr != KERN_SUCCESS) {
4369 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
4370 			    (uint64_t) start, (uint64_t) end,
4371 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
4372 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
4373 			    kr);
4374 		}
4375 	}
4376 #endif
4377 
4378 	kmem_set_user_wire_limits();
4379 }
4380 
4381 
4382 #pragma mark map copyio
4383 static inline void
current_thread_set_sec_override(bool val)4384 current_thread_set_sec_override(bool val)
4385 {
4386 #pragma unused(val)
4387 }
4388 
4389 /*
4390  * Note: semantic types aren't used as `copyio` already validates.
4391  */
4392 
4393 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)4394 copyinmap(
4395 	vm_map_t                map,
4396 	vm_map_offset_t         fromaddr,
4397 	void                   *todata,
4398 	vm_size_t               length)
4399 {
4400 	kern_return_t kr = KERN_SUCCESS;
4401 	vm_map_switch_context_t switch_ctx;
4402 
4403 	if (vm_map_pmap(map) == pmap_kernel()) {
4404 		/* assume a correct copy */
4405 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
4406 	} else if (current_map() == map) {
4407 		if (copyin(fromaddr, todata, length) != 0) {
4408 			kr = KERN_INVALID_ADDRESS;
4409 		}
4410 	} else {
4411 		vm_map_reference(map);
4412 		current_thread_set_sec_override(true);
4413 		switch_ctx = vm_map_switch_to(map);
4414 		if (copyin(fromaddr, todata, length) != 0) {
4415 			kr = KERN_INVALID_ADDRESS;
4416 		}
4417 		current_thread_set_sec_override(false);
4418 		vm_map_switch_back(switch_ctx);
4419 		vm_map_deallocate(map);
4420 	}
4421 	return kr;
4422 }
4423 
4424 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)4425 copyoutmap(
4426 	vm_map_t                map,
4427 	void                   *fromdata,
4428 	vm_map_address_t        toaddr,
4429 	vm_size_t               length)
4430 {
4431 	kern_return_t kr = KERN_SUCCESS;
4432 	vm_map_switch_context_t switch_ctx;
4433 
4434 	if (vm_map_pmap(map) == pmap_kernel()) {
4435 		/* assume a correct copy */
4436 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
4437 	} else if (current_map() == map) {
4438 		if (copyout(fromdata, toaddr, length) != 0) {
4439 			ktriage_record(thread_tid(current_thread()),
4440 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4441 			    KDBG_TRIAGE_RESERVED,
4442 			    KDBG_TRIAGE_VM_COPYOUTMAP_SAMEMAP_ERROR),
4443 			    KERN_INVALID_ADDRESS /* arg */);
4444 			kr = KERN_INVALID_ADDRESS;
4445 		}
4446 	} else {
4447 		vm_map_reference(map);
4448 		current_thread_set_sec_override(true);
4449 		switch_ctx = vm_map_switch_to(map);
4450 		if (copyout(fromdata, toaddr, length) != 0) {
4451 			ktriage_record(thread_tid(current_thread()),
4452 			    KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM,
4453 			    KDBG_TRIAGE_RESERVED,
4454 			    KDBG_TRIAGE_VM_COPYOUTMAP_DIFFERENTMAP_ERROR),
4455 			    KERN_INVALID_ADDRESS /* arg */);
4456 			kr = KERN_INVALID_ADDRESS;
4457 		}
4458 		current_thread_set_sec_override(false);
4459 		vm_map_switch_back(switch_ctx);
4460 		vm_map_deallocate(map);
4461 	}
4462 	return kr;
4463 }
4464 
4465 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)4466 copyoutmap_atomic32(
4467 	vm_map_t                map,
4468 	uint32_t                value,
4469 	vm_map_address_t        toaddr)
4470 {
4471 	kern_return_t kr = KERN_SUCCESS;
4472 	vm_map_switch_context_t switch_ctx;
4473 
4474 	if (vm_map_pmap(map) == pmap_kernel()) {
4475 		/* assume a correct toaddr */
4476 		*(uint32_t *)toaddr = value;
4477 	} else if (current_map() == map) {
4478 		if (copyout_atomic32(value, toaddr) != 0) {
4479 			kr = KERN_INVALID_ADDRESS;
4480 		}
4481 	} else {
4482 		vm_map_reference(map);
4483 		current_thread_set_sec_override(true);
4484 		switch_ctx = vm_map_switch_to(map);
4485 		if (copyout_atomic32(value, toaddr) != 0) {
4486 			kr = KERN_INVALID_ADDRESS;
4487 		}
4488 		current_thread_set_sec_override(false);
4489 		vm_map_switch_back(switch_ctx);
4490 		vm_map_deallocate(map);
4491 	}
4492 	return kr;
4493 }
4494 
4495 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)4496 copyoutmap_atomic64(
4497 	vm_map_t                map,
4498 	uint64_t                value,
4499 	vm_map_address_t        toaddr)
4500 {
4501 	kern_return_t kr = KERN_SUCCESS;
4502 	vm_map_switch_context_t switch_ctx;
4503 
4504 	if (vm_map_pmap(map) == pmap_kernel()) {
4505 		/* assume a correct toaddr */
4506 		*(uint64_t *)toaddr = value;
4507 	} else if (current_map() == map) {
4508 		if (copyout_atomic64(value, toaddr) != 0) {
4509 			kr = KERN_INVALID_ADDRESS;
4510 		}
4511 	} else {
4512 		vm_map_reference(map);
4513 		current_thread_set_sec_override(true);
4514 		switch_ctx = vm_map_switch_to(map);
4515 		if (copyout_atomic64(value, toaddr) != 0) {
4516 			kr = KERN_INVALID_ADDRESS;
4517 		}
4518 		current_thread_set_sec_override(false);
4519 		vm_map_switch_back(switch_ctx);
4520 		vm_map_deallocate(map);
4521 	}
4522 	return kr;
4523 }
4524 
4525 
4526 #pragma mark pointer obfuscation / packing
4527 
4528 /*
4529  *
4530  *	The following two functions are to be used when exposing kernel
4531  *	addresses to userspace via any of the various debug or info
4532  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
4533  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
4534  *	are exported to KEXTs.
4535  *
4536  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
4537  */
4538 
4539 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)4540 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
4541 {
4542 	assert(salt != 0);
4543 
4544 	if (addr == 0) {
4545 		return 0ul;
4546 	}
4547 
4548 	if (VM_KERNEL_IS_SLID(addr)) {
4549 		return VM_KERNEL_UNSLIDE(addr);
4550 	}
4551 
4552 	addr = VM_KERNEL_STRIP_PTR(addr);
4553 
4554 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
4555 	SHA256_CTX sha_ctx;
4556 
4557 	SHA256_Init(&sha_ctx);
4558 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
4559 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
4560 	SHA256_Final(sha_digest, &sha_ctx);
4561 
4562 	return sha_digest[0];
4563 }
4564 
4565 __exported vm_offset_t
4566 vm_kernel_addrhash_external(vm_offset_t addr);
4567 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)4568 vm_kernel_addrhash_external(vm_offset_t addr)
4569 {
4570 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
4571 }
4572 
4573 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)4574 vm_kernel_addrhide(
4575 	vm_offset_t addr,
4576 	vm_offset_t *hide_addr)
4577 {
4578 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
4579 }
4580 
4581 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)4582 vm_kernel_addrperm_external(
4583 	vm_offset_t addr,
4584 	vm_offset_t *perm_addr)
4585 {
4586 	addr = VM_KERNEL_STRIP_UPTR(addr);
4587 
4588 	if (VM_KERNEL_IS_SLID(addr)) {
4589 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
4590 	} else if (VM_KERNEL_ADDRESS(addr)) {
4591 		*perm_addr = ML_ADDRPERM(addr, vm_kernel_addrperm_ext);
4592 	} else {
4593 		*perm_addr = addr;
4594 	}
4595 }
4596 
4597 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)4598 vm_kernel_unslide_or_perm_external(
4599 	vm_offset_t addr,
4600 	vm_offset_t *up_addr)
4601 {
4602 	vm_kernel_addrperm_external(addr, up_addr);
4603 }
4604 
4605 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)4606 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
4607 {
4608 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
4609 		panic("pointer %p can't be packed: low %d bits aren't 0",
4610 		    (void *)ptr, params.vmpp_shift);
4611 	} else if (ptr <= params.vmpp_base) {
4612 		panic("pointer %p can't be packed: below base %p",
4613 		    (void *)ptr, (void *)params.vmpp_base);
4614 	} else {
4615 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
4616 		    (void *)ptr, (void *)vm_packing_max_packable(params));
4617 	}
4618 }
4619 
4620 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)4621 vm_packing_verify_range(
4622 	const char *subsystem,
4623 	vm_offset_t min_address,
4624 	vm_offset_t max_address,
4625 	vm_packing_params_t params)
4626 {
4627 	if (min_address > max_address) {
4628 		panic("%s: %s range invalid min:%p > max:%p",
4629 		    __func__, subsystem, (void *)min_address, (void *)max_address);
4630 	}
4631 
4632 	if (!params.vmpp_base_relative) {
4633 		return;
4634 	}
4635 
4636 	if (min_address <= params.vmpp_base) {
4637 		panic("%s: %s range invalid min:%p <= base:%p",
4638 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
4639 	}
4640 
4641 	if (max_address > vm_packing_max_packable(params)) {
4642 		panic("%s: %s range invalid max:%p >= max packable:%p",
4643 		    __func__, subsystem, (void *)max_address,
4644 		    (void *)vm_packing_max_packable(params));
4645 	}
4646 }
4647 
4648 #pragma mark tests
4649 #if MACH_ASSERT
4650 #include <sys/errno.h>
4651 
4652 static void
4653 kmem_test_for_entry(
4654 	vm_map_t                map,
4655 	vm_offset_t             addr,
4656 	void                  (^block)(vm_map_entry_t))
4657 {
4658 	vm_map_entry_t entry;
4659 
4660 	vm_map_lock(map);
4661 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
4662 	vm_map_unlock(map);
4663 }
4664 
4665 #define kmem_test_assert_map(map, pg, entries) ({ \
4666 	assert3u((map)->size, ==, ptoa(pg)); \
4667 	assert3u((map)->hdr.nentries, ==, entries); \
4668 })
4669 
4670 static bool
can_write_at(vm_offset_t offs,uint32_t page)4671 can_write_at(vm_offset_t offs, uint32_t page)
4672 {
4673 	static const int zero;
4674 
4675 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
4676 }
4677 #define assert_writeable(offs, page) \
4678 	assertf(can_write_at(offs, page), \
4679 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4680 
4681 #define assert_faults(offs, page) \
4682 	assertf(!can_write_at(offs, page), \
4683 	    "can write at %p + ptoa(%d)", (void *)offs, page)
4684 
4685 #define peek(offs, page) \
4686 	(*(uint32_t *)((offs) + ptoa(page)))
4687 
4688 #define poke(offs, page, v) \
4689 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
4690 
4691 #if CONFIG_SPTM
4692 __attribute__((noinline))
4693 static void
kmem_test_verify_type_policy(vm_offset_t addr,kmem_flags_t flags)4694 kmem_test_verify_type_policy(vm_offset_t addr, kmem_flags_t flags)
4695 {
4696 	extern bool use_xnu_restricted;
4697 	pmap_mapping_type_t expected_type = PMAP_MAPPING_TYPE_RESTRICTED;
4698 
4699 	/* Explicitly state the expected policy */
4700 	if (flags & (KMEM_DATA | KMEM_COMPRESSOR | KMEM_DATA_SHARED)) {
4701 		expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4702 	}
4703 
4704 	/* If X_K_R is disabled, DEFAULT is the only possible mapping */
4705 	if (!use_xnu_restricted) {
4706 		expected_type = PMAP_MAPPING_TYPE_DEFAULT;
4707 	}
4708 
4709 	/* Verify if derived correctly */
4710 	assert3u(expected_type, ==, __kmem_mapping_type(flags));
4711 
4712 	pmap_paddr_t pa = kvtophys(addr);
4713 	if (pa == 0) {
4714 		return;
4715 	}
4716 
4717 	/* Verify if the mapped address actually got the expected type */
4718 	assert3u(expected_type, ==, sptm_get_frame_type(pa));
4719 }
4720 #endif /* CONFIG_SPTM */
4721 
4722 __attribute__((noinline))
4723 static void
kmem_alloc_basic_test(vm_map_t map)4724 kmem_alloc_basic_test(vm_map_t map)
4725 {
4726 	kmem_guard_t guard = {
4727 		.kmg_tag = VM_KERN_MEMORY_DIAG,
4728 	};
4729 	vm_offset_t addr;
4730 
4731 	/*
4732 	 * Test wired basics:
4733 	 * - KMA_KOBJECT
4734 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
4735 	 * - allocation alignment
4736 	 */
4737 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
4738 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
4739 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
4740 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
4741 	kmem_test_assert_map(map, 10, 1);
4742 
4743 	kmem_test_for_entry(map, addr, ^(__assert_only vm_map_entry_t e){
4744 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
4745 		assert(e->vme_kernel_object);
4746 		assert(!e->vme_atomic);
4747 		assert3u(e->vme_start, <=, addr);
4748 		assert3u(addr + ptoa(10), <=, e->vme_end);
4749 	});
4750 
4751 	assert_faults(addr, 0);
4752 	for (int i = 1; i < 9; i++) {
4753 		assert_writeable(addr, i);
4754 	}
4755 	assert_faults(addr, 9);
4756 
4757 	kmem_free(map, addr, ptoa(10));
4758 	kmem_test_assert_map(map, 0, 0);
4759 
4760 	/*
4761 	 * Test pageable basics.
4762 	 */
4763 	addr = kmem_alloc_guard(map, ptoa(10), 0,
4764 	    KMA_PAGEABLE, guard).kmr_address;
4765 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
4766 	kmem_test_assert_map(map, 10, 1);
4767 
4768 	for (int i = 0; i < 9; i++) {
4769 		assert_faults(addr, i);
4770 		poke(addr, i, 42);
4771 		assert_writeable(addr, i);
4772 	}
4773 
4774 	kmem_free_guard(map, addr, ptoa(10),
4775 	    KMF_GUARD_FIRST | KMF_GUARD_LAST, guard);
4776 	kmem_test_assert_map(map, 0, 0);
4777 }
4778 
4779 __attribute__((noinline))
4780 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)4781 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
4782 {
4783 	kmem_guard_t guard = {
4784 		.kmg_atomic  = !(kind & KMR_DATA),
4785 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
4786 		.kmg_context = 0xefface,
4787 	};
4788 	vm_offset_t addr, newaddr;
4789 	const int N = 10;
4790 
4791 	/*
4792 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
4793 	 *	we could conceive an implementation where it grows in place
4794 	 *	if there's space after it.
4795 	 *
4796 	 *	However, this is what the implementation does today.
4797 	 */
4798 	bool realloc_growth_changes_address = true;
4799 	bool GF = (kind & KMR_GUARD_FIRST);
4800 	bool GL = (kind & KMR_GUARD_LAST);
4801 
4802 	/*
4803 	 *	Initial N page allocation
4804 	 */
4805 	addr = kmem_alloc_guard(map, ptoa(N), 0,
4806 	    (kind & ~KMEM_FREEOLD) | KMA_ZERO, guard).kmr_address;
4807 	assert3u(addr, !=, 0);
4808 
4809 	kmem_test_assert_map(map, N, 1);
4810 	for (int pg = GF; pg < N - GL; pg++) {
4811 		poke(addr, pg, 42 + pg);
4812 	}
4813 	for (int pg = N - GL; pg < N; pg++) {
4814 		assert_faults(addr, pg);
4815 	}
4816 
4817 #if CONFIG_SPTM
4818 	kmem_test_verify_type_policy(addr, ANYF(kind));
4819 #endif /* CONFIG_SPTM */
4820 	/*
4821 	 *	Grow to N + 3 pages
4822 	 */
4823 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
4824 	    kind | KMR_ZERO, guard).kmr_address;
4825 	assert3u(newaddr, !=, 0);
4826 	if (realloc_growth_changes_address) {
4827 		assert3u(addr, !=, newaddr);
4828 	}
4829 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
4830 		kmem_test_assert_map(map, N + 3, 1);
4831 	} else {
4832 		kmem_test_assert_map(map, 2 * N + 3, 2);
4833 	}
4834 	for (int pg = GF; pg < N - GL; pg++) {
4835 		assert3u(peek(newaddr, pg), ==, 42 + pg);
4836 	}
4837 	if ((kind & KMR_FREEOLD) == 0) {
4838 		for (int pg = GF; pg < N - GL; pg++) {
4839 			assert3u(peek(addr, pg), ==, 42 + pg);
4840 		}
4841 		/* check for tru-share */
4842 		poke(addr + 16, 0, 1234);
4843 		assert3u(peek(newaddr + 16, 0), ==, 1234);
4844 		kmem_free_guard(map, addr, ptoa(N),
4845 		    kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
4846 		kmem_test_assert_map(map, N + 3, 1);
4847 	}
4848 	if (addr != newaddr) {
4849 		for (int pg = GF; pg < N - GL; pg++) {
4850 			assert_faults(addr, pg);
4851 		}
4852 	}
4853 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
4854 		assert3u(peek(newaddr, pg), ==, 0);
4855 	}
4856 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
4857 		assert_faults(newaddr, pg);
4858 	}
4859 	addr = newaddr;
4860 
4861 
4862 	/*
4863 	 *	Shrink to N - 2 pages
4864 	 */
4865 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
4866 	    kind | KMR_ZERO, guard).kmr_address;
4867 	assert3u(map->size, ==, ptoa(N - 2));
4868 	assert3u(newaddr, ==, addr);
4869 	kmem_test_assert_map(map, N - 2, 1);
4870 
4871 	for (int pg = GF; pg < N - 2 - GL; pg++) {
4872 		assert3u(peek(addr, pg), ==, 42 + pg);
4873 	}
4874 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
4875 		assert_faults(addr, pg);
4876 	}
4877 
4878 	kmem_free_guard(map, addr, ptoa(N - 2),
4879 	    kind & (KMF_TAG | KMF_GUARD_FIRST | KMF_GUARD_LAST), guard);
4880 	kmem_test_assert_map(map, 0, 0);
4881 }
4882 
4883 static int
kmem_basic_test(__unused int64_t in,int64_t * out)4884 kmem_basic_test(__unused int64_t in, int64_t *out)
4885 {
4886 	mach_vm_offset_t addr;
4887 	vm_map_t map;
4888 
4889 	printf("%s: test running\n", __func__);
4890 
4891 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
4892 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
4893 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
4894 
4895 	printf("%s: kmem_alloc ...\n", __func__);
4896 	kmem_alloc_basic_test(map);
4897 	printf("%s:     PASS\n", __func__);
4898 
4899 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
4900 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
4901 	printf("%s:     PASS\n", __func__);
4902 
4903 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
4904 	kmem_realloc_basic_test(map, KMR_FREEOLD);
4905 	printf("%s:     PASS\n", __func__);
4906 
4907 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4908 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST);
4909 	printf("%s:     PASS\n", __func__);
4910 
4911 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4912 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
4913 	printf("%s:     PASS\n", __func__);
4914 
4915 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4916 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4917 	printf("%s:     PASS\n", __func__);
4918 
4919 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST) ...\n", __func__);
4920 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST);
4921 	printf("%s:     PASS\n", __func__);
4922 
4923 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
4924 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
4925 	printf("%s:     PASS\n", __func__);
4926 
4927 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST) ...\n", __func__);
4928 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_FIRST | KMR_GUARD_LAST);
4929 	printf("%s:     PASS\n", __func__);
4930 
4931 
4932 	/* using KMR_DATA signals to test the non atomic realloc path */
4933 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
4934 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
4935 	printf("%s:     PASS\n", __func__);
4936 
4937 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
4938 	kmem_realloc_basic_test(map, KMR_DATA);
4939 	printf("%s:     PASS\n", __func__);
4940 
4941 	/* test KMR_SHARED_DATA for the new shared kheap */
4942 	printf("%s: kmem_realloc (KMR_DATA_SHARED) ...\n", __func__);
4943 	kmem_realloc_basic_test(map, KMR_DATA_SHARED);
4944 	printf("%s:     PASS\n", __func__);
4945 
4946 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
4947 	vm_map_deallocate(map);
4948 
4949 	printf("%s: test passed\n", __func__);
4950 	*out = 1;
4951 	return 0;
4952 }
4953 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
4954 
4955 static void
kmem_test_get_size_idx_for_chunks(uint32_t chunks)4956 kmem_test_get_size_idx_for_chunks(uint32_t chunks)
4957 {
4958 	__assert_only uint32_t idx = kmem_get_size_idx_for_chunks(chunks);
4959 
4960 	assert(chunks >= kmem_size_array[idx].ks_num_chunk);
4961 }
4962 
4963 __attribute__((noinline))
4964 static void
kmem_test_get_size_idx_for_all_chunks()4965 kmem_test_get_size_idx_for_all_chunks()
4966 {
4967 	for (uint32_t i = 0; i < KMEM_NUM_SIZECLASS; i++) {
4968 		uint32_t chunks = kmem_size_array[i].ks_num_chunk;
4969 
4970 		if (chunks != 1) {
4971 			kmem_test_get_size_idx_for_chunks(chunks - 1);
4972 		}
4973 		kmem_test_get_size_idx_for_chunks(chunks);
4974 		kmem_test_get_size_idx_for_chunks(chunks + 1);
4975 	}
4976 }
4977 
4978 static int
kmem_guard_obj_test(__unused int64_t in,int64_t * out)4979 kmem_guard_obj_test(__unused int64_t in, int64_t *out)
4980 {
4981 	printf("%s: test running\n", __func__);
4982 
4983 	printf("%s: kmem_get_size_idx_for_chunks\n", __func__);
4984 	kmem_test_get_size_idx_for_all_chunks();
4985 	printf("%s:     PASS\n", __func__);
4986 
4987 	printf("%s: test passed\n", __func__);
4988 	*out = 1;
4989 	return 0;
4990 }
4991 SYSCTL_TEST_REGISTER(kmem_guard_obj, kmem_guard_obj_test);
4992 
4993 
4994 #endif /* MACH_ASSERT */
4995