xref: /xnu-8020.140.41/osfmk/vm/vm_kern.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <kern/misc_protos.h>
79 #include <vm/cpm.h>
80 #include <kern/ledger.h>
81 #include <kern/bits.h>
82 #include <kern/startup.h>
83 
84 #include <string.h>
85 
86 #include <libkern/OSDebug.h>
87 #include <libkern/crypto/sha2.h>
88 #include <libkern/section_keywords.h>
89 #include <sys/kdebug.h>
90 
91 #include <san/kasan.h>
92 #include <kern/kext_alloc.h>
93 #include <kern/backtrace.h>
94 #include <os/hash.h>
95 #include <kern/zalloc_internal.h>
96 
97 /*
98  *	Variables exported by this module.
99  */
100 
101 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
102 SECURITY_READ_ONLY_LATE(struct kmem_range) kmem_ranges[KMEM_RANGE_COUNT] = {};
103 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
104 __startup_data
105 vm_map_size_t data_range_size, ptr_range_size;
106 SECURITY_READ_ONLY_LATE(struct kmem_range)
107 kmem_large_ranges[KMEM_RANGE_COUNT] = {};
108 #endif
109 TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges", 2);
110 
111 #pragma mark helpers
112 
113 __attribute__((overloadable))
114 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)115 ANYF(kma_flags_t flags)
116 {
117 	return (kmem_flags_t)flags;
118 }
119 
120 __attribute__((overloadable))
121 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)122 ANYF(kmr_flags_t flags)
123 {
124 	return (kmem_flags_t)flags;
125 }
126 
127 __attribute__((overloadable))
128 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)129 ANYF(kmf_flags_t flags)
130 {
131 	return (kmem_flags_t)flags;
132 }
133 
134 __abortlike
135 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)136 __kmem_invalid_size_panic(
137 	vm_map_t        map,
138 	vm_size_t       size,
139 	uint32_t        flags)
140 {
141 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
142 	    map, flags, (size_t)size);
143 }
144 
145 __abortlike
146 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)147 __kmem_invalid_arguments_panic(
148 	const char     *what,
149 	vm_map_t        map,
150 	vm_address_t    address,
151 	vm_size_t       size,
152 	uint32_t        flags)
153 {
154 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
155 	    "invalid arguments passed",
156 	    what, map, (void *)address, (size_t)size, flags);
157 }
158 
159 __abortlike
160 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)161 __kmem_failed_panic(
162 	vm_map_t        map,
163 	vm_size_t       size,
164 	uint32_t        flags,
165 	kern_return_t   kr,
166 	const char     *what)
167 {
168 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
169 	    what, map, (size_t)size, flags, kr);
170 }
171 
172 __abortlike
173 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)174 __kmem_entry_not_found_panic(
175 	vm_map_t        map,
176 	vm_offset_t     addr)
177 {
178 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
179 }
180 
181 __abortlike
182 static void
__kmem_invalid_object_panic(uint32_t flags)183 __kmem_invalid_object_panic(uint32_t flags)
184 {
185 	if (flags == 0) {
186 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
187 	}
188 	panic("more than one of KMEM_KOBJECT or KMEM_COMPRESSOR specified");
189 }
190 
191 static inline vm_object_t
__kmem_object(kmem_flags_t flags)192 __kmem_object(kmem_flags_t flags)
193 {
194 	flags &= (KMEM_KOBJECT | KMEM_COMPRESSOR);
195 	if (flags == 0 || (flags & (flags - 1))) {
196 		__kmem_invalid_object_panic(flags);
197 	}
198 
199 	return (flags & KMEM_KOBJECT) ? kernel_object : compressor_object;
200 }
201 
202 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)203 __kmem_guard_left(kmem_flags_t flags)
204 {
205 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
206 }
207 
208 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)209 __kmem_guard_right(kmem_flags_t flags)
210 {
211 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
212 }
213 
214 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)215 __kmem_guard_size(kmem_flags_t flags)
216 {
217 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
218 }
219 
220 
221 #pragma mark kmem range methods
222 
223 __attribute__((overloadable))
224 __header_always_inline bool
kmem_range_contains(const struct kmem_range * r,vm_offset_t addr)225 kmem_range_contains(const struct kmem_range *r, vm_offset_t addr)
226 {
227 	vm_offset_t rmin, rmax;
228 
229 #if CONFIG_KERNEL_TBI
230 	addr = VM_KERNEL_TBI_FILL(addr);
231 #endif /* CONFIG_KERNEL_TBI */
232 
233 	/*
234 	 * The `&` is not a typo: we really expect the check to pass,
235 	 * so encourage the compiler to eagerly load and test without branches
236 	 */
237 	kmem_range_load(r, rmin, rmax);
238 	return (addr >= rmin) & (addr < rmax);
239 }
240 
241 __attribute__((overloadable))
242 __header_always_inline bool
kmem_range_contains(const struct kmem_range * r,vm_offset_t addr,vm_offset_t size)243 kmem_range_contains(const struct kmem_range *r, vm_offset_t addr, vm_offset_t size)
244 {
245 	vm_offset_t rmin, rmax;
246 
247 #if CONFIG_KERNEL_TBI
248 	addr = VM_KERNEL_TBI_FILL(addr);
249 #endif /* CONFIG_KERNEL_TBI */
250 
251 	/*
252 	 * The `&` is not a typo: we really expect the check to pass,
253 	 * so encourage the compiler to eagerly load and test without branches
254 	 */
255 	kmem_range_load(r, rmin, rmax);
256 	return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
257 }
258 
259 __header_always_inline vm_size_t
kmem_range_size(const struct kmem_range * r)260 kmem_range_size(const struct kmem_range *r)
261 {
262 	vm_offset_t rmin, rmax;
263 
264 	kmem_range_load(r, rmin, rmax);
265 	return rmax - rmin;
266 }
267 
268 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)269 kmem_range_id_contains(kmem_range_id_t range_id, vm_map_offset_t addr,
270     vm_map_size_t size)
271 {
272 	return kmem_range_contains(&kmem_ranges[range_id], addr, size);
273 }
274 
275 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)276 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
277 {
278 	kmem_range_id_t range_id = 0;
279 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
280 		if (kmem_range_id_contains(range_id, addr, size)) {
281 			break;
282 		}
283 	}
284 	return range_id;
285 }
286 
287 
288 #pragma mark entry parameters
289 
290 
291 __abortlike
292 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)293 __kmem_entry_validate_panic(
294 	vm_map_t        map,
295 	vm_map_entry_t  entry,
296 	vm_offset_t     addr,
297 	vm_size_t       size,
298 	uint32_t        flags,
299 	kmem_guard_t    guard)
300 {
301 	const char *what = "???";
302 
303 	if (entry->vme_atomic != guard.kmg_atomic) {
304 		what = "atomicity";
305 	} else if (entry->is_sub_map != guard.kmg_submap) {
306 		what = "objectness";
307 	} else if (addr != entry->vme_start) {
308 		what = "left bound";
309 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
310 		what = "right bound";
311 #if __LP64__
312 	} else if (guard.kmg_context != entry->vme_context) {
313 		what = "guard";
314 #endif
315 	}
316 
317 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
318 	    "entry:%p %s mismatch guard(0x%08x)",
319 	    map, (void *)addr, (size_t)size, flags, entry,
320 	    what, guard.kmg_context);
321 }
322 
323 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)324 __kmem_entry_validate_guard(
325 	vm_map_entry_t  entry,
326 	vm_offset_t     addr,
327 	vm_size_t       size,
328 	kmem_flags_t    flags,
329 	kmem_guard_t    guard)
330 {
331 	if (entry->vme_atomic != guard.kmg_atomic) {
332 		return false;
333 	}
334 
335 	if (!guard.kmg_atomic) {
336 		return true;
337 	}
338 
339 	if (entry->is_sub_map != guard.kmg_submap) {
340 		return false;
341 	}
342 
343 	if (addr != entry->vme_start) {
344 		return false;
345 	}
346 
347 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
348 		return false;
349 	}
350 
351 #if __LP64__
352 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
353 		return false;
354 	}
355 #endif
356 
357 	return true;
358 }
359 
360 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)361 kmem_entry_validate_guard(
362 	vm_map_t        map,
363 	vm_map_entry_t  entry,
364 	vm_offset_t     addr,
365 	vm_size_t       size,
366 	kmem_guard_t    guard)
367 {
368 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
369 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
370 	}
371 }
372 
373 __abortlike
374 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)375 __kmem_entry_validate_object_panic(
376 	vm_map_t        map,
377 	vm_map_entry_t  entry,
378 	kmem_flags_t    flags)
379 {
380 	const char *what;
381 	const char *verb;
382 
383 	if (entry->is_sub_map) {
384 		panic("kmem(map=%p) entry %p is a submap", map, entry);
385 	}
386 
387 	if (flags & KMEM_KOBJECT) {
388 		what = "kernel";
389 		verb = "isn't";
390 	} else if (flags & KMEM_COMPRESSOR) {
391 		what = "compressor";
392 		verb = "isn't";
393 	} else if (entry->vme_kernel_object) {
394 		what = "kernel";
395 		verb = "is unexpectedly";
396 	} else {
397 		what = "compressor";
398 		verb = "is unexpectedly";
399 	}
400 
401 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
402 	    map, flags, entry, verb, what);
403 }
404 
405 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)406 __kmem_entry_validate_object(
407 	vm_map_entry_t  entry,
408 	kmem_flags_t    flags)
409 {
410 	if (entry->is_sub_map) {
411 		return false;
412 	}
413 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
414 		return false;
415 	}
416 
417 	return (bool)(flags & KMEM_COMPRESSOR) ==
418 	       (VME_OBJECT(entry) == compressor_object);
419 }
420 
421 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)422 kmem_size_guard(
423 	vm_map_t        map,
424 	vm_offset_t     addr,
425 	kmem_guard_t    guard)
426 {
427 	kmem_flags_t flags = KMEM_GUESS_SIZE;
428 	vm_map_entry_t entry;
429 	vm_size_t size;
430 
431 	vm_map_lock_read(map);
432 
433 	if (!vm_map_lookup_entry(map, addr, &entry)) {
434 		__kmem_entry_not_found_panic(map, addr);
435 	}
436 
437 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
438 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
439 	}
440 
441 	size = (vm_size_t)(entry->vme_end - entry->vme_start);
442 
443 	vm_map_unlock_read(map);
444 
445 	return size;
446 }
447 
448 #if ZSECURITY_CONFIG(KALLOC_TYPE) && ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
449 static inline uint16_t
kmem_hash_backtrace(void * fp)450 kmem_hash_backtrace(
451 	void                     *fp)
452 {
453 	uint64_t  bt_count;
454 	uintptr_t bt[8] = {};
455 
456 	struct backtrace_control ctl = {
457 		.btc_frame_addr = (uintptr_t)fp,
458 	};
459 
460 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
461 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
462 }
463 #endif
464 
465 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
466     "Insufficient bits to represent ptr ranges");
467 
468 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)469 kmem_adjust_range_id(
470 	uint32_t                  hash)
471 {
472 #if ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
473 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
474 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
475 #else
476 	(void)hash;
477 	return KMEM_RANGE_ID_PTR_0;
478 #endif
479 }
480 
481 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)482 kmem_apply_security_policy(
483 	vm_map_t                  map,
484 	kma_flags_t               kma_flags,
485 	kmem_guard_t              guard,
486 	vm_map_kernel_flags_t    *vmk_flags,
487 	bool                      assert_dir __unused)
488 {
489 	kmem_range_id_t range_id;
490 	bool direction;
491 	uint16_t type_hash = guard.kmg_type_hash;
492 
493 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
494 		return;
495 	}
496 
497 	/*
498 	 * When ZSECURITY_CONFIG(KALLOC_TYPE) is enabled, a non-zero type-hash
499 	 * must be passed by krealloc_type
500 	 */
501 #if (DEBUG || DEVELOPMENT) && ZSECURITY_CONFIG(KALLOC_TYPE)
502 	if (assert_dir && !(kma_flags & KMA_DATA)) {
503 		assert(type_hash != 0);
504 	}
505 #endif
506 
507 	if (kma_flags & KMA_DATA) {
508 		range_id = KMEM_RANGE_ID_DATA;
509 		/*
510 		 * As an optimization in KMA_DATA to avoid fragmentation,
511 		 * allocate static carveouts at the end of the DATA range.
512 		 */
513 		direction = (bool)(kma_flags & KMA_PERMANENT);
514 	} else if (type_hash) {
515 		range_id = type_hash & KMEM_RANGE_MASK;
516 		direction = type_hash & KMEM_DIRECTION_MASK;
517 	} else {
518 #if ZSECURITY_CONFIG(KALLOC_TYPE) && ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
519 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
520 #endif
521 		/*
522 		 * Range id needs to correspond to one of the PTR ranges
523 		 */
524 		range_id = kmem_adjust_range_id(type_hash);
525 		direction = type_hash & KMEM_DIRECTION_MASK;
526 	}
527 
528 	vmk_flags->vmkf_range_id = range_id;
529 	vmk_flags->vmkf_last_free = direction;
530 }
531 
532 #pragma mark allocation
533 
534 kern_return_t
kmem_alloc_contig(vm_map_t map,vm_offset_t * addrp,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,vm_tag_t tag)535 kmem_alloc_contig(
536 	vm_map_t                map,
537 	vm_offset_t             *addrp,
538 	vm_size_t               size,
539 	vm_offset_t             mask,
540 	ppnum_t                 max_pnum,
541 	ppnum_t                 pnum_mask,
542 	kma_flags_t             flags,
543 	vm_tag_t                tag)
544 {
545 	vm_object_t             object;
546 	vm_object_offset_t      offset;
547 	vm_map_offset_t         map_addr;
548 	vm_map_offset_t         map_mask;
549 	vm_map_size_t           map_size, i;
550 	vm_map_entry_t          entry;
551 	vm_page_t               m, pages;
552 	kern_return_t           kr;
553 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
554 
555 	assert(VM_KERN_MEMORY_NONE != tag);
556 	assert(map);
557 	assert3u(flags & ~KMEM_ALLOC_CONTIG_FLAGS, ==, 0);
558 
559 	map_size = vm_map_round_page(size, VM_MAP_PAGE_MASK(map));
560 	map_mask = (vm_map_offset_t)mask;
561 
562 	/* Check for zero allocation size (either directly or via overflow) */
563 	if (map_size == 0) {
564 		*addrp = 0;
565 		return KERN_INVALID_ARGUMENT;
566 	}
567 
568 	/*
569 	 *	Allocate a new object (if necessary) and the reference we
570 	 *	will be donating to the map entry.  We must do this before
571 	 *	locking the map, or risk deadlock with the default pager.
572 	 */
573 	if ((flags & KMA_KOBJECT) != 0) {
574 		object = kernel_object;
575 		vm_object_reference(object);
576 	} else {
577 		object = vm_object_allocate(map_size);
578 	}
579 	if (flags & KMA_PERMANENT) {
580 		vmk_flags.vmkf_permanent = true;
581 	}
582 	kmem_apply_security_policy(map, flags, KMEM_GUARD_NONE, &vmk_flags, false);
583 
584 	kr = vm_map_find_space(map, 0, map_size, map_mask,
585 	    vmk_flags, &entry);
586 	if (KERN_SUCCESS != kr) {
587 		vm_object_deallocate(object);
588 		return kr;
589 	}
590 
591 	map_addr = entry->vme_start;
592 	if (object == kernel_object) {
593 		offset = map_addr;
594 	} else {
595 		offset = 0;
596 	}
597 	VME_OBJECT_SET(entry, object, false, 0);
598 	VME_OFFSET_SET(entry, offset);
599 	VME_ALIAS_SET(entry, tag);
600 
601 	/* Take an extra object ref in case the map entry gets deleted */
602 	vm_object_reference(object);
603 	vm_map_unlock(map);
604 
605 	kr = cpm_allocate(CAST_DOWN(vm_size_t, map_size), &pages, max_pnum, pnum_mask, FALSE, flags);
606 
607 	if (kr != KERN_SUCCESS) {
608 		vm_map_remove(map,
609 		    vm_map_trunc_page(map_addr,
610 		    VM_MAP_PAGE_MASK(map)),
611 		    vm_map_round_page(map_addr + map_size,
612 		    VM_MAP_PAGE_MASK(map)));
613 		vm_object_deallocate(object);
614 		*addrp = 0;
615 		return kr;
616 	}
617 
618 	if (flags & KMA_ZERO) {
619 		for (m = pages; m; m = NEXT_PAGE(m)) {
620 			vm_page_zero_fill(m);
621 		}
622 	}
623 
624 
625 	vm_object_lock(object);
626 	for (i = 0; i < map_size; i += PAGE_SIZE) {
627 		m = pages;
628 		pages = NEXT_PAGE(m);
629 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
630 		m->vmp_busy = FALSE;
631 		vm_page_insert(m, object, offset + i);
632 	}
633 	vm_object_unlock(object);
634 
635 	kr = vm_map_wire_kernel(map,
636 	    vm_map_trunc_page(map_addr,
637 	    VM_MAP_PAGE_MASK(map)),
638 	    vm_map_round_page(map_addr + map_size,
639 	    VM_MAP_PAGE_MASK(map)),
640 	    VM_PROT_DEFAULT, tag,
641 	    FALSE);
642 
643 	if (kr != KERN_SUCCESS) {
644 		if (object == kernel_object) {
645 			vm_object_lock(object);
646 			vm_object_page_remove(object, offset, offset + map_size);
647 			vm_object_unlock(object);
648 		}
649 		vm_map_remove(map,
650 		    vm_map_trunc_page(map_addr,
651 		    VM_MAP_PAGE_MASK(map)),
652 		    vm_map_round_page(map_addr + map_size,
653 		    VM_MAP_PAGE_MASK(map)));
654 		vm_object_deallocate(object);
655 		return kr;
656 	}
657 	vm_object_deallocate(object);
658 
659 	if (object == kernel_object) {
660 		vm_map_simplify(map, map_addr);
661 		vm_tag_update_size(tag, map_size);
662 	}
663 	*addrp = (vm_offset_t) map_addr;
664 	assert((vm_map_offset_t) *addrp == map_addr);
665 
666 	return KERN_SUCCESS;
667 }
668 
669 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)670 kmem_alloc_guard(
671 	vm_map_t        map,
672 	vm_size_t       size,
673 	vm_offset_t     mask,
674 	kma_flags_t     flags,
675 	kmem_guard_t    guard)
676 {
677 	vm_object_t             object;
678 	vm_map_entry_t          entry = NULL;
679 	vm_map_offset_t         map_addr, fill_start;
680 	vm_map_size_t           map_size, fill_size;
681 	vm_page_t               guard_left = VM_PAGE_NULL;
682 	vm_page_t               guard_right = VM_PAGE_NULL;
683 	vm_page_t               wired_page_list = VM_PAGE_NULL;
684 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
685 	bool                    need_guards;
686 	kmem_return_t           kmr = { };
687 
688 	assert(kernel_map && map->pmap == kernel_pmap);
689 
690 #if DEBUG || DEVELOPMENT
691 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
692 	    size, 0, 0, 0);
693 #endif
694 
695 	if (size == 0 ||
696 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
697 	    (size < __kmem_guard_size(ANYF(flags)))) {
698 		__kmem_invalid_size_panic(map, size, flags);
699 	}
700 
701 	/*
702 	 * limit the size of a single extent of wired memory
703 	 * to try and limit the damage to the system if
704 	 * too many pages get wired down
705 	 * limit raised to 2GB with 128GB max physical limit,
706 	 * but scaled by installed memory above this
707 	 */
708 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
709 	    size > MAX(1ULL << 31, sane_size / 64))) {
710 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
711 		goto out_error;
712 	}
713 
714 	/*
715 	 * Guard pages:
716 	 *
717 	 * Guard pages are implemented as fictitious pages.
718 	 *
719 	 * However, some maps, and some objects are known
720 	 * to manage their memory explicitly, and do not need
721 	 * those to be materialized, which saves memory.
722 	 *
723 	 * By placing guard pages on either end of a stack,
724 	 * they can help detect cases where a thread walks
725 	 * off either end of its stack.
726 	 *
727 	 * They are allocated and set up here and attempts
728 	 * to access those pages are trapped in vm_fault_page().
729 	 *
730 	 * The map_size we were passed may include extra space for
731 	 * guard pages. fill_size represents the actual size to populate.
732 	 * Similarly, fill_start indicates where the actual pages
733 	 * will begin in the range.
734 	 */
735 
736 	map_size   = round_page(size);
737 	fill_start = 0;
738 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
739 
740 	need_guards = flags & (KMA_KOBJECT | KMA_COMPRESSOR) ||
741 	    !map->never_faults;
742 
743 	if (flags & KMA_GUARD_FIRST) {
744 		vmk_flags.vmkf_guard_before = true;
745 		fill_start += PAGE_SIZE;
746 	}
747 	if ((flags & KMA_GUARD_FIRST) && need_guards) {
748 		guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
749 		if (__improbable(guard_left == VM_PAGE_NULL)) {
750 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
751 			goto out_error;
752 		}
753 	}
754 	if ((flags & KMA_GUARD_LAST) && need_guards) {
755 		guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
756 		if (__improbable(guard_right == VM_PAGE_NULL)) {
757 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
758 			goto out_error;
759 		}
760 	}
761 
762 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
763 		kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
764 		    &wired_page_list);
765 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
766 			goto out_error;
767 		}
768 	}
769 
770 	/*
771 	 *	Allocate a new object (if necessary).  We must do this before
772 	 *	locking the map, or risk deadlock with the default pager.
773 	 */
774 	if (flags & KMA_KOBJECT) {
775 		object = kernel_object;
776 		vm_object_reference(object);
777 	} else if (flags & KMA_COMPRESSOR) {
778 		object = compressor_object;
779 		vm_object_reference(object);
780 	} else {
781 		object = vm_object_allocate(map_size);
782 	}
783 
784 	if (flags & KMA_LAST_FREE) {
785 		vmk_flags.vmkf_last_free = true;
786 	}
787 	if (flags & KMA_PERMANENT) {
788 		vmk_flags.vmkf_permanent = true;
789 	}
790 	kmem_apply_security_policy(map, flags, guard, &vmk_flags, false);
791 
792 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
793 	    vmk_flags, &entry);
794 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
795 		vm_object_deallocate(object);
796 		goto out_error;
797 	}
798 
799 	map_addr = entry->vme_start;
800 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
801 	VME_ALIAS_SET(entry, guard.kmg_tag);
802 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
803 		VME_OFFSET_SET(entry, map_addr);
804 	} else {
805 		vm_object_reference(object);
806 	}
807 
808 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
809 		entry->wired_count = 1;
810 	}
811 
812 	if (guard_left || guard_right || wired_page_list) {
813 		vm_object_offset_t offset = 0ull;
814 
815 		vm_object_lock(object);
816 		vm_map_unlock(map);
817 
818 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
819 			offset = map_addr;
820 		}
821 
822 		if (guard_left) {
823 			vm_page_insert(guard_left, object, offset);
824 			guard_left->vmp_busy = FALSE;
825 			guard_left = VM_PAGE_NULL;
826 		}
827 
828 		if (guard_right) {
829 			vm_page_insert(guard_right, object,
830 			    offset + fill_start + fill_size);
831 			guard_right->vmp_busy = FALSE;
832 			guard_right = VM_PAGE_NULL;
833 		}
834 
835 		if (wired_page_list) {
836 			kernel_memory_populate_object_and_unlock(object,
837 			    map_addr + fill_start, offset + fill_start, fill_size,
838 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT);
839 		} else {
840 			vm_object_unlock(object);
841 		}
842 	} else {
843 		vm_map_unlock(map);
844 	}
845 
846 #if KASAN
847 	if (flags & KMA_PAGEABLE) {
848 		/*
849 		 * We need to allow the range for pageable memory,
850 		 * or faulting will not be allowed.
851 		 */
852 		kasan_notify_address(map_addr, map_size);
853 	}
854 #endif
855 	/*
856 	 * now that the pages are wired, we no longer have to fear coalesce
857 	 */
858 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
859 		vm_map_simplify(map, map_addr);
860 	} else {
861 		vm_object_deallocate(object);
862 	}
863 
864 #if DEBUG || DEVELOPMENT
865 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
866 	    atop(fill_size), 0, 0, 0);
867 #endif
868 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
869 	return kmr;
870 
871 out_error:
872 	if (flags & KMA_NOFAIL) {
873 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
874 	}
875 	if (guard_left) {
876 		guard_left->vmp_snext = wired_page_list;
877 		wired_page_list = guard_left;
878 	}
879 	if (guard_right) {
880 		guard_right->vmp_snext = wired_page_list;
881 		wired_page_list = guard_right;
882 	}
883 	if (wired_page_list) {
884 		vm_page_free_list(wired_page_list, FALSE);
885 	}
886 
887 #if DEBUG || DEVELOPMENT
888 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
889 	    0, 0, 0, 0);
890 #endif
891 
892 	return kmr;
893 }
894 
895 kmem_return_t
kmem_suballoc(vm_map_t parent,vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)896 kmem_suballoc(
897 	vm_map_t                parent,
898 	vm_offset_t             *addr,
899 	vm_size_t               size,
900 	vm_map_create_options_t vmc_options,
901 	int                     vm_flags,
902 	kms_flags_t             flags,
903 	vm_tag_t                tag)
904 {
905 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
906 	vm_map_offset_t map_addr = 0;
907 	kmem_return_t kmr = { };
908 	vm_map_t map;
909 
910 	assert(page_aligned(size));
911 	assert(parent->pmap == kernel_pmap);
912 
913 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
914 	if (parent == kernel_map) {
915 		assert((vm_flags & VM_FLAGS_FIXED_RANGE_SUBALLOC) ||
916 		    (flags & KMS_DATA));
917 	}
918 #endif
919 
920 	if ((vm_flags & VM_FLAGS_ANYWHERE) == 0) {
921 		map_addr = trunc_page(*addr);
922 	}
923 
924 	pmap_reference(vm_map_pmap(parent));
925 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
926 
927 	/*
928 	 * 1. vm_map_enter() will consume one ref on success.
929 	 *
930 	 * 2. make the entry atomic as kernel submaps should never be split.
931 	 *
932 	 * 3. instruct vm_map_enter() that it is a fresh submap
933 	 *    that needs to be taught its bounds as it inserted.
934 	 */
935 	vm_map_reference(map);
936 	vmk_flags.vmkf_submap = true;
937 	if ((flags & KMS_DATA) == 0) {
938 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
939 		vmk_flags.vmkf_submap_atomic = true;
940 	}
941 	vmk_flags.vmkf_submap_adjust = true;
942 	if (flags & KMS_LAST_FREE) {
943 		vmk_flags.vmkf_last_free = true;
944 	}
945 	if (flags & KMS_PERMANENT) {
946 		vmk_flags.vmkf_permanent = true;
947 	}
948 	if (flags & KMS_DATA) {
949 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
950 	}
951 
952 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
953 	    vm_flags, vmk_flags, tag, (vm_object_t)map, 0, FALSE,
954 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
955 
956 	if (kmr.kmr_return != KERN_SUCCESS) {
957 		if (flags & KMS_NOFAIL) {
958 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
959 			    parent, (size_t)size, kmr.kmr_return);
960 		}
961 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
962 		vm_map_deallocate(map);
963 		vm_map_deallocate(map); /* also removes ref to pmap */
964 		return kmr;
965 	}
966 
967 	/*
968 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
969 	 * that the exact same range is returned.
970 	 */
971 	if (*addr != 0 && parent == kernel_map &&
972 	    startup_phase > STARTUP_SUB_KMEM) {
973 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
974 	} else {
975 		*addr = CAST_DOWN(vm_offset_t, map_addr);
976 	}
977 
978 	kmr.kmr_submap = map;
979 	return kmr;
980 }
981 
982 /*
983  *	kmem_alloc:
984  *
985  *	Allocate wired-down memory in the kernel's address map
986  *	or a submap.  The memory is not zero-filled.
987  */
988 
989 __exported kern_return_t
990 kmem_alloc_external(
991 	vm_map_t        map,
992 	vm_offset_t     *addrp,
993 	vm_size_t       size);
994 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)995 kmem_alloc_external(
996 	vm_map_t        map,
997 	vm_offset_t     *addrp,
998 	vm_size_t       size)
999 {
1000 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1001 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1002 	}
1003 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1004 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1005 }
1006 
1007 
1008 /*
1009  *	kmem_alloc_kobject:
1010  *
1011  *	Allocate wired-down memory in the kernel's address map
1012  *	or a submap.  The memory is not zero-filled.
1013  *
1014  *	The memory is allocated in the kernel_object.
1015  *	It may not be copied with vm_map_copy, and
1016  *	it may not be reallocated with kmem_realloc.
1017  */
1018 
1019 __exported kern_return_t
1020 kmem_alloc_kobject_external(
1021 	vm_map_t        map,
1022 	vm_offset_t     *addrp,
1023 	vm_size_t       size);
1024 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1025 kmem_alloc_kobject_external(
1026 	vm_map_t        map,
1027 	vm_offset_t     *addrp,
1028 	vm_size_t       size)
1029 {
1030 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1031 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1032 	}
1033 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1034 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1035 }
1036 
1037 /*
1038  *	kmem_alloc_pageable:
1039  *
1040  *	Allocate pageable memory in the kernel's address map.
1041  */
1042 
1043 __exported kern_return_t
1044 kmem_alloc_pageable_external(
1045 	vm_map_t        map,
1046 	vm_offset_t     *addrp,
1047 	vm_size_t       size);
1048 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1049 kmem_alloc_pageable_external(
1050 	vm_map_t        map,
1051 	vm_offset_t     *addrp,
1052 	vm_size_t       size)
1053 {
1054 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1055 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1056 	}
1057 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1058 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1059 }
1060 
1061 
1062 #pragma mark population
1063 
1064 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags)1065 kernel_memory_populate_pmap_enter(
1066 	vm_object_t             object,
1067 	vm_address_t            addr,
1068 	vm_object_offset_t      offset,
1069 	vm_page_t               mem,
1070 	vm_prot_t               prot,
1071 	int                     pe_flags)
1072 {
1073 	kern_return_t   pe_result;
1074 	int             pe_options;
1075 
1076 	PMAP_ENTER_CHECK(kernel_pmap, mem);
1077 
1078 	pe_options = PMAP_OPTIONS_NOWAIT;
1079 	if (object->internal) {
1080 		pe_options |= PMAP_OPTIONS_INTERNAL;
1081 	}
1082 	if (mem->vmp_reusable || object->all_reusable) {
1083 		pe_options |= PMAP_OPTIONS_REUSABLE;
1084 	}
1085 
1086 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1087 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1088 	    pe_flags, /* wired */ TRUE, pe_options, NULL);
1089 
1090 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1091 		vm_object_unlock(object);
1092 
1093 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1094 
1095 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1096 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1097 		    pe_flags, /* wired */ TRUE, pe_options, NULL);
1098 
1099 		vm_object_lock(object);
1100 	}
1101 
1102 	assert(pe_result == KERN_SUCCESS);
1103 }
1104 
1105 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot)1106 kernel_memory_populate_object_and_unlock(
1107 	vm_object_t     object, /* must be locked */
1108 	vm_address_t    addr,
1109 	vm_offset_t     offset,
1110 	vm_size_t       size,
1111 	vm_page_t       page_list,
1112 	kma_flags_t     flags,
1113 	vm_tag_t        tag,
1114 	vm_prot_t       prot)
1115 {
1116 	vm_page_t       mem;
1117 	int             pe_flags;
1118 
1119 	assert3u((bool)(flags & KMA_KOBJECT), ==, object == kernel_object);
1120 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1121 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1122 		assert3u(offset, ==, addr);
1123 	}
1124 
1125 	if (flags & KMA_KSTACK) {
1126 		pe_flags = VM_MEM_STACK;
1127 	} else {
1128 		pe_flags = 0;
1129 	}
1130 
1131 	for (vm_object_offset_t pg_offset = 0;
1132 	    pg_offset < size;
1133 	    pg_offset += PAGE_SIZE_64) {
1134 		if (page_list == NULL) {
1135 			panic("%s: page_list too short", __func__);
1136 		}
1137 
1138 		mem = page_list;
1139 		page_list = mem->vmp_snext;
1140 		mem->vmp_snext = NULL;
1141 
1142 		assert(mem->vmp_wire_count == 0);
1143 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1144 
1145 		if (flags & KMA_COMPRESSOR) {
1146 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1147 
1148 			vm_page_insert(mem, object, offset + pg_offset);
1149 		} else {
1150 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1151 			mem->vmp_wire_count = 1;
1152 
1153 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1154 		}
1155 
1156 		mem->vmp_busy = false;
1157 		mem->vmp_pmapped = true;
1158 		mem->vmp_wpmapped = true;
1159 
1160 		/*
1161 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1162 		 * for the kernel and compressor objects.
1163 		 */
1164 
1165 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1166 		    mem, prot, pe_flags);
1167 
1168 		if (flags & KMA_NOENCRYPT) {
1169 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1170 		}
1171 	}
1172 
1173 	if (page_list) {
1174 		panic("%s: page_list too long", __func__);
1175 	}
1176 
1177 	vm_object_unlock(object);
1178 
1179 	if (!(flags & KMA_COMPRESSOR)) {
1180 		vm_page_lockspin_queues();
1181 		vm_page_wire_count += atop(size);
1182 		vm_page_unlock_queues();
1183 	}
1184 
1185 	if (flags & KMA_KOBJECT) {
1186 		/* vm_page_insert_wired() handles regular objects already */
1187 		vm_tag_update_size(tag, size);
1188 	}
1189 
1190 #if KASAN
1191 	if (flags & KMA_COMPRESSOR) {
1192 		kasan_notify_address_nopoison(addr, size);
1193 	} else {
1194 		kasan_notify_address(addr, size);
1195 	}
1196 #endif
1197 }
1198 
1199 
1200 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1201 kernel_memory_populate(
1202 	vm_offset_t     addr,
1203 	vm_size_t       size,
1204 	kma_flags_t     flags,
1205 	vm_tag_t        tag)
1206 {
1207 	kern_return_t   kr = KERN_SUCCESS;
1208 	vm_page_t       page_list = NULL;
1209 	vm_size_t       page_count = atop_64(size);
1210 	vm_object_t     object = __kmem_object(ANYF(flags));
1211 
1212 #if DEBUG || DEVELOPMENT
1213 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1214 	    size, 0, 0, 0);
1215 #endif
1216 
1217 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1218 	if (kr == KERN_SUCCESS) {
1219 		vm_object_lock(object);
1220 		kernel_memory_populate_object_and_unlock(object, addr,
1221 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT);
1222 	}
1223 
1224 #if DEBUG || DEVELOPMENT
1225 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1226 	    page_count, 0, 0, 0);
1227 #endif
1228 	return kr;
1229 }
1230 
1231 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1232 kernel_memory_depopulate(
1233 	vm_offset_t        addr,
1234 	vm_size_t          size,
1235 	kma_flags_t        flags,
1236 	vm_tag_t           tag)
1237 {
1238 	vm_object_t        object = __kmem_object(ANYF(flags));
1239 	vm_object_offset_t offset = addr;
1240 	vm_page_t          mem;
1241 	vm_page_t          local_freeq = NULL;
1242 	unsigned int       pages_unwired = 0;
1243 
1244 	vm_object_lock(object);
1245 
1246 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1247 
1248 	for (vm_object_offset_t pg_offset = 0;
1249 	    pg_offset < size;
1250 	    pg_offset += PAGE_SIZE_64) {
1251 		mem = vm_page_lookup(object, offset + pg_offset);
1252 
1253 		assert(mem);
1254 
1255 		if (flags & KMA_COMPRESSOR) {
1256 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1257 		} else {
1258 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1259 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1260 			pages_unwired++;
1261 		}
1262 
1263 		mem->vmp_busy = TRUE;
1264 
1265 		assert(mem->vmp_tabled);
1266 		vm_page_remove(mem, TRUE);
1267 		assert(mem->vmp_busy);
1268 
1269 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1270 
1271 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1272 		mem->vmp_snext = local_freeq;
1273 		local_freeq = mem;
1274 	}
1275 
1276 	vm_object_unlock(object);
1277 
1278 	vm_page_free_list(local_freeq, TRUE);
1279 
1280 	if (!(flags & KMA_COMPRESSOR)) {
1281 		vm_page_lockspin_queues();
1282 		vm_page_wire_count -= pages_unwired;
1283 		vm_page_unlock_queues();
1284 	}
1285 
1286 	if (flags & KMA_KOBJECT) {
1287 		/* vm_page_remove() handles regular objects already */
1288 		vm_tag_update_size(tag, -ptoa_64(pages_unwired));
1289 	}
1290 }
1291 
1292 #pragma mark reallocation
1293 
1294 __abortlike
1295 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry,vm_object_t object)1296 __kmem_realloc_invalid_object_size_panic(
1297 	vm_map_t                map,
1298 	vm_address_t            address,
1299 	vm_size_t               size,
1300 	vm_map_entry_t          entry,
1301 	vm_object_t             object)
1302 {
1303 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1304 	    "object %p has unexpected size %lld",
1305 	    map, (void *)address, (size_t)size, entry, object, object->vo_size);
1306 }
1307 
1308 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1309 kmem_realloc_shrink_guard(
1310 	vm_map_t                map,
1311 	vm_offset_t             oldaddr,
1312 	vm_size_t               oldsize,
1313 	vm_size_t               newsize,
1314 	kmr_flags_t             flags,
1315 	kmem_guard_t            guard,
1316 	vm_map_entry_t          entry)
1317 {
1318 	vm_object_t             object;
1319 	kmem_return_t           kmr = { .kmr_address = oldaddr };
1320 	bool                    was_atomic;
1321 
1322 	vm_map_lock_assert_exclusive(map);
1323 
1324 	if ((flags & KMR_KOBJECT) == 0) {
1325 		object = VME_OBJECT(entry);
1326 		vm_object_reference(object);
1327 	}
1328 
1329 	/*
1330 	 *	Shrinking an atomic entry starts with splitting it,
1331 	 *	and removing the second half.
1332 	 */
1333 	was_atomic = entry->vme_atomic;
1334 	entry->vme_atomic = false;
1335 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1336 	entry->vme_atomic = was_atomic;
1337 
1338 	(void)vm_map_remove_and_unlock(map,
1339 	    oldaddr + newsize, oldaddr + oldsize,
1340 	    VM_MAP_REMOVE_KUNWIRE, KMEM_GUARD_NONE);
1341 
1342 
1343 	/*
1344 	 *	Lastly, if there are guard pages, deal with them.
1345 	 *
1346 	 *	The kernel object just needs to depopulate,
1347 	 *	regular objects require freeing the last page
1348 	 *	and replacing it with a guard.
1349 	 */
1350 	if (flags & KMR_KOBJECT) {
1351 		if (flags & KMR_GUARD_LAST) {
1352 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1353 			    PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1354 		}
1355 	} else {
1356 		vm_page_t guard_right = VM_PAGE_NULL;
1357 		vm_offset_t remove_start = newsize;
1358 
1359 		if (flags & KMR_GUARD_LAST) {
1360 			guard_right = vm_page_grab_guard(true);
1361 			remove_start -= PAGE_SIZE;
1362 		}
1363 
1364 		vm_object_lock(object);
1365 
1366 		if (object->vo_size != oldsize) {
1367 			__kmem_realloc_invalid_object_size_panic(map,
1368 			    oldaddr, oldsize, entry, object);
1369 		}
1370 		object->vo_size = newsize;
1371 
1372 		vm_object_page_remove(object, remove_start, oldsize);
1373 
1374 		if (flags & KMR_GUARD_LAST) {
1375 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1376 			guard_right->vmp_busy = false;
1377 		}
1378 		vm_object_unlock(object);
1379 		vm_object_deallocate(object);
1380 	}
1381 
1382 	return kmr;
1383 }
1384 
1385 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard)1386 kmem_realloc_guard(
1387 	vm_map_t                map,
1388 	vm_offset_t             oldaddr,
1389 	vm_size_t               oldsize,
1390 	vm_size_t               newsize,
1391 	kmr_flags_t             flags,
1392 	kmem_guard_t            guard)
1393 {
1394 	vm_object_t             object;
1395 	vm_map_offset_t         newaddr;
1396 	vm_object_offset_t      newoffs;
1397 	vm_map_entry_t          oldentry;
1398 	vm_map_entry_t          newentry;
1399 	vm_page_t               page_list = NULL;
1400 	bool                    needs_wakeup = false;
1401 	kmem_return_t           kmr = { };
1402 	unsigned int            last_timestamp;
1403 	vm_map_kernel_flags_t   vmk_flags = {
1404 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1405 	};
1406 
1407 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1408 	if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1409 		__kmem_invalid_arguments_panic("realloc", map, oldaddr,
1410 		    oldsize, flags);
1411 	}
1412 
1413 	if (oldaddr == 0ul) {
1414 		return kmem_alloc_guard(map, newsize, 0, (kma_flags_t)flags, guard);
1415 	}
1416 
1417 	if (newsize == 0ul) {
1418 		kmem_free_guard(map, oldaddr, oldsize, KMF_NONE, guard);
1419 		return kmr;
1420 	}
1421 
1422 	if (newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1423 		__kmem_invalid_size_panic(map, newsize, flags);
1424 	}
1425 	if (newsize < __kmem_guard_size(ANYF(flags))) {
1426 		__kmem_invalid_size_panic(map, newsize, flags);
1427 	}
1428 
1429 	oldsize = round_page(oldsize);
1430 	newsize = round_page(newsize);
1431 
1432 	if (oldsize == newsize) {
1433 		kmr.kmr_address = oldaddr;
1434 		return kmr;
1435 	}
1436 
1437 	/*
1438 	 *	If we're growing the allocation,
1439 	 *	then reserve the pages we'll need,
1440 	 *	and find a spot for its new place.
1441 	 */
1442 	if (oldsize < newsize) {
1443 #if DEBUG || DEVELOPMENT
1444 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1445 		    VM_KERN_REQUEST, DBG_FUNC_START,
1446 		    newsize - oldsize, 0, 0, 0);
1447 #endif
1448 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1449 		    (kma_flags_t)flags, &page_list);
1450 		if (kmr.kmr_return == KERN_SUCCESS) {
1451 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1452 			    &vmk_flags, true);
1453 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1454 			    vmk_flags, &newentry);
1455 		}
1456 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1457 			if (flags & KMR_REALLOCF) {
1458 				kmem_free_guard(map, oldaddr, oldsize,
1459 				    KMF_NONE, guard);
1460 			}
1461 			if (page_list) {
1462 				vm_page_free_list(page_list, FALSE);
1463 			}
1464 #if DEBUG || DEVELOPMENT
1465 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1466 			    VM_KERN_REQUEST, DBG_FUNC_END,
1467 			    0, 0, 0, 0);
1468 #endif
1469 			return kmr;
1470 		}
1471 
1472 		/* map is locked */
1473 	} else {
1474 		vm_map_lock(map);
1475 	}
1476 
1477 
1478 	/*
1479 	 *	Locate the entry:
1480 	 *	- wait for it to quiesce.
1481 	 *	- validate its guard,
1482 	 *	- learn its correct tag,
1483 	 */
1484 again:
1485 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1486 		__kmem_entry_not_found_panic(map, oldaddr);
1487 	}
1488 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1489 		oldentry->needs_wakeup = true;
1490 		vm_map_entry_wait(map, THREAD_UNINT);
1491 		goto again;
1492 	}
1493 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1494 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1495 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1496 	}
1497 	/*
1498 	 *	TODO: We should validate for non atomic entries that the range
1499 	 *	      we are acting on is what we expect here.
1500 	 */
1501 
1502 	guard.kmg_tag = VME_ALIAS(oldentry);
1503 
1504 	if (newsize < oldsize) {
1505 		return kmem_realloc_shrink_guard(map, oldaddr, oldsize, newsize,
1506 		           flags, guard, oldentry);
1507 	}
1508 
1509 	/*
1510 	 *	We are growing the entry
1511 	 *
1512 	 *	For regular objects we use the object `vo_size` updates
1513 	 *	as a guarantee that no 2 kmem_realloc() can happen
1514 	 *	concurrently (by doing it before the map is unlocked.
1515 	 *
1516 	 *	For the kernel object, prevent the entry from being
1517 	 *	reallocated or changed by marking it "in_transition".
1518 	 */
1519 
1520 	object = VME_OBJECT(oldentry);
1521 	vm_object_lock(object);
1522 	vm_object_reference_locked(object);
1523 
1524 	newaddr = newentry->vme_start;
1525 	newoffs = oldsize;
1526 
1527 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1528 	VME_ALIAS_SET(newentry, guard.kmg_tag);
1529 	if (flags & KMR_KOBJECT) {
1530 		oldentry->in_transition = true;
1531 		VME_OFFSET_SET(newentry, newaddr);
1532 		newentry->wired_count = 1;
1533 		newoffs = newaddr + oldsize;
1534 	} else {
1535 		if (object->vo_size != oldsize) {
1536 			__kmem_realloc_invalid_object_size_panic(map,
1537 			    oldaddr, oldsize, oldentry, object);
1538 		}
1539 		object->vo_size = newsize;
1540 	}
1541 
1542 	last_timestamp = map->timestamp;
1543 	vm_map_unlock(map);
1544 
1545 
1546 	/*
1547 	 *	Now proceed with the population of pages.
1548 	 *
1549 	 *	Kernel objects can use the kmem population helpers.
1550 	 *
1551 	 *	Regular objects will insert pages manually,
1552 	 *	then wire the memory into the new range.
1553 	 */
1554 
1555 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1556 
1557 	if (flags & KMR_KOBJECT) {
1558 		assert(flags & KMR_FREEOLD);
1559 
1560 		pmap_protect(kernel_pmap,
1561 		    oldaddr, oldaddr + oldsize - guard_right_size,
1562 		    VM_PROT_NONE);
1563 
1564 		for (vm_object_offset_t offset = 0;
1565 		    offset < oldsize - guard_right_size;
1566 		    offset += PAGE_SIZE_64) {
1567 			vm_page_t mem;
1568 
1569 			mem = vm_page_lookup(object, oldaddr + offset);
1570 			if (mem == VM_PAGE_NULL) {
1571 				continue;
1572 			}
1573 
1574 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1575 
1576 			mem->vmp_busy = true;
1577 			vm_page_remove(mem, true);
1578 			vm_page_insert_wired(mem, object, newaddr + offset,
1579 			    guard.kmg_tag);
1580 			mem->vmp_busy = false;
1581 
1582 			kernel_memory_populate_pmap_enter(object, newaddr,
1583 			    offset, mem, VM_PROT_DEFAULT, 0);
1584 		}
1585 
1586 		kernel_memory_populate_object_and_unlock(object,
1587 		    newaddr + oldsize - guard_right_size,
1588 		    newoffs - guard_right_size,
1589 		    newsize - oldsize,
1590 		    page_list, (kma_flags_t)flags,
1591 		    guard.kmg_tag, VM_PROT_DEFAULT);
1592 	} else {
1593 		vm_page_t guard_right = VM_PAGE_NULL;
1594 		kern_return_t kr;
1595 
1596 		/*
1597 		 *	Note: we are borrowing the new entry reference
1598 		 *	on the object for the duration of this code,
1599 		 *	which works because we keep the object locked
1600 		 *	throughout.
1601 		 */
1602 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1603 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1604 			assert(guard_right->vmp_fictitious);
1605 			guard_right->vmp_busy = true;
1606 			vm_page_remove(guard_right, true);
1607 		}
1608 
1609 		for (vm_object_offset_t offset = oldsize - guard_right_size;
1610 		    offset < newsize - guard_right_size;
1611 		    offset += PAGE_SIZE_64) {
1612 			vm_page_t mem = page_list;
1613 
1614 			page_list = mem->vmp_snext;
1615 			mem->vmp_snext = VM_PAGE_NULL;
1616 
1617 			vm_page_insert(mem, object, offset);
1618 			mem->vmp_busy = false;
1619 		}
1620 
1621 		if (guard_right) {
1622 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1623 			guard_right->vmp_busy = false;
1624 		}
1625 
1626 		vm_object_unlock(object);
1627 
1628 		kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
1629 		    VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
1630 		assert(kr == KERN_SUCCESS);
1631 	}
1632 
1633 #if KASAN
1634 	kasan_notify_address(newaddr, newsize);
1635 #endif
1636 
1637 
1638 	/*
1639 	 *	Mark the entry as idle again,
1640 	 *	and honor KMR_FREEOLD if needed.
1641 	 */
1642 
1643 	vm_map_lock(map);
1644 	if (last_timestamp + 1 != map->timestamp &&
1645 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1646 		__kmem_entry_not_found_panic(map, oldaddr);
1647 	}
1648 
1649 	if (flags & KMR_KOBJECT) {
1650 		assert(oldentry->in_transition);
1651 		oldentry->in_transition = false;
1652 		if (oldentry->needs_wakeup) {
1653 			needs_wakeup = true;
1654 			oldentry->needs_wakeup = false;
1655 		}
1656 	}
1657 
1658 	if (flags & KMR_FREEOLD) {
1659 		(void)vm_map_remove_and_unlock(map,
1660 		    oldaddr, oldaddr + oldsize,
1661 		    VM_MAP_REMOVE_KUNWIRE, guard);
1662 	} else {
1663 		vm_map_unlock(map);
1664 	}
1665 
1666 	if (needs_wakeup) {
1667 		vm_map_entry_wakeup(map);
1668 	}
1669 
1670 
1671 #if DEBUG || DEVELOPMENT
1672 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1673 	    atop(newsize - oldsize), 0, 0, 0);
1674 #endif
1675 	kmr.kmr_address = newaddr;
1676 	return kmr;
1677 }
1678 
1679 #pragma mark free
1680 
1681 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t addr,vm_size_t size,kmf_flags_t flags,kmem_guard_t guard)1682 kmem_free_guard(
1683 	vm_map_t        map,
1684 	vm_offset_t     addr,
1685 	vm_size_t       size,
1686 	kmf_flags_t     flags,
1687 	kmem_guard_t    guard)
1688 {
1689 	vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1690 
1691 	assert(addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS);
1692 	assert(map->pmap == kernel_pmap);
1693 
1694 	if (flags & KMF_GUESS_SIZE) {
1695 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
1696 		size = PAGE_SIZE;
1697 	} else if (size == 0) {
1698 		__kmem_invalid_size_panic(map, size, flags);
1699 	} else {
1700 		size = round_page(size);
1701 	}
1702 
1703 	return vm_map_remove_guard(map, addr, addr + size,
1704 	           vmr_flags, guard).kmr_size;
1705 }
1706 
1707 __exported void
1708 kmem_free_external(
1709 	vm_map_t        map,
1710 	vm_offset_t     addr,
1711 	vm_size_t       size);
1712 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)1713 kmem_free_external(
1714 	vm_map_t        map,
1715 	vm_offset_t     addr,
1716 	vm_size_t       size)
1717 {
1718 	if (size) {
1719 		kmem_free(map, trunc_page(addr), size);
1720 #if MACH_ASSERT
1721 	} else {
1722 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
1723 		    map, (void *)addr, __builtin_return_address(0));
1724 #endif
1725 	}
1726 }
1727 
1728 
1729 #pragma mark kmem init
1730 
1731 /*
1732  * The default percentage of memory that can be mlocked is scaled based on the total
1733  * amount of memory in the system. These percentages are caclulated
1734  * offline and stored in this table. We index this table by
1735  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
1736  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
1737  *
1738  * Note that these values were picked for mac.
1739  * If we ever have very large memory config arm devices, we may want to revisit
1740  * since the kernel overhead is smaller there due to the larger page size.
1741  */
1742 
1743 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
1744 #define VM_USER_WIREABLE_MIN_CONFIG 32
1745 #if CONFIG_JETSAM
1746 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
1747  * pressure.
1748  */
1749 static vm_map_size_t wire_limit_percents[] =
1750 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
1751 #else
1752 static vm_map_size_t wire_limit_percents[] =
1753 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
1754 #endif /* CONFIG_JETSAM */
1755 
1756 /*
1757  * Sets the default global user wire limit which limits the amount of
1758  * memory that can be locked via mlock() based on the above algorithm..
1759  * This can be overridden via a sysctl.
1760  */
1761 static void
kmem_set_user_wire_limits(void)1762 kmem_set_user_wire_limits(void)
1763 {
1764 	uint64_t available_mem_log;
1765 	uint64_t max_wire_percent;
1766 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
1767 	    sizeof(vm_map_size_t);
1768 	vm_map_size_t limit;
1769 	uint64_t config_memsize = max_mem;
1770 #if defined(XNU_TARGET_OS_OSX)
1771 	config_memsize = max_mem_actual;
1772 #endif /* defined(XNU_TARGET_OS_OSX) */
1773 
1774 	available_mem_log = bit_floor(config_memsize);
1775 
1776 	if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
1777 		available_mem_log = 0;
1778 	} else {
1779 		available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
1780 	}
1781 	if (available_mem_log >= wire_limit_percents_length) {
1782 		available_mem_log = wire_limit_percents_length - 1;
1783 	}
1784 	max_wire_percent = wire_limit_percents[available_mem_log];
1785 
1786 	limit = config_memsize * max_wire_percent / 100;
1787 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
1788 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
1789 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
1790 	}
1791 
1792 	vm_global_user_wire_limit = limit;
1793 	/* the default per task limit is the same as the global limit */
1794 	vm_per_task_user_wire_limit = limit;
1795 	vm_add_wire_count_over_global_limit = 0;
1796 	vm_add_wire_count_over_user_limit = 0;
1797 }
1798 
1799 #define KMEM_MAX_CLAIMS 50
1800 __startup_data
1801 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
1802 __startup_data
1803 uint32_t kmem_claim_count = 0;
1804 
1805 __startup_func
1806 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)1807 kmem_range_startup_init(
1808 	struct kmem_range_startup_spec *sp)
1809 {
1810 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
1811 	if (sp->kc_calculate_sz) {
1812 		sp->kc_size = (sp->kc_calculate_sz)();
1813 	}
1814 	if (sp->kc_size) {
1815 		kmem_claims[kmem_claim_count] = *sp;
1816 		kmem_claim_count++;
1817 	}
1818 }
1819 
1820 static vm_offset_t
kmem_fuzz_start(void)1821 kmem_fuzz_start(void)
1822 {
1823 	vm_offset_t kmapoff_kaddr = 0;
1824 	uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
1825 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
1826 
1827 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
1828 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
1829 	    VM_KERN_MEMORY_OSFMK);
1830 	return kmapoff_kaddr + kmapoff_size;
1831 }
1832 
1833 /*
1834  * Returns a 16bit random number between 0 and
1835  * upper_limit (inclusive)
1836  */
1837 __startup_func
1838 uint16_t
kmem_get_random16(uint16_t upper_limit)1839 kmem_get_random16(
1840 	uint16_t        upper_limit)
1841 {
1842 	static uint64_t random_entropy;
1843 	assert(upper_limit < UINT16_MAX);
1844 	if (random_entropy == 0) {
1845 		random_entropy = early_random();
1846 	}
1847 	uint32_t result = random_entropy & UINT32_MAX;
1848 	random_entropy >>= 32;
1849 	return (uint16_t)(result % (upper_limit + 1));
1850 }
1851 
1852 /*
1853  * Generate a randomly shuffled array of indices from 0 to count - 1
1854  */
1855 __startup_func
1856 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)1857 kmem_shuffle(
1858 	uint16_t       *shuffle_buf,
1859 	uint16_t        count)
1860 {
1861 	for (uint16_t i = 0; i < count; i++) {
1862 		uint16_t j = kmem_get_random16(i);
1863 		if (j != i) {
1864 			shuffle_buf[i] = shuffle_buf[j];
1865 		}
1866 		shuffle_buf[j] = i;
1867 	}
1868 }
1869 
1870 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1871 __startup_func
1872 static void
kmem_shuffle_claims(void)1873 kmem_shuffle_claims(void)
1874 {
1875 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
1876 	kmem_shuffle(&shuffle_buf[0], (uint16_t)kmem_claim_count);
1877 	for (uint16_t i = 0; i < kmem_claim_count; i++) {
1878 		struct kmem_range_startup_spec tmp = kmem_claims[i];
1879 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
1880 		kmem_claims[shuffle_buf[i]] = tmp;
1881 	}
1882 }
1883 
1884 __startup_func
1885 static void
kmem_readjust_ranges(uint32_t cur_idx)1886 kmem_readjust_ranges(
1887 	uint32_t        cur_idx)
1888 {
1889 	assert(cur_idx != 0);
1890 	uint32_t j = cur_idx - 1, random;
1891 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
1892 	struct kmem_range *sp_range = sp.kc_range;
1893 
1894 	/*
1895 	 * Find max index where restriction is met
1896 	 */
1897 	for (; j > 0; j--) {
1898 		struct kmem_range_startup_spec spj = kmem_claims[j];
1899 		vm_map_offset_t max_start = spj.kc_range->min_address;
1900 		if (spj.kc_flags & KC_NO_MOVE) {
1901 			panic("kmem_range_init: Can't scramble with multiple constraints");
1902 		}
1903 		if (max_start <= sp_range->min_address) {
1904 			break;
1905 		}
1906 	}
1907 
1908 	/*
1909 	 * Pick a random index from 0 to max index and shift claims to the right
1910 	 * to make room for restricted claim
1911 	 */
1912 	random = kmem_get_random16((uint16_t)j);
1913 	assert(random <= j);
1914 
1915 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
1916 	sp_range->max_address = sp_range->min_address + sp.kc_size;
1917 
1918 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
1919 		struct kmem_range_startup_spec spj = kmem_claims[j];
1920 		struct kmem_range *range = spj.kc_range;
1921 		range->min_address += sp.kc_size;
1922 		range->max_address += sp.kc_size;
1923 		kmem_claims[j + 1] = spj;
1924 	}
1925 
1926 	sp.kc_flags = KC_NO_MOVE;
1927 	kmem_claims[random] = sp;
1928 }
1929 
1930 __startup_func
1931 static void
kmem_add_extra_claims(void)1932 kmem_add_extra_claims(void)
1933 {
1934 	vm_map_size_t largest_free_size = 0, total_claims = 0;
1935 
1936 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
1937 	largest_free_size = trunc_page(largest_free_size);
1938 
1939 	/*
1940 	 * Determine size of data and pointer kmem_ranges
1941 	 */
1942 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
1943 		total_claims += kmem_claims[i].kc_size;
1944 	}
1945 	assert((total_claims & PAGE_MASK) == 0);
1946 	largest_free_size -= total_claims;
1947 
1948 	/*
1949 	 * kasan and configs w/o *TRR need to have just one ptr range due to
1950 	 * resource constraints.
1951 	 */
1952 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
1953 	kmem_ptr_ranges = 1;
1954 #endif
1955 
1956 	ptr_range_size = round_page(largest_free_size /
1957 	    (kmem_ptr_ranges * 3));
1958 	data_range_size = largest_free_size -
1959 	    (ptr_range_size * kmem_ptr_ranges);
1960 
1961 
1962 	/*
1963 	 * Add claims for data and pointer
1964 	 */
1965 	struct kmem_range_startup_spec kmem_spec_data = {
1966 		.kc_name = "kmem_data_range",
1967 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
1968 		.kc_size = data_range_size,
1969 		.kc_flags = KC_NO_ENTRY,
1970 	};
1971 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
1972 
1973 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
1974 		struct kmem_range_startup_spec kmem_spec_ptr = {
1975 			.kc_name = "kmem_ptr_range",
1976 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
1977 			.kc_size = ptr_range_size,
1978 			.kc_flags = KC_NO_ENTRY,
1979 		};
1980 		kmem_claims[kmem_claim_count++] = kmem_spec_ptr;
1981 	}
1982 }
1983 
1984 __startup_func
1985 static void
kmem_scramble_ranges(void)1986 kmem_scramble_ranges(void)
1987 {
1988 	vm_map_offset_t start = 0;
1989 
1990 	/*
1991 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
1992 	 * the vm can find the requested ranges.
1993 	 */
1994 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
1995 	    VM_MAP_PAGE_SIZE(kernel_map));
1996 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
1997 
1998 	/*
1999 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
2000 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
2001 	 * could overflow into the heap.
2002 	 */
2003 	kext_alloc_init();
2004 
2005 	/*
2006 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
2007 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
2008 	 * eats about 2M of VA from the map)
2009 	 *
2010 	 * Note that we always need to slide by at least one page because the VM
2011 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
2012 	 * do not admit this address to be part of any zone submap.
2013 	 */
2014 	start = kmem_fuzz_start();
2015 
2016 	/*
2017 	 * Add claims for ptr and data kmem_ranges
2018 	 */
2019 	kmem_add_extra_claims();
2020 
2021 	/*
2022 	 * Shuffle registered claims
2023 	 */
2024 	assert(kmem_claim_count < UINT16_MAX);
2025 	kmem_shuffle_claims();
2026 
2027 	/*
2028 	 * Apply restrictions and determine range for each claim
2029 	 */
2030 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2031 		vm_map_offset_t end = 0;
2032 		struct kmem_range_startup_spec sp = kmem_claims[i];
2033 		struct kmem_range *sp_range = sp.kc_range;
2034 		if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
2035 		    VM_MAP_KERNEL_FLAGS_NONE, &start, NULL) != KERN_SUCCESS) {
2036 			panic("kmem_range_init: vm_map_locate_space failing for claim %s",
2037 			    sp.kc_name);
2038 		}
2039 
2040 		end = start + sp.kc_size;
2041 		/*
2042 		 * Re-adjust ranges if restriction not met
2043 		 */
2044 		if (sp_range->min_address && start > sp_range->min_address) {
2045 			kmem_readjust_ranges(i);
2046 		} else {
2047 			sp_range->min_address = start;
2048 			sp_range->max_address = end;
2049 		}
2050 		start = end;
2051 	}
2052 
2053 	/*
2054 	 * We have settled on the ranges, now create temporary entries for the
2055 	 * claims
2056 	 */
2057 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2058 		struct kmem_range_startup_spec sp = kmem_claims[i];
2059 		vm_map_entry_t entry = NULL;
2060 		if (sp.kc_flags & KC_NO_ENTRY) {
2061 			continue;
2062 		}
2063 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
2064 		    VM_MAP_KERNEL_FLAGS_NONE, &entry) != KERN_SUCCESS) {
2065 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
2066 			    sp.kc_name);
2067 		}
2068 		vm_object_reference(kernel_object);
2069 		VME_OBJECT_SET(entry, kernel_object, false, 0);
2070 		VME_OFFSET_SET(entry, entry->vme_start);
2071 		vm_map_unlock(kernel_map);
2072 	}
2073 	/*
2074 	 * Now that we are done assigning all the ranges, reset
2075 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
2076 	 */
2077 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct kmem_range) {};
2078 
2079 #if DEBUG || DEVELOPMENT
2080 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2081 		struct kmem_range_startup_spec sp = kmem_claims[i];
2082 		const char *size_str = "K";
2083 		uint32_t shift = 10;
2084 		if (sp.kc_size >> 30) {
2085 			size_str = "G";
2086 			shift = 30;
2087 		} else if (sp.kc_size >> 20) {
2088 			size_str = "M";
2089 			shift = 20;
2090 		}
2091 		printf("%-24s: %p - %p (%llu%s)\n", sp.kc_name,
2092 		    (void *)sp.kc_range->min_address, (void *)sp.kc_range->max_address,
2093 		    sp.kc_size >> shift, size_str);
2094 	}
2095 #endif /* DEBUG || DEVELOPMENT */
2096 }
2097 
2098 __startup_func
2099 static void
kmem_range_init(void)2100 kmem_range_init(void)
2101 {
2102 	kmem_scramble_ranges();
2103 
2104 	/* Initialize kmem_large_ranges. Skip 1/16th of range size on either side
2105 	 * for ptr ranges and 1/8th only from left for data as we a single front
2106 	 * for data.
2107 	 */
2108 	vm_size_t range_adjustment = ptr_range_size >> 4;
2109 	for (kmem_range_id_t i = 0; i < kmem_ptr_ranges; i++) {
2110 		kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address =
2111 		    kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address + range_adjustment;
2112 		kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address =
2113 		    kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address - range_adjustment;
2114 	}
2115 	range_adjustment = data_range_size >> 3;
2116 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
2117 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
2118 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
2119 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
2120 
2121 #if DEBUG || DEVELOPMENT
2122 	for (kmem_range_id_t i = 0; i < KMEM_RANGE_COUNT; i++) {
2123 		printf("kmem_large_ranges[%d]    : %p - %p\n", i,
2124 		    (void *)kmem_large_ranges[i].min_address,
2125 		    (void *)kmem_large_ranges[i].max_address);
2126 	}
2127 #endif
2128 }
2129 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
2130 __startup_func
2131 static void
kmem_range_init(void)2132 kmem_range_init(void)
2133 {
2134 	for (kmem_range_id_t i = 0; i < KMEM_RANGE_COUNT; i++) {
2135 		kmem_ranges[i].min_address = kernel_map->min_offset;
2136 		kmem_ranges[i].max_address = kernel_map->max_offset;
2137 	}
2138 	kext_alloc_init();
2139 	kmem_fuzz_start();
2140 }
2141 #endif
2142 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
2143 
2144 /*
2145  *	kmem_init:
2146  *
2147  *	Initialize the kernel's virtual memory map, taking
2148  *	into account all memory allocated up to this time.
2149  */
2150 __startup_func
2151 void
kmem_init(vm_offset_t start,vm_offset_t end)2152 kmem_init(
2153 	vm_offset_t     start,
2154 	vm_offset_t     end)
2155 {
2156 	vm_map_offset_t map_start;
2157 	vm_map_offset_t map_end;
2158 	vm_map_kernel_flags_t vmk_flags;
2159 
2160 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2161 	vmk_flags.vmkf_permanent = TRUE;
2162 	vmk_flags.vmkf_no_pmap_check = TRUE;
2163 
2164 	map_start = vm_map_trunc_page(start,
2165 	    VM_MAP_PAGE_MASK(kernel_map));
2166 	map_end = vm_map_round_page(end,
2167 	    VM_MAP_PAGE_MASK(kernel_map));
2168 
2169 	vm_map_will_allocate_early_map(&kernel_map);
2170 #if     defined(__arm__) || defined(__arm64__)
2171 	kernel_map = vm_map_create_options(pmap_kernel(),
2172 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2173 	    VM_MAX_KERNEL_ADDRESS,
2174 	    VM_MAP_CREATE_DEFAULT);
2175 	/*
2176 	 *	Reserve virtual memory allocated up to this time.
2177 	 */
2178 	{
2179 		unsigned int    region_select = 0;
2180 		vm_map_offset_t region_start;
2181 		vm_map_size_t   region_size;
2182 		vm_map_offset_t map_addr;
2183 		kern_return_t kr;
2184 
2185 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
2186 			map_addr = region_start;
2187 			kr = vm_map_enter(kernel_map, &map_addr,
2188 			    vm_map_round_page(region_size,
2189 			    VM_MAP_PAGE_MASK(kernel_map)),
2190 			    (vm_map_offset_t) 0,
2191 			    VM_FLAGS_FIXED,
2192 			    vmk_flags,
2193 			    VM_KERN_MEMORY_NONE,
2194 			    VM_OBJECT_NULL,
2195 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
2196 			    VM_INHERIT_DEFAULT);
2197 
2198 			if (kr != KERN_SUCCESS) {
2199 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2200 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
2201 				    (uint64_t) region_size, kr);
2202 			}
2203 
2204 			region_select++;
2205 		}
2206 	}
2207 #else
2208 	kernel_map = vm_map_create_options(pmap_kernel(),
2209 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
2210 	    VM_MAP_CREATE_DEFAULT);
2211 	/*
2212 	 *	Reserve virtual memory allocated up to this time.
2213 	 */
2214 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
2215 		vm_map_offset_t map_addr;
2216 		kern_return_t kr;
2217 
2218 		vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2219 		vmk_flags.vmkf_no_pmap_check = TRUE;
2220 
2221 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2222 		kr = vm_map_enter(kernel_map,
2223 		    &map_addr,
2224 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2225 		    (vm_map_offset_t) 0,
2226 		    VM_FLAGS_FIXED,
2227 		    vmk_flags,
2228 		    VM_KERN_MEMORY_NONE,
2229 		    VM_OBJECT_NULL,
2230 		    (vm_object_offset_t) 0, FALSE,
2231 		    VM_PROT_NONE, VM_PROT_NONE,
2232 		    VM_INHERIT_DEFAULT);
2233 
2234 		if (kr != KERN_SUCCESS) {
2235 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2236 			    (uint64_t) start, (uint64_t) end,
2237 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2238 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2239 			    kr);
2240 		}
2241 	}
2242 #endif
2243 
2244 	kmem_set_user_wire_limits();
2245 }
2246 
2247 
2248 #pragma mark map copyio
2249 
2250 /*
2251  *	Routine:	copyinmap
2252  *	Purpose:
2253  *		Like copyin, except that fromaddr is an address
2254  *		in the specified VM map.  This implementation
2255  *		is incomplete; it handles the current user map
2256  *		and the kernel map/submaps.
2257  */
2258 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)2259 copyinmap(
2260 	vm_map_t                map,
2261 	vm_map_offset_t         fromaddr,
2262 	void                    *todata,
2263 	vm_size_t               length)
2264 {
2265 	kern_return_t   kr = KERN_SUCCESS;
2266 	vm_map_t oldmap;
2267 
2268 	if (vm_map_pmap(map) == pmap_kernel()) {
2269 		/* assume a correct copy */
2270 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
2271 	} else if (current_map() == map) {
2272 		if (copyin(fromaddr, todata, length) != 0) {
2273 			kr = KERN_INVALID_ADDRESS;
2274 		}
2275 	} else {
2276 		vm_map_reference(map);
2277 		oldmap = vm_map_switch(map);
2278 		if (copyin(fromaddr, todata, length) != 0) {
2279 			kr = KERN_INVALID_ADDRESS;
2280 		}
2281 		vm_map_switch(oldmap);
2282 		vm_map_deallocate(map);
2283 	}
2284 	return kr;
2285 }
2286 
2287 /*
2288  *	Routine:	copyoutmap
2289  *	Purpose:
2290  *		Like copyout, except that toaddr is an address
2291  *		in the specified VM map.
2292  */
2293 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)2294 copyoutmap(
2295 	vm_map_t                map,
2296 	void                    *fromdata,
2297 	vm_map_address_t        toaddr,
2298 	vm_size_t               length)
2299 {
2300 	kern_return_t   kr = KERN_SUCCESS;
2301 	vm_map_t        oldmap;
2302 
2303 	if (vm_map_pmap(map) == pmap_kernel()) {
2304 		/* assume a correct copy */
2305 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
2306 	} else if (current_map() == map) {
2307 		if (copyout(fromdata, toaddr, length) != 0) {
2308 			kr = KERN_INVALID_ADDRESS;
2309 		}
2310 	} else {
2311 		vm_map_reference(map);
2312 		oldmap = vm_map_switch(map);
2313 		if (copyout(fromdata, toaddr, length) != 0) {
2314 			kr = KERN_INVALID_ADDRESS;
2315 		}
2316 		vm_map_switch(oldmap);
2317 		vm_map_deallocate(map);
2318 	}
2319 	return kr;
2320 }
2321 
2322 /*
2323  *	Routine:	copyoutmap_atomic{32, 64}
2324  *	Purpose:
2325  *		Like copyoutmap, except that the operation is atomic.
2326  *      Takes in value rather than *fromdata pointer.
2327  */
2328 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)2329 copyoutmap_atomic32(
2330 	vm_map_t                map,
2331 	uint32_t                value,
2332 	vm_map_address_t        toaddr)
2333 {
2334 	kern_return_t   kr = KERN_SUCCESS;
2335 	vm_map_t        oldmap;
2336 
2337 	if (vm_map_pmap(map) == pmap_kernel()) {
2338 		/* assume a correct toaddr */
2339 		*(uint32_t *)toaddr = value;
2340 	} else if (current_map() == map) {
2341 		if (copyout_atomic32(value, toaddr) != 0) {
2342 			kr = KERN_INVALID_ADDRESS;
2343 		}
2344 	} else {
2345 		vm_map_reference(map);
2346 		oldmap = vm_map_switch(map);
2347 		if (copyout_atomic32(value, toaddr) != 0) {
2348 			kr = KERN_INVALID_ADDRESS;
2349 		}
2350 		vm_map_switch(oldmap);
2351 		vm_map_deallocate(map);
2352 	}
2353 	return kr;
2354 }
2355 
2356 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)2357 copyoutmap_atomic64(
2358 	vm_map_t                map,
2359 	uint64_t                value,
2360 	vm_map_address_t        toaddr)
2361 {
2362 	kern_return_t   kr = KERN_SUCCESS;
2363 	vm_map_t        oldmap;
2364 
2365 	if (vm_map_pmap(map) == pmap_kernel()) {
2366 		/* assume a correct toaddr */
2367 		*(uint64_t *)toaddr = value;
2368 	} else if (current_map() == map) {
2369 		if (copyout_atomic64(value, toaddr) != 0) {
2370 			kr = KERN_INVALID_ADDRESS;
2371 		}
2372 	} else {
2373 		vm_map_reference(map);
2374 		oldmap = vm_map_switch(map);
2375 		if (copyout_atomic64(value, toaddr) != 0) {
2376 			kr = KERN_INVALID_ADDRESS;
2377 		}
2378 		vm_map_switch(oldmap);
2379 		vm_map_deallocate(map);
2380 	}
2381 	return kr;
2382 }
2383 
2384 
2385 #pragma mark pointer obfuscation / packing
2386 
2387 /*
2388  *
2389  *	The following two functions are to be used when exposing kernel
2390  *	addresses to userspace via any of the various debug or info
2391  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
2392  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
2393  *	are exported to KEXTs.
2394  *
2395  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
2396  */
2397 
2398 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)2399 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
2400 {
2401 	assert(salt != 0);
2402 
2403 	if (addr == 0) {
2404 		return 0ul;
2405 	}
2406 
2407 	if (VM_KERNEL_IS_SLID(addr)) {
2408 		return VM_KERNEL_UNSLIDE(addr);
2409 	}
2410 
2411 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
2412 	SHA256_CTX sha_ctx;
2413 
2414 	SHA256_Init(&sha_ctx);
2415 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
2416 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
2417 	SHA256_Final(sha_digest, &sha_ctx);
2418 
2419 	return sha_digest[0];
2420 }
2421 
2422 __exported vm_offset_t
2423 vm_kernel_addrhash_external(vm_offset_t addr);
2424 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)2425 vm_kernel_addrhash_external(vm_offset_t addr)
2426 {
2427 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
2428 }
2429 
2430 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)2431 vm_kernel_addrhide(
2432 	vm_offset_t addr,
2433 	vm_offset_t *hide_addr)
2434 {
2435 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
2436 }
2437 
2438 /*
2439  *	vm_kernel_addrperm_external:
2440  *	vm_kernel_unslide_or_perm_external:
2441  *
2442  *	Use these macros when exposing an address to userspace that could come from
2443  *	either kernel text/data *or* the heap.
2444  */
2445 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)2446 vm_kernel_addrperm_external(
2447 	vm_offset_t addr,
2448 	vm_offset_t *perm_addr)
2449 {
2450 	if (VM_KERNEL_IS_SLID(addr)) {
2451 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
2452 	} else if (VM_KERNEL_ADDRESS(addr)) {
2453 		*perm_addr = addr + vm_kernel_addrperm_ext;
2454 	} else {
2455 		*perm_addr = addr;
2456 	}
2457 }
2458 
2459 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)2460 vm_kernel_unslide_or_perm_external(
2461 	vm_offset_t addr,
2462 	vm_offset_t *up_addr)
2463 {
2464 	vm_kernel_addrperm_external(addr, up_addr);
2465 }
2466 
2467 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)2468 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
2469 {
2470 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
2471 		panic("pointer %p can't be packed: low %d bits aren't 0",
2472 		    (void *)ptr, params.vmpp_shift);
2473 	} else if (ptr <= params.vmpp_base) {
2474 		panic("pointer %p can't be packed: below base %p",
2475 		    (void *)ptr, (void *)params.vmpp_base);
2476 	} else {
2477 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
2478 		    (void *)ptr, (void *)vm_packing_max_packable(params));
2479 	}
2480 }
2481 
2482 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)2483 vm_packing_verify_range(
2484 	const char *subsystem,
2485 	vm_offset_t min_address,
2486 	vm_offset_t max_address,
2487 	vm_packing_params_t params)
2488 {
2489 	if (min_address > max_address) {
2490 		panic("%s: %s range invalid min:%p > max:%p",
2491 		    __func__, subsystem, (void *)min_address, (void *)max_address);
2492 	}
2493 
2494 	if (!params.vmpp_base_relative) {
2495 		return;
2496 	}
2497 
2498 	if (min_address <= params.vmpp_base) {
2499 		panic("%s: %s range invalid min:%p <= base:%p",
2500 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
2501 	}
2502 
2503 	if (max_address > vm_packing_max_packable(params)) {
2504 		panic("%s: %s range invalid max:%p >= max packable:%p",
2505 		    __func__, subsystem, (void *)max_address,
2506 		    (void *)vm_packing_max_packable(params));
2507 	}
2508 }
2509 
2510 #pragma mark tests
2511 #if DEBUG || DEVELOPMENT
2512 #include <sys/errno.h>
2513 
2514 static void
2515 kmem_test_for_entry(
2516 	vm_map_t                map,
2517 	vm_offset_t             addr,
2518 	void                  (^block)(vm_map_entry_t))
2519 {
2520 	vm_map_entry_t entry;
2521 
2522 	vm_map_lock(map);
2523 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
2524 	vm_map_unlock(map);
2525 }
2526 
2527 #define kmem_test_assert_map(map, pg, entries) ({ \
2528 	assert3u((map)->size, ==, ptoa(pg)); \
2529 	assert3u((map)->hdr.nentries, ==, entries); \
2530 })
2531 
2532 static bool
can_write_at(vm_offset_t offs,uint32_t page)2533 can_write_at(vm_offset_t offs, uint32_t page)
2534 {
2535 	static const int zero;
2536 
2537 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
2538 }
2539 #define assert_writeable(offs, page) \
2540 	assertf(can_write_at(offs, page), \
2541 	    "can write at %p + ptoa(%d)", (void *)offs, page)
2542 
2543 #define assert_faults(offs, page) \
2544 	assertf(!can_write_at(offs, page), \
2545 	    "can write at %p + ptoa(%d)", (void *)offs, page)
2546 
2547 #define peek(offs, page) \
2548 	(*(uint32_t *)((offs) + ptoa(page)))
2549 
2550 #define poke(offs, page, v) \
2551 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
2552 
2553 __attribute__((noinline))
2554 static void
kmem_alloc_basic_test(vm_map_t map)2555 kmem_alloc_basic_test(vm_map_t map)
2556 {
2557 	kmem_guard_t guard = {
2558 		.kmg_tag = VM_KERN_MEMORY_DIAG,
2559 	};
2560 	vm_offset_t addr;
2561 
2562 	/*
2563 	 * Test wired basics:
2564 	 * - KMA_KOBJECT
2565 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
2566 	 * - allocation alignment
2567 	 */
2568 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
2569 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
2570 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
2571 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
2572 	kmem_test_assert_map(map, 10, 1);
2573 
2574 	kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
2575 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
2576 		assert(e->vme_kernel_object);
2577 		assert(!e->vme_atomic);
2578 		assert3u(e->vme_start, <=, addr);
2579 		assert3u(addr + ptoa(10), <=, e->vme_end);
2580 	});
2581 
2582 	assert_faults(addr, 0);
2583 	for (int i = 1; i < 9; i++) {
2584 		assert_writeable(addr, i);
2585 	}
2586 	assert_faults(addr, 9);
2587 
2588 	kmem_free(map, addr, ptoa(10));
2589 	kmem_test_assert_map(map, 0, 0);
2590 
2591 	/*
2592 	 * Test pageable basics.
2593 	 */
2594 	addr = kmem_alloc_guard(map, ptoa(10), 0,
2595 	    KMA_PAGEABLE, guard).kmr_address;
2596 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
2597 	kmem_test_assert_map(map, 10, 1);
2598 
2599 	for (int i = 0; i < 9; i++) {
2600 		assert_faults(addr, i);
2601 		poke(addr, i, 42);
2602 		assert_writeable(addr, i);
2603 	}
2604 
2605 	kmem_free(map, addr, ptoa(10));
2606 	kmem_test_assert_map(map, 0, 0);
2607 }
2608 
2609 __attribute__((noinline))
2610 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)2611 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
2612 {
2613 	kmem_guard_t guard = {
2614 		.kmg_atomic  = !(kind & KMR_DATA),
2615 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
2616 		.kmg_context = 0xefface,
2617 	};
2618 	vm_offset_t addr, newaddr;
2619 	const int N = 10;
2620 
2621 	/*
2622 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
2623 	 *	we could conceive an implementation where it grows in place
2624 	 *	if there's space after it.
2625 	 *
2626 	 *	However, this is what the implementation does today.
2627 	 */
2628 	bool realloc_growth_changes_address = true;
2629 	bool GL = (kind & KMR_GUARD_LAST);
2630 
2631 	/*
2632 	 *	Initial N page allocation
2633 	 */
2634 	addr = kmem_alloc_guard(map, ptoa(N), 0,
2635 	    (kind & (KMA_KOBJECT | KMA_GUARD_LAST)) | KMA_ZERO,
2636 	    guard).kmr_address;
2637 	assert3u(addr, !=, 0);
2638 	kmem_test_assert_map(map, N, 1);
2639 	for (int pg = 0; pg < N - GL; pg++) {
2640 		poke(addr, pg, 42 + pg);
2641 	}
2642 	for (int pg = N - GL; pg < N; pg++) {
2643 		assert_faults(addr, pg);
2644 	}
2645 
2646 
2647 	/*
2648 	 *	Grow to N + 3 pages
2649 	 */
2650 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
2651 	    kind | KMR_ZERO, guard).kmr_address;
2652 	assert3u(newaddr, !=, 0);
2653 	if (realloc_growth_changes_address) {
2654 		assert3u(addr, !=, newaddr);
2655 	}
2656 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
2657 		kmem_test_assert_map(map, N + 3, 1);
2658 	} else {
2659 		kmem_test_assert_map(map, 2 * N + 3, 2);
2660 	}
2661 	for (int pg = 0; pg < N - GL; pg++) {
2662 		assert3u(peek(newaddr, pg), ==, 42 + pg);
2663 	}
2664 	if ((kind & KMR_FREEOLD) == 0) {
2665 		for (int pg = 0; pg < N - GL; pg++) {
2666 			assert3u(peek(addr, pg), ==, 42 + pg);
2667 		}
2668 		/* check for tru-share */
2669 		poke(addr + 16, 0, 1234);
2670 		assert3u(peek(newaddr + 16, 0), ==, 1234);
2671 		kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
2672 		kmem_test_assert_map(map, N + 3, 1);
2673 	}
2674 	if (addr != newaddr) {
2675 		for (int pg = 0; pg < N - GL; pg++) {
2676 			assert_faults(addr, pg);
2677 		}
2678 	}
2679 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
2680 		assert3u(peek(newaddr, pg), ==, 0);
2681 	}
2682 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
2683 		assert_faults(newaddr, pg);
2684 	}
2685 	addr = newaddr;
2686 
2687 
2688 	/*
2689 	 *	Shrink to N - 2 pages
2690 	 */
2691 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
2692 	    kind | KMR_ZERO, guard).kmr_address;
2693 	assert3u(map->size, ==, ptoa(N - 2));
2694 	assert3u(newaddr, ==, addr);
2695 	kmem_test_assert_map(map, N - 2, 1);
2696 
2697 	for (int pg = 0; pg < N - 2 - GL; pg++) {
2698 		assert3u(peek(addr, pg), ==, 42 + pg);
2699 	}
2700 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
2701 		assert_faults(addr, pg);
2702 	}
2703 
2704 	kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
2705 	kmem_test_assert_map(map, 0, 0);
2706 }
2707 
2708 static int
kmem_basic_test(__unused int64_t in,int64_t * out)2709 kmem_basic_test(__unused int64_t in, int64_t *out)
2710 {
2711 	vm_offset_t addr;
2712 	vm_map_t map;
2713 
2714 	printf("%s: test running\n", __func__);
2715 
2716 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
2717 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
2718 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
2719 
2720 	printf("%s: kmem_alloc ...\n", __func__);
2721 	kmem_alloc_basic_test(map);
2722 	printf("%s:     PASS\n", __func__);
2723 
2724 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
2725 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
2726 	printf("%s:     PASS\n", __func__);
2727 
2728 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
2729 	kmem_realloc_basic_test(map, KMR_FREEOLD);
2730 	printf("%s:     PASS\n", __func__);
2731 
2732 	printf("%s: kmem_realloc (KMR_NONE) ...\n", __func__);
2733 	kmem_realloc_basic_test(map, KMR_NONE);
2734 	printf("%s:     PASS\n", __func__);
2735 
2736 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2737 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
2738 	printf("%s:     PASS\n", __func__);
2739 
2740 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2741 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
2742 	printf("%s:     PASS\n", __func__);
2743 
2744 	printf("%s: kmem_realloc (KMR_GUARD_LAST) ...\n", __func__);
2745 	kmem_realloc_basic_test(map, KMR_GUARD_LAST);
2746 	printf("%s:     PASS\n", __func__);
2747 
2748 	/* using KMR_DATA signals to test the non atomic realloc path */
2749 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
2750 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
2751 	printf("%s:     PASS\n", __func__);
2752 
2753 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
2754 	kmem_realloc_basic_test(map, KMR_DATA);
2755 	printf("%s:     PASS\n", __func__);
2756 
2757 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
2758 	vm_map_deallocate(map);
2759 
2760 	printf("%s: test passed\n", __func__);
2761 	*out = 1;
2762 	return 0;
2763 }
2764 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
2765 #endif /* DEBUG || DEVELOPMENT */
2766