xref: /xnu-8792.61.2/osfmk/vm/vm_kern.c (revision 42e220869062b56f8d7d0726fd4c88954f87902c)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	vm/vm_kern.c
60  *	Author:	Avadis Tevanian, Jr., Michael Wayne Young
61  *	Date:	1985
62  *
63  *	Kernel memory management.
64  */
65 
66 #include <mach/kern_return.h>
67 #include <mach/vm_param.h>
68 #include <kern/assert.h>
69 #include <kern/thread.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_map_internal.h>
72 #include <vm/vm_object.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_compressor.h>
75 #include <vm/vm_pageout.h>
76 #include <vm/vm_init.h>
77 #include <vm/vm_fault.h>
78 #include <kern/misc_protos.h>
79 #include <vm/cpm.h>
80 #include <kern/ledger.h>
81 #include <kern/bits.h>
82 #include <kern/startup.h>
83 
84 #include <string.h>
85 
86 #include <libkern/OSDebug.h>
87 #include <libkern/crypto/sha2.h>
88 #include <libkern/section_keywords.h>
89 #include <sys/kdebug.h>
90 
91 #include <san/kasan.h>
92 #include <kern/kext_alloc.h>
93 #include <kern/backtrace.h>
94 #include <os/hash.h>
95 #include <kern/zalloc_internal.h>
96 
97 /*
98  *	Variables exported by this module.
99  */
100 
101 SECURITY_READ_ONLY_LATE(vm_map_t) kernel_map;
102 SECURITY_READ_ONLY_LATE(struct mach_vm_range) kmem_ranges[KMEM_RANGE_COUNT] = {};
103 TUNABLE(uint32_t, kmem_ptr_ranges, "kmem_ptr_ranges", 2);
104 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
105 __startup_data
106 vm_map_size_t data_range_size, ptr_range_size;
107 SECURITY_READ_ONLY_LATE(struct mach_vm_range)
108 kmem_large_ranges[KMEM_RANGE_COUNT] = {};
109 #endif
110 
111 #pragma mark helpers
112 
113 __attribute__((overloadable))
114 __header_always_inline kmem_flags_t
ANYF(kma_flags_t flags)115 ANYF(kma_flags_t flags)
116 {
117 	return (kmem_flags_t)flags;
118 }
119 
120 __attribute__((overloadable))
121 __header_always_inline kmem_flags_t
ANYF(kmr_flags_t flags)122 ANYF(kmr_flags_t flags)
123 {
124 	return (kmem_flags_t)flags;
125 }
126 
127 __attribute__((overloadable))
128 __header_always_inline kmem_flags_t
ANYF(kmf_flags_t flags)129 ANYF(kmf_flags_t flags)
130 {
131 	return (kmem_flags_t)flags;
132 }
133 
134 __abortlike
135 static void
__kmem_invalid_size_panic(vm_map_t map,vm_size_t size,uint32_t flags)136 __kmem_invalid_size_panic(
137 	vm_map_t        map,
138 	vm_size_t       size,
139 	uint32_t        flags)
140 {
141 	panic("kmem(map=%p, flags=0x%x): invalid size %zd",
142 	    map, flags, (size_t)size);
143 }
144 
145 __abortlike
146 static void
__kmem_invalid_arguments_panic(const char * what,vm_map_t map,vm_address_t address,vm_size_t size,uint32_t flags)147 __kmem_invalid_arguments_panic(
148 	const char     *what,
149 	vm_map_t        map,
150 	vm_address_t    address,
151 	vm_size_t       size,
152 	uint32_t        flags)
153 {
154 	panic("kmem_%s(map=%p, addr=%p, size=%zd, flags=0x%x): "
155 	    "invalid arguments passed",
156 	    what, map, (void *)address, (size_t)size, flags);
157 }
158 
159 __abortlike
160 static void
__kmem_failed_panic(vm_map_t map,vm_size_t size,uint32_t flags,kern_return_t kr,const char * what)161 __kmem_failed_panic(
162 	vm_map_t        map,
163 	vm_size_t       size,
164 	uint32_t        flags,
165 	kern_return_t   kr,
166 	const char     *what)
167 {
168 	panic("kmem_%s(%p, %zd, 0x%x): failed with %d",
169 	    what, map, (size_t)size, flags, kr);
170 }
171 
172 __abortlike
173 static void
__kmem_entry_not_found_panic(vm_map_t map,vm_offset_t addr)174 __kmem_entry_not_found_panic(
175 	vm_map_t        map,
176 	vm_offset_t     addr)
177 {
178 	panic("kmem(map=%p) no entry found at %p", map, (void *)addr);
179 }
180 
181 __abortlike
182 static void
__kmem_invalid_object_panic(uint32_t flags)183 __kmem_invalid_object_panic(uint32_t flags)
184 {
185 	if (flags == 0) {
186 		panic("KMEM_KOBJECT or KMEM_COMPRESSOR is required");
187 	}
188 	panic("more than one of KMEM_KOBJECT or KMEM_COMPRESSOR specified");
189 }
190 
191 static inline vm_object_t
__kmem_object(kmem_flags_t flags)192 __kmem_object(kmem_flags_t flags)
193 {
194 	flags &= (KMEM_KOBJECT | KMEM_COMPRESSOR);
195 	if (flags == 0 || (flags & (flags - 1))) {
196 		__kmem_invalid_object_panic(flags);
197 	}
198 
199 	return (flags & KMEM_KOBJECT) ? kernel_object : compressor_object;
200 }
201 
202 static inline vm_size_t
__kmem_guard_left(kmem_flags_t flags)203 __kmem_guard_left(kmem_flags_t flags)
204 {
205 	return (flags & KMEM_GUARD_FIRST) ? PAGE_SIZE : 0;
206 }
207 
208 static inline vm_size_t
__kmem_guard_right(kmem_flags_t flags)209 __kmem_guard_right(kmem_flags_t flags)
210 {
211 	return (flags & KMEM_GUARD_LAST) ? PAGE_SIZE : 0;
212 }
213 
214 static inline vm_size_t
__kmem_guard_size(kmem_flags_t flags)215 __kmem_guard_size(kmem_flags_t flags)
216 {
217 	return __kmem_guard_left(flags) + __kmem_guard_right(flags);
218 }
219 
220 
221 #pragma mark kmem range methods
222 
223 #if __arm64__
224 // <rdar://problem/48304934> arm64 doesn't use ldp when I'd expect it to
225 #define mach_vm_range_load(r, r_min, r_max) \
226 	asm("ldp %[rmin], %[rmax], [%[range]]" \
227 	    : [rmin] "=r"(r_min), [rmax] "=r"(r_max) \
228 	    : [range] "r"(r), "m"((r)->min_address), "m"((r)->max_address))
229 #else
230 #define mach_vm_range_load(r, rmin, rmax) \
231 	({ rmin = (r)->min_address; rmax = (r)->max_address; })
232 #endif
233 
234 __abortlike
235 static void
__mach_vm_range_overflow(mach_vm_offset_t addr,mach_vm_offset_t size)236 __mach_vm_range_overflow(
237 	mach_vm_offset_t        addr,
238 	mach_vm_offset_t        size)
239 {
240 	panic("invalid vm range: [0x%llx, 0x%llx + 0x%llx) wraps around",
241 	    addr, addr, size);
242 }
243 
244 __abortlike
245 static void
__mach_vm_range_invalid(mach_vm_offset_t min_address,mach_vm_offset_t max_address)246 __mach_vm_range_invalid(
247 	mach_vm_offset_t        min_address,
248 	mach_vm_offset_t        max_address)
249 {
250 	panic("invalid vm range: [0x%llx, 0x%llx) wraps around",
251 	    min_address, max_address);
252 }
253 
254 __header_always_inline mach_vm_size_t
mach_vm_range_size(const struct mach_vm_range * r)255 mach_vm_range_size(const struct mach_vm_range *r)
256 {
257 	mach_vm_offset_t rmin, rmax;
258 
259 	mach_vm_range_load(r, rmin, rmax);
260 	return rmax - rmin;
261 }
262 
263 __attribute__((overloadable))
264 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr)265 mach_vm_range_contains(const struct mach_vm_range *r, mach_vm_offset_t addr)
266 {
267 	mach_vm_offset_t rmin, rmax;
268 
269 #if CONFIG_KERNEL_TBI
270 	if (VM_KERNEL_ADDRESS(addr)) {
271 		addr = VM_KERNEL_TBI_FILL(addr);
272 	}
273 #endif /* CONFIG_KERNEL_TBI */
274 
275 	/*
276 	 * The `&` is not a typo: we really expect the check to pass,
277 	 * so encourage the compiler to eagerly load and test without branches
278 	 */
279 	mach_vm_range_load(r, rmin, rmax);
280 	return (addr >= rmin) & (addr < rmax);
281 }
282 
283 __attribute__((overloadable))
284 __header_always_inline bool
mach_vm_range_contains(const struct mach_vm_range * r,mach_vm_offset_t addr,mach_vm_offset_t size)285 mach_vm_range_contains(
286 	const struct mach_vm_range *r,
287 	mach_vm_offset_t        addr,
288 	mach_vm_offset_t        size)
289 {
290 	mach_vm_offset_t rmin, rmax;
291 
292 #if CONFIG_KERNEL_TBI
293 	if (VM_KERNEL_ADDRESS(addr)) {
294 		addr = VM_KERNEL_TBI_FILL(addr);
295 	}
296 #endif /* CONFIG_KERNEL_TBI */
297 
298 	/*
299 	 * The `&` is not a typo: we really expect the check to pass,
300 	 * so encourage the compiler to eagerly load and test without branches
301 	 */
302 	mach_vm_range_load(r, rmin, rmax);
303 	return (addr >= rmin) & (addr + size >= rmin) & (addr + size <= rmax);
304 }
305 
306 __attribute__((overloadable))
307 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,const struct mach_vm_range * r2)308 mach_vm_range_intersects(
309 	const struct mach_vm_range *r1,
310 	const struct mach_vm_range *r2)
311 {
312 	mach_vm_offset_t r1_min, r1_max;
313 	mach_vm_offset_t r2_min, r2_max;
314 
315 	mach_vm_range_load(r1, r1_min, r1_max);
316 	r2_min = r2->min_address;
317 	r2_max = r2->max_address;
318 
319 	if (r1_min > r1_max) {
320 		__mach_vm_range_invalid(r1_min, r1_max);
321 	}
322 
323 	if (r2_min > r2_max) {
324 		__mach_vm_range_invalid(r2_min, r2_max);
325 	}
326 
327 	return r1_max > r2_min && r1_min < r2_max;
328 }
329 
330 __attribute__((overloadable))
331 __header_always_inline bool
mach_vm_range_intersects(const struct mach_vm_range * r1,mach_vm_offset_t addr,mach_vm_offset_t size)332 mach_vm_range_intersects(
333 	const struct mach_vm_range *r1,
334 	mach_vm_offset_t        addr,
335 	mach_vm_offset_t        size)
336 {
337 	struct mach_vm_range r2;
338 
339 #if CONFIG_KERNEL_TBI
340 	addr = VM_KERNEL_STRIP_UPTR(addr);
341 #endif /* CONFIG_KERNEL_TBI */
342 	r2.min_address = addr;
343 	if (os_add_overflow(addr, size, &r2.max_address)) {
344 		__mach_vm_range_overflow(addr, size);
345 	}
346 
347 	return mach_vm_range_intersects(r1, &r2);
348 }
349 
350 bool
kmem_range_id_contains(kmem_range_id_t range_id,vm_map_offset_t addr,vm_map_size_t size)351 kmem_range_id_contains(
352 	kmem_range_id_t         range_id,
353 	vm_map_offset_t         addr,
354 	vm_map_size_t           size)
355 {
356 	return mach_vm_range_contains(&kmem_ranges[range_id], addr, size);
357 }
358 
359 vm_map_size_t
kmem_range_id_size(kmem_range_id_t range_id)360 kmem_range_id_size(kmem_range_id_t range_id)
361 {
362 	return mach_vm_range_size(&kmem_ranges[range_id]);
363 }
364 
365 kmem_range_id_t
kmem_addr_get_range(vm_map_offset_t addr,vm_map_size_t size)366 kmem_addr_get_range(vm_map_offset_t addr, vm_map_size_t size)
367 {
368 	kmem_range_id_t range_id = 0;
369 	for (; range_id < KMEM_RANGE_COUNT; range_id++) {
370 		if (kmem_range_id_contains(range_id, addr, size)) {
371 			break;
372 		}
373 	}
374 	return range_id;
375 }
376 
377 
378 #pragma mark entry parameters
379 
380 
381 __abortlike
382 static void
__kmem_entry_validate_panic(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,uint32_t flags,kmem_guard_t guard)383 __kmem_entry_validate_panic(
384 	vm_map_t        map,
385 	vm_map_entry_t  entry,
386 	vm_offset_t     addr,
387 	vm_size_t       size,
388 	uint32_t        flags,
389 	kmem_guard_t    guard)
390 {
391 	const char *what = "???";
392 
393 	if (entry->vme_atomic != guard.kmg_atomic) {
394 		what = "atomicity";
395 	} else if (entry->is_sub_map != guard.kmg_submap) {
396 		what = "objectness";
397 	} else if (addr != entry->vme_start) {
398 		what = "left bound";
399 	} else if ((flags & KMF_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
400 		what = "right bound";
401 #if __LP64__
402 	} else if (guard.kmg_context != entry->vme_context) {
403 		what = "guard";
404 #endif
405 	}
406 
407 	panic("kmem(map=%p, addr=%p, size=%zd, flags=0x%x): "
408 	    "entry:%p %s mismatch guard(0x%08x)",
409 	    map, (void *)addr, size, flags, entry,
410 	    what, guard.kmg_context);
411 }
412 
413 static bool
__kmem_entry_validate_guard(vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_flags_t flags,kmem_guard_t guard)414 __kmem_entry_validate_guard(
415 	vm_map_entry_t  entry,
416 	vm_offset_t     addr,
417 	vm_size_t       size,
418 	kmem_flags_t    flags,
419 	kmem_guard_t    guard)
420 {
421 	if (entry->vme_atomic != guard.kmg_atomic) {
422 		return false;
423 	}
424 
425 	if (!guard.kmg_atomic) {
426 		return true;
427 	}
428 
429 	if (entry->is_sub_map != guard.kmg_submap) {
430 		return false;
431 	}
432 
433 	if (addr != entry->vme_start) {
434 		return false;
435 	}
436 
437 	if ((flags & KMEM_GUESS_SIZE) == 0 && addr + size != entry->vme_end) {
438 		return false;
439 	}
440 
441 #if __LP64__
442 	if (!guard.kmg_submap && guard.kmg_context != entry->vme_context) {
443 		return false;
444 	}
445 #endif
446 
447 	return true;
448 }
449 
450 void
kmem_entry_validate_guard(vm_map_t map,vm_map_entry_t entry,vm_offset_t addr,vm_size_t size,kmem_guard_t guard)451 kmem_entry_validate_guard(
452 	vm_map_t        map,
453 	vm_map_entry_t  entry,
454 	vm_offset_t     addr,
455 	vm_size_t       size,
456 	kmem_guard_t    guard)
457 {
458 	if (!__kmem_entry_validate_guard(entry, addr, size, KMEM_NONE, guard)) {
459 		__kmem_entry_validate_panic(map, entry, addr, size, KMEM_NONE, guard);
460 	}
461 }
462 
463 __abortlike
464 static void
__kmem_entry_validate_object_panic(vm_map_t map,vm_map_entry_t entry,kmem_flags_t flags)465 __kmem_entry_validate_object_panic(
466 	vm_map_t        map,
467 	vm_map_entry_t  entry,
468 	kmem_flags_t    flags)
469 {
470 	const char *what;
471 	const char *verb;
472 
473 	if (entry->is_sub_map) {
474 		panic("kmem(map=%p) entry %p is a submap", map, entry);
475 	}
476 
477 	if (flags & KMEM_KOBJECT) {
478 		what = "kernel";
479 		verb = "isn't";
480 	} else if (flags & KMEM_COMPRESSOR) {
481 		what = "compressor";
482 		verb = "isn't";
483 	} else if (entry->vme_kernel_object) {
484 		what = "kernel";
485 		verb = "is unexpectedly";
486 	} else {
487 		what = "compressor";
488 		verb = "is unexpectedly";
489 	}
490 
491 	panic("kmem(map=%p, flags=0x%x): entry %p %s for the %s object",
492 	    map, flags, entry, verb, what);
493 }
494 
495 static bool
__kmem_entry_validate_object(vm_map_entry_t entry,kmem_flags_t flags)496 __kmem_entry_validate_object(
497 	vm_map_entry_t  entry,
498 	kmem_flags_t    flags)
499 {
500 	if (entry->is_sub_map) {
501 		return false;
502 	}
503 	if ((bool)(flags & KMEM_KOBJECT) != entry->vme_kernel_object) {
504 		return false;
505 	}
506 
507 	return (bool)(flags & KMEM_COMPRESSOR) ==
508 	       (VME_OBJECT(entry) == compressor_object);
509 }
510 
511 vm_size_t
kmem_size_guard(vm_map_t map,vm_offset_t addr,kmem_guard_t guard)512 kmem_size_guard(
513 	vm_map_t        map,
514 	vm_offset_t     addr,
515 	kmem_guard_t    guard)
516 {
517 	kmem_flags_t flags = KMEM_GUESS_SIZE;
518 	vm_map_entry_t entry;
519 	vm_size_t size;
520 
521 	vm_map_lock_read(map);
522 
523 	if (!vm_map_lookup_entry(map, addr, &entry)) {
524 		__kmem_entry_not_found_panic(map, addr);
525 	}
526 
527 	if (!__kmem_entry_validate_guard(entry, addr, 0, flags, guard)) {
528 		__kmem_entry_validate_panic(map, entry, addr, 0, flags, guard);
529 	}
530 
531 	size = (vm_size_t)(entry->vme_end - entry->vme_start);
532 
533 	vm_map_unlock_read(map);
534 
535 	return size;
536 }
537 
538 #if ZSECURITY_CONFIG(KALLOC_TYPE)
539 static inline uint16_t
kmem_hash_backtrace(void * fp)540 kmem_hash_backtrace(
541 	void                     *fp)
542 {
543 	uint64_t  bt_count;
544 	uintptr_t bt[8] = {};
545 
546 	struct backtrace_control ctl = {
547 		.btc_frame_addr = (uintptr_t)fp,
548 	};
549 
550 	bt_count = backtrace(bt, sizeof(bt) / sizeof(bt[0]), &ctl, NULL);
551 	return (uint16_t) os_hash_jenkins(bt, bt_count * sizeof(bt[0]));
552 }
553 #endif
554 
555 static_assert(KMEM_RANGE_ID_DATA - 1 <= KMEM_RANGE_MASK,
556     "Insufficient bits to represent ptr ranges");
557 
558 kmem_range_id_t
kmem_adjust_range_id(uint32_t hash)559 kmem_adjust_range_id(
560 	uint32_t                  hash)
561 {
562 	return (kmem_range_id_t) (KMEM_RANGE_ID_PTR_0 +
563 	       (hash & KMEM_RANGE_MASK) % kmem_ptr_ranges);
564 }
565 
566 static void
kmem_apply_security_policy(vm_map_t map,kma_flags_t kma_flags,kmem_guard_t guard,vm_map_kernel_flags_t * vmk_flags,bool assert_dir __unused)567 kmem_apply_security_policy(
568 	vm_map_t                  map,
569 	kma_flags_t               kma_flags,
570 	kmem_guard_t              guard,
571 	vm_map_kernel_flags_t    *vmk_flags,
572 	bool                      assert_dir __unused)
573 {
574 	kmem_range_id_t range_id;
575 	bool direction;
576 	uint16_t type_hash = guard.kmg_type_hash;
577 
578 	if (startup_phase < STARTUP_SUB_KMEM || map != kernel_map) {
579 		return;
580 	}
581 
582 	/*
583 	 * When ZSECURITY_CONFIG(KALLOC_TYPE) is enabled, a non-zero type-hash
584 	 * must be passed by krealloc_type
585 	 */
586 #if (DEBUG || DEVELOPMENT) && ZSECURITY_CONFIG(KALLOC_TYPE)
587 	if (assert_dir && !(kma_flags & KMA_DATA)) {
588 		assert(type_hash != 0);
589 	}
590 #endif
591 
592 	if (kma_flags & KMA_DATA) {
593 		range_id = KMEM_RANGE_ID_DATA;
594 		/*
595 		 * As an optimization in KMA_DATA to avoid fragmentation,
596 		 * allocate static carveouts at the end of the DATA range.
597 		 */
598 		direction = (bool)(kma_flags & KMA_PERMANENT);
599 	} else if (type_hash) {
600 		range_id = type_hash & KMEM_RANGE_MASK;
601 		direction = type_hash & KMEM_DIRECTION_MASK;
602 	} else {
603 #if ZSECURITY_CONFIG(KALLOC_TYPE)
604 		type_hash = (uint16_t) kmem_hash_backtrace(__builtin_frame_address(0));
605 #endif
606 		/*
607 		 * Range id needs to correspond to one of the PTR ranges
608 		 */
609 		range_id = kmem_adjust_range_id(type_hash);
610 		direction = type_hash & KMEM_DIRECTION_MASK;
611 	}
612 
613 	vmk_flags->vmkf_range_id = range_id;
614 	vmk_flags->vmkf_last_free = direction;
615 }
616 
617 #pragma mark allocation
618 
619 kern_return_t
kmem_alloc_contig(vm_map_t map,vm_offset_t * addrp,vm_size_t size,vm_offset_t mask,ppnum_t max_pnum,ppnum_t pnum_mask,kma_flags_t flags,vm_tag_t tag)620 kmem_alloc_contig(
621 	vm_map_t                map,
622 	vm_offset_t             *addrp,
623 	vm_size_t               size,
624 	vm_offset_t             mask,
625 	ppnum_t                 max_pnum,
626 	ppnum_t                 pnum_mask,
627 	kma_flags_t             flags,
628 	vm_tag_t                tag)
629 {
630 	vm_object_t             object;
631 	vm_object_offset_t      offset;
632 	vm_map_offset_t         map_addr;
633 	vm_map_offset_t         map_mask;
634 	vm_map_size_t           map_size, i;
635 	vm_map_entry_t          entry;
636 	vm_page_t               m, pages;
637 	kern_return_t           kr;
638 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
639 
640 	assert(VM_KERN_MEMORY_NONE != tag);
641 	assert(map);
642 	assert3u(flags & ~KMEM_ALLOC_CONTIG_FLAGS, ==, 0);
643 
644 	map_size = vm_map_round_page(size, VM_MAP_PAGE_MASK(map));
645 	map_mask = (vm_map_offset_t)mask;
646 
647 	/* Check for zero allocation size (either directly or via overflow) */
648 	if (map_size == 0) {
649 		*addrp = 0;
650 		return KERN_INVALID_ARGUMENT;
651 	}
652 
653 	/*
654 	 *	Allocate a new object (if necessary) and the reference we
655 	 *	will be donating to the map entry.  We must do this before
656 	 *	locking the map, or risk deadlock with the default pager.
657 	 */
658 	if ((flags & KMA_KOBJECT) != 0) {
659 		object = kernel_object;
660 		vm_object_reference(object);
661 	} else {
662 		object = vm_object_allocate(map_size);
663 		/* stabilize the object to prevent shadowing */
664 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
665 		object->true_share = TRUE;
666 	}
667 	if (flags & KMA_PERMANENT) {
668 		vmk_flags.vmkf_permanent = true;
669 	}
670 	kmem_apply_security_policy(map, flags, KMEM_GUARD_NONE, &vmk_flags, false);
671 
672 	kr = vm_map_find_space(map, 0, map_size, map_mask,
673 	    vmk_flags, &entry);
674 	if (KERN_SUCCESS != kr) {
675 		vm_object_deallocate(object);
676 		return kr;
677 	}
678 
679 	map_addr = entry->vme_start;
680 	if (object == kernel_object) {
681 		offset = map_addr;
682 	} else {
683 		offset = 0;
684 	}
685 	VME_OBJECT_SET(entry, object, false, 0);
686 	VME_OFFSET_SET(entry, offset);
687 	VME_ALIAS_SET(entry, tag);
688 
689 	/* Take an extra object ref in case the map entry gets deleted */
690 	vm_object_reference(object);
691 	vm_map_unlock(map);
692 
693 	kr = cpm_allocate(CAST_DOWN(vm_size_t, map_size), &pages, max_pnum, pnum_mask, FALSE, flags);
694 
695 	if (kr != KERN_SUCCESS) {
696 		vm_map_remove(map,
697 		    vm_map_trunc_page(map_addr,
698 		    VM_MAP_PAGE_MASK(map)),
699 		    vm_map_round_page(map_addr + map_size,
700 		    VM_MAP_PAGE_MASK(map)));
701 		vm_object_deallocate(object);
702 		*addrp = 0;
703 		return kr;
704 	}
705 
706 	if (flags & KMA_ZERO) {
707 		for (m = pages; m; m = NEXT_PAGE(m)) {
708 			vm_page_zero_fill(m);
709 		}
710 	}
711 
712 
713 	vm_object_lock(object);
714 	for (i = 0; i < map_size; i += PAGE_SIZE) {
715 		m = pages;
716 		pages = NEXT_PAGE(m);
717 		*(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
718 		m->vmp_busy = FALSE;
719 		vm_page_insert(m, object, offset + i);
720 	}
721 	vm_object_unlock(object);
722 
723 	kr = vm_map_wire_kernel(map,
724 	    vm_map_trunc_page(map_addr,
725 	    VM_MAP_PAGE_MASK(map)),
726 	    vm_map_round_page(map_addr + map_size,
727 	    VM_MAP_PAGE_MASK(map)),
728 	    VM_PROT_DEFAULT, tag,
729 	    FALSE);
730 
731 	if (kr != KERN_SUCCESS) {
732 		if (object == kernel_object) {
733 			vm_object_lock(object);
734 			vm_object_page_remove(object, offset, offset + map_size);
735 			vm_object_unlock(object);
736 		}
737 		vm_map_remove(map,
738 		    vm_map_trunc_page(map_addr,
739 		    VM_MAP_PAGE_MASK(map)),
740 		    vm_map_round_page(map_addr + map_size,
741 		    VM_MAP_PAGE_MASK(map)));
742 		vm_object_deallocate(object);
743 		return kr;
744 	}
745 	vm_object_deallocate(object);
746 
747 	if (object == kernel_object) {
748 		vm_map_simplify(map, map_addr);
749 		vm_tag_update_size(tag, map_size);
750 	}
751 	*addrp = (vm_offset_t) map_addr;
752 	assert((vm_map_offset_t) *addrp == map_addr);
753 
754 	return KERN_SUCCESS;
755 }
756 
757 kmem_return_t
kmem_alloc_guard(vm_map_t map,vm_size_t size,vm_offset_t mask,kma_flags_t flags,kmem_guard_t guard)758 kmem_alloc_guard(
759 	vm_map_t        map,
760 	vm_size_t       size,
761 	vm_offset_t     mask,
762 	kma_flags_t     flags,
763 	kmem_guard_t    guard)
764 {
765 	vm_object_t             object;
766 	vm_map_entry_t          entry = NULL;
767 	vm_map_offset_t         map_addr, fill_start;
768 	vm_map_size_t           map_size, fill_size;
769 	vm_page_t               guard_left = VM_PAGE_NULL;
770 	vm_page_t               guard_right = VM_PAGE_NULL;
771 	vm_page_t               wired_page_list = VM_PAGE_NULL;
772 	vm_map_kernel_flags_t   vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
773 	bool                    skip_guards;
774 	kmem_return_t           kmr = { };
775 
776 	assert(kernel_map && map->pmap == kernel_pmap);
777 
778 #if DEBUG || DEVELOPMENT
779 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
780 	    size, 0, 0, 0);
781 #endif
782 
783 	if (size == 0 ||
784 	    (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) ||
785 	    (size < __kmem_guard_size(ANYF(flags)))) {
786 		__kmem_invalid_size_panic(map, size, flags);
787 	}
788 
789 	/*
790 	 * limit the size of a single extent of wired memory
791 	 * to try and limit the damage to the system if
792 	 * too many pages get wired down
793 	 * limit raised to 2GB with 128GB max physical limit,
794 	 * but scaled by installed memory above this
795 	 */
796 	if (__improbable(!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
797 	    size > MAX(1ULL << 31, sane_size / 64))) {
798 		kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
799 		goto out_error;
800 	}
801 
802 	/*
803 	 * Guard pages:
804 	 *
805 	 * Guard pages are implemented as fictitious pages.
806 	 *
807 	 * However, some maps, and some objects are known
808 	 * to manage their memory explicitly, and do not need
809 	 * those to be materialized, which saves memory.
810 	 *
811 	 * By placing guard pages on either end of a stack,
812 	 * they can help detect cases where a thread walks
813 	 * off either end of its stack.
814 	 *
815 	 * They are allocated and set up here and attempts
816 	 * to access those pages are trapped in vm_fault_page().
817 	 *
818 	 * The map_size we were passed may include extra space for
819 	 * guard pages. fill_size represents the actual size to populate.
820 	 * Similarly, fill_start indicates where the actual pages
821 	 * will begin in the range.
822 	 */
823 
824 	map_size   = round_page(size);
825 	fill_start = 0;
826 	fill_size  = map_size - __kmem_guard_size(ANYF(flags));
827 
828 	skip_guards = (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) ||
829 	    map->never_faults;
830 
831 	if (flags & KMA_GUARD_FIRST) {
832 		vmk_flags.vmkf_guard_before = true;
833 		fill_start += PAGE_SIZE;
834 	}
835 	if ((flags & KMA_GUARD_FIRST) && !skip_guards) {
836 		guard_left = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
837 		if (__improbable(guard_left == VM_PAGE_NULL)) {
838 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
839 			goto out_error;
840 		}
841 	}
842 	if ((flags & KMA_GUARD_LAST) && !skip_guards) {
843 		guard_right = vm_page_grab_guard((flags & KMA_NOPAGEWAIT) == 0);
844 		if (__improbable(guard_right == VM_PAGE_NULL)) {
845 			kmr.kmr_return = KERN_RESOURCE_SHORTAGE;
846 			goto out_error;
847 		}
848 	}
849 
850 	if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
851 		kmr.kmr_return = vm_page_alloc_list(atop(fill_size), flags,
852 		    &wired_page_list);
853 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
854 			goto out_error;
855 		}
856 	}
857 
858 	/*
859 	 *	Allocate a new object (if necessary).  We must do this before
860 	 *	locking the map, or risk deadlock with the default pager.
861 	 */
862 	if (flags & KMA_KOBJECT) {
863 		object = kernel_object;
864 		vm_object_reference(object);
865 	} else if (flags & KMA_COMPRESSOR) {
866 		object = compressor_object;
867 		vm_object_reference(object);
868 	} else {
869 		object = vm_object_allocate(map_size);
870 		/* stabilize the object to prevent shadowing */
871 		object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
872 		object->true_share = TRUE;
873 	}
874 
875 	if (flags & KMA_LAST_FREE) {
876 		vmk_flags.vmkf_last_free = true;
877 	}
878 	if (flags & KMA_PERMANENT) {
879 		vmk_flags.vmkf_permanent = true;
880 	}
881 	kmem_apply_security_policy(map, flags, guard, &vmk_flags, false);
882 
883 	kmr.kmr_return = vm_map_find_space(map, 0, map_size, mask,
884 	    vmk_flags, &entry);
885 	if (__improbable(KERN_SUCCESS != kmr.kmr_return)) {
886 		vm_object_deallocate(object);
887 		goto out_error;
888 	}
889 
890 	map_addr = entry->vme_start;
891 	VME_OBJECT_SET(entry, object, guard.kmg_atomic, guard.kmg_context);
892 	VME_ALIAS_SET(entry, guard.kmg_tag);
893 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
894 		VME_OFFSET_SET(entry, map_addr);
895 	} else {
896 		vm_object_reference(object);
897 	}
898 
899 	if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE))) {
900 		entry->wired_count = 1;
901 	}
902 
903 	if (guard_left || guard_right || wired_page_list) {
904 		vm_object_offset_t offset = 0ull;
905 
906 		vm_object_lock(object);
907 		vm_map_unlock(map);
908 
909 		if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
910 			offset = map_addr;
911 		}
912 
913 		if (guard_left) {
914 			vm_page_insert(guard_left, object, offset);
915 			guard_left->vmp_busy = FALSE;
916 			guard_left = VM_PAGE_NULL;
917 		}
918 
919 		if (guard_right) {
920 			vm_page_insert(guard_right, object,
921 			    offset + fill_start + fill_size);
922 			guard_right->vmp_busy = FALSE;
923 			guard_right = VM_PAGE_NULL;
924 		}
925 
926 		if (wired_page_list) {
927 			kernel_memory_populate_object_and_unlock(object,
928 			    map_addr + fill_start, offset + fill_start, fill_size,
929 			    wired_page_list, flags, guard.kmg_tag, VM_PROT_DEFAULT);
930 		} else {
931 			vm_object_unlock(object);
932 		}
933 	} else {
934 		vm_map_unlock(map);
935 	}
936 
937 #if KASAN
938 	if (flags & KMA_PAGEABLE) {
939 		/*
940 		 * We need to allow the range for pageable memory,
941 		 * or faulting will not be allowed.
942 		 */
943 		kasan_notify_address(map_addr, map_size);
944 	}
945 #endif
946 	/*
947 	 * now that the pages are wired, we no longer have to fear coalesce
948 	 */
949 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
950 		vm_map_simplify(map, map_addr);
951 	} else {
952 		vm_object_deallocate(object);
953 	}
954 
955 #if DEBUG || DEVELOPMENT
956 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
957 	    atop(fill_size), 0, 0, 0);
958 #endif
959 	kmr.kmr_address = CAST_DOWN(vm_offset_t, map_addr);
960 	return kmr;
961 
962 out_error:
963 	if (flags & KMA_NOFAIL) {
964 		__kmem_failed_panic(map, size, flags, kmr.kmr_return, "alloc");
965 	}
966 	if (guard_left) {
967 		guard_left->vmp_snext = wired_page_list;
968 		wired_page_list = guard_left;
969 	}
970 	if (guard_right) {
971 		guard_right->vmp_snext = wired_page_list;
972 		wired_page_list = guard_right;
973 	}
974 	if (wired_page_list) {
975 		vm_page_free_list(wired_page_list, FALSE);
976 	}
977 
978 #if DEBUG || DEVELOPMENT
979 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
980 	    0, 0, 0, 0);
981 #endif
982 
983 	return kmr;
984 }
985 
986 kmem_return_t
kmem_suballoc(vm_map_t parent,mach_vm_offset_t * addr,vm_size_t size,vm_map_create_options_t vmc_options,int vm_flags,kms_flags_t flags,vm_tag_t tag)987 kmem_suballoc(
988 	vm_map_t                parent,
989 	mach_vm_offset_t       *addr,
990 	vm_size_t               size,
991 	vm_map_create_options_t vmc_options,
992 	int                     vm_flags,
993 	kms_flags_t             flags,
994 	vm_tag_t                tag)
995 {
996 	vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
997 	vm_map_offset_t map_addr = 0;
998 	kmem_return_t kmr = { };
999 	vm_map_t map;
1000 
1001 	assert(page_aligned(size));
1002 	assert(parent->pmap == kernel_pmap);
1003 
1004 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1005 	if (parent == kernel_map) {
1006 		assert((vm_flags & VM_FLAGS_FIXED_RANGE_SUBALLOC) ||
1007 		    (flags & KMS_DATA));
1008 	}
1009 #endif /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
1010 
1011 	if ((vm_flags & VM_FLAGS_ANYWHERE) == 0) {
1012 		map_addr = trunc_page(*addr);
1013 	}
1014 
1015 	pmap_reference(vm_map_pmap(parent));
1016 	map = vm_map_create_options(vm_map_pmap(parent), 0, size, vmc_options);
1017 
1018 	/*
1019 	 * 1. vm_map_enter() will consume one ref on success.
1020 	 *
1021 	 * 2. make the entry atomic as kernel submaps should never be split.
1022 	 *
1023 	 * 3. instruct vm_map_enter() that it is a fresh submap
1024 	 *    that needs to be taught its bounds as it inserted.
1025 	 */
1026 	vm_map_reference(map);
1027 	vmk_flags.vmkf_submap = true;
1028 	if ((flags & KMS_DATA) == 0) {
1029 		/* FIXME: IOKit submaps get fragmented and can't be atomic */
1030 		vmk_flags.vmkf_submap_atomic = true;
1031 	}
1032 	vmk_flags.vmkf_submap_adjust = true;
1033 	if (flags & KMS_LAST_FREE) {
1034 		vmk_flags.vmkf_last_free = true;
1035 	}
1036 	if (flags & KMS_PERMANENT) {
1037 		vmk_flags.vmkf_permanent = true;
1038 	}
1039 	if (flags & KMS_DATA) {
1040 		vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
1041 	}
1042 
1043 	kmr.kmr_return = vm_map_enter(parent, &map_addr, size, 0,
1044 	    vm_flags, vmk_flags, tag, (vm_object_t)map, 0, FALSE,
1045 	    VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
1046 
1047 	if (kmr.kmr_return != KERN_SUCCESS) {
1048 		if (flags & KMS_NOFAIL) {
1049 			panic("kmem_suballoc(map=%p, size=%zd) failed with %d",
1050 			    parent, size, kmr.kmr_return);
1051 		}
1052 		assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1053 		vm_map_deallocate(map);
1054 		vm_map_deallocate(map); /* also removes ref to pmap */
1055 		return kmr;
1056 	}
1057 
1058 	/*
1059 	 * For kmem_suballocs that register a claim and are assigned a range, ensure
1060 	 * that the exact same range is returned.
1061 	 */
1062 	if (*addr != 0 && parent == kernel_map &&
1063 	    startup_phase > STARTUP_SUB_KMEM) {
1064 		assert(CAST_DOWN(vm_offset_t, map_addr) == *addr);
1065 	} else {
1066 		*addr = map_addr;
1067 	}
1068 
1069 	kmr.kmr_submap = map;
1070 	return kmr;
1071 }
1072 
1073 /*
1074  *	kmem_alloc:
1075  *
1076  *	Allocate wired-down memory in the kernel's address map
1077  *	or a submap.  The memory is not zero-filled.
1078  */
1079 
1080 __exported kern_return_t
1081 kmem_alloc_external(
1082 	vm_map_t        map,
1083 	vm_offset_t     *addrp,
1084 	vm_size_t       size);
1085 kern_return_t
kmem_alloc_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1086 kmem_alloc_external(
1087 	vm_map_t        map,
1088 	vm_offset_t     *addrp,
1089 	vm_size_t       size)
1090 {
1091 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1092 		return kmem_alloc(map, addrp, size, KMA_NONE, vm_tag_bt());
1093 	}
1094 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1095 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1096 }
1097 
1098 
1099 /*
1100  *	kmem_alloc_kobject:
1101  *
1102  *	Allocate wired-down memory in the kernel's address map
1103  *	or a submap.  The memory is not zero-filled.
1104  *
1105  *	The memory is allocated in the kernel_object.
1106  *	It may not be copied with vm_map_copy, and
1107  *	it may not be reallocated with kmem_realloc.
1108  */
1109 
1110 __exported kern_return_t
1111 kmem_alloc_kobject_external(
1112 	vm_map_t        map,
1113 	vm_offset_t     *addrp,
1114 	vm_size_t       size);
1115 kern_return_t
kmem_alloc_kobject_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1116 kmem_alloc_kobject_external(
1117 	vm_map_t        map,
1118 	vm_offset_t     *addrp,
1119 	vm_size_t       size)
1120 {
1121 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1122 		return kmem_alloc(map, addrp, size, KMA_KOBJECT, vm_tag_bt());
1123 	}
1124 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1125 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1126 }
1127 
1128 /*
1129  *	kmem_alloc_pageable:
1130  *
1131  *	Allocate pageable memory in the kernel's address map.
1132  */
1133 
1134 __exported kern_return_t
1135 kmem_alloc_pageable_external(
1136 	vm_map_t        map,
1137 	vm_offset_t     *addrp,
1138 	vm_size_t       size);
1139 kern_return_t
kmem_alloc_pageable_external(vm_map_t map,vm_offset_t * addrp,vm_size_t size)1140 kmem_alloc_pageable_external(
1141 	vm_map_t        map,
1142 	vm_offset_t     *addrp,
1143 	vm_size_t       size)
1144 {
1145 	if (size && (size >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) == 0) {
1146 		return kmem_alloc(map, addrp, size, KMA_PAGEABLE | KMA_DATA, vm_tag_bt());
1147 	}
1148 	/* Maintain ABI compatibility: invalid sizes used to be allowed */
1149 	return size ? KERN_NO_SPACE: KERN_INVALID_ARGUMENT;
1150 }
1151 
1152 
1153 #pragma mark population
1154 
1155 static void
kernel_memory_populate_pmap_enter(vm_object_t object,vm_address_t addr,vm_object_offset_t offset,vm_page_t mem,vm_prot_t prot,int pe_flags)1156 kernel_memory_populate_pmap_enter(
1157 	vm_object_t             object,
1158 	vm_address_t            addr,
1159 	vm_object_offset_t      offset,
1160 	vm_page_t               mem,
1161 	vm_prot_t               prot,
1162 	int                     pe_flags)
1163 {
1164 	kern_return_t   pe_result;
1165 	int             pe_options;
1166 
1167 	PMAP_ENTER_CHECK(kernel_pmap, mem);
1168 
1169 	pe_options = PMAP_OPTIONS_NOWAIT;
1170 	if (object->internal) {
1171 		pe_options |= PMAP_OPTIONS_INTERNAL;
1172 	}
1173 	if (mem->vmp_reusable || object->all_reusable) {
1174 		pe_options |= PMAP_OPTIONS_REUSABLE;
1175 	}
1176 
1177 	pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1178 	    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1179 	    pe_flags, /* wired */ TRUE, pe_options, NULL);
1180 
1181 	if (pe_result == KERN_RESOURCE_SHORTAGE) {
1182 		vm_object_unlock(object);
1183 
1184 		pe_options &= ~PMAP_OPTIONS_NOWAIT;
1185 
1186 		pe_result = pmap_enter_options(kernel_pmap, addr + offset,
1187 		    VM_PAGE_GET_PHYS_PAGE(mem), prot, VM_PROT_NONE,
1188 		    pe_flags, /* wired */ TRUE, pe_options, NULL);
1189 
1190 		vm_object_lock(object);
1191 	}
1192 
1193 	assert(pe_result == KERN_SUCCESS);
1194 }
1195 
1196 void
kernel_memory_populate_object_and_unlock(vm_object_t object,vm_address_t addr,vm_offset_t offset,vm_size_t size,vm_page_t page_list,kma_flags_t flags,vm_tag_t tag,vm_prot_t prot)1197 kernel_memory_populate_object_and_unlock(
1198 	vm_object_t     object, /* must be locked */
1199 	vm_address_t    addr,
1200 	vm_offset_t     offset,
1201 	vm_size_t       size,
1202 	vm_page_t       page_list,
1203 	kma_flags_t     flags,
1204 	vm_tag_t        tag,
1205 	vm_prot_t       prot)
1206 {
1207 	vm_page_t       mem;
1208 	int             pe_flags;
1209 
1210 	assert3u((bool)(flags & KMA_KOBJECT), ==, object == kernel_object);
1211 	assert3u((bool)(flags & KMA_COMPRESSOR), ==, object == compressor_object);
1212 	if (flags & (KMA_KOBJECT | KMA_COMPRESSOR)) {
1213 		assert3u(offset, ==, addr);
1214 	}
1215 
1216 	if (flags & KMA_KSTACK) {
1217 		pe_flags = VM_MEM_STACK;
1218 	} else {
1219 		pe_flags = 0;
1220 	}
1221 
1222 	for (vm_object_offset_t pg_offset = 0;
1223 	    pg_offset < size;
1224 	    pg_offset += PAGE_SIZE_64) {
1225 		if (page_list == NULL) {
1226 			panic("%s: page_list too short", __func__);
1227 		}
1228 
1229 		mem = page_list;
1230 		page_list = mem->vmp_snext;
1231 		mem->vmp_snext = NULL;
1232 
1233 		assert(mem->vmp_wire_count == 0);
1234 		assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
1235 
1236 		if (flags & KMA_COMPRESSOR) {
1237 			mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
1238 			/*
1239 			 * Background processes doing I/O accounting can call
1240 			 * into NVME driver to do some work which results in
1241 			 * an allocation here and so we want to make sure
1242 			 * that the pages used by compressor, regardless of
1243 			 * process context, are never on the special Q.
1244 			 */
1245 			mem->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
1246 
1247 			vm_page_insert(mem, object, offset + pg_offset);
1248 		} else {
1249 			mem->vmp_q_state = VM_PAGE_IS_WIRED;
1250 			mem->vmp_wire_count = 1;
1251 
1252 			vm_page_insert_wired(mem, object, offset + pg_offset, tag);
1253 		}
1254 
1255 		mem->vmp_busy = false;
1256 		mem->vmp_pmapped = true;
1257 		mem->vmp_wpmapped = true;
1258 
1259 		/*
1260 		 * Manual PMAP_ENTER_OPTIONS() with shortcuts
1261 		 * for the kernel and compressor objects.
1262 		 */
1263 
1264 		kernel_memory_populate_pmap_enter(object, addr, pg_offset,
1265 		    mem, prot, pe_flags);
1266 
1267 		if (flags & KMA_NOENCRYPT) {
1268 			pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
1269 		}
1270 	}
1271 
1272 	if (page_list) {
1273 		panic("%s: page_list too long", __func__);
1274 	}
1275 
1276 	vm_object_unlock(object);
1277 
1278 	if (!(flags & KMA_COMPRESSOR)) {
1279 		vm_page_lockspin_queues();
1280 		vm_page_wire_count += atop(size);
1281 		vm_page_unlock_queues();
1282 	}
1283 
1284 	if (flags & KMA_KOBJECT) {
1285 		/* vm_page_insert_wired() handles regular objects already */
1286 		vm_tag_update_size(tag, size);
1287 	}
1288 
1289 #if KASAN
1290 	if (flags & KMA_COMPRESSOR) {
1291 		kasan_notify_address_nopoison(addr, size);
1292 	} else {
1293 		kasan_notify_address(addr, size);
1294 	}
1295 #endif
1296 }
1297 
1298 
1299 kern_return_t
kernel_memory_populate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1300 kernel_memory_populate(
1301 	vm_offset_t     addr,
1302 	vm_size_t       size,
1303 	kma_flags_t     flags,
1304 	vm_tag_t        tag)
1305 {
1306 	kern_return_t   kr = KERN_SUCCESS;
1307 	vm_page_t       page_list = NULL;
1308 	vm_size_t       page_count = atop_64(size);
1309 	vm_object_t     object = __kmem_object(ANYF(flags));
1310 
1311 #if DEBUG || DEVELOPMENT
1312 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
1313 	    size, 0, 0, 0);
1314 #endif
1315 
1316 	kr = vm_page_alloc_list(page_count, flags, &page_list);
1317 	if (kr == KERN_SUCCESS) {
1318 		vm_object_lock(object);
1319 		kernel_memory_populate_object_and_unlock(object, addr,
1320 		    addr, size, page_list, flags, tag, VM_PROT_DEFAULT);
1321 	}
1322 
1323 #if DEBUG || DEVELOPMENT
1324 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1325 	    page_count, 0, 0, 0);
1326 #endif
1327 	return kr;
1328 }
1329 
1330 void
kernel_memory_depopulate(vm_offset_t addr,vm_size_t size,kma_flags_t flags,vm_tag_t tag)1331 kernel_memory_depopulate(
1332 	vm_offset_t        addr,
1333 	vm_size_t          size,
1334 	kma_flags_t        flags,
1335 	vm_tag_t           tag)
1336 {
1337 	vm_object_t        object = __kmem_object(ANYF(flags));
1338 	vm_object_offset_t offset = addr;
1339 	vm_page_t          mem;
1340 	vm_page_t          local_freeq = NULL;
1341 	unsigned int       pages_unwired = 0;
1342 
1343 	vm_object_lock(object);
1344 
1345 	pmap_protect(kernel_pmap, offset, offset + size, VM_PROT_NONE);
1346 
1347 	for (vm_object_offset_t pg_offset = 0;
1348 	    pg_offset < size;
1349 	    pg_offset += PAGE_SIZE_64) {
1350 		mem = vm_page_lookup(object, offset + pg_offset);
1351 
1352 		assert(mem);
1353 
1354 		if (flags & KMA_COMPRESSOR) {
1355 			assert(mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
1356 		} else {
1357 			assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
1358 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1359 			pages_unwired++;
1360 		}
1361 
1362 		mem->vmp_busy = TRUE;
1363 
1364 		assert(mem->vmp_tabled);
1365 		vm_page_remove(mem, TRUE);
1366 		assert(mem->vmp_busy);
1367 
1368 		assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
1369 
1370 		mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
1371 		mem->vmp_snext = local_freeq;
1372 		local_freeq = mem;
1373 	}
1374 
1375 	vm_object_unlock(object);
1376 
1377 	vm_page_free_list(local_freeq, TRUE);
1378 
1379 	if (!(flags & KMA_COMPRESSOR)) {
1380 		vm_page_lockspin_queues();
1381 		vm_page_wire_count -= pages_unwired;
1382 		vm_page_unlock_queues();
1383 	}
1384 
1385 	if (flags & KMA_KOBJECT) {
1386 		/* vm_page_remove() handles regular objects already */
1387 		vm_tag_update_size(tag, -ptoa_64(pages_unwired));
1388 	}
1389 }
1390 
1391 #pragma mark reallocation
1392 
1393 __abortlike
1394 static void
__kmem_realloc_invalid_object_size_panic(vm_map_t map,vm_address_t address,vm_size_t size,vm_map_entry_t entry,vm_object_t object)1395 __kmem_realloc_invalid_object_size_panic(
1396 	vm_map_t                map,
1397 	vm_address_t            address,
1398 	vm_size_t               size,
1399 	vm_map_entry_t          entry,
1400 	vm_object_t             object)
1401 {
1402 	panic("kmem_realloc(map=%p, addr=%p, size=%zd, entry=%p): "
1403 	    "object %p has unexpected size %lld",
1404 	    map, (void *)address, (size_t)size, entry, object, object->vo_size);
1405 }
1406 
1407 static kmem_return_t
kmem_realloc_shrink_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard,vm_map_entry_t entry)1408 kmem_realloc_shrink_guard(
1409 	vm_map_t                map,
1410 	vm_offset_t             oldaddr,
1411 	vm_size_t               oldsize,
1412 	vm_size_t               newsize,
1413 	kmr_flags_t             flags,
1414 	kmem_guard_t            guard,
1415 	vm_map_entry_t          entry)
1416 {
1417 	vm_object_t             object;
1418 	kmem_return_t           kmr = { .kmr_address = oldaddr };
1419 	bool                    was_atomic;
1420 
1421 	vm_map_lock_assert_exclusive(map);
1422 
1423 	if ((flags & KMR_KOBJECT) == 0) {
1424 		object = VME_OBJECT(entry);
1425 		vm_object_reference(object);
1426 	}
1427 
1428 	/*
1429 	 *	Shrinking an atomic entry starts with splitting it,
1430 	 *	and removing the second half.
1431 	 */
1432 	was_atomic = entry->vme_atomic;
1433 	entry->vme_atomic = false;
1434 	vm_map_clip_end(map, entry, entry->vme_start + newsize);
1435 	entry->vme_atomic = was_atomic;
1436 
1437 	(void)vm_map_remove_and_unlock(map,
1438 	    oldaddr + newsize, oldaddr + oldsize,
1439 	    VM_MAP_REMOVE_KUNWIRE, KMEM_GUARD_NONE);
1440 
1441 
1442 	/*
1443 	 *	Lastly, if there are guard pages, deal with them.
1444 	 *
1445 	 *	The kernel object just needs to depopulate,
1446 	 *	regular objects require freeing the last page
1447 	 *	and replacing it with a guard.
1448 	 */
1449 	if (flags & KMR_KOBJECT) {
1450 		if (flags & KMR_GUARD_LAST) {
1451 			kernel_memory_depopulate(oldaddr + newsize - PAGE_SIZE,
1452 			    PAGE_SIZE, KMA_KOBJECT, guard.kmg_tag);
1453 		}
1454 	} else {
1455 		vm_page_t guard_right = VM_PAGE_NULL;
1456 		vm_offset_t remove_start = newsize;
1457 
1458 		if (flags & KMR_GUARD_LAST) {
1459 			if (!map->never_faults) {
1460 				guard_right = vm_page_grab_guard(true);
1461 			}
1462 			remove_start -= PAGE_SIZE;
1463 		}
1464 
1465 		vm_object_lock(object);
1466 
1467 		if (object->vo_size != oldsize) {
1468 			__kmem_realloc_invalid_object_size_panic(map,
1469 			    oldaddr, oldsize, entry, object);
1470 		}
1471 		object->vo_size = newsize;
1472 
1473 		vm_object_page_remove(object, remove_start, oldsize);
1474 
1475 		if (guard_right) {
1476 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1477 			guard_right->vmp_busy = false;
1478 		}
1479 		vm_object_unlock(object);
1480 		vm_object_deallocate(object);
1481 	}
1482 
1483 	return kmr;
1484 }
1485 
1486 kmem_return_t
kmem_realloc_guard(vm_map_t map,vm_offset_t oldaddr,vm_size_t oldsize,vm_size_t newsize,kmr_flags_t flags,kmem_guard_t guard)1487 kmem_realloc_guard(
1488 	vm_map_t                map,
1489 	vm_offset_t             oldaddr,
1490 	vm_size_t               oldsize,
1491 	vm_size_t               newsize,
1492 	kmr_flags_t             flags,
1493 	kmem_guard_t            guard)
1494 {
1495 	vm_object_t             object;
1496 	vm_map_offset_t         newaddr;
1497 	vm_object_offset_t      newoffs;
1498 	vm_map_entry_t          oldentry;
1499 	vm_map_entry_t          newentry;
1500 	vm_page_t               page_list = NULL;
1501 	bool                    needs_wakeup = false;
1502 	kmem_return_t           kmr = { };
1503 	unsigned int            last_timestamp;
1504 	vm_map_kernel_flags_t   vmk_flags = {
1505 		.vmkf_last_free = (bool)(flags & KMR_LAST_FREE),
1506 	};
1507 
1508 	assert(KMEM_REALLOC_FLAGS_VALID(flags));
1509 	if (!guard.kmg_atomic && (flags & (KMR_DATA | KMR_KOBJECT)) != KMR_DATA) {
1510 		__kmem_invalid_arguments_panic("realloc", map, oldaddr,
1511 		    oldsize, flags);
1512 	}
1513 
1514 	if (oldaddr == 0ul) {
1515 		return kmem_alloc_guard(map, newsize, 0, (kma_flags_t)flags, guard);
1516 	}
1517 
1518 	if (newsize == 0ul) {
1519 		kmem_free_guard(map, oldaddr, oldsize, KMF_NONE, guard);
1520 		return kmr;
1521 	}
1522 
1523 	if (newsize >> VM_KERNEL_POINTER_SIGNIFICANT_BITS) {
1524 		__kmem_invalid_size_panic(map, newsize, flags);
1525 	}
1526 	if (newsize < __kmem_guard_size(ANYF(flags))) {
1527 		__kmem_invalid_size_panic(map, newsize, flags);
1528 	}
1529 
1530 	oldsize = round_page(oldsize);
1531 	newsize = round_page(newsize);
1532 
1533 	if (oldsize == newsize) {
1534 		kmr.kmr_address = oldaddr;
1535 		return kmr;
1536 	}
1537 
1538 	/*
1539 	 *	If we're growing the allocation,
1540 	 *	then reserve the pages we'll need,
1541 	 *	and find a spot for its new place.
1542 	 */
1543 	if (oldsize < newsize) {
1544 #if DEBUG || DEVELOPMENT
1545 		VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1546 		    VM_KERN_REQUEST, DBG_FUNC_START,
1547 		    newsize - oldsize, 0, 0, 0);
1548 #endif
1549 		kmr.kmr_return = vm_page_alloc_list(atop(newsize - oldsize),
1550 		    (kma_flags_t)flags, &page_list);
1551 		if (kmr.kmr_return == KERN_SUCCESS) {
1552 			kmem_apply_security_policy(map, (kma_flags_t)flags, guard,
1553 			    &vmk_flags, true);
1554 			kmr.kmr_return = vm_map_find_space(map, 0, newsize, 0,
1555 			    vmk_flags, &newentry);
1556 		}
1557 		if (__improbable(kmr.kmr_return != KERN_SUCCESS)) {
1558 			if (flags & KMR_REALLOCF) {
1559 				kmem_free_guard(map, oldaddr, oldsize,
1560 				    KMF_NONE, guard);
1561 			}
1562 			if (page_list) {
1563 				vm_page_free_list(page_list, FALSE);
1564 			}
1565 #if DEBUG || DEVELOPMENT
1566 			VM_DEBUG_CONSTANT_EVENT(vm_kern_request,
1567 			    VM_KERN_REQUEST, DBG_FUNC_END,
1568 			    0, 0, 0, 0);
1569 #endif
1570 			return kmr;
1571 		}
1572 
1573 		/* map is locked */
1574 	} else {
1575 		vm_map_lock(map);
1576 	}
1577 
1578 
1579 	/*
1580 	 *	Locate the entry:
1581 	 *	- wait for it to quiesce.
1582 	 *	- validate its guard,
1583 	 *	- learn its correct tag,
1584 	 */
1585 again:
1586 	if (!vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1587 		__kmem_entry_not_found_panic(map, oldaddr);
1588 	}
1589 	if ((flags & KMR_KOBJECT) && oldentry->in_transition) {
1590 		oldentry->needs_wakeup = true;
1591 		vm_map_entry_wait(map, THREAD_UNINT);
1592 		goto again;
1593 	}
1594 	kmem_entry_validate_guard(map, oldentry, oldaddr, oldsize, guard);
1595 	if (!__kmem_entry_validate_object(oldentry, ANYF(flags))) {
1596 		__kmem_entry_validate_object_panic(map, oldentry, ANYF(flags));
1597 	}
1598 	/*
1599 	 *	TODO: We should validate for non atomic entries that the range
1600 	 *	      we are acting on is what we expect here.
1601 	 */
1602 
1603 	guard.kmg_tag = VME_ALIAS(oldentry);
1604 
1605 	if (newsize < oldsize) {
1606 		return kmem_realloc_shrink_guard(map, oldaddr, oldsize, newsize,
1607 		           flags, guard, oldentry);
1608 	}
1609 
1610 	/*
1611 	 *	We are growing the entry
1612 	 *
1613 	 *	For regular objects we use the object `vo_size` updates
1614 	 *	as a guarantee that no 2 kmem_realloc() can happen
1615 	 *	concurrently (by doing it before the map is unlocked.
1616 	 *
1617 	 *	For the kernel object, prevent the entry from being
1618 	 *	reallocated or changed by marking it "in_transition".
1619 	 */
1620 
1621 	object = VME_OBJECT(oldentry);
1622 	vm_object_lock(object);
1623 	vm_object_reference_locked(object);
1624 
1625 	newaddr = newentry->vme_start;
1626 	newoffs = oldsize;
1627 
1628 	VME_OBJECT_SET(newentry, object, guard.kmg_atomic, guard.kmg_context);
1629 	VME_ALIAS_SET(newentry, guard.kmg_tag);
1630 	if (flags & KMR_KOBJECT) {
1631 		oldentry->in_transition = true;
1632 		VME_OFFSET_SET(newentry, newaddr);
1633 		newentry->wired_count = 1;
1634 		newoffs = newaddr + oldsize;
1635 	} else {
1636 		if (object->vo_size != oldsize) {
1637 			__kmem_realloc_invalid_object_size_panic(map,
1638 			    oldaddr, oldsize, oldentry, object);
1639 		}
1640 		object->vo_size = newsize;
1641 	}
1642 
1643 	last_timestamp = map->timestamp;
1644 	vm_map_unlock(map);
1645 
1646 
1647 	/*
1648 	 *	Now proceed with the population of pages.
1649 	 *
1650 	 *	Kernel objects can use the kmem population helpers.
1651 	 *
1652 	 *	Regular objects will insert pages manually,
1653 	 *	then wire the memory into the new range.
1654 	 */
1655 
1656 	vm_size_t guard_right_size = __kmem_guard_right(ANYF(flags));
1657 
1658 	if (flags & KMR_KOBJECT) {
1659 		assert(flags & KMR_FREEOLD);
1660 
1661 		pmap_protect(kernel_pmap,
1662 		    oldaddr, oldaddr + oldsize - guard_right_size,
1663 		    VM_PROT_NONE);
1664 
1665 		for (vm_object_offset_t offset = 0;
1666 		    offset < oldsize - guard_right_size;
1667 		    offset += PAGE_SIZE_64) {
1668 			vm_page_t mem;
1669 
1670 			mem = vm_page_lookup(object, oldaddr + offset);
1671 			if (mem == VM_PAGE_NULL) {
1672 				continue;
1673 			}
1674 
1675 			pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
1676 
1677 			mem->vmp_busy = true;
1678 			vm_page_remove(mem, true);
1679 			vm_page_insert_wired(mem, object, newaddr + offset,
1680 			    guard.kmg_tag);
1681 			mem->vmp_busy = false;
1682 
1683 			kernel_memory_populate_pmap_enter(object, newaddr,
1684 			    offset, mem, VM_PROT_DEFAULT, 0);
1685 		}
1686 
1687 		kernel_memory_populate_object_and_unlock(object,
1688 		    newaddr + oldsize - guard_right_size,
1689 		    newoffs - guard_right_size,
1690 		    newsize - oldsize,
1691 		    page_list, (kma_flags_t)flags,
1692 		    guard.kmg_tag, VM_PROT_DEFAULT);
1693 	} else {
1694 		vm_page_t guard_right = VM_PAGE_NULL;
1695 		kern_return_t kr;
1696 
1697 		/*
1698 		 *	Note: we are borrowing the new entry reference
1699 		 *	on the object for the duration of this code,
1700 		 *	which works because we keep the object locked
1701 		 *	throughout.
1702 		 */
1703 		if ((flags & KMR_GUARD_LAST) && !map->never_faults) {
1704 			guard_right = vm_page_lookup(object, oldsize - PAGE_SIZE);
1705 			assert(guard_right->vmp_fictitious);
1706 			guard_right->vmp_busy = true;
1707 			vm_page_remove(guard_right, true);
1708 		}
1709 
1710 		for (vm_object_offset_t offset = oldsize - guard_right_size;
1711 		    offset < newsize - guard_right_size;
1712 		    offset += PAGE_SIZE_64) {
1713 			vm_page_t mem = page_list;
1714 
1715 			page_list = mem->vmp_snext;
1716 			mem->vmp_snext = VM_PAGE_NULL;
1717 
1718 			vm_page_insert(mem, object, offset);
1719 			mem->vmp_busy = false;
1720 		}
1721 
1722 		if (guard_right) {
1723 			vm_page_insert(guard_right, object, newsize - PAGE_SIZE);
1724 			guard_right->vmp_busy = false;
1725 		}
1726 
1727 		vm_object_unlock(object);
1728 
1729 		kr = vm_map_wire_kernel(map, newaddr, newaddr + newsize,
1730 		    VM_PROT_DEFAULT, guard.kmg_tag, FALSE);
1731 		assert(kr == KERN_SUCCESS);
1732 	}
1733 
1734 #if KASAN
1735 	kasan_notify_address(newaddr, newsize);
1736 #endif
1737 
1738 
1739 	/*
1740 	 *	Mark the entry as idle again,
1741 	 *	and honor KMR_FREEOLD if needed.
1742 	 */
1743 
1744 	vm_map_lock(map);
1745 	if (last_timestamp + 1 != map->timestamp &&
1746 	    !vm_map_lookup_entry(map, oldaddr, &oldentry)) {
1747 		__kmem_entry_not_found_panic(map, oldaddr);
1748 	}
1749 
1750 	if (flags & KMR_KOBJECT) {
1751 		assert(oldentry->in_transition);
1752 		oldentry->in_transition = false;
1753 		if (oldentry->needs_wakeup) {
1754 			needs_wakeup = true;
1755 			oldentry->needs_wakeup = false;
1756 		}
1757 	}
1758 
1759 	if (flags & KMR_FREEOLD) {
1760 		(void)vm_map_remove_and_unlock(map,
1761 		    oldaddr, oldaddr + oldsize,
1762 		    VM_MAP_REMOVE_KUNWIRE, guard);
1763 	} else {
1764 		vm_map_unlock(map);
1765 	}
1766 
1767 	if (needs_wakeup) {
1768 		vm_map_entry_wakeup(map);
1769 	}
1770 
1771 
1772 #if DEBUG || DEVELOPMENT
1773 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
1774 	    atop(newsize - oldsize), 0, 0, 0);
1775 #endif
1776 	kmr.kmr_address = newaddr;
1777 	return kmr;
1778 }
1779 
1780 
1781 #pragma mark free
1782 
1783 vm_size_t
kmem_free_guard(vm_map_t map,vm_offset_t addr,vm_size_t size,kmf_flags_t flags,kmem_guard_t guard)1784 kmem_free_guard(
1785 	vm_map_t        map,
1786 	vm_offset_t     addr,
1787 	vm_size_t       size,
1788 	kmf_flags_t     flags,
1789 	kmem_guard_t    guard)
1790 {
1791 	vmr_flags_t vmr_flags = VM_MAP_REMOVE_KUNWIRE;
1792 
1793 	assert(addr >= VM_MIN_KERNEL_AND_KEXT_ADDRESS);
1794 	assert(map->pmap == kernel_pmap);
1795 
1796 	if (flags & KMF_GUESS_SIZE) {
1797 		vmr_flags |= VM_MAP_REMOVE_GUESS_SIZE;
1798 		size = PAGE_SIZE;
1799 	} else if (size == 0) {
1800 		__kmem_invalid_size_panic(map, size, flags);
1801 	} else {
1802 		size = round_page(size);
1803 	}
1804 
1805 	return vm_map_remove_guard(map, addr, addr + size,
1806 	           vmr_flags, guard).kmr_size;
1807 }
1808 
1809 __exported void
1810 kmem_free_external(
1811 	vm_map_t        map,
1812 	vm_offset_t     addr,
1813 	vm_size_t       size);
1814 void
kmem_free_external(vm_map_t map,vm_offset_t addr,vm_size_t size)1815 kmem_free_external(
1816 	vm_map_t        map,
1817 	vm_offset_t     addr,
1818 	vm_size_t       size)
1819 {
1820 	if (size) {
1821 		kmem_free(map, trunc_page(addr), size);
1822 #if MACH_ASSERT
1823 	} else {
1824 		printf("kmem_free(map=%p, addr=%p) called with size=0, lr: %p\n",
1825 		    map, (void *)addr, __builtin_return_address(0));
1826 #endif
1827 	}
1828 }
1829 
1830 
1831 #pragma mark kmem init
1832 
1833 /*
1834  * The default percentage of memory that can be mlocked is scaled based on the total
1835  * amount of memory in the system. These percentages are caclulated
1836  * offline and stored in this table. We index this table by
1837  * log2(max_mem) - VM_USER_WIREABLE_MIN_CONFIG. We clamp this index in the range
1838  * [0, sizeof(wire_limit_percents) / sizeof(vm_map_size_t))
1839  *
1840  * Note that these values were picked for mac.
1841  * If we ever have very large memory config arm devices, we may want to revisit
1842  * since the kernel overhead is smaller there due to the larger page size.
1843  */
1844 
1845 /* Start scaling iff we're managing > 2^32 = 4GB of RAM. */
1846 #define VM_USER_WIREABLE_MIN_CONFIG 32
1847 #if CONFIG_JETSAM
1848 /* Systems with jetsam can wire a bit more b/c the system can relieve wired
1849  * pressure.
1850  */
1851 static vm_map_size_t wire_limit_percents[] =
1852 { 80, 80, 80, 80, 82, 85, 88, 91, 94, 97};
1853 #else
1854 static vm_map_size_t wire_limit_percents[] =
1855 { 70, 73, 76, 79, 82, 85, 88, 91, 94, 97};
1856 #endif /* CONFIG_JETSAM */
1857 
1858 /*
1859  * Sets the default global user wire limit which limits the amount of
1860  * memory that can be locked via mlock() based on the above algorithm..
1861  * This can be overridden via a sysctl.
1862  */
1863 static void
kmem_set_user_wire_limits(void)1864 kmem_set_user_wire_limits(void)
1865 {
1866 	uint64_t available_mem_log;
1867 	uint64_t max_wire_percent;
1868 	size_t wire_limit_percents_length = sizeof(wire_limit_percents) /
1869 	    sizeof(vm_map_size_t);
1870 	vm_map_size_t limit;
1871 	uint64_t config_memsize = max_mem;
1872 #if defined(XNU_TARGET_OS_OSX)
1873 	config_memsize = max_mem_actual;
1874 #endif /* defined(XNU_TARGET_OS_OSX) */
1875 
1876 	available_mem_log = bit_floor(config_memsize);
1877 
1878 	if (available_mem_log < VM_USER_WIREABLE_MIN_CONFIG) {
1879 		available_mem_log = 0;
1880 	} else {
1881 		available_mem_log -= VM_USER_WIREABLE_MIN_CONFIG;
1882 	}
1883 	if (available_mem_log >= wire_limit_percents_length) {
1884 		available_mem_log = wire_limit_percents_length - 1;
1885 	}
1886 	max_wire_percent = wire_limit_percents[available_mem_log];
1887 
1888 	limit = config_memsize * max_wire_percent / 100;
1889 	/* Cap the number of non lockable bytes at VM_NOT_USER_WIREABLE_MAX */
1890 	if (config_memsize - limit > VM_NOT_USER_WIREABLE_MAX) {
1891 		limit = config_memsize - VM_NOT_USER_WIREABLE_MAX;
1892 	}
1893 
1894 	vm_global_user_wire_limit = limit;
1895 	/* the default per task limit is the same as the global limit */
1896 	vm_per_task_user_wire_limit = limit;
1897 	vm_add_wire_count_over_global_limit = 0;
1898 	vm_add_wire_count_over_user_limit = 0;
1899 }
1900 
1901 #define KMEM_MAX_CLAIMS 50
1902 __startup_data
1903 struct kmem_range_startup_spec kmem_claims[KMEM_MAX_CLAIMS] = {};
1904 __startup_data
1905 uint32_t kmem_claim_count = 0;
1906 
1907 __startup_func
1908 void
kmem_range_startup_init(struct kmem_range_startup_spec * sp)1909 kmem_range_startup_init(
1910 	struct kmem_range_startup_spec *sp)
1911 {
1912 	assert(kmem_claim_count < KMEM_MAX_CLAIMS - KMEM_RANGE_COUNT);
1913 	if (sp->kc_calculate_sz) {
1914 		sp->kc_size = (sp->kc_calculate_sz)();
1915 	}
1916 	if (sp->kc_size) {
1917 		kmem_claims[kmem_claim_count] = *sp;
1918 		kmem_claim_count++;
1919 	}
1920 }
1921 
1922 #if ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
1923 static vm_offset_t
kmem_fuzz_start(void)1924 kmem_fuzz_start(void)
1925 {
1926 	vm_offset_t kmapoff_kaddr = 0;
1927 	uint32_t kmapoff_pgcnt = (early_random() & 0x1ff) + 1; /* 9 bits */
1928 	vm_map_size_t kmapoff_size = ptoa(kmapoff_pgcnt);
1929 
1930 	kmem_alloc(kernel_map, &kmapoff_kaddr, kmapoff_size,
1931 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_VAONLY,
1932 	    VM_KERN_MEMORY_OSFMK);
1933 	return kmapoff_kaddr + kmapoff_size;
1934 }
1935 
1936 /*
1937  * Returns a 16bit random number between 0 and
1938  * upper_limit (inclusive)
1939  */
1940 __startup_func
1941 uint16_t
kmem_get_random16(uint16_t upper_limit)1942 kmem_get_random16(
1943 	uint16_t        upper_limit)
1944 {
1945 	static uint64_t random_entropy;
1946 	assert(upper_limit < UINT16_MAX);
1947 	if (random_entropy == 0) {
1948 		random_entropy = early_random();
1949 	}
1950 	uint32_t result = random_entropy & UINT32_MAX;
1951 	random_entropy >>= 32;
1952 	return (uint16_t)(result % (upper_limit + 1));
1953 }
1954 
1955 /*
1956  * Generate a randomly shuffled array of indices from 0 to count - 1
1957  */
1958 __startup_func
1959 void
kmem_shuffle(uint16_t * shuffle_buf,uint16_t count)1960 kmem_shuffle(
1961 	uint16_t       *shuffle_buf,
1962 	uint16_t        count)
1963 {
1964 	for (uint16_t i = 0; i < count; i++) {
1965 		uint16_t j = kmem_get_random16(i);
1966 		if (j != i) {
1967 			shuffle_buf[i] = shuffle_buf[j];
1968 		}
1969 		shuffle_buf[j] = i;
1970 	}
1971 }
1972 
1973 __startup_func
1974 static void
kmem_shuffle_claims(void)1975 kmem_shuffle_claims(void)
1976 {
1977 	uint16_t shuffle_buf[KMEM_MAX_CLAIMS] = {};
1978 	uint16_t limit = (uint16_t)kmem_claim_count;
1979 
1980 	kmem_shuffle(&shuffle_buf[0], limit);
1981 	for (uint16_t i = 0; i < limit; i++) {
1982 		struct kmem_range_startup_spec tmp = kmem_claims[i];
1983 		kmem_claims[i] = kmem_claims[shuffle_buf[i]];
1984 		kmem_claims[shuffle_buf[i]] = tmp;
1985 	}
1986 }
1987 __startup_func
1988 static void
kmem_readjust_ranges(uint32_t cur_idx)1989 kmem_readjust_ranges(
1990 	uint32_t        cur_idx)
1991 {
1992 	assert(cur_idx != 0);
1993 	uint32_t j = cur_idx - 1, random;
1994 	struct kmem_range_startup_spec sp = kmem_claims[cur_idx];
1995 	struct mach_vm_range *sp_range = sp.kc_range;
1996 
1997 	/*
1998 	 * Find max index where restriction is met
1999 	 */
2000 	for (; j > 0; j--) {
2001 		struct kmem_range_startup_spec spj = kmem_claims[j];
2002 		vm_map_offset_t max_start = spj.kc_range->min_address;
2003 		if (spj.kc_flags & KC_NO_MOVE) {
2004 			panic("kmem_range_init: Can't scramble with multiple constraints");
2005 		}
2006 		if (max_start <= sp_range->min_address) {
2007 			break;
2008 		}
2009 	}
2010 
2011 	/*
2012 	 * Pick a random index from 0 to max index and shift claims to the right
2013 	 * to make room for restricted claim
2014 	 */
2015 	random = kmem_get_random16((uint16_t)j);
2016 	assert(random <= j);
2017 
2018 	sp_range->min_address = kmem_claims[random].kc_range->min_address;
2019 	sp_range->max_address = sp_range->min_address + sp.kc_size;
2020 
2021 	for (j = cur_idx - 1; j >= random && j != UINT32_MAX; j--) {
2022 		struct kmem_range_startup_spec spj = kmem_claims[j];
2023 		struct mach_vm_range *range = spj.kc_range;
2024 		range->min_address += sp.kc_size;
2025 		range->max_address += sp.kc_size;
2026 		kmem_claims[j + 1] = spj;
2027 	}
2028 
2029 	sp.kc_flags = KC_NO_MOVE;
2030 	kmem_claims[random] = sp;
2031 }
2032 
2033 __startup_func
2034 static void
kmem_add_extra_claims(void)2035 kmem_add_extra_claims(void)
2036 {
2037 	vm_map_size_t largest_free_size = 0, total_claims = 0;
2038 
2039 	vm_map_sizes(kernel_map, NULL, NULL, &largest_free_size);
2040 	largest_free_size = trunc_page(largest_free_size);
2041 
2042 	/*
2043 	 * Determine size of data and pointer kmem_ranges
2044 	 */
2045 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2046 		total_claims += kmem_claims[i].kc_size;
2047 	}
2048 	assert((total_claims & PAGE_MASK) == 0);
2049 	largest_free_size -= total_claims;
2050 
2051 	/*
2052 	 * kasan and configs w/o *TRR need to have just one ptr range due to
2053 	 * resource constraints.
2054 	 */
2055 #if !ZSECURITY_CONFIG(KERNEL_PTR_SPLIT)
2056 	kmem_ptr_ranges = 1;
2057 #endif
2058 
2059 	ptr_range_size = round_page(largest_free_size /
2060 	    (kmem_ptr_ranges * 3));
2061 	data_range_size = largest_free_size -
2062 	    (ptr_range_size * kmem_ptr_ranges);
2063 
2064 
2065 	/*
2066 	 * Add claims for data and pointer
2067 	 */
2068 	struct kmem_range_startup_spec kmem_spec_data = {
2069 		.kc_name = "kmem_data_range",
2070 		.kc_range = &kmem_ranges[KMEM_RANGE_ID_DATA],
2071 		.kc_size = data_range_size,
2072 		.kc_flags = KC_NO_ENTRY,
2073 	};
2074 	kmem_claims[kmem_claim_count++] = kmem_spec_data;
2075 
2076 	for (uint32_t i = 0; i < kmem_ptr_ranges; i++) {
2077 		struct kmem_range_startup_spec kmem_spec_ptr = {
2078 			.kc_name = "kmem_ptr_range",
2079 			.kc_range = &kmem_ranges[KMEM_RANGE_ID_PTR_0 + i],
2080 			.kc_size = ptr_range_size,
2081 			.kc_flags = KC_NO_ENTRY,
2082 		};
2083 		kmem_claims[kmem_claim_count++] = kmem_spec_ptr;
2084 	}
2085 }
2086 
2087 __startup_func
2088 static void
kmem_scramble_ranges(void)2089 kmem_scramble_ranges(void)
2090 {
2091 	vm_map_offset_t start = 0;
2092 
2093 	/*
2094 	 * Initiatize KMEM_RANGE_ID_NONE range to use the entire map so that
2095 	 * the vm can find the requested ranges.
2096 	 */
2097 	kmem_ranges[KMEM_RANGE_ID_NONE].min_address = MAX(kernel_map->min_offset,
2098 	    VM_MAP_PAGE_SIZE(kernel_map));
2099 	kmem_ranges[KMEM_RANGE_ID_NONE].max_address = kernel_map->max_offset;
2100 
2101 	/*
2102 	 * Allocating the g_kext_map prior to randomizing the remaining submaps as
2103 	 * this map is 2G in size and starts at the end of kernel_text on x86. It
2104 	 * could overflow into the heap.
2105 	 */
2106 	kext_alloc_init();
2107 
2108 	/*
2109 	 * Eat a random amount of kernel_map to fuzz subsequent heap, zone and
2110 	 * stack addresses. (With a 4K page and 9 bits of randomness, this
2111 	 * eats about 2M of VA from the map)
2112 	 *
2113 	 * Note that we always need to slide by at least one page because the VM
2114 	 * pointer packing schemes using KERNEL_PMAP_HEAP_RANGE_START as a base
2115 	 * do not admit this address to be part of any zone submap.
2116 	 */
2117 	start = kmem_fuzz_start();
2118 
2119 	/*
2120 	 * Add claims for ptr and data kmem_ranges
2121 	 */
2122 	kmem_add_extra_claims();
2123 
2124 	/*
2125 	 * Shuffle registered claims
2126 	 */
2127 	assert(kmem_claim_count < UINT16_MAX);
2128 	kmem_shuffle_claims();
2129 
2130 	/*
2131 	 * Apply restrictions and determine range for each claim
2132 	 */
2133 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2134 		vm_map_offset_t end = 0;
2135 		struct kmem_range_startup_spec sp = kmem_claims[i];
2136 		struct mach_vm_range *sp_range = sp.kc_range;
2137 		if (vm_map_locate_space(kernel_map, sp.kc_size, 0,
2138 		    VM_MAP_KERNEL_FLAGS_NONE, &start, NULL) != KERN_SUCCESS) {
2139 			panic("kmem_range_init: vm_map_locate_space failing for claim %s",
2140 			    sp.kc_name);
2141 		}
2142 
2143 		end = start + sp.kc_size;
2144 		/*
2145 		 * Re-adjust ranges if restriction not met
2146 		 */
2147 		if (sp_range->min_address && start > sp_range->min_address) {
2148 			kmem_readjust_ranges(i);
2149 		} else {
2150 			sp_range->min_address = start;
2151 			sp_range->max_address = end;
2152 		}
2153 		start = end;
2154 	}
2155 
2156 	/*
2157 	 * We have settled on the ranges, now create temporary entries for the
2158 	 * claims
2159 	 */
2160 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2161 		struct kmem_range_startup_spec sp = kmem_claims[i];
2162 		vm_map_entry_t entry = NULL;
2163 		if (sp.kc_flags & KC_NO_ENTRY) {
2164 			continue;
2165 		}
2166 		if (vm_map_find_space(kernel_map, sp.kc_range->min_address, sp.kc_size, 0,
2167 		    VM_MAP_KERNEL_FLAGS_NONE, &entry) != KERN_SUCCESS) {
2168 			panic("kmem_range_init: vm_map_find_space failing for claim %s",
2169 			    sp.kc_name);
2170 		}
2171 		vm_object_reference(kernel_object);
2172 		VME_OBJECT_SET(entry, kernel_object, false, 0);
2173 		VME_OFFSET_SET(entry, entry->vme_start);
2174 		vm_map_unlock(kernel_map);
2175 	}
2176 	/*
2177 	 * Now that we are done assigning all the ranges, reset
2178 	 * kmem_ranges[KMEM_RANGE_ID_NONE]
2179 	 */
2180 	kmem_ranges[KMEM_RANGE_ID_NONE] = (struct mach_vm_range) {};
2181 
2182 #if DEBUG || DEVELOPMENT
2183 	for (uint32_t i = 0; i < kmem_claim_count; i++) {
2184 		struct kmem_range_startup_spec sp = kmem_claims[i];
2185 
2186 		printf("%-24s: %p - %p (%u%c)\n", sp.kc_name,
2187 		    (void *)sp.kc_range->min_address,
2188 		    (void *)sp.kc_range->max_address,
2189 		    mach_vm_size_pretty(sp.kc_size),
2190 		    mach_vm_size_unit(sp.kc_size));
2191 	}
2192 #endif /* DEBUG || DEVELOPMENT */
2193 }
2194 
2195 __startup_func
2196 static void
kmem_range_init(void)2197 kmem_range_init(void)
2198 {
2199 	kmem_scramble_ranges();
2200 
2201 	/* Initialize kmem_large_ranges. Skip 1/16th of range size on either side
2202 	 * for ptr ranges and 1/8th only from left for data as we a single front
2203 	 * for data.
2204 	 */
2205 	vm_size_t range_adjustment = ptr_range_size >> 4;
2206 	for (kmem_range_id_t i = 0; i < kmem_ptr_ranges; i++) {
2207 		kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address =
2208 		    kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].min_address + range_adjustment;
2209 		kmem_large_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address =
2210 		    kmem_ranges[KMEM_RANGE_ID_PTR_0 + i].max_address - range_adjustment;
2211 	}
2212 	range_adjustment = data_range_size >> 3;
2213 	kmem_large_ranges[KMEM_RANGE_ID_DATA].min_address =
2214 	    kmem_ranges[KMEM_RANGE_ID_DATA].min_address + range_adjustment;
2215 	kmem_large_ranges[KMEM_RANGE_ID_DATA].max_address =
2216 	    kmem_ranges[KMEM_RANGE_ID_DATA].max_address;
2217 
2218 #if DEBUG || DEVELOPMENT
2219 	for (kmem_range_id_t i = 0; i < KMEM_RANGE_COUNT; i++) {
2220 		vm_size_t range_size = mach_vm_range_size(&kmem_large_ranges[i]);
2221 		printf("kmem_large_ranges[%d]    : %p - %p (%u%c)\n", i,
2222 		    (void *)kmem_large_ranges[i].min_address,
2223 		    (void *)kmem_large_ranges[i].max_address,
2224 		    mach_vm_size_pretty(range_size),
2225 		    mach_vm_size_unit(range_size));
2226 	}
2227 #endif
2228 }
2229 #else /* ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
2230 __startup_func
2231 static void
kmem_range_init(void)2232 kmem_range_init(void)
2233 {
2234 	for (kmem_range_id_t i = 0; i < KMEM_RANGE_COUNT; i++) {
2235 		kmem_ranges[i].min_address = kernel_map->min_offset;
2236 		kmem_ranges[i].max_address = kernel_map->max_offset;
2237 	}
2238 	kext_alloc_init();
2239 	kmem_fuzz_start();
2240 }
2241 #endif
2242 STARTUP(KMEM, STARTUP_RANK_THIRD, kmem_range_init);
2243 
2244 /*
2245  *	kmem_init:
2246  *
2247  *	Initialize the kernel's virtual memory map, taking
2248  *	into account all memory allocated up to this time.
2249  */
2250 __startup_func
2251 void
kmem_init(vm_offset_t start,vm_offset_t end)2252 kmem_init(
2253 	vm_offset_t     start,
2254 	vm_offset_t     end)
2255 {
2256 	vm_map_offset_t map_start;
2257 	vm_map_offset_t map_end;
2258 	vm_map_kernel_flags_t vmk_flags;
2259 
2260 	vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2261 	vmk_flags.vmkf_permanent = TRUE;
2262 	vmk_flags.vmkf_no_pmap_check = TRUE;
2263 
2264 	map_start = vm_map_trunc_page(start,
2265 	    VM_MAP_PAGE_MASK(kernel_map));
2266 	map_end = vm_map_round_page(end,
2267 	    VM_MAP_PAGE_MASK(kernel_map));
2268 
2269 	vm_map_will_allocate_early_map(&kernel_map);
2270 #if defined(__arm64__)
2271 	kernel_map = vm_map_create_options(pmap_kernel(),
2272 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2273 	    VM_MAX_KERNEL_ADDRESS,
2274 	    VM_MAP_CREATE_DEFAULT);
2275 	/*
2276 	 *	Reserve virtual memory allocated up to this time.
2277 	 */
2278 	{
2279 		unsigned int    region_select = 0;
2280 		vm_map_offset_t region_start;
2281 		vm_map_size_t   region_size;
2282 		vm_map_offset_t map_addr;
2283 		kern_return_t kr;
2284 
2285 		while (pmap_virtual_region(region_select, &region_start, &region_size)) {
2286 			map_addr = region_start;
2287 			kr = vm_map_enter(kernel_map, &map_addr,
2288 			    vm_map_round_page(region_size,
2289 			    VM_MAP_PAGE_MASK(kernel_map)),
2290 			    (vm_map_offset_t) 0,
2291 			    VM_FLAGS_FIXED,
2292 			    vmk_flags,
2293 			    VM_KERN_MEMORY_NONE,
2294 			    VM_OBJECT_NULL,
2295 			    (vm_object_offset_t) 0, FALSE, VM_PROT_NONE, VM_PROT_NONE,
2296 			    VM_INHERIT_DEFAULT);
2297 
2298 			if (kr != KERN_SUCCESS) {
2299 				panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2300 				    (uint64_t) start, (uint64_t) end, (uint64_t) region_start,
2301 				    (uint64_t) region_size, kr);
2302 			}
2303 
2304 			region_select++;
2305 		}
2306 	}
2307 #else
2308 	kernel_map = vm_map_create_options(pmap_kernel(),
2309 	    VM_MIN_KERNEL_AND_KEXT_ADDRESS, map_end,
2310 	    VM_MAP_CREATE_DEFAULT);
2311 	/*
2312 	 *	Reserve virtual memory allocated up to this time.
2313 	 */
2314 	if (start != VM_MIN_KERNEL_AND_KEXT_ADDRESS) {
2315 		vm_map_offset_t map_addr;
2316 		kern_return_t kr;
2317 
2318 		vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
2319 		vmk_flags.vmkf_no_pmap_check = TRUE;
2320 
2321 		map_addr = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2322 		kr = vm_map_enter(kernel_map,
2323 		    &map_addr,
2324 		    (vm_map_size_t)(map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2325 		    (vm_map_offset_t) 0,
2326 		    VM_FLAGS_FIXED,
2327 		    vmk_flags,
2328 		    VM_KERN_MEMORY_NONE,
2329 		    VM_OBJECT_NULL,
2330 		    (vm_object_offset_t) 0, FALSE,
2331 		    VM_PROT_NONE, VM_PROT_NONE,
2332 		    VM_INHERIT_DEFAULT);
2333 
2334 		if (kr != KERN_SUCCESS) {
2335 			panic("kmem_init(0x%llx,0x%llx): vm_map_enter(0x%llx,0x%llx) error 0x%x",
2336 			    (uint64_t) start, (uint64_t) end,
2337 			    (uint64_t) VM_MIN_KERNEL_AND_KEXT_ADDRESS,
2338 			    (uint64_t) (map_start - VM_MIN_KERNEL_AND_KEXT_ADDRESS),
2339 			    kr);
2340 		}
2341 	}
2342 #endif
2343 
2344 	kmem_set_user_wire_limits();
2345 }
2346 
2347 
2348 #pragma mark map copyio
2349 
2350 /*
2351  *	Routine:	copyinmap
2352  *	Purpose:
2353  *		Like copyin, except that fromaddr is an address
2354  *		in the specified VM map.  This implementation
2355  *		is incomplete; it handles the current user map
2356  *		and the kernel map/submaps.
2357  */
2358 kern_return_t
copyinmap(vm_map_t map,vm_map_offset_t fromaddr,void * todata,vm_size_t length)2359 copyinmap(
2360 	vm_map_t                map,
2361 	vm_map_offset_t         fromaddr,
2362 	void                    *todata,
2363 	vm_size_t               length)
2364 {
2365 	kern_return_t   kr = KERN_SUCCESS;
2366 	vm_map_t oldmap;
2367 
2368 	if (vm_map_pmap(map) == pmap_kernel()) {
2369 		/* assume a correct copy */
2370 		memcpy(todata, CAST_DOWN(void *, fromaddr), length);
2371 	} else if (current_map() == map) {
2372 		if (copyin(fromaddr, todata, length) != 0) {
2373 			kr = KERN_INVALID_ADDRESS;
2374 		}
2375 	} else {
2376 		vm_map_reference(map);
2377 		oldmap = vm_map_switch(map);
2378 		if (copyin(fromaddr, todata, length) != 0) {
2379 			kr = KERN_INVALID_ADDRESS;
2380 		}
2381 		vm_map_switch(oldmap);
2382 		vm_map_deallocate(map);
2383 	}
2384 	return kr;
2385 }
2386 
2387 /*
2388  *	Routine:	copyoutmap
2389  *	Purpose:
2390  *		Like copyout, except that toaddr is an address
2391  *		in the specified VM map.
2392  */
2393 kern_return_t
copyoutmap(vm_map_t map,void * fromdata,vm_map_address_t toaddr,vm_size_t length)2394 copyoutmap(
2395 	vm_map_t                map,
2396 	void                    *fromdata,
2397 	vm_map_address_t        toaddr,
2398 	vm_size_t               length)
2399 {
2400 	kern_return_t   kr = KERN_SUCCESS;
2401 	vm_map_t        oldmap;
2402 
2403 	if (vm_map_pmap(map) == pmap_kernel()) {
2404 		/* assume a correct copy */
2405 		memcpy(CAST_DOWN(void *, toaddr), fromdata, length);
2406 	} else if (current_map() == map) {
2407 		if (copyout(fromdata, toaddr, length) != 0) {
2408 			kr = KERN_INVALID_ADDRESS;
2409 		}
2410 	} else {
2411 		vm_map_reference(map);
2412 		oldmap = vm_map_switch(map);
2413 		if (copyout(fromdata, toaddr, length) != 0) {
2414 			kr = KERN_INVALID_ADDRESS;
2415 		}
2416 		vm_map_switch(oldmap);
2417 		vm_map_deallocate(map);
2418 	}
2419 	return kr;
2420 }
2421 
2422 /*
2423  *	Routine:	copyoutmap_atomic{32, 64}
2424  *	Purpose:
2425  *		Like copyoutmap, except that the operation is atomic.
2426  *      Takes in value rather than *fromdata pointer.
2427  */
2428 kern_return_t
copyoutmap_atomic32(vm_map_t map,uint32_t value,vm_map_address_t toaddr)2429 copyoutmap_atomic32(
2430 	vm_map_t                map,
2431 	uint32_t                value,
2432 	vm_map_address_t        toaddr)
2433 {
2434 	kern_return_t   kr = KERN_SUCCESS;
2435 	vm_map_t        oldmap;
2436 
2437 	if (vm_map_pmap(map) == pmap_kernel()) {
2438 		/* assume a correct toaddr */
2439 		*(uint32_t *)toaddr = value;
2440 	} else if (current_map() == map) {
2441 		if (copyout_atomic32(value, toaddr) != 0) {
2442 			kr = KERN_INVALID_ADDRESS;
2443 		}
2444 	} else {
2445 		vm_map_reference(map);
2446 		oldmap = vm_map_switch(map);
2447 		if (copyout_atomic32(value, toaddr) != 0) {
2448 			kr = KERN_INVALID_ADDRESS;
2449 		}
2450 		vm_map_switch(oldmap);
2451 		vm_map_deallocate(map);
2452 	}
2453 	return kr;
2454 }
2455 
2456 kern_return_t
copyoutmap_atomic64(vm_map_t map,uint64_t value,vm_map_address_t toaddr)2457 copyoutmap_atomic64(
2458 	vm_map_t                map,
2459 	uint64_t                value,
2460 	vm_map_address_t        toaddr)
2461 {
2462 	kern_return_t   kr = KERN_SUCCESS;
2463 	vm_map_t        oldmap;
2464 
2465 	if (vm_map_pmap(map) == pmap_kernel()) {
2466 		/* assume a correct toaddr */
2467 		*(uint64_t *)toaddr = value;
2468 	} else if (current_map() == map) {
2469 		if (copyout_atomic64(value, toaddr) != 0) {
2470 			kr = KERN_INVALID_ADDRESS;
2471 		}
2472 	} else {
2473 		vm_map_reference(map);
2474 		oldmap = vm_map_switch(map);
2475 		if (copyout_atomic64(value, toaddr) != 0) {
2476 			kr = KERN_INVALID_ADDRESS;
2477 		}
2478 		vm_map_switch(oldmap);
2479 		vm_map_deallocate(map);
2480 	}
2481 	return kr;
2482 }
2483 
2484 
2485 #pragma mark pointer obfuscation / packing
2486 
2487 /*
2488  *
2489  *	The following two functions are to be used when exposing kernel
2490  *	addresses to userspace via any of the various debug or info
2491  *	facilities that exist. These are basically the same as VM_KERNEL_ADDRPERM()
2492  *	and VM_KERNEL_UNSLIDE_OR_PERM() except they use a different random seed and
2493  *	are exported to KEXTs.
2494  *
2495  *	NOTE: USE THE MACRO VERSIONS OF THESE FUNCTIONS (in vm_param.h) FROM WITHIN THE KERNEL
2496  */
2497 
2498 vm_offset_t
vm_kernel_addrhash_internal(vm_offset_t addr,uint64_t salt)2499 vm_kernel_addrhash_internal(vm_offset_t addr, uint64_t salt)
2500 {
2501 	assert(salt != 0);
2502 
2503 	if (addr == 0) {
2504 		return 0ul;
2505 	}
2506 
2507 	if (VM_KERNEL_IS_SLID(addr)) {
2508 		return VM_KERNEL_UNSLIDE(addr);
2509 	}
2510 
2511 	vm_offset_t sha_digest[SHA256_DIGEST_LENGTH / sizeof(vm_offset_t)];
2512 	SHA256_CTX sha_ctx;
2513 
2514 	SHA256_Init(&sha_ctx);
2515 	SHA256_Update(&sha_ctx, &salt, sizeof(salt));
2516 	SHA256_Update(&sha_ctx, &addr, sizeof(addr));
2517 	SHA256_Final(sha_digest, &sha_ctx);
2518 
2519 	return sha_digest[0];
2520 }
2521 
2522 __exported vm_offset_t
2523 vm_kernel_addrhash_external(vm_offset_t addr);
2524 vm_offset_t
vm_kernel_addrhash_external(vm_offset_t addr)2525 vm_kernel_addrhash_external(vm_offset_t addr)
2526 {
2527 	return vm_kernel_addrhash_internal(addr, vm_kernel_addrhash_salt_ext);
2528 }
2529 
2530 void
vm_kernel_addrhide(vm_offset_t addr,vm_offset_t * hide_addr)2531 vm_kernel_addrhide(
2532 	vm_offset_t addr,
2533 	vm_offset_t *hide_addr)
2534 {
2535 	*hide_addr = VM_KERNEL_ADDRHIDE(addr);
2536 }
2537 
2538 /*
2539  *	vm_kernel_addrperm_external:
2540  *	vm_kernel_unslide_or_perm_external:
2541  *
2542  *	Use these macros when exposing an address to userspace that could come from
2543  *	either kernel text/data *or* the heap.
2544  */
2545 void
vm_kernel_addrperm_external(vm_offset_t addr,vm_offset_t * perm_addr)2546 vm_kernel_addrperm_external(
2547 	vm_offset_t addr,
2548 	vm_offset_t *perm_addr)
2549 {
2550 	if (VM_KERNEL_IS_SLID(addr)) {
2551 		*perm_addr = VM_KERNEL_UNSLIDE(addr);
2552 	} else if (VM_KERNEL_ADDRESS(addr)) {
2553 		*perm_addr = addr + vm_kernel_addrperm_ext;
2554 	} else {
2555 		*perm_addr = addr;
2556 	}
2557 }
2558 
2559 void
vm_kernel_unslide_or_perm_external(vm_offset_t addr,vm_offset_t * up_addr)2560 vm_kernel_unslide_or_perm_external(
2561 	vm_offset_t addr,
2562 	vm_offset_t *up_addr)
2563 {
2564 	vm_kernel_addrperm_external(addr, up_addr);
2565 }
2566 
2567 void
vm_packing_pointer_invalid(vm_offset_t ptr,vm_packing_params_t params)2568 vm_packing_pointer_invalid(vm_offset_t ptr, vm_packing_params_t params)
2569 {
2570 	if (ptr & ((1ul << params.vmpp_shift) - 1)) {
2571 		panic("pointer %p can't be packed: low %d bits aren't 0",
2572 		    (void *)ptr, params.vmpp_shift);
2573 	} else if (ptr <= params.vmpp_base) {
2574 		panic("pointer %p can't be packed: below base %p",
2575 		    (void *)ptr, (void *)params.vmpp_base);
2576 	} else {
2577 		panic("pointer %p can't be packed: maximum encodable pointer is %p",
2578 		    (void *)ptr, (void *)vm_packing_max_packable(params));
2579 	}
2580 }
2581 
2582 void
vm_packing_verify_range(const char * subsystem,vm_offset_t min_address,vm_offset_t max_address,vm_packing_params_t params)2583 vm_packing_verify_range(
2584 	const char *subsystem,
2585 	vm_offset_t min_address,
2586 	vm_offset_t max_address,
2587 	vm_packing_params_t params)
2588 {
2589 	if (min_address > max_address) {
2590 		panic("%s: %s range invalid min:%p > max:%p",
2591 		    __func__, subsystem, (void *)min_address, (void *)max_address);
2592 	}
2593 
2594 	if (!params.vmpp_base_relative) {
2595 		return;
2596 	}
2597 
2598 	if (min_address <= params.vmpp_base) {
2599 		panic("%s: %s range invalid min:%p <= base:%p",
2600 		    __func__, subsystem, (void *)min_address, (void *)params.vmpp_base);
2601 	}
2602 
2603 	if (max_address > vm_packing_max_packable(params)) {
2604 		panic("%s: %s range invalid max:%p >= max packable:%p",
2605 		    __func__, subsystem, (void *)max_address,
2606 		    (void *)vm_packing_max_packable(params));
2607 	}
2608 }
2609 
2610 #pragma mark tests
2611 #if DEBUG || DEVELOPMENT
2612 #include <sys/errno.h>
2613 
2614 static void
2615 kmem_test_for_entry(
2616 	vm_map_t                map,
2617 	vm_offset_t             addr,
2618 	void                  (^block)(vm_map_entry_t))
2619 {
2620 	vm_map_entry_t entry;
2621 
2622 	vm_map_lock(map);
2623 	block(vm_map_lookup_entry(map, addr, &entry) ? entry : NULL);
2624 	vm_map_unlock(map);
2625 }
2626 
2627 #define kmem_test_assert_map(map, pg, entries) ({ \
2628 	assert3u((map)->size, ==, ptoa(pg)); \
2629 	assert3u((map)->hdr.nentries, ==, entries); \
2630 })
2631 
2632 static bool
can_write_at(vm_offset_t offs,uint32_t page)2633 can_write_at(vm_offset_t offs, uint32_t page)
2634 {
2635 	static const int zero;
2636 
2637 	return verify_write(&zero, (void *)(offs + ptoa(page) + 128), 1) == 0;
2638 }
2639 #define assert_writeable(offs, page) \
2640 	assertf(can_write_at(offs, page), \
2641 	    "can write at %p + ptoa(%d)", (void *)offs, page)
2642 
2643 #define assert_faults(offs, page) \
2644 	assertf(!can_write_at(offs, page), \
2645 	    "can write at %p + ptoa(%d)", (void *)offs, page)
2646 
2647 #define peek(offs, page) \
2648 	(*(uint32_t *)((offs) + ptoa(page)))
2649 
2650 #define poke(offs, page, v) \
2651 	(*(uint32_t *)((offs) + ptoa(page)) = (v))
2652 
2653 __attribute__((noinline))
2654 static void
kmem_alloc_basic_test(vm_map_t map)2655 kmem_alloc_basic_test(vm_map_t map)
2656 {
2657 	kmem_guard_t guard = {
2658 		.kmg_tag = VM_KERN_MEMORY_DIAG,
2659 	};
2660 	vm_offset_t addr;
2661 
2662 	/*
2663 	 * Test wired basics:
2664 	 * - KMA_KOBJECT
2665 	 * - KMA_GUARD_FIRST, KMA_GUARD_LAST
2666 	 * - allocation alignment
2667 	 */
2668 	addr = kmem_alloc_guard(map, ptoa(10), ptoa(2) - 1,
2669 	    KMA_KOBJECT | KMA_GUARD_FIRST | KMA_GUARD_LAST, guard).kmr_address;
2670 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | GF | GL)", map);
2671 	assert3u((addr + PAGE_SIZE) % ptoa(2), ==, 0);
2672 	kmem_test_assert_map(map, 10, 1);
2673 
2674 	kmem_test_for_entry(map, addr, ^(vm_map_entry_t e){
2675 		assertf(e, "unable to find address %p in map %p", (void *)addr, map);
2676 		assert(e->vme_kernel_object);
2677 		assert(!e->vme_atomic);
2678 		assert3u(e->vme_start, <=, addr);
2679 		assert3u(addr + ptoa(10), <=, e->vme_end);
2680 	});
2681 
2682 	assert_faults(addr, 0);
2683 	for (int i = 1; i < 9; i++) {
2684 		assert_writeable(addr, i);
2685 	}
2686 	assert_faults(addr, 9);
2687 
2688 	kmem_free(map, addr, ptoa(10));
2689 	kmem_test_assert_map(map, 0, 0);
2690 
2691 	/*
2692 	 * Test pageable basics.
2693 	 */
2694 	addr = kmem_alloc_guard(map, ptoa(10), 0,
2695 	    KMA_PAGEABLE, guard).kmr_address;
2696 	assertf(addr != 0ull, "kma(%p, 10p, 0, KO | PG)", map);
2697 	kmem_test_assert_map(map, 10, 1);
2698 
2699 	for (int i = 0; i < 9; i++) {
2700 		assert_faults(addr, i);
2701 		poke(addr, i, 42);
2702 		assert_writeable(addr, i);
2703 	}
2704 
2705 	kmem_free(map, addr, ptoa(10));
2706 	kmem_test_assert_map(map, 0, 0);
2707 }
2708 
2709 __attribute__((noinline))
2710 static void
kmem_realloc_basic_test(vm_map_t map,kmr_flags_t kind)2711 kmem_realloc_basic_test(vm_map_t map, kmr_flags_t kind)
2712 {
2713 	kmem_guard_t guard = {
2714 		.kmg_atomic  = !(kind & KMR_DATA),
2715 		.kmg_tag     = VM_KERN_MEMORY_DIAG,
2716 		.kmg_context = 0xefface,
2717 	};
2718 	vm_offset_t addr, newaddr;
2719 	const int N = 10;
2720 
2721 	/*
2722 	 *	This isn't something kmem_realloc_guard() _needs_ to do,
2723 	 *	we could conceive an implementation where it grows in place
2724 	 *	if there's space after it.
2725 	 *
2726 	 *	However, this is what the implementation does today.
2727 	 */
2728 	bool realloc_growth_changes_address = true;
2729 	bool GL = (kind & KMR_GUARD_LAST);
2730 
2731 	/*
2732 	 *	Initial N page allocation
2733 	 */
2734 	addr = kmem_alloc_guard(map, ptoa(N), 0,
2735 	    (kind & (KMA_KOBJECT | KMA_GUARD_LAST)) | KMA_ZERO,
2736 	    guard).kmr_address;
2737 	assert3u(addr, !=, 0);
2738 	kmem_test_assert_map(map, N, 1);
2739 	for (int pg = 0; pg < N - GL; pg++) {
2740 		poke(addr, pg, 42 + pg);
2741 	}
2742 	for (int pg = N - GL; pg < N; pg++) {
2743 		assert_faults(addr, pg);
2744 	}
2745 
2746 
2747 	/*
2748 	 *	Grow to N + 3 pages
2749 	 */
2750 	newaddr = kmem_realloc_guard(map, addr, ptoa(N), ptoa(N + 3),
2751 	    kind | KMR_ZERO, guard).kmr_address;
2752 	assert3u(newaddr, !=, 0);
2753 	if (realloc_growth_changes_address) {
2754 		assert3u(addr, !=, newaddr);
2755 	}
2756 	if ((kind & KMR_FREEOLD) || (addr == newaddr)) {
2757 		kmem_test_assert_map(map, N + 3, 1);
2758 	} else {
2759 		kmem_test_assert_map(map, 2 * N + 3, 2);
2760 	}
2761 	for (int pg = 0; pg < N - GL; pg++) {
2762 		assert3u(peek(newaddr, pg), ==, 42 + pg);
2763 	}
2764 	if ((kind & KMR_FREEOLD) == 0) {
2765 		for (int pg = 0; pg < N - GL; pg++) {
2766 			assert3u(peek(addr, pg), ==, 42 + pg);
2767 		}
2768 		/* check for tru-share */
2769 		poke(addr + 16, 0, 1234);
2770 		assert3u(peek(newaddr + 16, 0), ==, 1234);
2771 		kmem_free_guard(map, addr, ptoa(N), KMF_NONE, guard);
2772 		kmem_test_assert_map(map, N + 3, 1);
2773 	}
2774 	if (addr != newaddr) {
2775 		for (int pg = 0; pg < N - GL; pg++) {
2776 			assert_faults(addr, pg);
2777 		}
2778 	}
2779 	for (int pg = N - GL; pg < N + 3 - GL; pg++) {
2780 		assert3u(peek(newaddr, pg), ==, 0);
2781 	}
2782 	for (int pg = N + 3 - GL; pg < N + 3; pg++) {
2783 		assert_faults(newaddr, pg);
2784 	}
2785 	addr = newaddr;
2786 
2787 
2788 	/*
2789 	 *	Shrink to N - 2 pages
2790 	 */
2791 	newaddr = kmem_realloc_guard(map, addr, ptoa(N + 3), ptoa(N - 2),
2792 	    kind | KMR_ZERO, guard).kmr_address;
2793 	assert3u(map->size, ==, ptoa(N - 2));
2794 	assert3u(newaddr, ==, addr);
2795 	kmem_test_assert_map(map, N - 2, 1);
2796 
2797 	for (int pg = 0; pg < N - 2 - GL; pg++) {
2798 		assert3u(peek(addr, pg), ==, 42 + pg);
2799 	}
2800 	for (int pg = N - 2 - GL; pg < N + 3; pg++) {
2801 		assert_faults(addr, pg);
2802 	}
2803 
2804 	kmem_free_guard(map, addr, ptoa(N - 2), KMF_NONE, guard);
2805 	kmem_test_assert_map(map, 0, 0);
2806 }
2807 
2808 static int
kmem_basic_test(__unused int64_t in,int64_t * out)2809 kmem_basic_test(__unused int64_t in, int64_t *out)
2810 {
2811 	mach_vm_offset_t addr;
2812 	vm_map_t map;
2813 
2814 	printf("%s: test running\n", __func__);
2815 
2816 	map = kmem_suballoc(kernel_map, &addr, 64U << 20,
2817 	        VM_MAP_CREATE_DEFAULT, VM_FLAGS_ANYWHERE,
2818 	        KMS_NOFAIL | KMS_DATA, VM_KERN_MEMORY_DIAG).kmr_submap;
2819 
2820 	printf("%s: kmem_alloc ...\n", __func__);
2821 	kmem_alloc_basic_test(map);
2822 	printf("%s:     PASS\n", __func__);
2823 
2824 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD) ...\n", __func__);
2825 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD);
2826 	printf("%s:     PASS\n", __func__);
2827 
2828 	printf("%s: kmem_realloc (KMR_FREEOLD) ...\n", __func__);
2829 	kmem_realloc_basic_test(map, KMR_FREEOLD);
2830 	printf("%s:     PASS\n", __func__);
2831 
2832 	printf("%s: kmem_realloc (KMR_NONE) ...\n", __func__);
2833 	kmem_realloc_basic_test(map, KMR_NONE);
2834 	printf("%s:     PASS\n", __func__);
2835 
2836 	printf("%s: kmem_realloc (KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2837 	kmem_realloc_basic_test(map, KMR_KOBJECT | KMR_FREEOLD | KMR_GUARD_LAST);
2838 	printf("%s:     PASS\n", __func__);
2839 
2840 	printf("%s: kmem_realloc (KMR_FREEOLD | KMR_GUARD_LAST) ...\n", __func__);
2841 	kmem_realloc_basic_test(map, KMR_FREEOLD | KMR_GUARD_LAST);
2842 	printf("%s:     PASS\n", __func__);
2843 
2844 	printf("%s: kmem_realloc (KMR_GUARD_LAST) ...\n", __func__);
2845 	kmem_realloc_basic_test(map, KMR_GUARD_LAST);
2846 	printf("%s:     PASS\n", __func__);
2847 
2848 	/* using KMR_DATA signals to test the non atomic realloc path */
2849 	printf("%s: kmem_realloc (KMR_DATA | KMR_FREEOLD) ...\n", __func__);
2850 	kmem_realloc_basic_test(map, KMR_DATA | KMR_FREEOLD);
2851 	printf("%s:     PASS\n", __func__);
2852 
2853 	printf("%s: kmem_realloc (KMR_DATA) ...\n", __func__);
2854 	kmem_realloc_basic_test(map, KMR_DATA);
2855 	printf("%s:     PASS\n", __func__);
2856 
2857 	kmem_free_guard(kernel_map, addr, 64U << 20, KMF_NONE, KMEM_GUARD_SUBMAP);
2858 	vm_map_deallocate(map);
2859 
2860 	printf("%s: test passed\n", __func__);
2861 	*out = 1;
2862 	return 0;
2863 }
2864 SYSCTL_TEST_REGISTER(kmem_basic, kmem_basic_test);
2865 #endif /* DEBUG || DEVELOPMENT */
2866