1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56 /*
57 */
58 /*
59 * File: kern/zalloc.c
60 * Author: Avadis Tevanian, Jr.
61 *
62 * Zone-based memory allocator. A zone is a collection of fixed size
63 * data blocks for which quick allocation/deallocation is possible.
64 */
65
66 #define ZALLOC_ALLOW_DEPRECATED 1
67 #if !ZALLOC_TEST
68 #include <mach/mach_types.h>
69 #include <mach/vm_param.h>
70 #include <mach/kern_return.h>
71 #include <mach/mach_host_server.h>
72 #include <mach/task_server.h>
73 #include <mach/machine/vm_types.h>
74 #include <machine/machine_routines.h>
75 #include <mach/vm_map.h>
76 #include <mach/sdt.h>
77 #if __x86_64__
78 #include <i386/cpuid.h>
79 #endif
80
81 #include <kern/bits.h>
82 #include <kern/btlog.h>
83 #include <kern/startup.h>
84 #include <kern/kern_types.h>
85 #include <kern/assert.h>
86 #include <kern/backtrace.h>
87 #include <kern/host.h>
88 #include <kern/macro_help.h>
89 #include <kern/sched.h>
90 #include <kern/locks.h>
91 #include <kern/sched_prim.h>
92 #include <kern/misc_protos.h>
93 #include <kern/thread_call.h>
94 #include <kern/zalloc_internal.h>
95 #include <kern/kalloc.h>
96 #include <kern/debug.h>
97
98 #include <prng/random.h>
99
100 #include <vm/pmap.h>
101 #include <vm/vm_map.h>
102 #include <vm/vm_kern.h>
103 #include <vm/vm_page.h>
104 #include <vm/vm_pageout.h>
105 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
106
107 #include <pexpert/pexpert.h>
108
109 #include <machine/machparam.h>
110 #include <machine/machine_routines.h> /* ml_cpu_get_info */
111
112 #include <os/atomic.h>
113
114 #include <libkern/OSDebug.h>
115 #include <libkern/OSAtomic.h>
116 #include <libkern/section_keywords.h>
117 #include <sys/kdebug.h>
118
119 #include <san/kasan.h>
120 #include <libsa/stdlib.h>
121 #include <sys/errno.h>
122
123 #include <IOKit/IOBSD.h>
124
125 #if DEBUG
126 #define z_debug_assert(expr) assert(expr)
127 #else
128 #define z_debug_assert(expr) (void)(expr)
129 #endif
130
131 /* Returns pid of the task with the largest number of VM map entries. */
132 extern pid_t find_largest_process_vm_map_entries(void);
133
134 /*
135 * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
136 * For any other pid we try to kill that process synchronously.
137 */
138 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
139
140 extern zone_t vm_map_entry_zone;
141 extern zone_t vm_object_zone;
142 extern zone_t ipc_service_port_label_zone;
143
144 ZONE_DEFINE_TYPE(percpu_u64_zone, "percpu.64", uint64_t,
145 ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE);
146
147 #if CONFIG_KERNEL_TBI && KASAN_TBI
148 #define ZONE_MIN_ELEM_SIZE (sizeof(uint64_t) * 2)
149 #define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
150 #else /* CONFIG_KERNEL_TBI && KASAN_TBI */
151 #define ZONE_MIN_ELEM_SIZE sizeof(uint64_t)
152 #define ZONE_ALIGN_SIZE ZONE_MIN_ELEM_SIZE
153 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
154
155 #define ZONE_MAX_ALLOC_SIZE (32 * 1024)
156 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
157 #define ZONE_CHUNK_ALLOC_SIZE (256 * 1024)
158 #define ZONE_GUARD_DENSE (32 * 1024)
159 #define ZONE_GUARD_SPARSE (64 * 1024)
160 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
161
162 __enum_closed_decl(zm_len_t, uint16_t, {
163 ZM_CHUNK_FREE = 0x0,
164 /* 1 through 8 are valid lengths */
165 ZM_CHUNK_LEN_MAX = 0x8,
166
167 /* PGZ magical values */
168 ZM_PGZ_FREE = 0x0,
169 ZM_PGZ_ALLOCATED = 0xa, /* [a]llocated */
170 ZM_PGZ_GUARD = 0xb, /* oo[b] */
171 ZM_PGZ_DOUBLE_FREE = 0xd, /* [d]ouble_free */
172
173 /* secondary page markers */
174 ZM_SECONDARY_PAGE = 0xe,
175 ZM_SECONDARY_PCPU_PAGE = 0xf,
176 });
177
178 static_assert(MAX_ZONES < (1u << 10), "MAX_ZONES must fit in zm_index");
179
180 struct zone_page_metadata {
181 union {
182 struct {
183 /* The index of the zone this metadata page belongs to */
184 zone_id_t zm_index : 10;
185
186 /*
187 * This chunk ends with a guard page.
188 */
189 uint16_t zm_guarded : 1;
190
191 /*
192 * Whether `zm_bitmap` is an inline bitmap
193 * or a packed bitmap reference
194 */
195 uint16_t zm_inline_bitmap : 1;
196
197 /*
198 * Zones allocate in "chunks" of zone_t::z_chunk_pages
199 * consecutive pages, or zpercpu_count() pages if the
200 * zone is percpu.
201 *
202 * The first page of it has its metadata set with:
203 * - 0 if none of the pages are currently wired
204 * - the number of wired pages in the chunk
205 * (not scaled for percpu).
206 *
207 * Other pages in the chunk have their zm_chunk_len set
208 * to ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE
209 * depending on whether the zone is percpu or not.
210 * For those, zm_page_index holds the index of that page
211 * in the run, and zm_subchunk_len the remaining length
212 * within the chunk.
213 *
214 * Metadata used for PGZ pages can have 3 values:
215 * - ZM_PGZ_FREE: slot is free
216 * - ZM_PGZ_ALLOCATED: slot holds an allocated element
217 * at offset (zm_pgz_orig_addr & PAGE_MASK)
218 * - ZM_PGZ_DOUBLE_FREE: slot detected a double free
219 * (will panic).
220 */
221 zm_len_t zm_chunk_len : 4;
222 };
223 uint16_t zm_bits;
224 };
225
226 union {
227 #define ZM_ALLOC_SIZE_LOCK 1u
228 uint16_t zm_alloc_size; /* first page only */
229 struct {
230 uint8_t zm_page_index; /* secondary pages only */
231 uint8_t zm_subchunk_len; /* secondary pages only */
232 };
233 uint16_t zm_oob_offs; /* in guard pages */
234 };
235 union {
236 uint32_t zm_bitmap; /* most zones */
237 uint32_t zm_bump; /* permanent zones */
238 };
239
240 union {
241 struct {
242 zone_pva_t zm_page_next;
243 zone_pva_t zm_page_prev;
244 };
245 vm_offset_t zm_pgz_orig_addr;
246 struct zone_page_metadata *zm_pgz_slot_next;
247 };
248 };
249 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
250
251 /*!
252 * @typedef zone_element_t
253 *
254 * @brief
255 * Type that represents a "resolved" zone element.
256 *
257 * @description
258 * This type encodes an element pointer as a pair of:
259 * { chunk base, element index }.
260 *
261 * The chunk base is extracted with @c trunc_page()
262 * as it is always page aligned, and occupies the bits above @c PAGE_SHIFT.
263 *
264 * The other bits encode the element index in the chunk rather than its address.
265 */
266 typedef struct zone_element {
267 vm_offset_t ze_value;
268 } zone_element_t;
269
270 /*!
271 * @typedef zone_magazine_t
272 *
273 * @brief
274 * Magazine of cached allocations.
275 *
276 * @field zm_cur how many elements this magazine holds (unused while loaded).
277 * @field zm_link linkage used by magazine depots.
278 * @field zm_elems an array of @c zc_mag_size() elements.
279 */
280 typedef struct zone_magazine {
281 uint16_t zm_cur;
282 STAILQ_ENTRY(zone_magazine) zm_link;
283 zone_element_t zm_elems[0];
284 } *zone_magazine_t;
285
286 /*!
287 * @typedef zone_cache_t
288 *
289 * @brief
290 * Magazine of cached allocations.
291 *
292 * @discussion
293 * Below is a diagram of the caching system. This design is inspired by the
294 * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
295 * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA
296 * zone allocator (itself derived from this seminal work).
297 *
298 * It is divided into 3 layers:
299 * - the per-cpu layer,
300 * - the recirculation depot layer,
301 * - the Zone Allocator.
302 *
303 * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t),
304 * which are stacks of up to @c zc_mag_size() elements.
305 *
306 * <h2>CPU layer</h2>
307 *
308 * The CPU layer (@c zone_cache_t) looks like this:
309 *
310 * ╭─ a ─ f ─┬───────── zm_depot ──────────╮
311 * │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮ │
312 * │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│ │
313 * │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│ │
314 * │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│ │
315 * │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯ │
316 * ╰─────────┴─────────────────────────────╯
317 *
318 * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from,
319 * or free to. Serialization is achieved through disabling preemption, and only
320 * the current CPU can acces those allocations. This is represented on the left
321 * hand side of the diagram above.
322 *
323 * The right hand side is the per-cpu depot. It consists of @c zm_depot_count
324 * full magazines, and is protected by the @c zm_depot_lock for access.
325 * The lock is expected to absolutely never be contended, as only the local CPU
326 * tends to access the local per-cpu depot in regular operation mode.
327 *
328 * However unlike UMA, our implementation allows for the zone GC to reclaim
329 * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock.
330 *
331 *
332 * <h2>Recirculation Depot</h2>
333 *
334 * The recirculation depot layer is a list similar to the per-cpu depot,
335 * however it is different in two fundamental ways:
336 *
337 * - it is protected by the regular zone lock,
338 * - elements referenced by the magazines in that layer appear free
339 * to the zone layer.
340 *
341 *
342 * <h2>Magazine circulation and sizing</h2>
343 *
344 * The caching system sizes itself dynamically. Operations that allocate/free
345 * a single element call @c zone_lock_nopreempt_check_contention() which records
346 * contention on the lock by doing a trylock and recording its success.
347 *
348 * This information is stored in the @c z_contention_cur field of the zone,
349 * and a windoed moving average is maintained in @c z_contention_wma.
350 * Each time a CPU registers any contention, it will also allow its own per-cpu
351 * cache to grow, incrementing @c zc_depot_max, which is how the per-cpu layer
352 * might grow into using its local depot.
353 *
354 * Note that @c zc_depot_max assume that the (a) and (f) pre-loaded magazines
355 * on average contain @c zc_mag_size() elements.
356 *
357 * When a per-cpu layer cannot hold more full magazines in its depot,
358 * then it will overflow about 1/3 of its depot into the recirculation depot
359 * (see @c zfree_cached_slow(). Conversely, when a depot is empty, then it will
360 * refill its per-cpu depot to about 1/3 of its size from the recirculation
361 * depot (see @c zalloc_cached_slow()).
362 *
363 * Lastly, the zone layer keeps track of the high and low watermark of how many
364 * elements have been free per period of time (including being part of the
365 * recirculation depot) in the @c z_elems_free_min and @c z_elems_free_max
366 * fields. A weighted moving average of the amplitude of this is maintained in
367 * the @c z_elems_free_wss which informs the zone GC on how to gently trim
368 * zones without hurting performance.
369 *
370 *
371 * <h2>Security considerations</h2>
372 *
373 * The zone caching layer has been designed to avoid returning elements in
374 * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine,
375 * and @c zfree() free to the (f) magazine, and only swap them when the
376 * requested operation cannot be fulfilled.
377 *
378 * The per-cpu overflow depot or the recirculation depots are similarly used
379 * in FIFO order.
380 *
381 * More importantly, when magazines flow through the recirculation depot,
382 * the elements they contain are marked as "free" in the zone layer bitmaps.
383 * Because allocations out of per-cpu caches verify the bitmaps at allocation
384 * time, this acts as a poor man's double-free quarantine. The magazines
385 * allow to avoid the cost of the bit-scanning involved in the zone-level
386 * @c zalloc_item() codepath.
387 *
388 *
389 * @field zc_alloc_cur denormalized number of elements in the (a) magazine
390 * @field zc_free_cur denormalized number of elements in the (f) magazine
391 * @field zc_alloc_elems a pointer to the array of elements in (a)
392 * @field zc_free_elems a pointer to the array of elements in (f)
393 *
394 * @field zc_depot_lock a lock to access @c zc_depot, @c zc_depot_cur.
395 * @field zc_depot a list of @c zc_depot_cur full magazines
396 * @field zc_depot_cur number of magazines in @c zc_depot
397 * @field zc_depot_max the maximum number of elements in @c zc_depot,
398 * protected by the zone lock.
399 */
400 typedef struct zone_cache {
401 uint16_t zc_alloc_cur;
402 uint16_t zc_free_cur;
403 uint16_t zc_depot_cur;
404 uint16_t __zc_padding;
405 zone_element_t *zc_alloc_elems;
406 zone_element_t *zc_free_elems;
407 hw_lock_bit_t zc_depot_lock;
408 uint32_t zc_depot_max;
409 struct zone_depot zc_depot;
410 } *zone_cache_t;
411
412 #if !__x86_64__
413 static
414 #endif
415 __security_const_late struct {
416 struct kmem_range zi_map_range; /* all zone submaps */
417 struct kmem_range zi_ro_range; /* read-only range */
418 struct kmem_range zi_meta_range; /* debugging only */
419 struct kmem_range zi_bits_range; /* bits buddy allocator */
420 struct kmem_range zi_pgz_range;
421 struct zone_page_metadata *zi_pgz_meta;
422
423 /*
424 * The metadata lives within the zi_meta_range address range.
425 *
426 * The correct formula to find a metadata index is:
427 * absolute_page_index - page_index(zi_map_range.min_address)
428 *
429 * And then this index is used to dereference zi_meta_range.min_address
430 * as a `struct zone_page_metadata` array.
431 *
432 * To avoid doing that substraction all the time in the various fast-paths,
433 * zi_meta_base are pre-offset with that minimum page index to avoid redoing
434 * that math all the time.
435 */
436 struct zone_page_metadata *zi_meta_base;
437 } zone_info;
438
439 __startup_data
440 static struct kmem_range zone_map_range;
441 __startup_data
442 vm_map_size_t zone_map_size;
443 __startup_data
444 static vm_map_size_t zone_meta_size;
445 __startup_data
446 static vm_map_size_t zone_bits_size;
447
448 /*
449 * Initial array of metadata for stolen memory.
450 *
451 * The numbers here have to be kept in sync with vm_map_steal_memory()
452 * so that we have reserved enough metadata.
453 *
454 * After zone_init() has run (which happens while the kernel is still single
455 * threaded), the metadata is moved to its final dynamic location, and
456 * this array is unmapped with the rest of __startup_data at lockdown.
457 */
458 #define ZONE_EARLY_META_INLINE_COUNT 64
459 __startup_data
460 static struct zone_page_metadata
461 zone_early_meta_array_startup[ZONE_EARLY_META_INLINE_COUNT];
462
463 #if __x86_64__
464 /*
465 * On Intel we can't "free" pmap stolen pages,
466 * so instead we use a static array in __KLDDATA
467 * which gets reclaimed at lockdown time.
468 */
469 __startup_data __attribute__((aligned(PAGE_SIZE)))
470 static uint8_t zone_early_pages_to_cram[PAGE_SIZE * 16];
471 #endif
472
473 /*
474 * The zone_locks_grp allows for collecting lock statistics.
475 * All locks are associated to this group in zinit.
476 * Look at tools/lockstat for debugging lock contention.
477 */
478 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
479 static LCK_MTX_EARLY_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
480
481 /*
482 * The zone metadata lock protects:
483 * - metadata faulting,
484 * - VM submap VA allocations,
485 * - early gap page queue list
486 */
487 #define zone_meta_lock() lck_mtx_lock(&zone_metadata_region_lck);
488 #define zone_meta_unlock() lck_mtx_unlock(&zone_metadata_region_lck);
489
490 /*
491 * Exclude more than one concurrent garbage collection
492 */
493 static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
494 static LCK_MTX_EARLY_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
495 static LCK_SPIN_DECLARE(zone_exhausted_lock, &zone_gc_lck_grp);
496
497 /*
498 * Panic logging metadata
499 */
500 bool panic_include_zprint = false;
501 bool panic_include_kalloc_types = false;
502 zone_t kalloc_type_src_zone = ZONE_NULL;
503 zone_t kalloc_type_dst_zone = ZONE_NULL;
504 mach_memory_info_t *panic_kext_memory_info = NULL;
505 vm_size_t panic_kext_memory_size = 0;
506 vm_offset_t panic_fault_address = 0;
507
508 /*
509 * Protects zone_array, num_zones, num_zones_in_use, and
510 * zone_destroyed_bitmap
511 */
512 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
513 static zone_id_t num_zones_in_use;
514 zone_id_t _Atomic num_zones;
515 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
516
517 /*
518 * Initial globals for zone stats until we can allocate the real ones.
519 * Those get migrated inside the per-CPU ones during zone_init() and
520 * this array is unmapped with the rest of __startup_data at lockdown.
521 */
522
523 /* zone to allocate zone_magazine structs from */
524 static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone;
525 /*
526 * Until pid1 is made, zone caching is off,
527 * until compute_zone_working_set_size() runs for the firt time.
528 *
529 * -1 represents the "never enabled yet" value.
530 */
531 static int8_t zone_caching_disabled = -1;
532
533 __startup_data
534 static struct zone_cache zone_cache_startup[MAX_ZONES];
535 __startup_data
536 static struct zone_stats zone_stats_startup[MAX_ZONES];
537 struct zone zone_array[MAX_ZONES];
538 SECURITY_READ_ONLY_LATE(zone_security_flags_t) zone_security_array[MAX_ZONES] = {
539 [0 ... MAX_ZONES - 1] = {
540 .z_kheap_id = KHEAP_ID_NONE,
541 .z_noencrypt = false,
542 .z_submap_idx = Z_SUBMAP_IDX_GENERAL_0,
543 .z_kalloc_type = false,
544 .z_va_sequester = ZSECURITY_CONFIG(SEQUESTER),
545 },
546 };
547 SECURITY_READ_ONLY_LATE(uint16_t) zone_ro_elem_size[MAX_ZONES];
548
549 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
550 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
551
552 /* Used to keep track of destroyed slots in the zone_array */
553 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
554
555 /* number of zone mapped pages used by all zones */
556 static size_t _Atomic zone_pages_jetsam_threshold = ~0;
557 size_t zone_pages_wired;
558 size_t zone_guard_pages;
559
560 /* Time in (ms) after which we panic for zone exhaustions */
561 TUNABLE(int, zone_exhausted_timeout, "zet", 5000);
562
563 #if VM_TAG_SIZECLASSES
564 /* enable tags for zones that ask for it */
565 static TUNABLE(bool, zone_tagging_on, "-zt", false);
566 #endif /* VM_TAG_SIZECLASSES */
567
568 #if DEBUG || DEVELOPMENT
569 static int zalloc_simulate_vm_pressure;
570 TUNABLE(bool, zalloc_disable_copyio_check, "-no-copyio-zalloc-check", false);
571 #endif /* DEBUG || DEVELOPMENT */
572
573 /*
574 * Zone caching tunables
575 *
576 * zc_mag_size():
577 * size of magazines, larger to reduce contention at the expense of memory
578 *
579 * zc_auto_enable_threshold
580 * number of contentions per second after which zone caching engages
581 * automatically.
582 *
583 * 0 to disable.
584 *
585 * zc_grow_threshold
586 * numer of contentions per second after which the per-cpu depot layer
587 * grows at each newly observed contention without restriction.
588 *
589 * 0 to disable.
590 *
591 * zc_recirc_batch
592 * how many magazines to transfer at most from/to the recirculation depot.
593 * Default 4.
594 *
595 * zc_defrag_ratio
596 * percentage of the working set to recirc size below which
597 * the zone is defragmented. Default is 66%.
598 *
599 * zc_defrag_threshold
600 * how much memory needs to be free before the auto-defrag is even considered.
601 * Default is 512k.
602 *
603 * zc_autogc_ratio
604 * percentage of the working set to min-free size below which
605 * the zone is auto-GCed to the working set size. Default is 20%.
606 *
607 * zc_autogc_threshold
608 * how much memory needs to be free before the auto-gc is even considered.
609 * Default is 4M.
610 *
611 * zc_free_batch_size
612 * The size of batches of frees/reclaim that can be done keeping
613 * the zone lock held (and preemption disabled).
614 */
615 static TUNABLE(uint16_t, zc_magazine_size, "zc_mag_size", 8);
616 static TUNABLE(uint32_t, zc_auto_threshold, "zc_auto_enable_threshold", 20);
617 static TUNABLE(uint32_t, zc_grow_threshold, "zc_grow_threshold", 8);
618 static TUNABLE(uint16_t, zc_recirc_batch, "zc_recirc_batch", 4);
619 static TUNABLE(uint32_t, zc_defrag_ratio, "zc_defrag_ratio", 66);
620 static TUNABLE(uint32_t, zc_defrag_threshold, "zc_defrag_threshold", 512u << 10);
621 static TUNABLE(uint32_t, zc_autogc_ratio, "zc_autogc_ratio", 20);
622 static TUNABLE(uint32_t, zc_autogc_threshold, "zc_autogc_threshold", 4u << 20);
623 static TUNABLE(uint32_t, zc_free_batch_size, "zc_free_batch_size", 256);
624
625 static SECURITY_READ_ONLY_LATE(size_t) zone_pages_wired_max;
626 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_submaps[Z_SUBMAP_IDX_COUNT];
627 static SECURITY_READ_ONLY_LATE(vm_map_t) zone_meta_map;
628 static char const * const zone_submaps_names[Z_SUBMAP_IDX_COUNT] = {
629 [Z_SUBMAP_IDX_VM] = "VM",
630 [Z_SUBMAP_IDX_READ_ONLY] = "RO",
631 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
632 [Z_SUBMAP_IDX_GENERAL_0] = "GEN0",
633 [Z_SUBMAP_IDX_GENERAL_1] = "GEN1",
634 [Z_SUBMAP_IDX_GENERAL_2] = "GEN2",
635 [Z_SUBMAP_IDX_GENERAL_3] = "GEN3",
636 #else
637 [Z_SUBMAP_IDX_GENERAL_0] = "GEN",
638 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
639 [Z_SUBMAP_IDX_DATA] = "DATA",
640 };
641
642 #if __x86_64__
643 #define ZONE_ENTROPY_CNT 8
644 #else
645 #define ZONE_ENTROPY_CNT 2
646 #endif
647 static struct zone_bool_gen {
648 struct bool_gen zbg_bg;
649 uint32_t zbg_entropy[ZONE_ENTROPY_CNT];
650 } zone_bool_gen[MAX_CPUS];
651
652 #if CONFIG_PROB_GZALLOC
653 /*
654 * Probabilistic gzalloc
655 * =====================
656 *
657 *
658 * Probabilistic guard zalloc samples allocations and will protect them by
659 * double-mapping the page holding them and returning the secondary virtual
660 * address to its callers.
661 *
662 * Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args
663 * are set.
664 *
665 *
666 * Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with
667 * most zalloc/kalloc features:
668 * - zone_require is functional
669 * - zone caching or zone tagging is compatible
670 * - non-blocking allocation work (they will always return NULL with gzalloc).
671 *
672 * PGZ limitations:
673 * - VA sequestering isn't respected, as the slots (which are in limited
674 * quantity) will be reused for any type, however the PGZ quarantine
675 * somewhat mitigates the impact.
676 * - zones with elements larger than a page cannot be protected.
677 *
678 *
679 * Tunables:
680 * --------
681 *
682 * pgz=1:
683 * Turn on probabilistic guard malloc for all zones
684 *
685 * (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified)
686 *
687 * pgz_sample_rate=0 to 2^31
688 * average sample rate between two guarded allocations.
689 * 0 means every allocation.
690 *
691 * The default is a random number between 1000 and 10,000
692 *
693 * pgz_slots
694 * how many allocations to protect.
695 *
696 * Each costs:
697 * - a PTE in the pmap (when allocated)
698 * - 2 zone page meta's (every other page is a "guard" one, 32B total)
699 * - 64 bytes per backtraces.
700 * On LP64 this is <16K per 100 slots.
701 *
702 * The default is ~200 slots per G of physical ram (32k / G)
703 *
704 * TODO:
705 * - try harder to allocate elements at the "end" to catch OOB more reliably.
706 *
707 * pgz_quarantine
708 * how many slots should be free at any given time.
709 *
710 * PGZ will round robin through free slots to be reused, but free slots are
711 * important to detect use-after-free by acting as a quarantine.
712 *
713 * By default, PGZ will keep 33% of the slots around at all time.
714 *
715 * pgz1=<name>, pgz2=<name>, ..., pgzn=<name>...
716 * Specific zones for which to enable probabilistic guard malloc.
717 * There must be no numbering gap (names after the gap will be ignored).
718 */
719 #if DEBUG || DEVELOPMENT
720 static TUNABLE(bool, pgz_all, "pgz", true);
721 #else
722 static TUNABLE(bool, pgz_all, "pgz", false);
723 #endif
724 static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate", 0);
725 static TUNABLE(uint32_t, pgz_slots, "pgz_slots", UINT32_MAX);
726 static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine", 0);
727 #endif /* CONFIG_PROB_GZALLOC */
728
729 static zone_t zone_find_largest(uint64_t *zone_size);
730
731 #endif /* !ZALLOC_TEST */
732 #pragma mark Zone metadata
733 #if !ZALLOC_TEST
734
735 static inline bool
zone_has_index(zone_t z,zone_id_t zid)736 zone_has_index(zone_t z, zone_id_t zid)
737 {
738 return zone_array + zid == z;
739 }
740
741 static zone_element_t
zone_element_encode(vm_offset_t base,vm_offset_t eidx)742 zone_element_encode(vm_offset_t base, vm_offset_t eidx)
743 {
744 return (zone_element_t){ .ze_value = base | eidx };
745 }
746
747 static vm_offset_t
zone_element_base(zone_element_t ze)748 zone_element_base(zone_element_t ze)
749 {
750 return trunc_page(ze.ze_value);
751 }
752
753 static vm_offset_t
zone_element_idx(zone_element_t ze)754 zone_element_idx(zone_element_t ze)
755 {
756 return ze.ze_value & PAGE_MASK;
757 }
758
759 static vm_offset_t
zone_element_addr(zone_t z,zone_element_t ze,vm_offset_t esize)760 zone_element_addr(zone_t z, zone_element_t ze, vm_offset_t esize)
761 {
762 vm_offset_t offs = zone_oob_offs(z);
763
764 return offs + zone_element_base(ze) + esize * zone_element_idx(ze);
765 }
766
767 __abortlike
768 void
zone_invalid_panic(zone_t zone)769 zone_invalid_panic(zone_t zone)
770 {
771 panic("zone %p isn't in the zone_array", zone);
772 }
773
774 __abortlike
775 static void
zone_metadata_corruption(zone_t zone,struct zone_page_metadata * meta,const char * kind)776 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
777 const char *kind)
778 {
779 panic("zone metadata corruption: %s (meta %p, zone %s%s)",
780 kind, meta, zone_heap_name(zone), zone->z_name);
781 }
782
783 __abortlike
784 static void
zone_invalid_element_addr_panic(zone_t zone,vm_offset_t addr)785 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
786 {
787 panic("zone element pointer validation failed (addr: %p, zone %s%s)",
788 (void *)addr, zone_heap_name(zone), zone->z_name);
789 }
790
791 __abortlike
792 static void
zone_page_metadata_index_confusion_panic(zone_t zone,vm_offset_t addr,struct zone_page_metadata * meta)793 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
794 struct zone_page_metadata *meta)
795 {
796 zone_security_flags_t zsflags = zone_security_config(zone), src_zsflags;
797 zone_id_t zidx;
798 zone_t src_zone;
799
800 if (zsflags.z_kalloc_type) {
801 panic_include_kalloc_types = true;
802 kalloc_type_dst_zone = zone;
803 }
804
805 zidx = meta->zm_index;
806 if (zidx >= os_atomic_load(&num_zones, relaxed)) {
807 panic("%p expected in zone %s%s[%d], but metadata has invalid zidx: %d",
808 (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
809 zidx);
810 }
811
812 src_zone = &zone_array[zidx];
813 src_zsflags = zone_security_array[zidx];
814 if (src_zsflags.z_kalloc_type) {
815 panic_include_kalloc_types = true;
816 kalloc_type_src_zone = src_zone;
817 }
818
819 panic("%p not in the expected zone %s%s[%d], but found in %s%s[%d]",
820 (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
821 zone_heap_name(src_zone), src_zone->z_name, zidx);
822 }
823
824 __abortlike
825 static void
zone_page_metadata_list_corruption(zone_t zone,struct zone_page_metadata * meta)826 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
827 {
828 panic("metadata list corruption through element %p detected in zone %s%s",
829 meta, zone_heap_name(zone), zone->z_name);
830 }
831
832 __abortlike
833 static void
zone_page_meta_accounting_panic(zone_t zone,struct zone_page_metadata * meta,const char * kind)834 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
835 const char *kind)
836 {
837 panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
838 zone_heap_name(zone), zone->z_name, meta);
839 }
840
841 __abortlike
842 static void
zone_meta_double_free_panic(zone_t zone,zone_element_t ze,const char * caller)843 zone_meta_double_free_panic(zone_t zone, zone_element_t ze, const char *caller)
844 {
845 panic("%s: double free of %p to zone %s%s", caller,
846 (void *)zone_element_addr(zone, ze, zone_elem_size(zone)),
847 zone_heap_name(zone), zone->z_name);
848 }
849
850 __abortlike
851 static void
zone_accounting_panic(zone_t zone,const char * kind)852 zone_accounting_panic(zone_t zone, const char *kind)
853 {
854 panic("accounting mismatch (%s) for zone %s%s", kind,
855 zone_heap_name(zone), zone->z_name);
856 }
857
858 #define zone_counter_sub(z, stat, value) ({ \
859 if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \
860 zone_accounting_panic(z, #stat " wrap-around"); \
861 } \
862 (z)->stat; \
863 })
864
865 static inline void
zone_elems_free_add(zone_t z,uint32_t count)866 zone_elems_free_add(zone_t z, uint32_t count)
867 {
868 uint32_t n = (z->z_elems_free += count);
869 if (z->z_elems_free_max < n) {
870 z->z_elems_free_max = n;
871 }
872 }
873
874 static inline void
zone_elems_free_sub(zone_t z,uint32_t count)875 zone_elems_free_sub(zone_t z, uint32_t count)
876 {
877 uint32_t n = zone_counter_sub(z, z_elems_free, count);
878
879 if (z->z_elems_free_min > n) {
880 z->z_elems_free_min = n;
881 }
882 }
883
884 static inline uint16_t
zone_meta_alloc_size_add(zone_t z,struct zone_page_metadata * m,vm_offset_t esize)885 zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m,
886 vm_offset_t esize)
887 {
888 if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) {
889 zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
890 }
891 return m->zm_alloc_size;
892 }
893
894 static inline uint16_t
zone_meta_alloc_size_sub(zone_t z,struct zone_page_metadata * m,vm_offset_t esize)895 zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m,
896 vm_offset_t esize)
897 {
898 if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) {
899 zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
900 }
901 return m->zm_alloc_size;
902 }
903
904 __abortlike
905 static void
zone_nofail_panic(zone_t zone)906 zone_nofail_panic(zone_t zone)
907 {
908 panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
909 zone_heap_name(zone), zone->z_name);
910 }
911
912 __header_always_inline bool
zone_spans_ro_va(vm_offset_t addr_start,vm_offset_t addr_end)913 zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end)
914 {
915 vm_offset_t rmin, rmax;
916
917 #if CONFIG_KERNEL_TBI
918 addr_start = VM_KERNEL_STRIP_UPTR(addr_start);
919 addr_end = VM_KERNEL_STRIP_UPTR(addr_end);
920 #endif /* CONFIG_KERNEL_TBI */
921
922 kmem_range_load(&zone_info.zi_ro_range, rmin, rmax);
923
924 /*
925 * Either the start and the end are leftward of the read-only range, or they
926 * are both completely rightward. If neither, then they span over the range.
927 */
928
929 if ((addr_start < rmin) && (addr_end < rmin)) {
930 /* Leftward */
931 return false;
932 } else if ((addr_start > rmax) && (addr_end > rmax)) {
933 /* Rightward */
934 return false;
935 }
936
937 return true;
938 }
939
940 #define from_range(r, addr, size) \
941 __builtin_choose_expr(__builtin_constant_p(size) ? (size) == 1 : 0, \
942 kmem_range_contains(r, (vm_offset_t)(addr)), \
943 kmem_range_contains(r, (vm_offset_t)(addr), size))
944
945 #define from_ro_map(addr, size) \
946 from_range(&zone_info.zi_ro_range, addr, size)
947
948 #define from_zone_map(addr, size) \
949 from_range(&zone_info.zi_map_range, addr, size)
950
951 #define zone_map_size() \
952 kmem_range_size(&zone_info.zi_map_range)
953
954 #define zone_readonly_size() \
955 kmem_range_size(&zone_info.zi_ro_range)
956
957 __header_always_inline bool
zone_pva_is_null(zone_pva_t page)958 zone_pva_is_null(zone_pva_t page)
959 {
960 return page.packed_address == 0;
961 }
962
963 __header_always_inline bool
zone_pva_is_queue(zone_pva_t page)964 zone_pva_is_queue(zone_pva_t page)
965 {
966 // actual kernel pages have the top bit set
967 return (int32_t)page.packed_address > 0;
968 }
969
970 __header_always_inline bool
zone_pva_is_equal(zone_pva_t pva1,zone_pva_t pva2)971 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
972 {
973 return pva1.packed_address == pva2.packed_address;
974 }
975
976 __header_always_inline zone_pva_t *
zone_pageq_base(void)977 zone_pageq_base(void)
978 {
979 extern zone_pva_t data_seg_start[] __SEGMENT_START_SYM("__DATA");
980
981 /*
982 * `-1` so that if the first __DATA variable is a page queue,
983 * it gets a non 0 index
984 */
985 return data_seg_start - 1;
986 }
987
988 __header_always_inline void
zone_queue_set_head(zone_t z,zone_pva_t queue,zone_pva_t oldv,struct zone_page_metadata * meta)989 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
990 struct zone_page_metadata *meta)
991 {
992 zone_pva_t *queue_head = &zone_pageq_base()[queue.packed_address];
993
994 if (!zone_pva_is_equal(*queue_head, oldv)) {
995 zone_page_metadata_list_corruption(z, meta);
996 }
997 *queue_head = meta->zm_page_next;
998 }
999
1000 __header_always_inline zone_pva_t
zone_queue_encode(zone_pva_t * headp)1001 zone_queue_encode(zone_pva_t *headp)
1002 {
1003 return (zone_pva_t){ (uint32_t)(headp - zone_pageq_base()) };
1004 }
1005
1006 __header_always_inline zone_pva_t
zone_pva_from_addr(vm_address_t addr)1007 zone_pva_from_addr(vm_address_t addr)
1008 {
1009 // cannot use atop() because we want to maintain the sign bit
1010 return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
1011 }
1012
1013 __header_always_inline zone_pva_t
zone_pva_from_element(zone_element_t ze)1014 zone_pva_from_element(zone_element_t ze)
1015 {
1016 return zone_pva_from_addr(ze.ze_value);
1017 }
1018
1019 __header_always_inline vm_address_t
zone_pva_to_addr(zone_pva_t page)1020 zone_pva_to_addr(zone_pva_t page)
1021 {
1022 // cause sign extension so that we end up with the right address
1023 return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
1024 }
1025
1026 __header_always_inline struct zone_page_metadata *
zone_pva_to_meta(zone_pva_t page)1027 zone_pva_to_meta(zone_pva_t page)
1028 {
1029 return &zone_info.zi_meta_base[page.packed_address];
1030 }
1031
1032 __header_always_inline zone_pva_t
zone_pva_from_meta(struct zone_page_metadata * meta)1033 zone_pva_from_meta(struct zone_page_metadata *meta)
1034 {
1035 return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) };
1036 }
1037
1038 __header_always_inline struct zone_page_metadata *
zone_meta_from_addr(vm_offset_t addr)1039 zone_meta_from_addr(vm_offset_t addr)
1040 {
1041 return zone_pva_to_meta(zone_pva_from_addr(addr));
1042 }
1043
1044 __header_always_inline struct zone_page_metadata *
zone_meta_from_element(zone_element_t ze)1045 zone_meta_from_element(zone_element_t ze)
1046 {
1047 return zone_pva_to_meta(zone_pva_from_element(ze));
1048 }
1049
1050 __header_always_inline zone_id_t
zone_index_from_ptr(const void * ptr)1051 zone_index_from_ptr(const void *ptr)
1052 {
1053 return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index;
1054 }
1055
1056 __header_always_inline vm_offset_t
zone_meta_to_addr(struct zone_page_metadata * meta)1057 zone_meta_to_addr(struct zone_page_metadata *meta)
1058 {
1059 return ptoa((int32_t)(meta - zone_info.zi_meta_base));
1060 }
1061
1062 __attribute__((overloadable))
1063 __header_always_inline void
zone_meta_validate(zone_t z,struct zone_page_metadata * meta,vm_address_t addr)1064 zone_meta_validate(zone_t z, struct zone_page_metadata *meta, vm_address_t addr)
1065 {
1066 if (!zone_has_index(z, meta->zm_index)) {
1067 zone_page_metadata_index_confusion_panic(z, addr, meta);
1068 }
1069 }
1070
1071 __attribute__((overloadable))
1072 __header_always_inline void
zone_meta_validate(zone_t z,struct zone_page_metadata * meta,zone_element_t ze)1073 zone_meta_validate(zone_t z, struct zone_page_metadata *meta, zone_element_t ze)
1074 {
1075 zone_meta_validate(z, meta, zone_element_addr(z, ze, zone_elem_size(z)));
1076 }
1077
1078 __attribute__((overloadable))
1079 __header_always_inline void
zone_meta_validate(zone_t z,struct zone_page_metadata * meta)1080 zone_meta_validate(zone_t z, struct zone_page_metadata *meta)
1081 {
1082 zone_meta_validate(z, meta, zone_meta_to_addr(meta));
1083 }
1084
1085 __header_always_inline void
zone_meta_queue_push(zone_t z,zone_pva_t * headp,struct zone_page_metadata * meta)1086 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
1087 struct zone_page_metadata *meta)
1088 {
1089 zone_pva_t head = *headp;
1090 zone_pva_t queue_pva = zone_queue_encode(headp);
1091 struct zone_page_metadata *tmp;
1092
1093 meta->zm_page_next = head;
1094 if (!zone_pva_is_null(head)) {
1095 tmp = zone_pva_to_meta(head);
1096 if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
1097 zone_page_metadata_list_corruption(z, meta);
1098 }
1099 tmp->zm_page_prev = zone_pva_from_meta(meta);
1100 }
1101 meta->zm_page_prev = queue_pva;
1102 *headp = zone_pva_from_meta(meta);
1103 }
1104
1105 __header_always_inline struct zone_page_metadata *
zone_meta_queue_pop(zone_t z,zone_pva_t * headp)1106 zone_meta_queue_pop(zone_t z, zone_pva_t *headp)
1107 {
1108 zone_pva_t head = *headp;
1109 struct zone_page_metadata *meta = zone_pva_to_meta(head);
1110 struct zone_page_metadata *tmp;
1111
1112 zone_meta_validate(z, meta);
1113
1114 if (!zone_pva_is_null(meta->zm_page_next)) {
1115 tmp = zone_pva_to_meta(meta->zm_page_next);
1116 if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
1117 zone_page_metadata_list_corruption(z, meta);
1118 }
1119 tmp->zm_page_prev = meta->zm_page_prev;
1120 }
1121 *headp = meta->zm_page_next;
1122
1123 meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
1124
1125 return meta;
1126 }
1127
1128 __header_always_inline void
zone_meta_remqueue(zone_t z,struct zone_page_metadata * meta)1129 zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta)
1130 {
1131 zone_pva_t meta_pva = zone_pva_from_meta(meta);
1132 struct zone_page_metadata *tmp;
1133
1134 if (!zone_pva_is_null(meta->zm_page_next)) {
1135 tmp = zone_pva_to_meta(meta->zm_page_next);
1136 if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
1137 zone_page_metadata_list_corruption(z, meta);
1138 }
1139 tmp->zm_page_prev = meta->zm_page_prev;
1140 }
1141 if (zone_pva_is_queue(meta->zm_page_prev)) {
1142 zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
1143 } else {
1144 tmp = zone_pva_to_meta(meta->zm_page_prev);
1145 if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
1146 zone_page_metadata_list_corruption(z, meta);
1147 }
1148 tmp->zm_page_next = meta->zm_page_next;
1149 }
1150
1151 meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
1152 }
1153
1154 __header_always_inline void
zone_meta_requeue(zone_t z,zone_pva_t * headp,struct zone_page_metadata * meta)1155 zone_meta_requeue(zone_t z, zone_pva_t *headp,
1156 struct zone_page_metadata *meta)
1157 {
1158 zone_meta_remqueue(z, meta);
1159 zone_meta_queue_push(z, headp, meta);
1160 }
1161
1162 /* prevents a given metadata from ever reaching the z_pageq_empty queue */
1163 static inline void
zone_meta_lock_in_partial(zone_t z,struct zone_page_metadata * m,uint32_t len)1164 zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
1165 {
1166 uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK);
1167
1168 assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK);
1169 if (new_size == ZM_ALLOC_SIZE_LOCK) {
1170 zone_meta_requeue(z, &z->z_pageq_partial, m);
1171 zone_counter_sub(z, z_wired_empty, len);
1172 }
1173 }
1174
1175 /* allows a given metadata to reach the z_pageq_empty queue again */
1176 static inline void
zone_meta_unlock_from_partial(zone_t z,struct zone_page_metadata * m,uint32_t len)1177 zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
1178 {
1179 uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK);
1180
1181 assert(new_size % sizeof(vm_offset_t) == 0);
1182 if (new_size == 0) {
1183 zone_meta_requeue(z, &z->z_pageq_empty, m);
1184 z->z_wired_empty += len;
1185 }
1186 }
1187
1188 /*
1189 * Routine to populate a page backing metadata in the zone_metadata_region.
1190 * Must be called without the zone lock held as it might potentially block.
1191 */
1192 static void
zone_meta_populate(vm_offset_t base,vm_size_t size)1193 zone_meta_populate(vm_offset_t base, vm_size_t size)
1194 {
1195 struct zone_page_metadata *from = zone_meta_from_addr(base);
1196 struct zone_page_metadata *to = from + atop(size);
1197 vm_offset_t page_addr = trunc_page(from);
1198
1199 for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
1200 #if !KASAN_ZALLOC
1201 /*
1202 * This can race with another thread doing a populate on the same metadata
1203 * page, where we see an updated pmap but unmapped KASan shadow, causing a
1204 * fault in the shadow when we first access the metadata page. Avoid this
1205 * by always synchronizing on the zone_metadata_region lock with KASan.
1206 */
1207 if (pmap_find_phys(kernel_pmap, page_addr)) {
1208 continue;
1209 }
1210 #endif
1211
1212 for (;;) {
1213 kern_return_t ret = KERN_SUCCESS;
1214
1215 /*
1216 * All updates to the zone_metadata_region are done
1217 * under the zone_metadata_region_lck
1218 */
1219 zone_meta_lock();
1220 if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
1221 ret = kernel_memory_populate(page_addr,
1222 PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
1223 VM_KERN_MEMORY_OSFMK);
1224 }
1225 zone_meta_unlock();
1226
1227 if (ret == KERN_SUCCESS) {
1228 break;
1229 }
1230
1231 /*
1232 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
1233 * to bad system deadlocks, so if the allocation failed,
1234 * we need to do the VM_PAGE_WAIT() outside of the lock.
1235 */
1236 VM_PAGE_WAIT();
1237 }
1238 }
1239 }
1240
1241 __abortlike
1242 static void
zone_invalid_element_panic(zone_t zone,vm_offset_t addr,bool cache)1243 zone_invalid_element_panic(zone_t zone, vm_offset_t addr, bool cache)
1244 {
1245 struct zone_page_metadata *meta;
1246 vm_offset_t page, esize = zone_elem_size(zone);
1247 const char *from_cache = "";
1248
1249 if (cache) {
1250 zone_element_t ze = { .ze_value = addr };
1251 addr = zone_element_addr(zone, ze, esize);
1252 from_cache = " (from cache)";
1253
1254 if (zone_element_idx(ze) >= zone->z_chunk_elems) {
1255 panic("eidx %d for addr %p being freed to zone %s%s, is larger "
1256 "than number fo element in chunk (%d)", (int)zone_element_idx(ze),
1257 (void *)addr, zone_heap_name(zone), zone->z_name,
1258 zone->z_chunk_elems);
1259 }
1260 }
1261
1262 if (!from_zone_map(addr, esize)) {
1263 panic("addr %p being freed to zone %s%s%s, isn't from zone map",
1264 (void *)addr, zone_heap_name(zone), zone->z_name, from_cache);
1265 }
1266 page = trunc_page(addr);
1267 meta = zone_meta_from_addr(addr);
1268
1269 if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) {
1270 panic("metadata %p corresponding to addr %p being freed to "
1271 "zone %s%s%s, is marked as secondary per cpu page",
1272 meta, (void *)addr, zone_heap_name(zone), zone->z_name,
1273 from_cache);
1274 }
1275 if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) {
1276 panic("metadata %p corresponding to addr %p being freed to "
1277 "zone %s%s%s, has chunk len greater than max",
1278 meta, (void *)addr, zone_heap_name(zone), zone->z_name,
1279 from_cache);
1280 }
1281
1282 if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1283 page -= ptoa(meta->zm_page_index);
1284 }
1285
1286 if ((addr - page - zone_oob_offs(zone)) % esize) {
1287 panic("addr %p being freed to zone %s%s%s, isn't aligned to "
1288 "zone element size", (void *)addr, zone_heap_name(zone),
1289 zone->z_name, from_cache);
1290 }
1291
1292 zone_invalid_element_addr_panic(zone, addr);
1293 }
1294
1295 __header_always_inline
1296 struct zone_page_metadata *
zone_element_validate(zone_t zone,zone_element_t ze)1297 zone_element_validate(zone_t zone, zone_element_t ze)
1298 {
1299 struct zone_page_metadata *meta;
1300 vm_offset_t page = zone_element_base(ze);
1301
1302 if (!from_zone_map(page, 1)) {
1303 zone_invalid_element_panic(zone, ze.ze_value, true);
1304 }
1305 meta = zone_meta_from_addr(page);
1306
1307 if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) {
1308 zone_invalid_element_panic(zone, ze.ze_value, true);
1309 }
1310 if (zone_element_idx(ze) >= zone->z_chunk_elems) {
1311 zone_invalid_element_panic(zone, ze.ze_value, true);
1312 }
1313
1314 zone_meta_validate(zone, meta, ze);
1315
1316 return meta;
1317 }
1318
1319 __attribute__((always_inline))
1320 static struct zone_page_metadata *
zone_element_resolve(zone_t zone,vm_offset_t addr,vm_offset_t esize,zone_element_t * ze)1321 zone_element_resolve(zone_t zone, vm_offset_t addr, vm_offset_t esize,
1322 zone_element_t *ze)
1323 {
1324 struct zone_page_metadata *meta;
1325 vm_offset_t offs = zone_oob_offs(zone);
1326 vm_offset_t page, eidx;
1327
1328 if (!from_zone_map(addr, esize)) {
1329 zone_invalid_element_panic(zone, addr, false);
1330 }
1331 page = trunc_page(addr);
1332 meta = zone_meta_from_addr(addr);
1333 zone_meta_validate(zone, meta, addr);
1334
1335 if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) {
1336 zone_invalid_element_panic(zone, addr, false);
1337 }
1338 if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1339 page -= ptoa(meta->zm_page_index);
1340 meta -= meta->zm_page_index;
1341 }
1342
1343 eidx = (addr - page - offs) / esize;
1344 if ((addr - page - offs) % esize) {
1345 zone_invalid_element_panic(zone, addr, false);
1346 }
1347
1348 *ze = zone_element_encode(page, eidx);
1349 return meta;
1350 }
1351
1352 #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST)
1353 void *
zone_element_pgz_oob_adjust(struct kalloc_result kr,vm_size_t elem_size)1354 zone_element_pgz_oob_adjust(struct kalloc_result kr, vm_size_t elem_size)
1355 {
1356 vm_offset_t addr = (vm_offset_t)kr.addr;
1357 /*
1358 * 0-sized allocations in a KALLOC_MINSIZE bucket
1359 * would be offset to the next allocation which is incorrect.
1360 */
1361 vm_offset_t req_size = MAX(roundup(kr.size, KALLOC_MINALIGN), KALLOC_MINALIGN);
1362 vm_offset_t end = addr + elem_size;
1363 vm_offset_t offs;
1364
1365 /*
1366 * Given how chunks work, for a zone with PGZ guards on,
1367 * there's a single element which ends precisely
1368 * at the page boundary: the last one.
1369 */
1370 if (req_size == elem_size ||
1371 (end & PAGE_MASK) ||
1372 !zone_meta_from_addr(addr)->zm_guarded) {
1373 return kr.addr;
1374 }
1375
1376 offs = elem_size - req_size;
1377 zone_meta_from_addr(end)->zm_oob_offs = (uint16_t)offs;
1378
1379 return (char *)addr + offs;
1380 }
1381 #endif /* !ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */
1382
1383 /*
1384 * Routine to get the size of a zone allocated address.
1385 * If the address doesnt belong to the zone maps, returns 0.
1386 */
1387 vm_size_t
zone_element_size(void * elem,zone_t * z,bool clear_oob,vm_offset_t * oob_offs)1388 zone_element_size(void *elem, zone_t *z, bool clear_oob, vm_offset_t *oob_offs)
1389 {
1390 vm_address_t addr = (vm_address_t)elem;
1391 struct zone_page_metadata *meta;
1392 vm_size_t esize, offs, end;
1393 zone_t zone;
1394
1395 if (from_zone_map(addr, sizeof(void *))) {
1396 meta = zone_meta_from_addr(addr);
1397 zone = &zone_array[meta->zm_index];
1398 esize = zone_elem_size_safe(zone);
1399 end = addr + esize;
1400 offs = 0;
1401
1402 #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST)
1403 /*
1404 * If the chunk uses guards, and that (addr + esize)
1405 * either crosses a page boundary or is at the boundary,
1406 * we need to look harder.
1407 */
1408 if (oob_offs && meta->zm_guarded && atop(addr ^ end)) {
1409 /*
1410 * Because in the vast majority of cases the element
1411 * size is sub-page, and that meta[1] must be faulted,
1412 * we can quickly peek at whether it's a guard.
1413 *
1414 * For elements larger than a page, finding the guard
1415 * page requires a little more effort.
1416 */
1417 if (meta[1].zm_chunk_len == ZM_PGZ_GUARD) {
1418 offs = meta[1].zm_oob_offs;
1419 if (clear_oob) {
1420 meta[1].zm_oob_offs = 0;
1421 }
1422 } else if (esize > PAGE_SIZE) {
1423 struct zone_page_metadata *gmeta;
1424
1425 if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1426 gmeta = meta + meta->zm_subchunk_len;
1427 } else {
1428 gmeta = meta + zone->z_chunk_pages;
1429 }
1430 assert(gmeta->zm_chunk_len == ZM_PGZ_GUARD);
1431
1432 if (end >= zone_meta_to_addr(gmeta)) {
1433 offs = gmeta->zm_oob_offs;
1434 if (clear_oob) {
1435 gmeta->zm_oob_offs = 0;
1436 }
1437 }
1438 }
1439 }
1440 #else
1441 #pragma unused(end, clear_oob)
1442 #endif /* ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */
1443
1444 if (oob_offs) {
1445 *oob_offs = offs;
1446 }
1447 if (z) {
1448 *z = zone;
1449 }
1450 return esize;
1451 }
1452
1453 if (oob_offs) {
1454 *oob_offs = 0;
1455 }
1456 #if CONFIG_GZALLOC
1457 if (__improbable(gzalloc_enabled())) {
1458 vm_size_t gzsize;
1459 if (gzalloc_element_size(elem, z, &gzsize)) {
1460 return gzsize;
1461 }
1462 }
1463 #endif /* CONFIG_GZALLOC */
1464
1465 return 0;
1466 }
1467
1468 zone_id_t
zone_id_for_element(void * addr,vm_size_t esize)1469 zone_id_for_element(void *addr, vm_size_t esize)
1470 {
1471 zone_id_t zid = ZONE_ID_INVALID;
1472 if (from_zone_map(addr, esize)) {
1473 zid = zone_index_from_ptr(addr);
1474 __builtin_assume(zid != ZONE_ID_INVALID);
1475 }
1476 return zid;
1477 }
1478
1479 /* This function just formats the reason for the panics by redoing the checks */
1480 __abortlike
1481 static void
zone_require_panic(zone_t zone,void * addr)1482 zone_require_panic(zone_t zone, void *addr)
1483 {
1484 uint32_t zindex;
1485 zone_t other;
1486
1487 if (!from_zone_map(addr, zone_elem_size(zone))) {
1488 panic("zone_require failed: address not in a zone (addr: %p)", addr);
1489 }
1490
1491 zindex = zone_index_from_ptr(addr);
1492 other = &zone_array[zindex];
1493 if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1494 panic("zone_require failed: invalid zone index %d "
1495 "(addr: %p, expected: %s%s)", zindex,
1496 addr, zone_heap_name(zone), zone->z_name);
1497 } else {
1498 panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1499 "(addr: %p, expected: %s%s)",
1500 zindex, zone_heap_name(other), other->z_name,
1501 addr, zone_heap_name(zone), zone->z_name);
1502 }
1503 }
1504
1505 __abortlike
1506 static void
zone_id_require_panic(zone_id_t zid,void * addr)1507 zone_id_require_panic(zone_id_t zid, void *addr)
1508 {
1509 zone_require_panic(&zone_array[zid], addr);
1510 }
1511
1512 /*
1513 * Routines to panic if a pointer is not mapped to an expected zone.
1514 * This can be used as a means of pinning an object to the zone it is expected
1515 * to be a part of. Causes a panic if the address does not belong to any
1516 * specified zone, does not belong to any zone, has been freed and therefore
1517 * unmapped from the zone, or the pointer contains an uninitialized value that
1518 * does not belong to any zone.
1519 */
1520 void
zone_require(zone_t zone,void * addr)1521 zone_require(zone_t zone, void *addr)
1522 {
1523 vm_size_t esize = zone_elem_size(zone);
1524
1525 if (__probable(from_zone_map(addr, esize))) {
1526 if (zone_has_index(zone, zone_index_from_ptr(addr))) {
1527 return;
1528 }
1529 #if CONFIG_GZALLOC
1530 } else if (__probable(zone->z_gzalloc_tracked)) {
1531 return;
1532 #endif
1533 }
1534 zone_require_panic(zone, addr);
1535 }
1536
1537 void
zone_id_require(zone_id_t zid,vm_size_t esize,void * addr)1538 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1539 {
1540 if (__probable(from_zone_map(addr, esize))) {
1541 if (zid == zone_index_from_ptr(addr)) {
1542 return;
1543 }
1544 #if CONFIG_GZALLOC
1545 } else if (__probable(zone_array[zid].z_gzalloc_tracked)) {
1546 return;
1547 #endif
1548 }
1549 zone_id_require_panic(zid, addr);
1550 }
1551
1552 bool
zone_owns(zone_t zone,void * addr)1553 zone_owns(zone_t zone, void *addr)
1554 {
1555 vm_size_t esize = zone_elem_size_safe(zone);
1556
1557 if (__probable(from_zone_map(addr, esize))) {
1558 return zone_has_index(zone, zone_index_from_ptr(addr));
1559 #if CONFIG_GZALLOC
1560 } else if (__probable(zone->z_gzalloc_tracked)) {
1561 return true;
1562 #endif
1563 }
1564 return false;
1565 }
1566
1567 static inline struct kmem_range
zone_kmem_suballoc(vm_offset_t addr,vm_size_t size,int flags,vm_tag_t tag,vm_map_t * new_map)1568 zone_kmem_suballoc(
1569 vm_offset_t addr,
1570 vm_size_t size,
1571 int flags,
1572 vm_tag_t tag,
1573 vm_map_t *new_map)
1574 {
1575 struct kmem_range r;
1576
1577 *new_map = kmem_suballoc(kernel_map, &addr, size,
1578 VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
1579 flags, KMS_PERMANENT | KMS_NOFAIL, tag).kmr_submap;
1580
1581 r.min_address = addr;
1582 r.max_address = addr + size;
1583 return r;
1584 }
1585
1586 #endif /* !ZALLOC_TEST */
1587 #pragma mark Zone bits allocator
1588
1589 /*!
1590 * @defgroup Zone Bitmap allocator
1591 * @{
1592 *
1593 * @brief
1594 * Functions implementing the zone bitmap allocator
1595 *
1596 * @discussion
1597 * The zone allocator maintains which elements are allocated or free in bitmaps.
1598 *
1599 * When the number of elements per page is smaller than 32, it is stored inline
1600 * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set,
1601 * and @c zm_bitmap used for storage).
1602 *
1603 * When the number of elements is larger, then a bitmap is allocated from
1604 * a buddy allocator (impelemented under the @c zba_* namespace). Pointers
1605 * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in
1606 * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in
1607 * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme
1608 * cannot be larger than 1024 bytes (8192 bits).
1609 *
1610 * This buddy allocator can actually accomodate allocations as large
1611 * as 8k on 16k systems and 2k on 4k systems.
1612 *
1613 * Note: @c zba_* functions are implementation details not meant to be used
1614 * outside of the allocation of the allocator itself. Interfaces to the rest of
1615 * the zone allocator are documented and not @c zba_* prefixed.
1616 */
1617
1618 #define ZBA_CHUNK_SIZE PAGE_MAX_SIZE
1619 #define ZBA_GRANULE sizeof(uint64_t)
1620 #define ZBA_GRANULE_BITS (8 * sizeof(uint64_t))
1621 #define ZBA_MAX_ORDER (PAGE_MAX_SHIFT - 4)
1622 #define ZBA_MAX_ALLOC_ORDER 7
1623 #define ZBA_SLOTS (ZBA_CHUNK_SIZE / ZBA_GRANULE)
1624 static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes");
1625 static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough");
1626
1627 struct zone_bits_chain {
1628 uint32_t zbc_next;
1629 uint32_t zbc_prev;
1630 } __attribute__((aligned(ZBA_GRANULE)));
1631
1632 struct zone_bits_head {
1633 uint32_t zbh_next;
1634 uint32_t zbh_unused;
1635 } __attribute__((aligned(ZBA_GRANULE)));
1636
1637 static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size");
1638 static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size");
1639
1640 struct zone_bits_allocator_meta {
1641 uint32_t zbam_chunks;
1642 uint32_t __zbam_padding;
1643 struct zone_bits_head zbam_lists[ZBA_MAX_ORDER + 1];
1644 };
1645
1646 struct zone_bits_allocator_header {
1647 uint64_t zbah_bits[ZBA_SLOTS / (8 * sizeof(uint64_t))];
1648 };
1649
1650 #if ZALLOC_TEST
1651 static struct zalloc_bits_allocator_test_setup {
1652 vm_offset_t zbats_base;
1653 void (*zbats_populate)(vm_address_t addr, vm_size_t size);
1654 } zba_test_info;
1655
1656 static struct zone_bits_allocator_header *
zba_base_header(void)1657 zba_base_header(void)
1658 {
1659 return (struct zone_bits_allocator_header *)zba_test_info.zbats_base;
1660 }
1661
1662 static void
zba_populate(uint32_t n)1663 zba_populate(uint32_t n)
1664 {
1665 vm_address_t base = zba_test_info.zbats_base;
1666 zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE);
1667 }
1668 #else
1669 __startup_data
1670 static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE]
1671 __attribute__((aligned(ZBA_CHUNK_SIZE)));
1672 static LCK_MTX_EARLY_DECLARE(zba_mtx, &zone_locks_grp);
1673
1674 static struct zone_bits_allocator_header *
zba_base_header(void)1675 zba_base_header(void)
1676 {
1677 return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address;
1678 }
1679
1680 static void
zba_lock(void)1681 zba_lock(void)
1682 {
1683 lck_mtx_lock(&zba_mtx);
1684 }
1685
1686 static void
zba_unlock(void)1687 zba_unlock(void)
1688 {
1689 lck_mtx_unlock(&zba_mtx);
1690 }
1691
1692 static void
zba_populate(uint32_t n)1693 zba_populate(uint32_t n)
1694 {
1695 vm_size_t size = ZBA_CHUNK_SIZE;
1696 vm_address_t addr;
1697
1698 addr = zone_info.zi_bits_range.min_address + n * size;
1699 if (addr >= zone_info.zi_bits_range.max_address) {
1700 uint64_t zsize = 0;
1701 zone_t z = zone_find_largest(&zsize);
1702 panic("zba_populate: out of bitmap space, "
1703 "likely due to memory leak in zone [%s%s] "
1704 "(%luM, %d elements allocated)",
1705 zone_heap_name(z), zone_name(z),
1706 (unsigned long)zsize >> 20,
1707 zone_count_allocated(z));
1708 }
1709
1710 for (;;) {
1711 kern_return_t kr = KERN_SUCCESS;
1712
1713 if (0 == pmap_find_phys(kernel_pmap, addr)) {
1714 kr = kernel_memory_populate(addr, size,
1715 KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
1716 VM_KERN_MEMORY_OSFMK);
1717 }
1718
1719 if (kr == KERN_SUCCESS) {
1720 return;
1721 }
1722
1723 zba_unlock();
1724 VM_PAGE_WAIT();
1725 zba_lock();
1726 }
1727 }
1728 #endif
1729
1730 __pure2
1731 static struct zone_bits_allocator_meta *
zba_meta(void)1732 zba_meta(void)
1733 {
1734 return (struct zone_bits_allocator_meta *)&zba_base_header()[1];
1735 }
1736
1737 __pure2
1738 static uint64_t *
zba_slot_base(void)1739 zba_slot_base(void)
1740 {
1741 return (uint64_t *)zba_base_header();
1742 }
1743
1744 __pure2
1745 static vm_address_t
zba_page_addr(uint32_t n)1746 zba_page_addr(uint32_t n)
1747 {
1748 return (vm_address_t)zba_base_header() + n * ZBA_CHUNK_SIZE;
1749 }
1750
1751 __pure2
1752 static struct zone_bits_head *
zba_head(uint32_t order)1753 zba_head(uint32_t order)
1754 {
1755 return &zba_meta()->zbam_lists[order];
1756 }
1757
1758 __pure2
1759 static uint32_t
zba_head_index(uint32_t order)1760 zba_head_index(uint32_t order)
1761 {
1762 uint32_t hdr_size = sizeof(struct zone_bits_allocator_header) +
1763 offsetof(struct zone_bits_allocator_meta, zbam_lists);
1764 return (hdr_size / ZBA_GRANULE) + order;
1765 }
1766
1767 __pure2
1768 static struct zone_bits_chain *
zba_chain_for_index(uint32_t index)1769 zba_chain_for_index(uint32_t index)
1770 {
1771 return (struct zone_bits_chain *)(zba_slot_base() + index);
1772 }
1773
1774 __pure2
1775 static uint32_t
zba_chain_to_index(const struct zone_bits_chain * zbc)1776 zba_chain_to_index(const struct zone_bits_chain *zbc)
1777 {
1778 return (uint32_t)((const uint64_t *)zbc - zba_slot_base());
1779 }
1780
1781 __abortlike
1782 static void
zba_head_corruption_panic(uint32_t order)1783 zba_head_corruption_panic(uint32_t order)
1784 {
1785 panic("zone bits allocator head[%d:%p] is corrupt", order,
1786 zba_head(order));
1787 }
1788
1789 __abortlike
1790 static void
zba_chain_corruption_panic(struct zone_bits_chain * a,struct zone_bits_chain * b)1791 zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b)
1792 {
1793 panic("zone bits allocator freelist is corrupt (%p <-> %p)", a, b);
1794 }
1795
1796 static void
zba_push_block(struct zone_bits_chain * zbc,uint32_t order)1797 zba_push_block(struct zone_bits_chain *zbc, uint32_t order)
1798 {
1799 struct zone_bits_head *hd = zba_head(order);
1800 uint32_t hd_index = zba_head_index(order);
1801 uint32_t index = zba_chain_to_index(zbc);
1802 struct zone_bits_chain *next;
1803
1804 if (hd->zbh_next) {
1805 next = zba_chain_for_index(hd->zbh_next);
1806 if (next->zbc_prev != hd_index) {
1807 zba_head_corruption_panic(order);
1808 }
1809 next->zbc_prev = index;
1810 }
1811 zbc->zbc_next = hd->zbh_next;
1812 zbc->zbc_prev = hd_index;
1813 hd->zbh_next = index;
1814 }
1815
1816 static void
zba_remove_block(struct zone_bits_chain * zbc)1817 zba_remove_block(struct zone_bits_chain *zbc)
1818 {
1819 struct zone_bits_chain *prev = zba_chain_for_index(zbc->zbc_prev);
1820 uint32_t index = zba_chain_to_index(zbc);
1821
1822 if (prev->zbc_next != index) {
1823 zba_chain_corruption_panic(prev, zbc);
1824 }
1825 if ((prev->zbc_next = zbc->zbc_next)) {
1826 struct zone_bits_chain *next = zba_chain_for_index(zbc->zbc_next);
1827 if (next->zbc_prev != index) {
1828 zba_chain_corruption_panic(zbc, next);
1829 }
1830 next->zbc_prev = zbc->zbc_prev;
1831 }
1832 }
1833
1834 static vm_address_t
zba_try_pop_block(uint32_t order)1835 zba_try_pop_block(uint32_t order)
1836 {
1837 struct zone_bits_head *hd = zba_head(order);
1838 struct zone_bits_chain *zbc;
1839
1840 if (hd->zbh_next == 0) {
1841 return 0;
1842 }
1843
1844 zbc = zba_chain_for_index(hd->zbh_next);
1845 zba_remove_block(zbc);
1846 return (vm_address_t)zbc;
1847 }
1848
1849 static struct zone_bits_allocator_header *
zba_header(vm_offset_t addr)1850 zba_header(vm_offset_t addr)
1851 {
1852 addr &= -(vm_offset_t)ZBA_CHUNK_SIZE;
1853 return (struct zone_bits_allocator_header *)addr;
1854 }
1855
1856 static size_t
zba_node_parent(size_t node)1857 zba_node_parent(size_t node)
1858 {
1859 return (node - 1) / 2;
1860 }
1861
1862 static size_t
zba_node_left_child(size_t node)1863 zba_node_left_child(size_t node)
1864 {
1865 return node * 2 + 1;
1866 }
1867
1868 static size_t
zba_node_buddy(size_t node)1869 zba_node_buddy(size_t node)
1870 {
1871 return ((node - 1) ^ 1) + 1;
1872 }
1873
1874 static size_t
zba_node(vm_offset_t addr,uint32_t order)1875 zba_node(vm_offset_t addr, uint32_t order)
1876 {
1877 vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE;
1878 return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1;
1879 }
1880
1881 static struct zone_bits_chain *
zba_chain_for_node(struct zone_bits_allocator_header * zbah,size_t node,uint32_t order)1882 zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order)
1883 {
1884 vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order;
1885 return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE);
1886 }
1887
1888 static void
zba_node_flip_split(struct zone_bits_allocator_header * zbah,size_t node)1889 zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node)
1890 {
1891 zbah->zbah_bits[node / 64] ^= 1ull << (node % 64);
1892 }
1893
1894 static bool
zba_node_is_split(struct zone_bits_allocator_header * zbah,size_t node)1895 zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node)
1896 {
1897 return zbah->zbah_bits[node / 64] & (1ull << (node % 64));
1898 }
1899
1900 static void
zba_free(vm_offset_t addr,uint32_t order)1901 zba_free(vm_offset_t addr, uint32_t order)
1902 {
1903 struct zone_bits_allocator_header *zbah = zba_header(addr);
1904 struct zone_bits_chain *zbc;
1905 size_t node = zba_node(addr, order);
1906
1907 while (node) {
1908 size_t parent = zba_node_parent(node);
1909
1910 zba_node_flip_split(zbah, parent);
1911 if (zba_node_is_split(zbah, parent)) {
1912 break;
1913 }
1914
1915 zbc = zba_chain_for_node(zbah, zba_node_buddy(node), order);
1916 zba_remove_block(zbc);
1917 order++;
1918 node = parent;
1919 }
1920
1921 zba_push_block(zba_chain_for_node(zbah, node, order), order);
1922 }
1923
1924 static vm_size_t
zba_chunk_header_size(uint32_t n)1925 zba_chunk_header_size(uint32_t n)
1926 {
1927 vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header);
1928 if (n == 0) {
1929 hdr_size += sizeof(struct zone_bits_allocator_meta);
1930 }
1931 return hdr_size;
1932 }
1933
1934 static void
zba_init_chunk(uint32_t n)1935 zba_init_chunk(uint32_t n)
1936 {
1937 vm_size_t hdr_size = zba_chunk_header_size(n);
1938 vm_offset_t page = zba_page_addr(n);
1939 struct zone_bits_allocator_header *zbah = zba_header(page);
1940 vm_size_t size = ZBA_CHUNK_SIZE;
1941 size_t node;
1942
1943 for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) {
1944 if (size < hdr_size + (ZBA_GRANULE << o)) {
1945 continue;
1946 }
1947 size -= ZBA_GRANULE << o;
1948 node = zba_node(page + size, o);
1949 zba_node_flip_split(zbah, zba_node_parent(node));
1950 zba_push_block(zba_chain_for_node(zbah, node, o), o);
1951 }
1952
1953 zba_meta()->zbam_chunks = n + 1;
1954 }
1955
1956 __attribute__((noinline))
1957 static void
zba_grow(void)1958 zba_grow(void)
1959 {
1960 uint32_t chunk = zba_meta()->zbam_chunks;
1961
1962 zba_populate(chunk);
1963 if (zba_meta()->zbam_chunks == chunk) {
1964 zba_init_chunk(chunk);
1965 }
1966 }
1967
1968 static vm_offset_t
zba_alloc(uint32_t order)1969 zba_alloc(uint32_t order)
1970 {
1971 struct zone_bits_allocator_header *zbah;
1972 uint32_t cur = order;
1973 vm_address_t addr;
1974 size_t node;
1975
1976 while ((addr = zba_try_pop_block(cur)) == 0) {
1977 if (cur++ >= ZBA_MAX_ORDER) {
1978 zba_grow();
1979 cur = order;
1980 }
1981 }
1982
1983 zbah = zba_header(addr);
1984 node = zba_node(addr, cur);
1985 zba_node_flip_split(zbah, zba_node_parent(node));
1986 while (cur > order) {
1987 cur--;
1988 zba_node_flip_split(zbah, node);
1989 node = zba_node_left_child(node);
1990 zba_push_block(zba_chain_for_node(zbah, node + 1, cur), cur);
1991 }
1992
1993 return addr;
1994 }
1995
1996 #define zba_map_index(type, n) (n / (8 * sizeof(type)))
1997 #define zba_map_bit(type, n) ((type)1 << (n % (8 * sizeof(type))))
1998 #define zba_map_mask_lt(type, n) (zba_map_bit(type, n) - 1)
1999 #define zba_map_mask_ge(type, n) ((type)-zba_map_bit(type, n))
2000
2001 #if !ZALLOC_TEST
2002 static uint32_t
zba_bits_ref_order(uint32_t bref)2003 zba_bits_ref_order(uint32_t bref)
2004 {
2005 return bref & 0x7;
2006 }
2007
2008 static bitmap_t *
zba_bits_ref_ptr(uint32_t bref)2009 zba_bits_ref_ptr(uint32_t bref)
2010 {
2011 return zba_slot_base() + (bref >> 3);
2012 }
2013
2014 static vm_offset_t
zba_scan_bitmap_inline(zone_t zone,struct zone_page_metadata * meta,zalloc_flags_t flags,vm_offset_t eidx)2015 zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta,
2016 zalloc_flags_t flags, vm_offset_t eidx)
2017 {
2018 size_t i = eidx / 32;
2019 uint32_t map;
2020
2021 if (eidx % 32) {
2022 map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx);
2023 if (map) {
2024 eidx = __builtin_ctz(map);
2025 meta[i].zm_bitmap ^= 1u << eidx;
2026 return i * 32 + eidx;
2027 }
2028 i++;
2029 }
2030
2031 uint32_t chunk_len = meta->zm_chunk_len;
2032 if (flags & Z_PCPU) {
2033 chunk_len = zpercpu_count();
2034 }
2035 for (int j = 0; j < chunk_len; j++, i++) {
2036 if (i >= chunk_len) {
2037 i = 0;
2038 }
2039 if (__probable(map = meta[i].zm_bitmap)) {
2040 meta[i].zm_bitmap &= map - 1;
2041 return i * 32 + __builtin_ctz(map);
2042 }
2043 }
2044
2045 zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
2046 }
2047
2048 static vm_offset_t
zba_scan_bitmap_ref(zone_t zone,struct zone_page_metadata * meta,vm_offset_t eidx)2049 zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta,
2050 vm_offset_t eidx)
2051 {
2052 uint32_t bits_size = 1 << zba_bits_ref_order(meta->zm_bitmap);
2053 bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2054 size_t i = eidx / 64;
2055 uint64_t map;
2056
2057 if (eidx % 64) {
2058 map = bits[i] & zba_map_mask_ge(uint64_t, eidx);
2059 if (map) {
2060 eidx = __builtin_ctzll(map);
2061 bits[i] ^= 1ull << eidx;
2062 return i * 64 + eidx;
2063 }
2064 i++;
2065 }
2066
2067 for (int j = 0; j < bits_size; i++, j++) {
2068 if (i >= bits_size) {
2069 i = 0;
2070 }
2071 if (__probable(map = bits[i])) {
2072 bits[i] &= map - 1;
2073 return i * 64 + __builtin_ctzll(map);
2074 }
2075 }
2076
2077 zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
2078 }
2079
2080 /*!
2081 * @function zone_meta_find_and_clear_bit
2082 *
2083 * @brief
2084 * The core of the bitmap allocator: find a bit set in the bitmaps.
2085 *
2086 * @discussion
2087 * This method will round robin through available allocations,
2088 * with a per-core memory of the last allocated element index allocated.
2089 *
2090 * This is done in order to avoid a fully LIFO behavior which makes exploiting
2091 * double-free bugs way too practical.
2092 *
2093 * @param zone The zone we're allocating from.
2094 * @param meta The main metadata for the chunk being allocated from.
2095 * @param flags the alloc flags (for @c Z_PCPU).
2096 */
2097 static vm_offset_t
zone_meta_find_and_clear_bit(zone_t zone,struct zone_page_metadata * meta,zalloc_flags_t flags)2098 zone_meta_find_and_clear_bit(zone_t zone, struct zone_page_metadata *meta,
2099 zalloc_flags_t flags)
2100 {
2101 zone_stats_t zs = zpercpu_get(zone->z_stats);
2102 vm_offset_t eidx = zs->zs_alloc_rr + 1;
2103
2104 if (meta->zm_inline_bitmap) {
2105 eidx = zba_scan_bitmap_inline(zone, meta, flags, eidx);
2106 } else {
2107 eidx = zba_scan_bitmap_ref(zone, meta, eidx);
2108 }
2109 zs->zs_alloc_rr = (uint16_t)eidx;
2110 return eidx;
2111 }
2112
2113 /*!
2114 * @function zone_meta_bits_init
2115 *
2116 * @brief
2117 * Initializes the zm_bitmap field(s) for a newly assigned chunk.
2118 *
2119 * @param meta The main metadata for the initialized chunk.
2120 * @param count The number of elements the chunk can hold
2121 * (which might be partial for partially populated chunks).
2122 * @param nbits The maximum nuber of bits that will be used.
2123 */
2124 static void
zone_meta_bits_init(struct zone_page_metadata * meta,uint32_t count,uint32_t nbits)2125 zone_meta_bits_init(struct zone_page_metadata *meta,
2126 uint32_t count, uint32_t nbits)
2127 {
2128 static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <=
2129 ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough");
2130
2131 if (meta->zm_inline_bitmap) {
2132 /*
2133 * We're called with the metadata zm_bitmap fields already
2134 * zeroed out.
2135 */
2136 for (size_t i = 0; 32 * i < count; i++) {
2137 if (32 * i + 32 <= count) {
2138 meta[i].zm_bitmap = ~0u;
2139 } else {
2140 meta[i].zm_bitmap = zba_map_mask_lt(uint32_t, count);
2141 }
2142 }
2143 } else {
2144 uint32_t order = flsll((nbits - 1) / ZBA_GRANULE_BITS);
2145 uint64_t *bits;
2146
2147 assert(order <= ZBA_MAX_ALLOC_ORDER);
2148 assert(count <= ZBA_GRANULE_BITS << order);
2149
2150 zba_lock();
2151 bits = (uint64_t *)zba_alloc(order);
2152 zba_unlock();
2153
2154 for (size_t i = 0; i < 1u << order; i++) {
2155 if (64 * i + 64 <= count) {
2156 bits[i] = ~0ull;
2157 } else if (64 * i < count) {
2158 bits[i] = zba_map_mask_lt(uint64_t, count);
2159 } else {
2160 bits[i] = 0ull;
2161 }
2162 }
2163
2164 meta->zm_bitmap = (uint32_t)((vm_offset_t)bits -
2165 (vm_offset_t)zba_slot_base()) + order;
2166 }
2167 }
2168
2169 /*!
2170 * @function zone_meta_bits_merge
2171 *
2172 * @brief
2173 * Adds elements <code>[start, end)</code> to a chunk being extended.
2174 *
2175 * @param meta The main metadata for the extended chunk.
2176 * @param start The index of the first element to add to the chunk.
2177 * @param end The index of the last (exclusive) element to add.
2178 */
2179 static void
zone_meta_bits_merge(struct zone_page_metadata * meta,uint32_t start,uint32_t end)2180 zone_meta_bits_merge(struct zone_page_metadata *meta,
2181 uint32_t start, uint32_t end)
2182 {
2183 if (meta->zm_inline_bitmap) {
2184 while (start < end) {
2185 size_t s_i = start / 32;
2186 size_t s_e = end / 32;
2187
2188 if (s_i == s_e) {
2189 meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) &
2190 zba_map_mask_ge(uint32_t, start);
2191 break;
2192 }
2193
2194 meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start);
2195 start += 32 - (start % 32);
2196 }
2197 } else {
2198 uint64_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2199
2200 while (start < end) {
2201 size_t s_i = start / 64;
2202 size_t s_e = end / 64;
2203
2204 if (s_i == s_e) {
2205 bits[s_i] |= zba_map_mask_lt(uint64_t, end) &
2206 zba_map_mask_ge(uint64_t, start);
2207 break;
2208 }
2209 bits[s_i] |= zba_map_mask_ge(uint64_t, start);
2210 start += 64 - (start % 64);
2211 }
2212 }
2213 }
2214
2215 /*!
2216 * @function zone_bits_free
2217 *
2218 * @brief
2219 * Frees a bitmap to the zone bitmap allocator.
2220 *
2221 * @param bref
2222 * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field.
2223 */
2224 static void
zone_bits_free(uint32_t bref)2225 zone_bits_free(uint32_t bref)
2226 {
2227 zba_lock();
2228 zba_free((vm_offset_t)zba_bits_ref_ptr(bref), zba_bits_ref_order(bref));
2229 zba_unlock();
2230 }
2231
2232 /*!
2233 * @function zone_meta_is_free
2234 *
2235 * @brief
2236 * Returns whether a given element appears free.
2237 */
2238 static bool
zone_meta_is_free(struct zone_page_metadata * meta,zone_element_t ze)2239 zone_meta_is_free(struct zone_page_metadata *meta, zone_element_t ze)
2240 {
2241 vm_offset_t eidx = zone_element_idx(ze);
2242 if (meta->zm_inline_bitmap) {
2243 uint32_t bit = zba_map_bit(uint32_t, eidx);
2244 return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit;
2245 } else {
2246 bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2247 uint64_t bit = zba_map_bit(uint64_t, eidx);
2248 return bits[zba_map_index(uint64_t, eidx)] & bit;
2249 }
2250 }
2251
2252 /*!
2253 * @function zone_meta_mark_free
2254 *
2255 * @brief
2256 * Marks an element as free and returns whether it was marked as used.
2257 */
2258 static bool
zone_meta_mark_free(struct zone_page_metadata * meta,zone_element_t ze)2259 zone_meta_mark_free(struct zone_page_metadata *meta, zone_element_t ze)
2260 {
2261 vm_offset_t eidx = zone_element_idx(ze);
2262
2263 if (meta->zm_inline_bitmap) {
2264 uint32_t bit = zba_map_bit(uint32_t, eidx);
2265 if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) {
2266 return false;
2267 }
2268 meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit;
2269 } else {
2270 bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2271 uint64_t bit = zba_map_bit(uint64_t, eidx);
2272 if (bits[zba_map_index(uint64_t, eidx)] & bit) {
2273 return false;
2274 }
2275 bits[zba_map_index(uint64_t, eidx)] ^= bit;
2276 }
2277 return true;
2278 }
2279
2280 /*!
2281 * @function zone_meta_mark_used
2282 *
2283 * @brief
2284 * Marks an element as used and returns whether it was marked as free
2285 */
2286 static bool
zone_meta_mark_used(struct zone_page_metadata * meta,zone_element_t ze)2287 zone_meta_mark_used(struct zone_page_metadata *meta, zone_element_t ze)
2288 {
2289 vm_offset_t eidx = zone_element_idx(ze);
2290
2291 if (meta->zm_inline_bitmap) {
2292 uint32_t bit = zba_map_bit(uint32_t, eidx);
2293 if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) {
2294 meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit;
2295 return true;
2296 }
2297 } else {
2298 bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2299 uint64_t bit = zba_map_bit(uint64_t, eidx);
2300 if (bits[zba_map_index(uint64_t, eidx)] & bit) {
2301 bits[zba_map_index(uint64_t, eidx)] ^= bit;
2302 return true;
2303 }
2304 }
2305 return false;
2306 }
2307
2308 #endif /* !ZALLOC_TEST */
2309 /*! @} */
2310 #pragma mark ZTAGS
2311 #if !ZALLOC_TEST
2312 #if VM_TAG_SIZECLASSES
2313 /*
2314 * Zone tagging allows for per "tag" accounting of allocations for the kalloc
2315 * zones only.
2316 *
2317 * There are 3 kinds of tags that can be used:
2318 * - pre-registered VM_KERN_MEMORY_*
2319 * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc())
2320 * - per-kext tags computed by IOKit (using the magic Z_VM_TAG_BT_BIT marker).
2321 *
2322 * The VM tracks the statistics in lazily allocated structures.
2323 * See vm_tag_will_update_zone(), vm_tag_update_zone_size().
2324 *
2325 * If for some reason the requested tag cannot be accounted for,
2326 * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated.
2327 *
2328 * Each allocated element also remembers the tag it was assigned,
2329 * in its ztSlot() which lets zalloc/zfree update statistics correctly.
2330 */
2331
2332 // for zones with tagging enabled:
2333
2334 // calculate a pointer to the tag base entry,
2335 // holding either a uint32_t the first tag offset for a page in the zone map,
2336 // or two uint16_t tags if the page can only hold one or two elements
2337
2338 #define ZTAGBASE(zone, element) \
2339 (&((uint32_t *)zone_tagbase_range.min_address)[atop((element) - \
2340 zone_info.zi_map_range.min_address)])
2341
2342 static struct kmem_range zone_tagbase_range;
2343 static vm_map_t zone_tagbase_map;
2344 static vm_map_size_t zone_tagbase_map_size;
2345
2346 static struct kmem_range zone_tags_range;
2347 static vm_map_t zone_tags_map;
2348 static vm_map_size_t zone_tags_map_size;
2349
2350 // simple heap allocator for allocating the tags for new memory
2351
2352 static LCK_MTX_EARLY_DECLARE(ztLock, &zone_locks_grp); /* heap lock */
2353
2354 /*
2355 * Array of all sizeclasses used by kalloc variants so that we can
2356 * have accounting per size class for each kalloc callsite
2357 */
2358 uint16_t zone_tags_sizeclasses[VM_TAG_SIZECLASSES];
2359
2360 enum{
2361 ztFreeIndexCount = 8,
2362 ztFreeIndexMax = (ztFreeIndexCount - 1),
2363 ztTagsPerBlock = 4
2364 };
2365
2366 struct ztBlock {
2367 #if __LITTLE_ENDIAN__
2368 uint64_t free:1,
2369 next:21,
2370 prev:21,
2371 size:21;
2372 #else
2373 // ztBlock needs free bit least significant
2374 #error !__LITTLE_ENDIAN__
2375 #endif
2376 };
2377 typedef struct ztBlock ztBlock;
2378
2379 static ztBlock * ztBlocks;
2380 static uint32_t ztBlocksCount;
2381 static uint32_t ztBlocksFree;
2382
2383 __startup_func
2384 void
__zone_site_register(vm_allocation_site_t * site)2385 __zone_site_register(vm_allocation_site_t *site)
2386 {
2387 if (zone_tagging_on) {
2388 vm_tag_alloc(site);
2389 }
2390 }
2391
2392 static uint32_t
ztLog2up(uint32_t size)2393 ztLog2up(uint32_t size)
2394 {
2395 if (1 == size) {
2396 size = 0;
2397 } else {
2398 size = 32 - __builtin_clz(size - 1);
2399 }
2400 return size;
2401 }
2402
2403 // pointer to the tag for an element
2404 static vm_tag_t *
ztSlot(zone_t zone,vm_offset_t element)2405 ztSlot(zone_t zone, vm_offset_t element)
2406 {
2407 vm_tag_t *result;
2408 if (zone->z_tags_inline) {
2409 result = (vm_tag_t *)ZTAGBASE(zone, element);
2410 if ((PAGE_MASK & element) >= zone_elem_size(zone)) {
2411 result++;
2412 }
2413 } else {
2414 result = &((vm_tag_t *)zone_tags_range.min_address)[ZTAGBASE(zone,
2415 element)[0] + (element & PAGE_MASK) / zone_elem_size(zone)];
2416 }
2417 return result;
2418 }
2419
2420 static uint32_t
ztLog2down(uint32_t size)2421 ztLog2down(uint32_t size)
2422 {
2423 size = 31 - __builtin_clz(size);
2424 return size;
2425 }
2426
2427 static void
ztFault(const void * address,size_t size,uint32_t flags)2428 ztFault(const void * address, size_t size, uint32_t flags)
2429 {
2430 vm_map_offset_t addr = (vm_map_offset_t) address;
2431 vm_map_offset_t page, end;
2432
2433 page = trunc_page(addr);
2434 end = round_page(addr + size);
2435
2436 for (; page < end; page += page_size) {
2437 if (!pmap_find_phys(kernel_pmap, page)) {
2438 kernel_memory_populate(page, PAGE_SIZE,
2439 KMA_NOFAIL | KMA_KOBJECT | flags,
2440 VM_KERN_MEMORY_DIAG);
2441 }
2442 }
2443 }
2444
2445 static boolean_t
ztPresent(const void * address,size_t size)2446 ztPresent(const void * address, size_t size)
2447 {
2448 vm_map_offset_t addr = (vm_map_offset_t) address;
2449 vm_map_offset_t page, end;
2450 boolean_t result;
2451
2452 page = trunc_page(addr);
2453 end = round_page(addr + size);
2454 for (result = TRUE; (page < end); page += page_size) {
2455 result = pmap_find_phys(kernel_pmap, page);
2456 if (!result) {
2457 break;
2458 }
2459 }
2460 return result;
2461 }
2462
2463
2464 void __unused
2465 ztDump(boolean_t sanity);
2466 void __unused
ztDump(boolean_t sanity)2467 ztDump(boolean_t sanity)
2468 {
2469 uint32_t q, cq, p;
2470
2471 for (q = 0; q <= ztFreeIndexMax; q++) {
2472 p = q;
2473 do{
2474 if (sanity) {
2475 cq = ztLog2down(ztBlocks[p].size);
2476 if (cq > ztFreeIndexMax) {
2477 cq = ztFreeIndexMax;
2478 }
2479 if (!ztBlocks[p].free
2480 || ((p != q) && (q != cq))
2481 || (ztBlocks[ztBlocks[p].next].prev != p)
2482 || (ztBlocks[ztBlocks[p].prev].next != p)) {
2483 kprintf("zterror at %d", p);
2484 ztDump(FALSE);
2485 kprintf("zterror at %d", p);
2486 assert(FALSE);
2487 }
2488 continue;
2489 }
2490 kprintf("zt[%03d]%c %d, %d, %d\n",
2491 p, ztBlocks[p].free ? 'F' : 'A',
2492 ztBlocks[p].next, ztBlocks[p].prev,
2493 ztBlocks[p].size);
2494 p = ztBlocks[p].next;
2495 if (p == q) {
2496 break;
2497 }
2498 }while (p != q);
2499 if (!sanity) {
2500 printf("\n");
2501 }
2502 }
2503 if (!sanity) {
2504 printf("-----------------------\n");
2505 }
2506 }
2507
2508
2509
2510 #define ZTBDEQ(idx) \
2511 ztBlocks[ztBlocks[(idx)].prev].next = ztBlocks[(idx)].next; \
2512 ztBlocks[ztBlocks[(idx)].next].prev = ztBlocks[(idx)].prev;
2513
2514 static void
ztFree(zone_t zone __unused,uint32_t index,uint32_t count)2515 ztFree(zone_t zone __unused, uint32_t index, uint32_t count)
2516 {
2517 uint32_t q, w, p, size, merge;
2518
2519 assert(count);
2520 ztBlocksFree += count;
2521
2522 // merge with preceding
2523 merge = (index + count);
2524 if ((merge < ztBlocksCount)
2525 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
2526 && ztBlocks[merge].free) {
2527 ZTBDEQ(merge);
2528 count += ztBlocks[merge].size;
2529 }
2530
2531 // merge with following
2532 merge = (index - 1);
2533 if ((merge > ztFreeIndexMax)
2534 && ztPresent(&ztBlocks[merge], sizeof(ztBlocks[merge]))
2535 && ztBlocks[merge].free) {
2536 size = ztBlocks[merge].size;
2537 count += size;
2538 index -= size;
2539 ZTBDEQ(index);
2540 }
2541
2542 q = ztLog2down(count);
2543 if (q > ztFreeIndexMax) {
2544 q = ztFreeIndexMax;
2545 }
2546 w = q;
2547 // queue in order of size
2548 while (TRUE) {
2549 p = ztBlocks[w].next;
2550 if (p == q) {
2551 break;
2552 }
2553 if (ztBlocks[p].size >= count) {
2554 break;
2555 }
2556 w = p;
2557 }
2558 ztBlocks[p].prev = index;
2559 ztBlocks[w].next = index;
2560
2561 // fault in first
2562 ztFault(&ztBlocks[index], sizeof(ztBlocks[index]), 0);
2563
2564 // mark first & last with free flag and size
2565 ztBlocks[index].free = TRUE;
2566 ztBlocks[index].size = count;
2567 ztBlocks[index].prev = w;
2568 ztBlocks[index].next = p;
2569 if (count > 1) {
2570 index += (count - 1);
2571 // fault in last
2572 ztFault(&ztBlocks[index], sizeof(ztBlocks[index]), 0);
2573 ztBlocks[index].free = TRUE;
2574 ztBlocks[index].size = count;
2575 }
2576 }
2577
2578 static uint32_t
ztAlloc(zone_t zone,uint32_t count)2579 ztAlloc(zone_t zone, uint32_t count)
2580 {
2581 uint32_t q, w, p, leftover;
2582
2583 assert(count);
2584
2585 q = ztLog2up(count);
2586 if (q > ztFreeIndexMax) {
2587 q = ztFreeIndexMax;
2588 }
2589 do{
2590 w = q;
2591 while (TRUE) {
2592 p = ztBlocks[w].next;
2593 if (p == q) {
2594 break;
2595 }
2596 if (ztBlocks[p].size >= count) {
2597 // dequeue, mark both ends allocated
2598 ztBlocks[w].next = ztBlocks[p].next;
2599 ztBlocks[ztBlocks[p].next].prev = w;
2600 ztBlocks[p].free = FALSE;
2601 ztBlocksFree -= ztBlocks[p].size;
2602 if (ztBlocks[p].size > 1) {
2603 ztBlocks[p + ztBlocks[p].size - 1].free = FALSE;
2604 }
2605
2606 // fault all the allocation
2607 ztFault(&ztBlocks[p], count * sizeof(ztBlocks[p]), 0);
2608 // mark last as allocated
2609 if (count > 1) {
2610 ztBlocks[p + count - 1].free = FALSE;
2611 }
2612 // free remainder
2613 leftover = ztBlocks[p].size - count;
2614 if (leftover) {
2615 ztFree(zone, p + ztBlocks[p].size - leftover, leftover);
2616 }
2617
2618 return p;
2619 }
2620 w = p;
2621 }
2622 q++;
2623 }while (q <= ztFreeIndexMax);
2624
2625 return -1U;
2626 }
2627
2628 __startup_func
2629 static void
zone_tagging_init(void)2630 zone_tagging_init(void)
2631 {
2632 // allocate submaps VM_KERN_MEMORY_DIAG
2633 zone_tagbase_range = zone_kmem_suballoc(zone_tagbase_range.min_address,
2634 zone_tagbase_map_size, VM_FLAGS_FIXED_RANGE_SUBALLOC,
2635 VM_KERN_MEMORY_DIAG, &zone_tagbase_map);
2636
2637 zone_tags_range = zone_kmem_suballoc(zone_tags_range.min_address,
2638 zone_tags_map_size, VM_FLAGS_FIXED_RANGE_SUBALLOC, VM_KERN_MEMORY_DIAG,
2639 &zone_tags_map);
2640
2641 ztBlocks = (ztBlock *) zone_tags_range.min_address;
2642 ztBlocksCount = (uint32_t)(zone_tags_map_size / sizeof(ztBlock));
2643
2644 // initialize the qheads
2645 lck_mtx_lock(&ztLock);
2646
2647 ztFault(&ztBlocks[0], sizeof(ztBlocks[0]), 0);
2648 for (uint32_t idx = 0; idx < ztFreeIndexCount; idx++) {
2649 ztBlocks[idx].free = TRUE;
2650 ztBlocks[idx].next = idx;
2651 ztBlocks[idx].prev = idx;
2652 ztBlocks[idx].size = 0;
2653 }
2654 // free remaining space
2655 ztFree(NULL, ztFreeIndexCount, ztBlocksCount - ztFreeIndexCount);
2656
2657 lck_mtx_unlock(&ztLock);
2658 }
2659
2660 static void
ztMemoryAdd(zone_t zone,vm_offset_t mem,vm_size_t size)2661 ztMemoryAdd(zone_t zone, vm_offset_t mem, vm_size_t size)
2662 {
2663 uint32_t * tagbase;
2664 uint32_t count, block, blocks, idx;
2665 size_t pages;
2666
2667 pages = atop(size);
2668 tagbase = ZTAGBASE(zone, mem);
2669
2670 lck_mtx_lock(&ztLock);
2671
2672 // fault tagbase
2673 ztFault(tagbase, pages * sizeof(uint32_t), 0);
2674
2675 if (!zone->z_tags_inline) {
2676 // allocate tags
2677 count = (uint32_t)(size / zone_elem_size(zone));
2678 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
2679 block = ztAlloc(zone, blocks);
2680 if (-1U == block) {
2681 ztDump(false);
2682 }
2683 assert(-1U != block);
2684 }
2685
2686 lck_mtx_unlock(&ztLock);
2687
2688 if (!zone->z_tags_inline) {
2689 // set tag base for each page
2690 block *= ztTagsPerBlock;
2691 for (idx = 0; idx < pages; idx++) {
2692 vm_offset_t esize = zone_elem_size(zone);
2693 tagbase[idx] = block + (uint32_t)((ptoa(idx) + esize - 1) / esize);
2694 }
2695 }
2696 }
2697
2698 static void
ztMemoryRemove(zone_t zone,vm_offset_t mem,vm_size_t size)2699 ztMemoryRemove(zone_t zone, vm_offset_t mem, vm_size_t size)
2700 {
2701 uint32_t * tagbase;
2702 uint32_t count, block, blocks, idx;
2703 size_t pages;
2704
2705 // set tag base for each page
2706 pages = atop(size);
2707 tagbase = ZTAGBASE(zone, mem);
2708 block = tagbase[0];
2709 for (idx = 0; idx < pages; idx++) {
2710 tagbase[idx] = 0xFFFFFFFF;
2711 }
2712
2713 lck_mtx_lock(&ztLock);
2714 if (!zone->z_tags_inline) {
2715 count = (uint32_t)(size / zone_elem_size(zone));
2716 blocks = ((count + ztTagsPerBlock - 1) / ztTagsPerBlock);
2717 assert(block != 0xFFFFFFFF);
2718 block /= ztTagsPerBlock;
2719 ztFree(NULL /* zone is unlocked */, block, blocks);
2720 }
2721
2722 lck_mtx_unlock(&ztLock);
2723 }
2724
2725 uint16_t
zone_index_from_tag_index(uint32_t sizeclass_idx)2726 zone_index_from_tag_index(uint32_t sizeclass_idx)
2727 {
2728 return zone_tags_sizeclasses[sizeclass_idx];
2729 }
2730
2731 #endif /* VM_TAG_SIZECLASSES */
2732 #endif /* !ZALLOC_TEST */
2733 #pragma mark zalloc helpers
2734 #if !ZALLOC_TEST
2735
2736 __pure2
2737 static inline uint16_t
zc_mag_size(void)2738 zc_mag_size(void)
2739 {
2740 return zc_magazine_size;
2741 }
2742
2743 __attribute__((noinline, cold))
2744 static void
zone_lock_was_contended(zone_t zone,zone_cache_t zc)2745 zone_lock_was_contended(zone_t zone, zone_cache_t zc)
2746 {
2747 lck_ticket_lock_nopreempt(&zone->z_lock, &zone_locks_grp);
2748
2749 /*
2750 * If zone caching has been disabled due to memory pressure,
2751 * then recording contention is not useful, give the system
2752 * time to recover.
2753 */
2754 if (__improbable(zone_caching_disabled)) {
2755 return;
2756 }
2757
2758 zone->z_contention_cur++;
2759
2760 if (zc == NULL || zc->zc_depot_max >= INT16_MAX) {
2761 return;
2762 }
2763
2764 /*
2765 * Let the depot grow based on how bad the contention is,
2766 * and how populated the zone is.
2767 */
2768 if (zone->z_contention_wma < 2 * Z_CONTENTION_WMA_UNIT) {
2769 if (zc->zc_depot_max * zpercpu_count() * 20u >=
2770 zone->z_elems_avail) {
2771 return;
2772 }
2773 }
2774 if (zone->z_contention_wma < 4 * Z_CONTENTION_WMA_UNIT) {
2775 if (zc->zc_depot_max * zpercpu_count() * 10u >=
2776 zone->z_elems_avail) {
2777 return;
2778 }
2779 }
2780 if (!zc_grow_threshold || zone->z_contention_wma <
2781 zc_grow_threshold * Z_CONTENTION_WMA_UNIT) {
2782 return;
2783 }
2784
2785 zc->zc_depot_max++;
2786 }
2787
2788 static inline void
zone_lock_nopreempt_check_contention(zone_t zone,zone_cache_t zc)2789 zone_lock_nopreempt_check_contention(zone_t zone, zone_cache_t zc)
2790 {
2791 if (lck_ticket_lock_try_nopreempt(&zone->z_lock, &zone_locks_grp)) {
2792 return;
2793 }
2794
2795 zone_lock_was_contended(zone, zc);
2796 }
2797
2798 static inline void
zone_lock_check_contention(zone_t zone,zone_cache_t zc)2799 zone_lock_check_contention(zone_t zone, zone_cache_t zc)
2800 {
2801 disable_preemption();
2802 zone_lock_nopreempt_check_contention(zone, zc);
2803 }
2804
2805 static inline void
zone_unlock_nopreempt(zone_t zone)2806 zone_unlock_nopreempt(zone_t zone)
2807 {
2808 lck_ticket_unlock_nopreempt(&zone->z_lock);
2809 }
2810
2811 static inline void
zone_depot_lock_nopreempt(zone_cache_t zc)2812 zone_depot_lock_nopreempt(zone_cache_t zc)
2813 {
2814 hw_lock_bit_nopreempt(&zc->zc_depot_lock, 0, &zone_locks_grp);
2815 }
2816
2817 static inline void
zone_depot_unlock_nopreempt(zone_cache_t zc)2818 zone_depot_unlock_nopreempt(zone_cache_t zc)
2819 {
2820 hw_unlock_bit_nopreempt(&zc->zc_depot_lock, 0);
2821 }
2822
2823 static inline void
zone_depot_lock(zone_cache_t zc)2824 zone_depot_lock(zone_cache_t zc)
2825 {
2826 hw_lock_bit(&zc->zc_depot_lock, 0, &zone_locks_grp);
2827 }
2828
2829 static inline void
zone_depot_unlock(zone_cache_t zc)2830 zone_depot_unlock(zone_cache_t zc)
2831 {
2832 hw_unlock_bit(&zc->zc_depot_lock, 0);
2833 }
2834
2835 const char *
zone_name(zone_t z)2836 zone_name(zone_t z)
2837 {
2838 return z->z_name;
2839 }
2840
2841 const char *
zone_heap_name(zone_t z)2842 zone_heap_name(zone_t z)
2843 {
2844 zone_security_flags_t zsflags = zone_security_config(z);
2845 if (__probable(zsflags.z_kheap_id < KHEAP_ID_COUNT)) {
2846 return kalloc_heap_names[zsflags.z_kheap_id];
2847 }
2848 return "invalid";
2849 }
2850
2851 static uint32_t
zone_alloc_pages_for_nelems(zone_t z,vm_size_t max_elems)2852 zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems)
2853 {
2854 vm_size_t elem_count, chunks;
2855
2856 elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) /
2857 zone_elem_size_safe(z);
2858 chunks = (max_elems + elem_count - 1) / elem_count;
2859
2860 return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages);
2861 }
2862
2863 static inline vm_size_t
zone_submaps_approx_size(void)2864 zone_submaps_approx_size(void)
2865 {
2866 vm_size_t size = 0;
2867
2868 for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
2869 if (zone_submaps[idx] != VM_MAP_NULL) {
2870 size += zone_submaps[idx]->size;
2871 }
2872 }
2873
2874 return size;
2875 }
2876
2877 static void
zone_cache_swap_magazines(zone_cache_t cache)2878 zone_cache_swap_magazines(zone_cache_t cache)
2879 {
2880 uint16_t count_a = cache->zc_alloc_cur;
2881 uint16_t count_f = cache->zc_free_cur;
2882 zone_element_t *elems_a = cache->zc_alloc_elems;
2883 zone_element_t *elems_f = cache->zc_free_elems;
2884
2885 z_debug_assert(count_a <= zc_mag_size());
2886 z_debug_assert(count_f <= zc_mag_size());
2887
2888 cache->zc_alloc_cur = count_f;
2889 cache->zc_free_cur = count_a;
2890 cache->zc_alloc_elems = elems_f;
2891 cache->zc_free_elems = elems_a;
2892 }
2893
2894 /*!
2895 * @function zone_magazine_load
2896 *
2897 * @brief
2898 * Cache the value of @c zm_cur on the cache to avoid a dependent load
2899 * on the allocation fastpath.
2900 */
2901 static void
zone_magazine_load(uint16_t * count,zone_element_t ** elems,zone_magazine_t mag)2902 zone_magazine_load(uint16_t *count, zone_element_t **elems, zone_magazine_t mag)
2903 {
2904 z_debug_assert(mag->zm_cur <= zc_mag_size());
2905 *count = mag->zm_cur;
2906 *elems = mag->zm_elems;
2907 }
2908
2909 /*!
2910 * @function zone_magazine_replace
2911 *
2912 * @brief
2913 * Unlod a magazine and load a new one instead.
2914 */
2915 static zone_magazine_t
zone_magazine_replace(uint16_t * count,zone_element_t ** elems,zone_magazine_t mag)2916 zone_magazine_replace(uint16_t *count, zone_element_t **elems,
2917 zone_magazine_t mag)
2918 {
2919 zone_magazine_t old;
2920
2921 old = (zone_magazine_t)((uintptr_t)*elems -
2922 offsetof(struct zone_magazine, zm_elems));
2923 old->zm_cur = *count;
2924 z_debug_assert(old->zm_cur <= zc_mag_size());
2925 zone_magazine_load(count, elems, mag);
2926
2927 return old;
2928 }
2929
2930 static zone_magazine_t
zone_magazine_alloc(zalloc_flags_t flags)2931 zone_magazine_alloc(zalloc_flags_t flags)
2932 {
2933 return zalloc_flags(zc_magazine_zone, flags | Z_ZERO);
2934 }
2935
2936 static void
zone_magazine_free(zone_magazine_t mag)2937 zone_magazine_free(zone_magazine_t mag)
2938 {
2939 (zfree)(zc_magazine_zone, mag);
2940 }
2941
2942 static void
zone_magazine_free_list(struct zone_depot * mags)2943 zone_magazine_free_list(struct zone_depot *mags)
2944 {
2945 zone_magazine_t mag, tmp;
2946
2947 STAILQ_FOREACH_SAFE(mag, mags, zm_link, tmp) {
2948 zone_magazine_free(mag);
2949 }
2950
2951 STAILQ_INIT(mags);
2952 }
2953
2954 static void
zone_enable_caching(zone_t zone)2955 zone_enable_caching(zone_t zone)
2956 {
2957 zone_cache_t caches;
2958
2959 caches = zalloc_percpu_permanent_type(struct zone_cache);
2960 zpercpu_foreach(zc, caches) {
2961 zone_magazine_load(&zc->zc_alloc_cur, &zc->zc_alloc_elems,
2962 zone_magazine_alloc(Z_WAITOK | Z_NOFAIL));
2963 zone_magazine_load(&zc->zc_free_cur, &zc->zc_free_elems,
2964 zone_magazine_alloc(Z_WAITOK | Z_NOFAIL));
2965 STAILQ_INIT(&zc->zc_depot);
2966 }
2967
2968 if (os_atomic_xchg(&zone->z_pcpu_cache, caches, release)) {
2969 panic("allocating caches for zone %s twice", zone->z_name);
2970 }
2971 }
2972
2973 bool
zone_maps_owned(vm_address_t addr,vm_size_t size)2974 zone_maps_owned(vm_address_t addr, vm_size_t size)
2975 {
2976 return from_zone_map(addr, size);
2977 }
2978
2979 void
zone_map_sizes(vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)2980 zone_map_sizes(
2981 vm_map_size_t *psize,
2982 vm_map_size_t *pfree,
2983 vm_map_size_t *plargest_free)
2984 {
2985 vm_map_size_t size, free, largest;
2986
2987 vm_map_sizes(zone_submaps[0], psize, pfree, plargest_free);
2988
2989 for (uint32_t i = 1; i < Z_SUBMAP_IDX_COUNT; i++) {
2990 vm_map_sizes(zone_submaps[i], &size, &free, &largest);
2991 *psize += size;
2992 *pfree += free;
2993 *plargest_free = MAX(*plargest_free, largest);
2994 }
2995 }
2996
2997 __attribute__((always_inline))
2998 vm_map_t
zone_submap(zone_security_flags_t zsflags)2999 zone_submap(zone_security_flags_t zsflags)
3000 {
3001 return zone_submaps[zsflags.z_submap_idx];
3002 }
3003
3004 unsigned
zpercpu_count(void)3005 zpercpu_count(void)
3006 {
3007 return zpercpu_early_count;
3008 }
3009
3010 #if ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC
3011 /*
3012 * Returns a random number of a given bit-width.
3013 *
3014 * DO NOT COPY THIS CODE OUTSIDE OF ZALLOC
3015 *
3016 * This uses Intel's rdrand because random() uses FP registers
3017 * which causes FP faults and allocations which isn't something
3018 * we can do from zalloc itself due to reentrancy problems.
3019 *
3020 * For pre-rdrand machines (which we no longer support),
3021 * we use a bad biased random generator that doesn't use FP.
3022 * Such HW is no longer supported, but VM of newer OSes on older
3023 * bare metal is made to limp along (with reduced security) this way.
3024 */
3025 static uint64_t
zalloc_random_mask64(uint32_t bits)3026 zalloc_random_mask64(uint32_t bits)
3027 {
3028 uint64_t mask = ~0ull >> (64 - bits);
3029 uint64_t v;
3030
3031 #if __x86_64__
3032 if (__probable(cpuid_features() & CPUID_FEATURE_RDRAND)) {
3033 asm volatile ("1: rdrand %0; jnc 1b\n" : "=r" (v) :: "cc");
3034 v &= mask;
3035 } else {
3036 disable_preemption();
3037 int cpu = cpu_number();
3038 v = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg,
3039 zone_bool_gen[cpu].zbg_entropy,
3040 ZONE_ENTROPY_CNT, bits);
3041 enable_preemption();
3042 }
3043 #else
3044 v = early_random() & mask;
3045 #endif
3046
3047 return v;
3048 }
3049
3050 /*
3051 * Returns a random number within [bound_min, bound_max)
3052 *
3053 * This isn't _exactly_ uniform, but the skew is small enough
3054 * not to matter for the consumers of this interface.
3055 *
3056 * Values within [bound_min, 2^64 % (bound_max - bound_min))
3057 * will be returned (bound_max - bound_min) / 2^64 more often
3058 * than values within [2^64 % (bound_max - bound_min), bound_max).
3059 */
3060 static uint32_t
zalloc_random_uniform32(uint32_t bound_min,uint32_t bound_max)3061 zalloc_random_uniform32(uint32_t bound_min, uint32_t bound_max)
3062 {
3063 uint64_t delta = bound_max - bound_min;
3064
3065 return bound_min + (uint32_t)(zalloc_random_mask64(64) % delta);
3066 }
3067
3068 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC */
3069 #if ZONE_ENABLE_LOGGING || CONFIG_PROB_GZALLOC
3070 /*
3071 * Track all kalloc zones of specified size for zlog name
3072 * kalloc.type.<size> or kalloc.type.var.<size> or kalloc.<size>
3073 */
3074 static bool
track_kalloc_zones(zone_t z,const char * logname)3075 track_kalloc_zones(zone_t z, const char *logname)
3076 {
3077 const char *prefix;
3078 size_t len;
3079 zone_security_flags_t zsflags = zone_security_config(z);
3080
3081 prefix = "kalloc.type.var.";
3082 len = strlen(prefix);
3083 if (zsflags.z_kalloc_type && zsflags.z_kheap_id == KHEAP_ID_KT_VAR &&
3084 strncmp(logname, prefix, len) == 0) {
3085 vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
3086
3087 return zone_elem_size(z) == sizeclass;
3088 }
3089
3090 prefix = "kalloc.type.";
3091 len = strlen(prefix);
3092 if (zsflags.z_kalloc_type && zsflags.z_kheap_id != KHEAP_ID_KT_VAR &&
3093 strncmp(logname, prefix, len) == 0) {
3094 vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
3095
3096 return zone_elem_size(z) == sizeclass;
3097 }
3098
3099 prefix = "kalloc.";
3100 len = strlen(prefix);
3101 if ((zsflags.z_kheap_id || zsflags.z_kalloc_type) &&
3102 strncmp(logname, prefix, len) == 0) {
3103 vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
3104
3105 return zone_elem_size(z) == sizeclass;
3106 }
3107
3108 return false;
3109 }
3110 #endif
3111
3112 int
track_this_zone(const char * zonename,const char * logname)3113 track_this_zone(const char *zonename, const char *logname)
3114 {
3115 unsigned int len;
3116 const char *zc = zonename;
3117 const char *lc = logname;
3118
3119 /*
3120 * Compare the strings. We bound the compare by MAX_ZONE_NAME.
3121 */
3122
3123 for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
3124 /*
3125 * If the current characters don't match, check for a space in
3126 * in the zone name and a corresponding period in the log name.
3127 * If that's not there, then the strings don't match.
3128 */
3129
3130 if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
3131 break;
3132 }
3133
3134 /*
3135 * The strings are equal so far. If we're at the end, then it's a match.
3136 */
3137
3138 if (*zc == '\0') {
3139 return TRUE;
3140 }
3141 }
3142
3143 return FALSE;
3144 }
3145
3146 #if DEBUG || DEVELOPMENT
3147
3148 vm_size_t
zone_element_info(void * addr,vm_tag_t * ptag)3149 zone_element_info(void *addr, vm_tag_t * ptag)
3150 {
3151 vm_size_t size = 0;
3152 vm_tag_t tag = VM_KERN_MEMORY_NONE;
3153 struct zone *src_zone;
3154
3155 if (from_zone_map(addr, sizeof(void *))) {
3156 src_zone = &zone_array[zone_index_from_ptr(addr)];
3157 #if VM_TAG_SIZECLASSES
3158 if (__improbable(src_zone->z_uses_tags)) {
3159 tag = *ztSlot(src_zone, (vm_offset_t)addr) >> 1;
3160 }
3161 #endif /* VM_TAG_SIZECLASSES */
3162 size = zone_elem_size_safe(src_zone);
3163 } else {
3164 #if CONFIG_GZALLOC
3165 gzalloc_element_size(addr, NULL, &size);
3166 #endif /* CONFIG_GZALLOC */
3167 }
3168 *ptag = tag;
3169 return size;
3170 }
3171
3172 #endif /* DEBUG || DEVELOPMENT */
3173 #endif /* !ZALLOC_TEST */
3174
3175 #pragma mark Zone zeroing and early random
3176 #if !ZALLOC_TEST
3177
3178 /*
3179 * Zone zeroing
3180 *
3181 * All allocations from zones are zeroed on free and are additionally
3182 * check that they are still zero on alloc. The check is
3183 * always on, on embedded devices. Perf regression was detected
3184 * on intel as we cant use the vectorized implementation of
3185 * memcmp_zero_ptr_aligned due to cyclic dependenices between
3186 * initization and allocation. Therefore we perform the check
3187 * on 20% of the allocations.
3188 */
3189 #if ZALLOC_ENABLE_ZERO_CHECK
3190 #if defined(__x86_64__) || defined(__arm__)
3191 /*
3192 * Peform zero validation on every 5th allocation
3193 */
3194 static TUNABLE(uint32_t, zzc_rate, "zzc_rate", 5);
3195 static uint32_t PERCPU_DATA(zzc_decrementer);
3196 #endif /* defined(__x86_64__) || defined(__arm__) */
3197
3198 /*
3199 * Determine if zero validation for allocation should be skipped
3200 */
3201 static bool
zalloc_skip_zero_check(void)3202 zalloc_skip_zero_check(void)
3203 {
3204 #if defined(__x86_64__) || defined(__arm__)
3205 uint32_t *counterp, cnt;
3206
3207 counterp = PERCPU_GET(zzc_decrementer);
3208 cnt = *counterp;
3209 if (__probable(cnt > 0)) {
3210 *counterp = cnt - 1;
3211 return true;
3212 }
3213 *counterp = zzc_rate - 1;
3214 #endif /* !(defined(__x86_64__) || defined(__arm__)) */
3215 return false;
3216 }
3217
3218 __abortlike
3219 static void
zalloc_uaf_panic(zone_t z,uintptr_t elem,size_t size)3220 zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size)
3221 {
3222 uint32_t esize = (uint32_t)zone_elem_size(z);
3223 uint32_t first_offs = ~0u;
3224 uintptr_t first_bits = 0, v;
3225 char buf[1024];
3226 int pos = 0;
3227
3228 #if __LP64__
3229 #define ZPF "0x%016lx"
3230 #else
3231 #define ZPF "0x%08lx"
3232 #endif
3233
3234 buf[0] = '\0';
3235
3236 for (uint32_t o = 0; o < size; o += sizeof(v)) {
3237 if ((v = *(uintptr_t *)(elem + o)) == 0) {
3238 continue;
3239 }
3240 pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
3241 "%5d: "ZPF, o, v);
3242 if (first_offs > o) {
3243 first_offs = o;
3244 first_bits = v;
3245 }
3246 }
3247
3248 (panic)("[%s%s]: element modified after free "
3249 "(off:%d, val:"ZPF", sz:%d, ptr:%p)%s",
3250 zone_heap_name(z), zone_name(z),
3251 first_offs, first_bits, esize, (void *)elem, buf);
3252
3253 #undef ZPF
3254 }
3255
3256 static void
zalloc_validate_element(zone_t zone,vm_offset_t elem,vm_size_t size,zalloc_flags_t flags)3257 zalloc_validate_element(zone_t zone, vm_offset_t elem, vm_size_t size,
3258 zalloc_flags_t flags)
3259 {
3260 #if CONFIG_GZALLOC
3261 if (zone->z_gzalloc_tracked) {
3262 return;
3263 }
3264 #endif /* CONFIG_GZALLOC */
3265
3266 if (flags & Z_NOZZC) {
3267 return;
3268 }
3269 if (memcmp_zero_ptr_aligned((void *)elem, size)) {
3270 zalloc_uaf_panic(zone, elem, size);
3271 }
3272 if (flags & Z_PCPU) {
3273 for (size_t i = zpercpu_count(); --i > 0;) {
3274 elem += PAGE_SIZE;
3275 if (memcmp_zero_ptr_aligned((void *)elem, size)) {
3276 zalloc_uaf_panic(zone, elem, size);
3277 }
3278 }
3279 }
3280 }
3281
3282 #endif /* ZALLOC_ENABLE_ZERO_CHECK */
3283
3284 static void
zone_early_scramble_rr(zone_t zone,zone_stats_t zstats)3285 zone_early_scramble_rr(zone_t zone, zone_stats_t zstats)
3286 {
3287 int cpu = cpu_number();
3288 zone_stats_t zs = zpercpu_get_cpu(zstats, cpu);
3289 uint32_t bits;
3290
3291 bits = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg,
3292 zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, 8);
3293
3294 zs->zs_alloc_rr += bits;
3295 zs->zs_alloc_rr %= zone->z_chunk_elems;
3296 }
3297
3298 #endif /* !ZALLOC_TEST */
3299 #pragma mark Zone Leak Detection
3300 #if !ZALLOC_TEST
3301 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
3302
3303 /*
3304 * Zone leak debugging code
3305 *
3306 * When enabled, this code keeps a log to track allocations to a particular
3307 * zone that have not yet been freed.
3308 *
3309 * Examining this log will reveal the source of a zone leak.
3310 *
3311 * The log is allocated only when logging is enabled (it is off by default),
3312 * so there is no effect on the system when it's turned off.
3313 *
3314 * Zone logging is enabled with the `zlog<n>=<zone>` boot-arg for each
3315 * zone name to log, with n starting at 1.
3316 *
3317 * Leaks debugging utilizes 2 tunables:
3318 * - zlsize (in kB) which describes how much "size" the record covers
3319 * (zones with smaller elements get more records, default is 4M).
3320 *
3321 * - zlfreq (in kB) which describes a sample rate in cumulative allocation
3322 * size at which automatic leak detection will sample allocations.
3323 * (default is 16k)
3324 *
3325 *
3326 * Zone corruption logging
3327 *
3328 * Logging can also be used to help identify the source of a zone corruption.
3329 *
3330 * First, identify the zone that is being corrupted,
3331 * then add "-zc zlog<n>=<zone name>" to the boot-args.
3332 *
3333 * When -zc is used in conjunction with zlog,
3334 * it changes the logging style to track both allocations and frees to the zone.
3335 *
3336 * When the corruption is detected, examining the log will show you the stack
3337 * traces of the callers who last allocated and freed any particular element in
3338 * the zone.
3339 *
3340 * Corruption debugging logs will have zrecs records
3341 * (tuned by the zrecs= boot-arg, 16k elements per G of RAM by default).
3342 */
3343
3344 #define ZRECORDS_MAX (256u << 10)
3345 #define ZRECORDS_DEFAULT (16u << 10)
3346 static TUNABLE(uint32_t, zrecs, "zrecs", 0);
3347 static TUNABLE(uint32_t, zlsize, "zlsize", 4 * 1024);
3348 static TUNABLE(uint32_t, zlfreq, "zlfreq", 16);
3349
3350 __startup_func
3351 static void
zone_leaks_init_zrecs(void)3352 zone_leaks_init_zrecs(void)
3353 {
3354 /*
3355 * Don't allow more than ZRECORDS_MAX records,
3356 * even if the user asked for more.
3357 *
3358 * This prevents accidentally hogging too much kernel memory
3359 * and making the system unusable.
3360 */
3361 if (zrecs == 0) {
3362 zrecs = ZRECORDS_DEFAULT *
3363 (uint32_t)((max_mem + (1ul << 30)) >> 30);
3364 }
3365 if (zrecs > ZRECORDS_MAX) {
3366 zrecs = ZRECORDS_MAX;
3367 }
3368 }
3369 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_leaks_init_zrecs);
3370
3371 static uint32_t
zone_leaks_record_count(zone_t z)3372 zone_leaks_record_count(zone_t z)
3373 {
3374 uint32_t recs = (zlsize << 10) / zone_elem_size(z);
3375
3376 return MIN(MAX(recs, ZRECORDS_DEFAULT), ZRECORDS_MAX);
3377 }
3378
3379 static uint32_t
zone_leaks_sample_rate(zone_t z)3380 zone_leaks_sample_rate(zone_t z)
3381 {
3382 return (zlfreq << 10) / zone_elem_size(z);
3383 }
3384
3385 #if ZONE_ENABLE_LOGGING
3386 /* Log allocations and frees to help debug a zone element corruption */
3387 static TUNABLE(bool, corruption_debug_flag, "-zc", false);
3388
3389 /*
3390 * A maximum of 10 zlog<n> boot args can be provided (zlog1 -> zlog10)
3391 */
3392 #define MAX_ZONES_LOG_REQUESTS 10
3393 /*
3394 * As all kalloc type zones of a specificified size are logged, by providing
3395 * a single zlog boot-arg, the maximum number of zones that can be logged
3396 * is higher than MAX_ZONES_LOG_REQUESTS
3397 */
3398 #define MAX_ZONES_LOGGED 20
3399
3400 static int num_zones_logged = 0;
3401
3402 /**
3403 * @function zone_setup_logging
3404 *
3405 * @abstract
3406 * Optionally sets up a zone for logging.
3407 *
3408 * @discussion
3409 * We recognized two boot-args:
3410 *
3411 * zlog=<zone_to_log>
3412 * zrecs=<num_records_in_log>
3413 * zlsize=<memory to cover for leaks>
3414 *
3415 * The zlog arg is used to specify the zone name that should be logged,
3416 * and zrecs/zlsize is used to control the size of the log.
3417 */
3418 static void
zone_setup_logging(zone_t z)3419 zone_setup_logging(zone_t z)
3420 {
3421 char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
3422 char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
3423 char zlog_val[MAX_ZONE_NAME]; /* the zone name we're logging, if any */
3424 bool logging_on = false;
3425
3426 if (num_zones_logged >= MAX_ZONES_LOGGED) {
3427 return;
3428 }
3429
3430 /*
3431 * Append kalloc heap name to zone name (if zone is used by kalloc)
3432 */
3433 snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3434
3435 /* zlog0 isn't allowed. */
3436 for (int i = 1; i <= MAX_ZONES_LOG_REQUESTS; i++) {
3437 snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
3438
3439 if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val))) {
3440 if (track_this_zone(zone_name, zlog_val) ||
3441 track_kalloc_zones(z, zlog_val)) {
3442 logging_on = true;
3443 break;
3444 }
3445 }
3446 }
3447
3448 /*
3449 * Backwards compat. with the old boot-arg used to specify single zone
3450 * logging i.e. zlog Needs to happen after the newer zlogn checks
3451 * because the prefix will match all the zlogn
3452 * boot-args.
3453 */
3454 if (!logging_on &&
3455 PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val))) {
3456 if (track_this_zone(zone_name, zlog_val) ||
3457 track_kalloc_zones(z, zlog_val)) {
3458 logging_on = true;
3459 }
3460 }
3461
3462 /*
3463 * If we want to log a zone, see if we need to allocate buffer space for
3464 * the log.
3465 *
3466 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
3467 * we have to defer allocation in that case.
3468 *
3469 * zone_init() will finish the job.
3470 *
3471 * If we want to log one of the VM related zones that's set up early on,
3472 * we will skip allocation of the log until zinit is called again later
3473 * on some other zone.
3474 */
3475 if (logging_on) {
3476 if (os_atomic_inc(&num_zones_logged, relaxed) >
3477 MAX_ZONES_LOGGED) {
3478 os_atomic_dec(&num_zones_logged, relaxed);
3479 return;
3480 }
3481
3482 if (corruption_debug_flag) {
3483 z->z_btlog = btlog_create(BTLOG_LOG, zrecs, 0);
3484 } else {
3485 z->z_btlog = btlog_create(BTLOG_HASH,
3486 zone_leaks_record_count(z), 0);
3487 }
3488 if (z->z_btlog) {
3489 z->z_log_on = true;
3490 printf("zone[%s%s]: logging enabled\n",
3491 zone_heap_name(z), z->z_name);
3492 } else {
3493 printf("zone[%s%s]: failed to enable logging\n",
3494 zone_heap_name(z), z->z_name);
3495 }
3496 }
3497 }
3498
3499 #endif /* ZONE_ENABLE_LOGGING */
3500 #if CONFIG_ZLEAKS
3501
3502 static thread_call_data_t zone_leaks_callout;
3503
3504 /*
3505 * The zone leak detector, abbreviated 'zleak', keeps track
3506 * of a subset of the currently outstanding allocations
3507 * made by the zone allocator.
3508 *
3509 * It will engage itself automatically if the zone map usage
3510 * goes above zleak_pages_global_wired_threshold pages.
3511 *
3512 * When that threshold is reached, zones who use more than
3513 * zleak_pages_per_zone_wired_threshold pages will get
3514 * a BTLOG_HASH btlog with sampling to minimize perf impact,
3515 * yet receive statistical data about the backtrace that is
3516 * the most likely to cause the leak.
3517 *
3518 * If the zone goes under the threshold enough, then the log
3519 * is disabled and backtraces freed. Data can be collected
3520 * from userspace with the zlog(1) command.
3521 */
3522
3523 /* whether the zleaks subsystem thinks the map is under pressure */
3524 uint32_t zleak_active;
3525 SECURITY_READ_ONLY_LATE(vm_size_t) zleak_max_zonemap_size;
3526
3527 /* Size of zone map at which to start collecting data */
3528 static size_t zleak_pages_global_wired_threshold = ~0;
3529 vm_size_t zleak_global_tracking_threshold = ~0;
3530
3531 /* Size a zone will have before we will collect data on it */
3532 static size_t zleak_pages_per_zone_wired_threshold = ~0;
3533 vm_size_t zleak_per_zone_tracking_threshold = ~0;
3534
3535 static inline bool
zleak_should_enable_for_zone(zone_t z)3536 zleak_should_enable_for_zone(zone_t z)
3537 {
3538 if (z->z_log_on) {
3539 return false;
3540 }
3541 if (z->z_btlog) {
3542 return false;
3543 }
3544 if (!zleak_active) {
3545 return false;
3546 }
3547 return z->z_wired_cur >= zleak_pages_per_zone_wired_threshold;
3548 }
3549
3550 static inline bool
zleak_should_disable_for_zone(zone_t z)3551 zleak_should_disable_for_zone(zone_t z)
3552 {
3553 if (z->z_log_on) {
3554 return false;
3555 }
3556 if (!z->z_btlog) {
3557 return false;
3558 }
3559 if (!zleak_active) {
3560 return true;
3561 }
3562 return z->z_wired_cur < zleak_pages_per_zone_wired_threshold / 2;
3563 }
3564
3565 static inline bool
zleak_should_activate(size_t pages)3566 zleak_should_activate(size_t pages)
3567 {
3568 return !zleak_active && pages >= zleak_pages_global_wired_threshold;
3569 }
3570
3571 static inline bool
zleak_should_deactivate(size_t pages)3572 zleak_should_deactivate(size_t pages)
3573 {
3574 return zleak_active && pages < zleak_pages_global_wired_threshold / 2;
3575 }
3576
3577 static void
zleaks_enable_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)3578 zleaks_enable_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
3579 {
3580 size_t pages = os_atomic_load(&zone_pages_wired, relaxed);
3581 btlog_t log;
3582
3583 if (zleak_should_activate(pages)) {
3584 zleak_active = 1;
3585 } else if (zleak_should_deactivate(pages)) {
3586 zleak_active = 0;
3587 }
3588
3589 zone_foreach(z) {
3590 if (zleak_should_disable_for_zone(z)) {
3591 log = z->z_btlog;
3592 z->z_btlog = NULL;
3593 assert(z->z_btlog_disabled == NULL);
3594 btlog_disable(log);
3595 z->z_btlog_disabled = log;
3596 }
3597
3598 if (zleak_should_enable_for_zone(z)) {
3599 log = z->z_btlog_disabled;
3600 if (log == NULL) {
3601 log = btlog_create(BTLOG_HASH,
3602 zone_leaks_record_count(z),
3603 zone_leaks_sample_rate(z));
3604 } else if (btlog_enable(log) == KERN_SUCCESS) {
3605 z->z_btlog_disabled = NULL;
3606 } else {
3607 log = NULL;
3608 }
3609 os_atomic_store(&z->z_btlog, log, release);
3610 }
3611 }
3612 }
3613
3614 __startup_func
3615 static void
zleak_init(void)3616 zleak_init(void)
3617 {
3618 zleak_max_zonemap_size = ptoa(zone_pages_wired_max);
3619
3620 zleak_update_threshold(&zleak_global_tracking_threshold,
3621 zleak_max_zonemap_size / 2);
3622 zleak_update_threshold(&zleak_per_zone_tracking_threshold,
3623 zleak_global_tracking_threshold / 8);
3624
3625 thread_call_setup_with_options(&zone_leaks_callout,
3626 zleaks_enable_async, NULL, THREAD_CALL_PRIORITY_USER,
3627 THREAD_CALL_OPTIONS_ONCE);
3628 }
3629 STARTUP(ZALLOC, STARTUP_RANK_SECOND, zleak_init);
3630
3631 kern_return_t
zleak_update_threshold(vm_size_t * arg,uint64_t value)3632 zleak_update_threshold(vm_size_t *arg, uint64_t value)
3633 {
3634 if (value >= zleak_max_zonemap_size) {
3635 return KERN_INVALID_VALUE;
3636 }
3637
3638 if (arg == &zleak_global_tracking_threshold) {
3639 zleak_global_tracking_threshold = (vm_size_t)value;
3640 zleak_pages_global_wired_threshold = atop(value);
3641 if (startup_phase >= STARTUP_SUB_THREAD_CALL) {
3642 thread_call_enter(&zone_leaks_callout);
3643 }
3644 return KERN_SUCCESS;
3645 }
3646
3647 if (arg == &zleak_per_zone_tracking_threshold) {
3648 zleak_per_zone_tracking_threshold = (vm_size_t)value;
3649 zleak_pages_per_zone_wired_threshold = atop(value);
3650 if (startup_phase >= STARTUP_SUB_THREAD_CALL) {
3651 thread_call_enter(&zone_leaks_callout);
3652 }
3653 return KERN_SUCCESS;
3654 }
3655
3656 return KERN_INVALID_ARGUMENT;
3657 }
3658
3659 static void
panic_display_zleaks(bool has_syms)3660 panic_display_zleaks(bool has_syms)
3661 {
3662 bool did_header = false;
3663 vm_address_t bt[BTLOG_MAX_DEPTH];
3664 uint32_t len, count;
3665
3666 zone_foreach(z) {
3667 btlog_t log = z->z_btlog;
3668
3669 if (log == NULL || btlog_get_type(log) != BTLOG_HASH) {
3670 continue;
3671 }
3672
3673 count = btlog_guess_top(log, bt, &len);
3674 if (count == 0) {
3675 continue;
3676 }
3677
3678 if (!did_header) {
3679 paniclog_append_noflush("Zone (suspected) leak report:\n");
3680 did_header = true;
3681 }
3682
3683 paniclog_append_noflush(" Zone: %s%s\n",
3684 zone_heap_name(z), zone_name(z));
3685 paniclog_append_noflush(" Count: %d (%ld bytes)\n", count,
3686 (long)count * zone_scale_for_percpu(z, zone_elem_size(z)));
3687 paniclog_append_noflush(" Size: %ld\n",
3688 (long)zone_size_wired(z));
3689 paniclog_append_noflush(" Top backtrace:\n");
3690 for (uint32_t i = 0; i < len; i++) {
3691 if (has_syms) {
3692 paniclog_append_noflush(" %p ", (void *)bt[i]);
3693 panic_print_symbol_name(bt[i]);
3694 paniclog_append_noflush("\n");
3695 } else {
3696 paniclog_append_noflush(" %p\n", (void *)bt[i]);
3697 }
3698 }
3699
3700 kmod_panic_dump(bt, len);
3701 paniclog_append_noflush("\n");
3702 }
3703 }
3704 #endif /* CONFIG_ZLEAKS */
3705
3706 static void
zalloc_log(btlog_t log,vm_offset_t addr,zalloc_flags_t flags,void * fp)3707 zalloc_log(btlog_t log, vm_offset_t addr, zalloc_flags_t flags, void *fp)
3708 {
3709 btref_t ref;
3710
3711 if (btlog_sample(log)) {
3712 ref = btref_get(fp, (flags & Z_NOWAIT) ? BTREF_GET_NOWAIT : 0);
3713 btlog_record(log, (void *)addr, ZOP_ALLOC, ref);
3714 }
3715 }
3716
3717 static void
zfree_log(btlog_t log,vm_offset_t addr,void * fp)3718 zfree_log(btlog_t log, vm_offset_t addr, void *fp)
3719 {
3720 /*
3721 * See if we're doing logging on this zone.
3722 *
3723 * There are two styles of logging used depending on
3724 * whether we're trying to catch a leak or corruption.
3725 */
3726 if (btlog_get_type(log) == BTLOG_LOG) {
3727 /*
3728 * We're logging to catch a corruption.
3729 *
3730 * Add a record of this zfree operation to log.
3731 */
3732 btlog_record(log, (void *)addr, ZOP_FREE,
3733 btref_get(fp, BTREF_GET_NOWAIT));
3734 } else {
3735 /*
3736 * We're logging to catch a leak.
3737 *
3738 * Remove any record we might have for this element
3739 * since it's being freed. Note that we may not find it
3740 * if the buffer overflowed and that's OK.
3741 *
3742 * Since the log is of a limited size, old records get
3743 * overwritten if there are more zallocs than zfrees.
3744 */
3745 btlog_erase(log, (void *)addr);
3746 }
3747 }
3748
3749 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
3750 #endif /* !ZALLOC_TEST */
3751 #pragma mark zone (re)fill
3752 #if !ZALLOC_TEST
3753
3754 /*!
3755 * @defgroup Zone Refill
3756 * @{
3757 *
3758 * @brief
3759 * Functions handling The zone refill machinery.
3760 *
3761 * @discussion
3762 * Zones are refilled based on 2 mechanisms: direct expansion, async expansion.
3763 *
3764 * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is
3765 * dropping below half of its @c z_elems_rsv (0 for most zones) and will:
3766 *
3767 * - call @c zone_expand_locked() directly if the caller is allowed to block,
3768 *
3769 * - wakeup the asynchroous expansion thread call if the caller is not allowed
3770 * to block, or if the reserve becomes depleted.
3771 *
3772 *
3773 * <h2>Synchronous expansion</h2>
3774 *
3775 * This mechanism is actually the only one that may refill a zone, and all the
3776 * other ones funnel through this one eventually.
3777 *
3778 * @c zone_expand_locked() implements the core of the expansion mechanism,
3779 * and will do so while a caller specified predicate is true.
3780 *
3781 * Zone expansion allows for up to 2 threads to concurrently refill the zone:
3782 * - one VM privileged thread,
3783 * - one regular thread.
3784 *
3785 * Regular threads that refill will put down their identity in @c z_expander,
3786 * so that priority inversion avoidance can be implemented.
3787 *
3788 * However, VM privileged threads are allowed to use VM page reserves,
3789 * which allows for the system to recover from extreme memory pressure
3790 * situations, allowing for the few allocations that @c zone_gc() or
3791 * killing processes require.
3792 *
3793 * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit
3794 * is set. @c z_expander is not necessarily the identity of this VM privileged
3795 * thread (it is if the VM privileged thread came in first, but wouldn't be, and
3796 * could even be @c THREAD_NULL otherwise).
3797 *
3798 * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid
3799 * spending a whole pointer on priority inheritance for VM privileged threads
3800 * (and other issues related to having two owners), we use the rwlock boost as
3801 * a stop gap to avoid priority inversions.
3802 *
3803 *
3804 * <h2>Chunk wiring policies</h2>
3805 *
3806 * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time
3807 * to try to minimize fragmentation relative to element sizes not aligning with
3808 * a chunk size well. However, this can grow large and be hard to fulfill on
3809 * a system under a lot of memory pressure (chunks can be as long as 8 pages on
3810 * 4k page systems).
3811 *
3812 * This is why, when under memory pressure the system allows chunks to be
3813 * partially populated. The metadata of the first page in the chunk maintains
3814 * the count of actually populated pages.
3815 *
3816 * The metadata for addresses assigned to a zone are found of 4 queues:
3817 * - @c z_pageq_empty has chunk heads with populated pages and no allocated
3818 * elements (those can be targeted by @c zone_gc()),
3819 * - @c z_pageq_partial has chunk heads with populated pages that are partially
3820 * used,
3821 * - @c z_pageq_full has chunk heads with populated pages with no free elements
3822 * left,
3823 * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to
3824 * the zone forever (if @c z_va_sequester is enabled), or the first secondary
3825 * metadata for a chunk whose corresponding page is not populated in the
3826 * chunk.
3827 *
3828 * When new pages need to be wired/populated, chunks from the @c z_pageq_va
3829 * queues are preferred.
3830 *
3831 *
3832 * <h2>Asynchronous expansion</h2>
3833 *
3834 * This mechanism allows for refilling zones used mostly with non blocking
3835 * callers. It relies on a thread call (@c zone_expand_callout) which will
3836 * iterate all zones and refill the ones marked with @c z_async_refilling.
3837 *
3838 * NOTE: If the calling thread for zalloc_noblock is lower priority than
3839 * the thread_call, then zalloc_noblock to an empty zone may succeed.
3840 *
3841 *
3842 * <h2>Dealing with zone allocations from the mach VM code</h2>
3843 *
3844 * The implementation of the mach VM itself uses the zone allocator
3845 * for things like the vm_map_entry data structure. In order to prevent
3846 * a recursion problem when adding more pages to a zone, the VM zones
3847 * use the Z_SUBMAP_IDX_VM submap which doesn't use kmem_alloc()
3848 * or any VM map functions to allocate.
3849 *
3850 * Instead, a really simple coalescing first-fit allocator is used
3851 * for this submap, and no one else than zalloc can allocate from it.
3852 *
3853 * Memory is directly populated which doesn't require allocation of
3854 * VM map entries, and avoids recursion. The cost of this scheme however,
3855 * is that `vm_map_lookup_entry` will not function on those addresses
3856 * (nor any API relying on it).
3857 */
3858
3859 static thread_call_data_t zone_expand_callout;
3860
3861 static inline kma_flags_t
zone_kma_flags(zone_t z,zone_security_flags_t zsflags,zalloc_flags_t flags)3862 zone_kma_flags(zone_t z, zone_security_flags_t zsflags, zalloc_flags_t flags)
3863 {
3864 kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO;
3865
3866 if (zsflags.z_noencrypt) {
3867 kmaflags |= KMA_NOENCRYPT;
3868 }
3869 if (flags & Z_NOPAGEWAIT) {
3870 kmaflags |= KMA_NOPAGEWAIT;
3871 }
3872 if (z->z_permanent || (!z->z_destructible && zsflags.z_va_sequester)) {
3873 kmaflags |= KMA_PERMANENT;
3874 }
3875 if (zsflags.z_submap_from_end) {
3876 kmaflags |= KMA_LAST_FREE;
3877 }
3878
3879 return kmaflags;
3880 }
3881
3882 static inline void
zone_add_wired_pages(uint32_t pages)3883 zone_add_wired_pages(uint32_t pages)
3884 {
3885 size_t count = os_atomic_add(&zone_pages_wired, pages, relaxed);
3886
3887 #if CONFIG_ZLEAKS
3888 if (__improbable(zleak_should_activate(count) &&
3889 startup_phase >= STARTUP_SUB_THREAD_CALL)) {
3890 thread_call_enter(&zone_leaks_callout);
3891 }
3892 #else
3893 (void)count;
3894 #endif
3895 }
3896
3897 static inline void
zone_remove_wired_pages(uint32_t pages)3898 zone_remove_wired_pages(uint32_t pages)
3899 {
3900 size_t count = os_atomic_sub(&zone_pages_wired, pages, relaxed);
3901
3902 #if CONFIG_ZLEAKS
3903 if (__improbable(zleak_should_deactivate(count) &&
3904 startup_phase >= STARTUP_SUB_THREAD_CALL)) {
3905 thread_call_enter(&zone_leaks_callout);
3906 }
3907 #else
3908 (void)count;
3909 #endif
3910 }
3911
3912 /*!
3913 * @function zcram_and_lock()
3914 *
3915 * @brief
3916 * Prepare some memory for being usable for allocation purposes.
3917 *
3918 * @discussion
3919 * Prepare memory in <code>[addr + ptoa(pg_start), addr + ptoa(pg_end))</code>
3920 * to be usable in the zone.
3921 *
3922 * This function assumes the metadata is already populated for the range.
3923 *
3924 * Calling this function with @c pg_start being 0 means that the memory
3925 * is either a partial chunk, or a full chunk, that isn't published anywhere
3926 * and the initialization can happen without locks held.
3927 *
3928 * Calling this function with a non zero @c pg_start means that we are extending
3929 * an existing chunk: the memory in <code>[addr, addr + ptoa(pg_start))</code>,
3930 * is already usable and published in the zone, so extending it requires holding
3931 * the zone lock.
3932 *
3933 * @param zone The zone to cram new populated pages into
3934 * @param addr The base address for the chunk(s)
3935 * @param pg_va_new The number of virtual pages newly assigned to the zone
3936 * @param pg_start The first newly populated page relative to @a addr.
3937 * @param pg_end The after-last newly populated page relative to @a addr.
3938 * @param lock 0 or ZM_ALLOC_SIZE_LOCK (used by early crams)
3939 */
3940 static void
zcram_and_lock(zone_t zone,vm_offset_t addr,uint32_t pg_va_new,uint32_t pg_start,uint32_t pg_end,uint16_t lock)3941 zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new,
3942 uint32_t pg_start, uint32_t pg_end, uint16_t lock)
3943 {
3944 zone_id_t zindex = zone_index(zone);
3945 vm_offset_t elem_size = zone_elem_size_safe(zone);
3946 uint32_t free_start = 0, free_end = 0;
3947 uint32_t oob_offs = zone_oob_offs(zone);
3948
3949 struct zone_page_metadata *meta = zone_meta_from_addr(addr);
3950 uint32_t chunk_pages = zone->z_chunk_pages;
3951 bool guarded = meta->zm_guarded;
3952
3953 assert(pg_start < pg_end && pg_end <= chunk_pages);
3954
3955 if (pg_start == 0) {
3956 uint16_t chunk_len = (uint16_t)pg_end;
3957 uint16_t secondary_len = ZM_SECONDARY_PAGE;
3958 bool inline_bitmap = false;
3959
3960 if (zone->z_percpu) {
3961 chunk_len = 1;
3962 secondary_len = ZM_SECONDARY_PCPU_PAGE;
3963 assert(pg_end == zpercpu_count());
3964 }
3965 if (!zone->z_permanent) {
3966 inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages;
3967 }
3968
3969 meta[0] = (struct zone_page_metadata){
3970 .zm_index = zindex,
3971 .zm_guarded = guarded,
3972 .zm_inline_bitmap = inline_bitmap,
3973 .zm_chunk_len = chunk_len,
3974 .zm_alloc_size = lock,
3975 };
3976
3977 for (uint16_t i = 1; i < chunk_pages; i++) {
3978 meta[i] = (struct zone_page_metadata){
3979 .zm_index = zindex,
3980 .zm_guarded = guarded,
3981 .zm_inline_bitmap = inline_bitmap,
3982 .zm_chunk_len = secondary_len,
3983 .zm_page_index = (uint8_t)i,
3984 .zm_subchunk_len = (uint8_t)(chunk_pages - i),
3985 };
3986 }
3987
3988 free_end = (uint32_t)(ptoa(chunk_len) - oob_offs) / elem_size;
3989 if (!zone->z_permanent) {
3990 zone_meta_bits_init(meta, free_end, zone->z_chunk_elems);
3991 }
3992 } else {
3993 assert(!zone->z_percpu && !zone->z_permanent);
3994
3995 free_end = (uint32_t)(ptoa(pg_end) - oob_offs) / elem_size;
3996 free_start = (uint32_t)(ptoa(pg_start) - oob_offs) / elem_size;
3997 }
3998
3999 #if VM_TAG_SIZECLASSES
4000 if (__improbable(zone->z_uses_tags)) {
4001 assert(!zone->z_percpu);
4002 ztMemoryAdd(zone, addr + ptoa(pg_start),
4003 ptoa(pg_end - pg_start));
4004 }
4005 #endif /* VM_TAG_SIZECLASSES */
4006
4007 /*
4008 * Insert the initialized pages / metadatas into the right lists.
4009 */
4010
4011 zone_lock(zone);
4012 assert(zone->z_self == zone);
4013
4014 if (pg_start != 0) {
4015 assert(meta->zm_chunk_len == pg_start);
4016
4017 zone_meta_bits_merge(meta, free_start, free_end);
4018 meta->zm_chunk_len = (uint16_t)pg_end;
4019
4020 /*
4021 * consume the zone_meta_lock_in_partial()
4022 * done in zone_expand_locked()
4023 */
4024 zone_meta_alloc_size_sub(zone, meta, ZM_ALLOC_SIZE_LOCK);
4025 zone_meta_remqueue(zone, meta);
4026 }
4027
4028 if (zone->z_permanent || meta->zm_alloc_size) {
4029 zone_meta_queue_push(zone, &zone->z_pageq_partial, meta);
4030 } else {
4031 zone_meta_queue_push(zone, &zone->z_pageq_empty, meta);
4032 zone->z_wired_empty += zone->z_percpu ? 1 : pg_end;
4033 }
4034 if (pg_end < chunk_pages) {
4035 /* push any non populated residual VA on z_pageq_va */
4036 zone_meta_queue_push(zone, &zone->z_pageq_va, meta + pg_end);
4037 }
4038
4039 zone_elems_free_add(zone, free_end - free_start);
4040 zone->z_elems_avail += free_end - free_start;
4041 zone->z_wired_cur += zone->z_percpu ? 1 : pg_end - pg_start;
4042 if (pg_va_new) {
4043 zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new;
4044 }
4045 if (zone->z_wired_hwm < zone->z_wired_cur) {
4046 zone->z_wired_hwm = zone->z_wired_cur;
4047 }
4048
4049 #if CONFIG_ZLEAKS
4050 if (__improbable(zleak_should_enable_for_zone(zone) &&
4051 startup_phase >= STARTUP_SUB_THREAD_CALL)) {
4052 thread_call_enter(&zone_leaks_callout);
4053 }
4054 #endif /* CONFIG_ZLEAKS */
4055
4056 zone_add_wired_pages(pg_end - pg_start);
4057 }
4058
4059 static void
zcram(zone_t zone,vm_offset_t addr,uint32_t pages,uint16_t lock)4060 zcram(zone_t zone, vm_offset_t addr, uint32_t pages, uint16_t lock)
4061 {
4062 uint32_t chunk_pages = zone->z_chunk_pages;
4063
4064 assert(pages % chunk_pages == 0);
4065 for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) {
4066 zcram_and_lock(zone, addr, chunk_pages, 0, chunk_pages, lock);
4067 zone_unlock(zone);
4068 }
4069 }
4070
4071 __startup_func
4072 void
zone_cram_early(zone_t zone,vm_offset_t newmem,vm_size_t size)4073 zone_cram_early(zone_t zone, vm_offset_t newmem, vm_size_t size)
4074 {
4075 uint32_t pages = (uint32_t)atop(size);
4076
4077 assert(from_zone_map(newmem, size));
4078 assert3u(size % ptoa(zone->z_chunk_pages), ==, 0);
4079 assert3u(startup_phase, <, STARTUP_SUB_ZALLOC);
4080
4081 /*
4082 * The early pages we move at the pmap layer can't be "depopulated"
4083 * because there's no vm_page_t for them.
4084 *
4085 * "Lock" them so that they never hit z_pageq_empty.
4086 */
4087 bzero((void *)newmem, size);
4088 zcram(zone, newmem, pages, ZM_ALLOC_SIZE_LOCK);
4089 }
4090
4091 __attribute__((overloadable))
4092 static inline bool
zone_submap_is_sequestered(zone_submap_idx_t idx)4093 zone_submap_is_sequestered(zone_submap_idx_t idx)
4094 {
4095 switch (idx) {
4096 case Z_SUBMAP_IDX_READ_ONLY:
4097 case Z_SUBMAP_IDX_VM:
4098 return true;
4099 case Z_SUBMAP_IDX_DATA:
4100 return false;
4101 default:
4102 return ZSECURITY_CONFIG(SEQUESTER);
4103 }
4104 }
4105
4106 __attribute__((overloadable))
4107 static inline bool
zone_submap_is_sequestered(zone_security_flags_t zsflags)4108 zone_submap_is_sequestered(zone_security_flags_t zsflags)
4109 {
4110 return zone_submap_is_sequestered(zsflags.z_submap_idx);
4111 }
4112
4113 /*!
4114 * @function zone_submap_alloc_sequestered_va
4115 *
4116 * @brief
4117 * Allocates VA without using vm_find_space().
4118 *
4119 * @discussion
4120 * Allocate VA quickly without using the slower vm_find_space() for cases
4121 * when the submaps are fully sequestered.
4122 *
4123 * The VM submap is used to implement the VM itself so it is always sequestered,
4124 * as it can't kmem_alloc which needs to always allocate vm entries.
4125 * However, it can use vm_map_enter() which tries to coalesce entries, which
4126 * always works, so the VM map only ever needs 2 entries (one for each end).
4127 *
4128 * The RO submap is similarly always sequestered if it exists (as a non
4129 * sequestered RO submap makes very little sense).
4130 *
4131 * The allocator is a very simple bump-allocator
4132 * that allocates from either end.
4133 */
4134 static kern_return_t
zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags,uint32_t pages,vm_offset_t * addrp)4135 zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags, uint32_t pages,
4136 vm_offset_t *addrp)
4137 {
4138 vm_size_t size = ptoa(pages);
4139 vm_map_t map = zone_submap(zsflags);
4140 vm_map_entry_t first, last;
4141 vm_map_offset_t addr;
4142
4143 vm_map_lock(map);
4144
4145 first = vm_map_first_entry(map);
4146 last = vm_map_last_entry(map);
4147
4148 if (first->vme_end + size > last->vme_start) {
4149 vm_map_unlock(map);
4150 return KERN_NO_SPACE;
4151 }
4152
4153 if (zsflags.z_submap_from_end) {
4154 last->vme_start -= size;
4155 addr = last->vme_start;
4156 VME_OFFSET_SET(last, addr);
4157 } else {
4158 addr = first->vme_end;
4159 first->vme_end += size;
4160 }
4161 map->size += size;
4162
4163 vm_map_unlock(map);
4164
4165 *addrp = addr;
4166 return KERN_SUCCESS;
4167 }
4168
4169 void
zone_fill_initially(zone_t zone,vm_size_t nelems)4170 zone_fill_initially(zone_t zone, vm_size_t nelems)
4171 {
4172 kma_flags_t kmaflags = KMA_NOFAIL | KMA_PERMANENT;
4173 kern_return_t kr;
4174 vm_offset_t addr;
4175 uint32_t pages;
4176 zone_security_flags_t zsflags = zone_security_config(zone);
4177
4178 assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible);
4179 assert(zone->z_elems_avail == 0);
4180
4181 kmaflags |= zone_kma_flags(zone, zsflags, Z_WAITOK);
4182 pages = zone_alloc_pages_for_nelems(zone, nelems);
4183 if (zone_submap_is_sequestered(zsflags)) {
4184 kr = zone_submap_alloc_sequestered_va(zsflags, pages, &addr);
4185 if (kr != KERN_SUCCESS) {
4186 panic("zone_submap_alloc_sequestered_va() "
4187 "of %u pages failed", pages);
4188 }
4189 kernel_memory_populate(addr, ptoa(pages),
4190 kmaflags, VM_KERN_MEMORY_ZONE);
4191 } else {
4192 assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
4193 kmem_alloc(zone_submap(zsflags), &addr, ptoa(pages),
4194 kmaflags, VM_KERN_MEMORY_ZONE);
4195 }
4196
4197 zone_meta_populate(addr, ptoa(pages));
4198 zcram(zone, addr, pages, 0);
4199 }
4200
4201 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4202 __attribute__((noinline))
4203 static void
zone_scramble_va_and_unlock(zone_t z,struct zone_page_metadata * meta,uint32_t runs,uint32_t pages,uint32_t chunk_pages,uint64_t guard_mask)4204 zone_scramble_va_and_unlock(
4205 zone_t z,
4206 struct zone_page_metadata *meta,
4207 uint32_t runs,
4208 uint32_t pages,
4209 uint32_t chunk_pages,
4210 uint64_t guard_mask)
4211 {
4212 struct zone_page_metadata *arr[ZONE_CHUNK_ALLOC_SIZE / 4096];
4213
4214 for (uint32_t run = 0, n = 0; run < runs; run++) {
4215 arr[run] = meta + n;
4216 n += chunk_pages + ((guard_mask >> run) & 1);
4217 }
4218
4219 /*
4220 * Fisher–Yates shuffle, for an array with indices [0, n)
4221 *
4222 * for i from n−1 downto 1 do
4223 * j ← random integer such that 0 ≤ j ≤ i
4224 * exchange a[j] and a[i]
4225 *
4226 * The point here is that early allocations aren't at a fixed
4227 * distance from each other.
4228 */
4229 for (uint32_t i = runs - 1; i > 0; i--) {
4230 uint32_t j = zalloc_random_uniform32(0, i + 1);
4231
4232 meta = arr[j];
4233 arr[j] = arr[i];
4234 arr[i] = meta;
4235 }
4236
4237 zone_lock(z);
4238
4239 for (uint32_t i = 0; i < runs; i++) {
4240 zone_meta_queue_push(z, &z->z_pageq_va, arr[i]);
4241 }
4242 z->z_va_cur += z->z_percpu ? runs : pages;
4243 }
4244
4245 static inline uint32_t
dist_u32(uint32_t a,uint32_t b)4246 dist_u32(uint32_t a, uint32_t b)
4247 {
4248 return a < b ? b - a : a - b;
4249 }
4250
4251 static uint64_t
zalloc_random_clear_n_bits(uint64_t mask,uint32_t pop,uint32_t n)4252 zalloc_random_clear_n_bits(uint64_t mask, uint32_t pop, uint32_t n)
4253 {
4254 for (; n-- > 0; pop--) {
4255 uint32_t bit = zalloc_random_uniform32(0, pop);
4256 uint64_t m = mask;
4257
4258 for (; bit; bit--) {
4259 m &= m - 1;
4260 }
4261
4262 mask ^= 1ull << __builtin_ctzll(m);
4263 }
4264
4265 return mask;
4266 }
4267
4268 /**
4269 * @function zalloc_random_bits
4270 *
4271 * @brief
4272 * Compute a random number with a specified number of bit set in a given width.
4273 *
4274 * @discussion
4275 * This function generates a "uniform" distribution of sets of bits set in
4276 * a given width, with typically less than width/4 calls to random.
4277 *
4278 * @param pop the target number of bits set.
4279 * @param width the number of bits in the random integer to generate.
4280 */
4281 static uint64_t
zalloc_random_bits(uint32_t pop,uint32_t width)4282 zalloc_random_bits(uint32_t pop, uint32_t width)
4283 {
4284 uint64_t w_mask = (1ull << width) - 1;
4285 uint64_t mask;
4286 uint32_t cur;
4287
4288 if (3 * width / 4 <= pop) {
4289 mask = w_mask;
4290 cur = width;
4291 } else if (pop <= width / 4) {
4292 mask = 0;
4293 cur = 0;
4294 } else {
4295 /*
4296 * Chosing a random number this way will overwhelmingly
4297 * contain `width` bits +/- a few.
4298 */
4299 mask = zalloc_random_mask64(width);
4300 cur = __builtin_popcountll(mask);
4301
4302 if (dist_u32(cur, pop) > dist_u32(width - cur, pop)) {
4303 /*
4304 * If the opposite mask has a closer popcount,
4305 * then start with that one as the seed.
4306 */
4307 cur = width - cur;
4308 mask ^= w_mask;
4309 }
4310 }
4311
4312 if (cur < pop) {
4313 /*
4314 * Setting `pop - cur` bits is really clearing that many from
4315 * the opposite mask.
4316 */
4317 mask ^= w_mask;
4318 mask = zalloc_random_clear_n_bits(mask, width - cur, pop - cur);
4319 mask ^= w_mask;
4320 } else if (pop < cur) {
4321 mask = zalloc_random_clear_n_bits(mask, cur, cur - pop);
4322 }
4323
4324 return mask;
4325 }
4326 #endif
4327
4328 static void
zone_allocate_va_locked(zone_t z,zalloc_flags_t flags)4329 zone_allocate_va_locked(zone_t z, zalloc_flags_t flags)
4330 {
4331 zone_security_flags_t zsflags = zone_security_config(z);
4332 struct zone_page_metadata *meta;
4333 kma_flags_t kmaflags = zone_kma_flags(z, zsflags, flags) | KMA_VAONLY;
4334 uint32_t chunk_pages = z->z_chunk_pages;
4335 uint32_t runs, pages, guards, rnum;
4336 uint64_t guard_mask = 0;
4337 bool lead_guard = false;
4338 kern_return_t kr;
4339 vm_offset_t addr;
4340
4341 zone_unlock(z);
4342
4343 /*
4344 * A lot of OOB exploitation techniques rely on precise placement
4345 * and interleaving of zone pages. The layout that is sought
4346 * by attackers will be C/P/T types, where:
4347 * - (C)ompromised is the type for which attackers have a bug,
4348 * - (P)adding is used to pad memory,
4349 * - (T)arget is the type that the attacker will attempt to corrupt
4350 * by exploiting (C).
4351 *
4352 * Note that in some cases C==T and P isn't needed.
4353 *
4354 * In order to make those placement games much harder,
4355 * we grow zones by random runs of memory, up to 256k.
4356 * This makes predicting the precise layout of the heap
4357 * quite more complicated.
4358 *
4359 * Note: this function makes a very heavy use of random,
4360 * however, it is mostly limited to sequestered zones,
4361 * and eventually the layout will be fixed,
4362 * and the usage of random vastly reduced.
4363 *
4364 * For non sequestered zones, there's a single call
4365 * to random in order to decide whether we want
4366 * a guard page or not.
4367 */
4368 pages = chunk_pages;
4369 guards = 0;
4370 runs = 1;
4371 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4372 if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) {
4373 pages = atop(ZONE_CHUNK_ALLOC_SIZE);
4374 runs = (pages + chunk_pages - 1) / chunk_pages;
4375 runs = zalloc_random_uniform32(1, runs + 1);
4376 pages = runs * chunk_pages;
4377 }
4378 static_assert(ZONE_CHUNK_ALLOC_SIZE / 4096 <= 64,
4379 "make sure that `runs` will never be larger than 64");
4380 #endif /* !ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4381
4382 /*
4383 * Zones that are suceptible to OOB (kalloc, ZC_PGZ_USE_GUARDS),
4384 * guards might be added after each chunk.
4385 *
4386 * Those guard pages are marked with the ZM_PGZ_GUARD
4387 * magical chunk len, and their zm_oob_offs field
4388 * is used to remember optional shift applied
4389 * to returned elements, in order to right-align-them
4390 * as much as possible.
4391 *
4392 * In an adversarial context, while guard pages
4393 * are extremely effective against linear overflow,
4394 * using a predictable density of guard pages feels like
4395 * a missed opportunity. Which is why we chose to insert
4396 * one guard page for about 32k of memory, and place it
4397 * randomly.
4398 */
4399 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4400 if (z->z_percpu) {
4401 /*
4402 * For per-cpu runs, have a 75% chance to have a guard.
4403 */
4404 rnum = zalloc_random_uniform32(0, 4 * 128);
4405 guards = rnum >= 128;
4406 } else if (!zsflags.z_pgz_use_guards && !z->z_pgz_use_guards) {
4407 vm_offset_t rest;
4408
4409 /*
4410 * For types that are less susceptible to have OOBs,
4411 * have a density of 1 guard every 64k, with a uniform
4412 * distribution.
4413 */
4414 rnum = zalloc_random_uniform32(0, ZONE_GUARD_SPARSE);
4415 guards = (uint32_t)ptoa(pages) / ZONE_GUARD_SPARSE;
4416 rest = (uint32_t)ptoa(pages) % ZONE_GUARD_SPARSE;
4417 guards += rnum < rest;
4418 } else if (ptoa(chunk_pages) >= ZONE_GUARD_DENSE) {
4419 /*
4420 * For chunks >= 32k, have a 75% chance of guard pages
4421 * between chunks.
4422 */
4423 rnum = zalloc_random_uniform32(65, 129);
4424 guards = runs * rnum / 128;
4425 } else {
4426 vm_offset_t rest;
4427
4428 /*
4429 * Otherwise, aim at 1 guard every 32k,
4430 * with a uniform distribution.
4431 */
4432 rnum = zalloc_random_uniform32(0, ZONE_GUARD_DENSE);
4433 guards = (uint32_t)ptoa(pages) / ZONE_GUARD_DENSE;
4434 rest = (uint32_t)ptoa(pages) % ZONE_GUARD_DENSE;
4435 guards += rnum < rest;
4436 }
4437 assert3u(guards, <=, runs);
4438
4439 guard_mask = 0;
4440
4441 if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) {
4442 uint32_t g = 0;
4443
4444 /*
4445 * Several exploitation strategies rely on a C/T (compromised
4446 * then target types) ordering of pages with a sub-page reach
4447 * from C into T.
4448 *
4449 * We want to reliably thwart such exploitations
4450 * and hence force a guard page between alternating
4451 * memory types.
4452 */
4453 guard_mask |= 1ull << (runs - 1);
4454 g++;
4455
4456 /*
4457 * While we randomize the chunks lengths, an attacker with
4458 * precise timing control can guess when overflows happen,
4459 * and "measure" the runs, which gives them an indication
4460 * of where the next run start offset is.
4461 *
4462 * In order to make this knowledge unusable, add a guard page
4463 * _before_ the new run with a 25% probability, regardless
4464 * of whether we had enough guard pages.
4465 */
4466 if ((rnum & 3) == 0) {
4467 lead_guard = true;
4468 g++;
4469 }
4470 if (guards > g) {
4471 guard_mask |= zalloc_random_bits(guards - g, runs - 1);
4472 } else {
4473 guards = g;
4474 }
4475 } else {
4476 assert3u(runs, ==, 1);
4477 assert3u(guards, <=, 1);
4478 guard_mask = guards << (runs - 1);
4479 }
4480 #else
4481 (void)rnum;
4482 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4483
4484 if (zone_submap_is_sequestered(zsflags)) {
4485 kr = zone_submap_alloc_sequestered_va(zsflags,
4486 pages + guards, &addr);
4487 } else {
4488 assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
4489 kr = kmem_alloc(zone_submap(zsflags), &addr,
4490 ptoa(pages + guards), kmaflags, VM_KERN_MEMORY_ZONE);
4491 }
4492
4493 if (kr != KERN_SUCCESS) {
4494 uint64_t zone_size = 0;
4495 zone_t zone_largest = zone_find_largest(&zone_size);
4496 panic("zalloc[%d]: zone map exhausted while allocating from zone [%s%s], "
4497 "likely due to memory leak in zone [%s%s] "
4498 "(%luM, %d elements allocated)",
4499 kr, zone_heap_name(z), zone_name(z),
4500 zone_heap_name(zone_largest), zone_name(zone_largest),
4501 (unsigned long)zone_size >> 20,
4502 zone_count_allocated(zone_largest));
4503 }
4504
4505 meta = zone_meta_from_addr(addr);
4506 zone_meta_populate(addr, ptoa(pages + guards));
4507
4508 /*
4509 * Handle the leading guard page if any
4510 */
4511 if (lead_guard) {
4512 meta[0].zm_index = zone_index(z);
4513 meta[0].zm_chunk_len = ZM_PGZ_GUARD;
4514 meta[0].zm_guarded = true;
4515 meta++;
4516 }
4517
4518 for (uint32_t run = 0, n = 0; run < runs; run++) {
4519 bool guarded = (guard_mask >> run) & 1;
4520
4521 for (uint32_t i = 0; i < chunk_pages; i++, n++) {
4522 meta[n].zm_index = zone_index(z);
4523 meta[n].zm_guarded = guarded;
4524 }
4525 if (guarded) {
4526 meta[n].zm_index = zone_index(z);
4527 meta[n].zm_chunk_len = ZM_PGZ_GUARD;
4528 n++;
4529 }
4530 }
4531 if (guards) {
4532 os_atomic_add(&zone_guard_pages, guards, relaxed);
4533 }
4534
4535 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4536 if (__improbable(zone_caching_disabled < 0)) {
4537 return zone_scramble_va_and_unlock(z, meta, runs, pages,
4538 chunk_pages, guard_mask);
4539 }
4540 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4541
4542 zone_lock(z);
4543
4544 for (uint32_t run = 0, n = 0; run < runs; run++) {
4545 zone_meta_queue_push(z, &z->z_pageq_va, meta + n);
4546 n += chunk_pages + ((guard_mask >> run) & 1);
4547 }
4548 z->z_va_cur += z->z_percpu ? runs : pages;
4549 }
4550
4551 static bool
zone_expand_pred_nope(__unused zone_t z)4552 zone_expand_pred_nope(__unused zone_t z)
4553 {
4554 return false;
4555 }
4556
4557 static inline void
ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size)4558 ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size)
4559 {
4560 #if DEBUG || DEVELOPMENT
4561 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
4562 size, 0, 0, 0);
4563 #else
4564 (void)size;
4565 #endif
4566 }
4567
4568 static inline void
ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages)4569 ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages)
4570 {
4571 #if DEBUG || DEVELOPMENT
4572 task_t task = current_task_early();
4573 if (pages && task) {
4574 ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages);
4575 }
4576 VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
4577 pages, 0, 0, 0);
4578 #else
4579 (void)pages;
4580 #endif
4581 }
4582
4583 __attribute__((noinline))
4584 static void
__ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(zone_t z,uint32_t pgs)4585 __ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(zone_t z, uint32_t pgs)
4586 {
4587 uint64_t wait_start = 0;
4588 long mapped;
4589
4590 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4591
4592 if ((z >= &zone_array[ZONE_ID_VM_MAP] &&
4593 z <= &zone_array[ZONE_ID_VM_MAP_HOLES]) ||
4594 current_thread()->options & TH_OPT_VMPRIV) {
4595 /*
4596 * VM_MAP, VM_MAP_ENTRY and VM_MAP_HOLES zones are allowed
4597 * to overcommit because they're used to reclaim memory
4598 * (VM support).
4599 */
4600 return;
4601 }
4602
4603 mapped = os_atomic_load(&zone_pages_wired, relaxed);
4604
4605 /*
4606 * If the zone map is really exhausted, wait on the GC thread,
4607 * donating our priority (which is important because the GC
4608 * thread is at a rather low priority).
4609 */
4610 for (uint32_t n = 1; mapped >= zone_pages_wired_max - pgs; n++) {
4611 uint32_t wait_ms = n * (n + 1) / 2;
4612 uint64_t interval;
4613
4614 if (n == 1) {
4615 wait_start = mach_absolute_time();
4616 } else {
4617 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4618 }
4619 if (zone_exhausted_timeout > 0 &&
4620 wait_ms > zone_exhausted_timeout) {
4621 panic("zone map exhaustion: waited for %dms "
4622 "(pages: %ld, max: %ld, wanted: %d)",
4623 wait_ms, mapped, zone_pages_wired_max, pgs);
4624 }
4625
4626 clock_interval_to_absolutetime_interval(wait_ms, NSEC_PER_MSEC,
4627 &interval);
4628
4629 lck_spin_lock(&zone_exhausted_lock);
4630 lck_spin_sleep_with_inheritor(&zone_exhausted_lock,
4631 LCK_SLEEP_UNLOCK, &zone_pages_wired,
4632 vm_pageout_gc_thread, THREAD_UNINT, wait_start + interval);
4633
4634 mapped = os_atomic_load(&zone_pages_wired, relaxed);
4635 }
4636 }
4637
4638 static bool
zone_expand_wait_for_pages(bool waited)4639 zone_expand_wait_for_pages(bool waited)
4640 {
4641 if (waited) {
4642 return false;
4643 }
4644 #if DEBUG || DEVELOPMENT
4645 if (zalloc_simulate_vm_pressure) {
4646 return false;
4647 }
4648 #endif /* DEBUG || DEVELOPMENT */
4649 return !vm_pool_low();
4650 }
4651
4652 static void
zone_expand_locked(zone_t z,zalloc_flags_t flags,bool (* pred)(zone_t))4653 zone_expand_locked(zone_t z, zalloc_flags_t flags, bool (*pred)(zone_t))
4654 {
4655 thread_t self = current_thread();
4656 bool vm_priv = (self->options & TH_OPT_VMPRIV);
4657 bool clear_vm_priv;
4658 thread_pri_floor_t token;
4659 zone_security_flags_t zsflags = zone_security_config(z);
4660
4661 for (;;) {
4662 if (!pred) {
4663 /* NULL pred means "try just once" */
4664 pred = zone_expand_pred_nope;
4665 } else if (!pred(z)) {
4666 return;
4667 }
4668
4669 if (vm_priv && !z->z_expander_vm_priv) {
4670 /*
4671 * Claim the vm priv overcommit slot
4672 *
4673 * We do not track exact ownership for VM privileged
4674 * threads, so use the rwlock boost as a stop-gap
4675 * just in case.
4676 */
4677 token = thread_priority_floor_start();
4678 z->z_expander_vm_priv = true;
4679 clear_vm_priv = true;
4680 } else {
4681 clear_vm_priv = false;
4682 }
4683
4684 if (z->z_expander == NULL) {
4685 z->z_expander = self;
4686 break;
4687 }
4688 if (clear_vm_priv) {
4689 break;
4690 }
4691
4692 if (flags & Z_NOPAGEWAIT) {
4693 return;
4694 }
4695
4696 z->z_expanding_wait = true;
4697 lck_ticket_sleep_with_inheritor(&z->z_lock, &zone_locks_grp,
4698 LCK_SLEEP_DEFAULT, &z->z_expander, z->z_expander,
4699 TH_UNINT, TIMEOUT_WAIT_FOREVER);
4700 }
4701
4702 do {
4703 struct zone_page_metadata *meta = NULL;
4704 uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0;
4705 vm_page_t page_list = NULL;
4706 vm_offset_t addr = 0;
4707 int waited = 0;
4708
4709 /*
4710 * While we hold the zone lock, look if there's VA we can:
4711 * - complete from partial pages,
4712 * - reuse from the sequester list.
4713 *
4714 * When the page is being populated we pretend we allocated
4715 * an extra element so that zone_gc() can't attempt to free
4716 * the chunk (as it could become empty while we wait for pages).
4717 */
4718 if (zone_pva_is_null(z->z_pageq_va)) {
4719 zone_allocate_va_locked(z, flags);
4720 }
4721
4722 meta = zone_meta_queue_pop(z, &z->z_pageq_va);
4723 addr = zone_meta_to_addr(meta);
4724 if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
4725 cur_pages = meta->zm_page_index;
4726 meta -= cur_pages;
4727 addr -= ptoa(cur_pages);
4728 zone_meta_lock_in_partial(z, meta, cur_pages);
4729 }
4730 zone_unlock(z);
4731
4732 /*
4733 * And now allocate pages to populate our VA.
4734 */
4735 if (z->z_percpu) {
4736 min_pages = z->z_chunk_pages;
4737 } else {
4738 min_pages = (uint32_t)atop(round_page(zone_oob_offs(z) +
4739 zone_elem_size(z)));
4740 }
4741
4742 /*
4743 * Trigger jetsams via VM_PAGEOUT_GC_EVENT
4744 * if we're running out of zone memory
4745 */
4746 if (__improbable(zone_map_nearing_exhaustion())) {
4747 __ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(z, min_pages);
4748 }
4749
4750 ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages));
4751
4752 while (pages < z->z_chunk_pages - cur_pages) {
4753 vm_page_t m = vm_page_grab();
4754
4755 if (m) {
4756 pages++;
4757 m->vmp_snext = page_list;
4758 page_list = m;
4759 vm_page_zero_fill(m);
4760 continue;
4761 }
4762
4763 if (pages >= min_pages &&
4764 !zone_expand_wait_for_pages(waited)) {
4765 break;
4766 }
4767
4768 if ((flags & Z_NOPAGEWAIT) == 0) {
4769 waited++;
4770 VM_PAGE_WAIT();
4771 continue;
4772 }
4773
4774 /*
4775 * Undo everything and bail out:
4776 *
4777 * - free pages
4778 * - undo the fake allocation if any
4779 * - put the VA back on the VA page queue.
4780 */
4781 vm_page_free_list(page_list, FALSE);
4782 ZONE_TRACE_VM_KERN_REQUEST_END(pages);
4783
4784 zone_lock(z);
4785
4786 if (cur_pages) {
4787 zone_meta_unlock_from_partial(z, meta, cur_pages);
4788 }
4789 if (meta) {
4790 zone_meta_queue_push(z, &z->z_pageq_va,
4791 meta + cur_pages);
4792 }
4793 goto page_shortage;
4794 }
4795
4796 vm_object_lock(kernel_object);
4797 kernel_memory_populate_object_and_unlock(kernel_object,
4798 addr + ptoa(cur_pages), addr + ptoa(cur_pages), ptoa(pages), page_list,
4799 zone_kma_flags(z, zsflags, flags), VM_KERN_MEMORY_ZONE,
4800 (zsflags.z_submap_idx == Z_SUBMAP_IDX_READ_ONLY)
4801 ? VM_PROT_READ : VM_PROT_READ | VM_PROT_WRITE);
4802
4803 ZONE_TRACE_VM_KERN_REQUEST_END(pages);
4804
4805 zcram_and_lock(z, addr, new_va, cur_pages, cur_pages + pages, 0);
4806 } while (pred(z));
4807
4808 page_shortage:
4809 if (clear_vm_priv) {
4810 z->z_expander_vm_priv = false;
4811 thread_priority_floor_end(&token);
4812 }
4813 if (z->z_expander == self) {
4814 z->z_expander = THREAD_NULL;
4815 }
4816 if (z->z_expanding_wait) {
4817 z->z_expanding_wait = false;
4818 wakeup_all_with_inheritor(&z->z_expander, THREAD_AWAKENED);
4819 }
4820 }
4821
4822 static bool
zalloc_needs_refill(zone_t zone)4823 zalloc_needs_refill(zone_t zone)
4824 {
4825 if (zone->z_elems_free > zone->z_elems_rsv) {
4826 return false;
4827 }
4828 if (zone->z_wired_cur < zone->z_wired_max) {
4829 return true;
4830 }
4831 if (zone->exhaustible) {
4832 return false;
4833 }
4834 if (zone->expandable) {
4835 /*
4836 * If we're expandable, just don't go through this again.
4837 */
4838 zone->z_wired_max = ~0u;
4839 return true;
4840 }
4841 zone_unlock(zone);
4842
4843 panic("zone '%s%s' exhausted", zone_heap_name(zone), zone_name(zone));
4844 }
4845
4846 static void
zone_expand_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)4847 zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
4848 {
4849 zone_foreach(z) {
4850 if (z->no_callout) {
4851 /* z_async_refilling will never be set */
4852 continue;
4853 }
4854
4855 zone_lock(z);
4856 if (z->z_self && z->z_async_refilling) {
4857 z->z_async_refilling = false;
4858 zone_expand_locked(z, Z_WAITOK, zalloc_needs_refill);
4859 }
4860 zone_unlock(z);
4861 }
4862 }
4863
4864 static inline void
zone_expand_async_schedule_if_needed(zone_t zone)4865 zone_expand_async_schedule_if_needed(zone_t zone)
4866 {
4867 if (__improbable(startup_phase < STARTUP_SUB_THREAD_CALL)) {
4868 return;
4869 }
4870
4871 if (zone->z_elems_free > zone->z_elems_rsv || zone->z_async_refilling ||
4872 zone->no_callout) {
4873 return;
4874 }
4875
4876 if (!zone->expandable && zone->z_wired_cur >= zone->z_wired_max) {
4877 return;
4878 }
4879
4880 if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
4881 return;
4882 }
4883
4884 if (zone->z_elems_free == 0 || !vm_pool_low()) {
4885 zone->z_async_refilling = true;
4886 thread_call_enter(&zone_expand_callout);
4887 }
4888 }
4889
4890 #endif /* !ZALLOC_TEST */
4891 #pragma mark zone jetsam integration
4892 #if !ZALLOC_TEST
4893
4894 /*
4895 * We're being very conservative here and picking a value of 95%. We might need to lower this if
4896 * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
4897 */
4898 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
4899
4900 /*
4901 * Threshold above which largest zones should be included in the panic log
4902 */
4903 #define ZONE_MAP_EXHAUSTION_PRINT_PANIC 80
4904
4905 /*
4906 * Trigger zone-map-exhaustion jetsams if the zone map is X% full,
4907 * where X=zone_map_jetsam_limit.
4908 *
4909 * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
4910 */
4911 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
4912 ZONE_MAP_JETSAM_LIMIT_DEFAULT);
4913
4914 kern_return_t
zone_map_jetsam_set_limit(uint32_t value)4915 zone_map_jetsam_set_limit(uint32_t value)
4916 {
4917 if (value <= 0 || value > 100) {
4918 return KERN_INVALID_VALUE;
4919 }
4920
4921 zone_map_jetsam_limit = value;
4922 os_atomic_store(&zone_pages_jetsam_threshold,
4923 zone_pages_wired_max * value / 100, relaxed);
4924 return KERN_SUCCESS;
4925 }
4926
4927 void
get_zone_map_size(uint64_t * current_size,uint64_t * capacity)4928 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
4929 {
4930 vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
4931 *current_size = ptoa_64(phys_pages);
4932 *capacity = ptoa_64(zone_pages_wired_max);
4933 }
4934
4935 void
get_largest_zone_info(char * zone_name,size_t zone_name_len,uint64_t * zone_size)4936 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
4937 {
4938 zone_t largest_zone = zone_find_largest(zone_size);
4939
4940 /*
4941 * Append kalloc heap name to zone name (if zone is used by kalloc)
4942 */
4943 snprintf(zone_name, zone_name_len, "%s%s",
4944 zone_heap_name(largest_zone), largest_zone->z_name);
4945 }
4946
4947 static bool
zone_map_nearing_threshold(unsigned int threshold)4948 zone_map_nearing_threshold(unsigned int threshold)
4949 {
4950 uint64_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
4951 return phys_pages * 100 > zone_pages_wired_max * threshold;
4952 }
4953
4954 bool
zone_map_nearing_exhaustion(void)4955 zone_map_nearing_exhaustion(void)
4956 {
4957 vm_size_t pages = os_atomic_load(&zone_pages_wired, relaxed);
4958
4959 return pages >= os_atomic_load(&zone_pages_jetsam_threshold, relaxed);
4960 }
4961
4962
4963 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
4964
4965 /*
4966 * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
4967 * to walk through the jetsam priority bands and kill processes.
4968 */
4969 static zone_t
kill_process_in_largest_zone(void)4970 kill_process_in_largest_zone(void)
4971 {
4972 pid_t pid = -1;
4973 uint64_t zone_size = 0;
4974 zone_t largest_zone = zone_find_largest(&zone_size);
4975
4976 printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n",
4977 ptoa_64(os_atomic_load(&zone_pages_wired, relaxed)),
4978 ptoa_64(zone_pages_wired_max),
4979 (uint64_t)zone_submaps_approx_size(),
4980 (uint64_t)zone_map_size(),
4981 zone_map_jetsam_limit);
4982 printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
4983 largest_zone->z_name, (uintptr_t)zone_size);
4984
4985 /*
4986 * We want to make sure we don't call this function from userspace.
4987 * Or we could end up trying to synchronously kill the process
4988 * whose context we're in, causing the system to hang.
4989 */
4990 assert(current_task() == kernel_task);
4991
4992 /*
4993 * If vm_object_zone is the largest, check to see if the number of
4994 * elements in vm_map_entry_zone is comparable.
4995 *
4996 * If so, consider vm_map_entry_zone as the largest. This lets us target
4997 * a specific process to jetsam to quickly recover from the zone map
4998 * bloat.
4999 */
5000 if (largest_zone == vm_object_zone) {
5001 unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
5002 unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
5003 /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
5004 if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
5005 largest_zone = vm_map_entry_zone;
5006 printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
5007 (uintptr_t)zone_size_wired(largest_zone));
5008 }
5009 }
5010
5011 /* TODO: Extend this to check for the largest process in other zones as well. */
5012 if (largest_zone == vm_map_entry_zone) {
5013 pid = find_largest_process_vm_map_entries();
5014 } else {
5015 printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
5016 "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
5017 largest_zone->z_name);
5018 }
5019 if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
5020 printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
5021 }
5022
5023 return largest_zone;
5024 }
5025
5026 #endif /* !ZALLOC_TEST */
5027 #pragma mark probabilistic gzalloc
5028 #if !ZALLOC_TEST
5029 #if CONFIG_PROB_GZALLOC
5030
5031 extern uint32_t random(void);
5032 struct pgz_backtrace {
5033 uint32_t pgz_depth;
5034 int32_t pgz_bt[MAX_ZTRACE_DEPTH];
5035 };
5036
5037 static int32_t PERCPU_DATA(pgz_sample_counter);
5038 static SECURITY_READ_ONLY_LATE(struct pgz_backtrace *) pgz_backtraces;
5039 static uint32_t pgz_uses; /* number of zones using PGZ */
5040 static int32_t pgz_slot_avail;
5041 #if OS_ATOMIC_HAS_LLSC
5042 struct zone_page_metadata *pgz_slot_head;
5043 #else
5044 static struct pgz_slot_head {
5045 uint32_t psh_count;
5046 uint32_t psh_slot;
5047 } pgz_slot_head;
5048 #endif
5049 struct zone_page_metadata *pgz_slot_tail;
5050 static SECURITY_READ_ONLY_LATE(vm_map_t) pgz_submap;
5051
5052 static struct zone_page_metadata *
pgz_meta(uint32_t index)5053 pgz_meta(uint32_t index)
5054 {
5055 return &zone_info.zi_pgz_meta[2 * index + 1];
5056 }
5057
5058 static struct pgz_backtrace *
pgz_bt(uint32_t slot,bool free)5059 pgz_bt(uint32_t slot, bool free)
5060 {
5061 return &pgz_backtraces[2 * slot + free];
5062 }
5063
5064 static void
pgz_backtrace(struct pgz_backtrace * bt,void * fp)5065 pgz_backtrace(struct pgz_backtrace *bt, void *fp)
5066 {
5067 struct backtrace_control ctl = {
5068 .btc_frame_addr = (uintptr_t)fp,
5069 };
5070
5071 bt->pgz_depth = (uint32_t)backtrace_packed(BTP_KERN_OFFSET_32,
5072 (uint8_t *)bt->pgz_bt, sizeof(bt->pgz_bt), &ctl, NULL) / 4;
5073 }
5074
5075 static uint32_t
pgz_slot(vm_offset_t addr)5076 pgz_slot(vm_offset_t addr)
5077 {
5078 return (uint32_t)((addr - zone_info.zi_pgz_range.min_address) >> (PAGE_SHIFT + 1));
5079 }
5080
5081 static vm_offset_t
pgz_addr(uint32_t slot)5082 pgz_addr(uint32_t slot)
5083 {
5084 return zone_info.zi_pgz_range.min_address + ptoa(2 * slot + 1);
5085 }
5086
5087 static bool
pgz_sample(zalloc_flags_t flags)5088 pgz_sample(zalloc_flags_t flags)
5089 {
5090 int32_t *counterp, cnt;
5091
5092 counterp = PERCPU_GET(pgz_sample_counter);
5093 cnt = *counterp;
5094 if (__probable(cnt > 0)) {
5095 *counterp = cnt - 1;
5096 return false;
5097 }
5098
5099 if (pgz_slot_avail <= 0) {
5100 return false;
5101 }
5102
5103 /*
5104 * zalloc_random_uniform() might block, so when the sampled allocation
5105 * requested Z_NOWAIT, set the counter to `-1` which will cause
5106 * the next allocation that can block to generate a new random value.
5107 * No allocation on this CPU will sample until then.
5108 */
5109 if (flags & Z_NOWAIT) {
5110 *counterp = -1;
5111 } else {
5112 enable_preemption();
5113 *counterp = zalloc_random_uniform32(0, 2 * pgz_sample_rate);
5114 disable_preemption();
5115 }
5116
5117 return cnt == 0;
5118 }
5119
5120 static inline bool
pgz_slot_alloc(uint32_t * slot)5121 pgz_slot_alloc(uint32_t *slot)
5122 {
5123 struct zone_page_metadata *m;
5124 uint32_t tries = 100;
5125
5126 disable_preemption();
5127
5128 #if OS_ATOMIC_USE_LLSC
5129 int32_t ov, nv;
5130 os_atomic_rmw_loop(&pgz_slot_avail, ov, nv, relaxed, {
5131 if (__improbable(ov <= 0)) {
5132 os_atomic_rmw_loop_give_up({
5133 enable_preemption();
5134 return false;
5135 });
5136 }
5137 nv = ov - 1;
5138 });
5139 #else
5140 if (__improbable(os_atomic_dec_orig(&pgz_slot_avail, relaxed) <= 0)) {
5141 os_atomic_inc(&pgz_slot_avail, relaxed);
5142 enable_preemption();
5143 return false;
5144 }
5145 #endif
5146
5147 again:
5148 if (__improbable(tries-- == 0)) {
5149 /*
5150 * Too much contention,
5151 * extremely unlikely but do not stay stuck.
5152 */
5153 os_atomic_inc(&pgz_slot_avail, relaxed);
5154 enable_preemption();
5155 return false;
5156 }
5157
5158 #if OS_ATOMIC_HAS_LLSC
5159 do {
5160 m = os_atomic_load_exclusive(&pgz_slot_head, dependency);
5161 if (__improbable(m->zm_pgz_slot_next == NULL)) {
5162 /*
5163 * Either we are waiting for an enqueuer (unlikely)
5164 * or we are competing with another core and
5165 * are looking at a popped element.
5166 */
5167 os_atomic_clear_exclusive();
5168 goto again;
5169 }
5170 } while (!os_atomic_store_exclusive(&pgz_slot_head,
5171 m->zm_pgz_slot_next, relaxed));
5172 #else
5173 struct zone_page_metadata *base = zone_info.zi_pgz_meta;
5174 struct pgz_slot_head ov, nv;
5175 os_atomic_rmw_loop(&pgz_slot_head, ov, nv, dependency, {
5176 m = &base[ov.psh_slot * 2];
5177 if (__improbable(m->zm_pgz_slot_next == NULL)) {
5178 /*
5179 * Either we are waiting for an enqueuer (unlikely)
5180 * or we are competing with another core and
5181 * are looking at a popped element.
5182 */
5183 os_atomic_rmw_loop_give_up(goto again);
5184 }
5185 nv.psh_count = ov.psh_count + 1;
5186 nv.psh_slot = (uint32_t)((m->zm_pgz_slot_next - base) / 2);
5187 });
5188 #endif
5189
5190 enable_preemption();
5191
5192 m->zm_pgz_slot_next = NULL;
5193 *slot = (uint32_t)((m - zone_info.zi_pgz_meta) / 2);
5194 return true;
5195 }
5196
5197 static inline bool
pgz_slot_free(uint32_t slot)5198 pgz_slot_free(uint32_t slot)
5199 {
5200 struct zone_page_metadata *m = &zone_info.zi_pgz_meta[2 * slot];
5201 struct zone_page_metadata *t;
5202
5203 disable_preemption();
5204 t = os_atomic_xchg(&pgz_slot_tail, m, relaxed);
5205 os_atomic_store(&t->zm_pgz_slot_next, m, release);
5206 os_atomic_inc(&pgz_slot_avail, relaxed);
5207 enable_preemption();
5208
5209 return true;
5210 }
5211
5212 /*!
5213 * @function pgz_protect()
5214 *
5215 * @brief
5216 * Try to protect an allocation with PGZ.
5217 *
5218 * @param zone The zone the allocation was made against.
5219 * @param addr An allocated element address to protect.
5220 * @param flags The @c zalloc_flags_t passed to @c zalloc.
5221 * @param fp The caller frame pointer (for the backtrace).
5222 * @returns The new address for the element, or @c addr.
5223 */
5224 __attribute__((noinline))
5225 static vm_offset_t
pgz_protect(zone_t zone,vm_offset_t addr,zalloc_flags_t flags,void * fp)5226 pgz_protect(zone_t zone, vm_offset_t addr, zalloc_flags_t flags, void *fp)
5227 {
5228 kern_return_t kr;
5229 uint32_t slot;
5230
5231 if (!pgz_slot_alloc(&slot)) {
5232 return addr;
5233 }
5234
5235 /*
5236 * Try to double-map the page (may fail if Z_NOWAIT).
5237 * we will always find a PA because pgz_init() pre-expanded the pmap.
5238 */
5239 vm_offset_t new_addr = pgz_addr(slot);
5240 pmap_paddr_t pa = kvtophys(trunc_page(addr));
5241
5242 kr = pmap_enter_options_addr(kernel_pmap, new_addr, pa,
5243 VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE,
5244 (flags & Z_NOWAIT) ? PMAP_OPTIONS_NOWAIT : 0, NULL);
5245
5246 if (__improbable(kr != KERN_SUCCESS)) {
5247 pgz_slot_free(slot);
5248 return addr;
5249 }
5250
5251 struct zone_page_metadata tmp = {
5252 .zm_chunk_len = ZM_PGZ_ALLOCATED,
5253 .zm_index = zone_index(zone),
5254 };
5255 struct zone_page_metadata *meta = pgz_meta(slot);
5256
5257 os_atomic_store(&meta->zm_bits, tmp.zm_bits, relaxed);
5258 os_atomic_store(&meta->zm_pgz_orig_addr, addr, relaxed);
5259 pgz_backtrace(pgz_bt(slot, false), fp);
5260
5261 return new_addr + (addr & PAGE_MASK);
5262 }
5263
5264 /*!
5265 * @function pgz_unprotect()
5266 *
5267 * @brief
5268 * Release a PGZ slot and returns the original address of a freed element.
5269 *
5270 * @param addr A PGZ protected element address.
5271 * @param fp The caller frame pointer (for the backtrace).
5272 * @returns The non protected address for the element
5273 * that was passed to @c pgz_protect().
5274 */
5275 __attribute__((noinline))
5276 static vm_offset_t
pgz_unprotect(vm_offset_t addr,void * fp)5277 pgz_unprotect(vm_offset_t addr, void *fp)
5278 {
5279 struct zone_page_metadata *meta;
5280 struct zone_page_metadata tmp;
5281 uint32_t slot;
5282
5283 slot = pgz_slot(addr);
5284 meta = zone_meta_from_addr(addr);
5285 tmp = *meta;
5286 if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) {
5287 goto double_free;
5288 }
5289
5290 pmap_remove(kernel_pmap, trunc_page(addr), trunc_page(addr) + PAGE_SIZE);
5291
5292 pgz_backtrace(pgz_bt(slot, true), fp);
5293
5294 tmp.zm_chunk_len = ZM_PGZ_FREE;
5295 tmp.zm_bits = os_atomic_xchg(&meta->zm_bits, tmp.zm_bits, relaxed);
5296 if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) {
5297 goto double_free;
5298 }
5299
5300 pgz_slot_free(slot);
5301 return tmp.zm_pgz_orig_addr;
5302
5303 double_free:
5304 panic_fault_address = addr;
5305 meta->zm_chunk_len = ZM_PGZ_DOUBLE_FREE;
5306 panic("probabilistic gzalloc double free: %p", (void *)addr);
5307 }
5308
5309 bool
pgz_owned(mach_vm_address_t addr)5310 pgz_owned(mach_vm_address_t addr)
5311 {
5312 vm_offset_t rmin, rmax;
5313
5314 #if CONFIG_KERNEL_TBI
5315 addr = VM_KERNEL_TBI_FILL(addr);
5316 #endif /* CONFIG_KERNEL_TBI */
5317
5318 kmem_range_load(&zone_info.zi_pgz_range, rmin, rmax);
5319
5320 return (addr >= rmin) & (addr < rmax);
5321 }
5322
5323
5324 __attribute__((always_inline))
5325 vm_offset_t
__pgz_decode(mach_vm_address_t addr,mach_vm_size_t size)5326 __pgz_decode(mach_vm_address_t addr, mach_vm_size_t size)
5327 {
5328 struct zone_page_metadata *meta;
5329
5330 if (__probable(!pgz_owned(addr))) {
5331 return (vm_offset_t)addr;
5332 }
5333
5334 if (zone_addr_size_crosses_page(addr, size)) {
5335 panic("invalid size for PGZ protected address %p:%p",
5336 (void *)addr, (void *)(addr + size));
5337 }
5338
5339 meta = zone_meta_from_addr((vm_offset_t)addr);
5340 if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5341 panic_fault_address = (vm_offset_t)addr;
5342 panic("probabilistic gzalloc use-after-free: %p", (void *)addr);
5343 }
5344
5345 return trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK);
5346 }
5347
5348 __attribute__((always_inline))
5349 vm_offset_t
__pgz_decode_allow_invalid(vm_offset_t addr,zone_id_t zid)5350 __pgz_decode_allow_invalid(vm_offset_t addr, zone_id_t zid)
5351 {
5352 struct zone_page_metadata *meta;
5353
5354 if (__probable(!pgz_owned(addr))) {
5355 return addr;
5356 }
5357
5358 meta = zone_meta_from_addr(addr);
5359 addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK);
5360
5361 if (zid != ZONE_ID_ANY && zone_index_from_ptr((void *)addr) != zid) {
5362 return 0;
5363 }
5364
5365 return addr;
5366 }
5367
5368 static void
pgz_zone_init(zone_t z)5369 pgz_zone_init(zone_t z)
5370 {
5371 char zn[MAX_ZONE_NAME];
5372 char zv[MAX_ZONE_NAME];
5373 char key[30];
5374
5375 if (zone_elem_size(z) > PAGE_SIZE) {
5376 return;
5377 }
5378
5379 if (zone_index(z) == ZONE_ID_SELECT_SET) {
5380 return;
5381 }
5382
5383 if (pgz_all) {
5384 os_atomic_inc(&pgz_uses, relaxed);
5385 z->z_pgz_tracked = true;
5386 return;
5387 }
5388
5389 snprintf(zn, sizeof(zn), "%s%s", zone_heap_name(z), zone_name(z));
5390
5391 for (int i = 1;; i++) {
5392 snprintf(key, sizeof(key), "pgz%d", i);
5393 if (!PE_parse_boot_argn(key, zv, sizeof(zv))) {
5394 break;
5395 }
5396 if (track_this_zone(zn, zv) || track_kalloc_zones(z, zv)) {
5397 os_atomic_inc(&pgz_uses, relaxed);
5398 z->z_pgz_tracked = true;
5399 break;
5400 }
5401 }
5402 }
5403
5404 __startup_func
5405 static vm_size_t
pgz_get_size(void)5406 pgz_get_size(void)
5407 {
5408 if (pgz_slots == UINT32_MAX) {
5409 /*
5410 * Scale with RAM size: ~200 slots a G
5411 */
5412 pgz_slots = (uint32_t)(sane_size >> 22);
5413 }
5414
5415 /*
5416 * Make sure that the slot allocation scheme works.
5417 * see pgz_slot_alloc() / pgz_slot_free();
5418 */
5419 if (pgz_slots < zpercpu_count() * 4) {
5420 pgz_slots = zpercpu_count() * 4;
5421 }
5422 if (pgz_slots >= UINT16_MAX) {
5423 pgz_slots = UINT16_MAX - 1;
5424 }
5425
5426 /*
5427 * Quarantine is 33% of slots by default, no more than 90%.
5428 */
5429 if (pgz_quarantine == 0) {
5430 pgz_quarantine = pgz_slots / 3;
5431 }
5432 if (pgz_quarantine > pgz_slots * 9 / 10) {
5433 pgz_quarantine = pgz_slots * 9 / 10;
5434 }
5435 pgz_slot_avail = pgz_slots - pgz_quarantine;
5436
5437 return ptoa(2 * pgz_slots + 1);
5438 }
5439
5440 __startup_func
5441 static void
pgz_init(void)5442 pgz_init(void)
5443 {
5444 if (!pgz_uses) {
5445 return;
5446 }
5447
5448 if (pgz_sample_rate == 0) {
5449 /*
5450 * If no rate was provided, pick a random one that scales
5451 * with the number of protected zones.
5452 *
5453 * Use a binomal distribution to avoid having too many
5454 * really fast sample rates.
5455 */
5456 uint32_t factor = MIN(pgz_uses, 10);
5457 uint32_t max_rate = 1000 * factor;
5458 uint32_t min_rate = 100 * factor;
5459
5460 pgz_sample_rate = (zalloc_random_uniform32(min_rate, max_rate) +
5461 zalloc_random_uniform32(min_rate, max_rate)) / 2;
5462 }
5463
5464 struct kmem_range *r = &zone_info.zi_pgz_range;
5465 zone_info.zi_pgz_meta = zone_meta_from_addr(r->min_address);
5466 zone_meta_populate(r->min_address, kmem_range_size(r));
5467
5468 for (size_t i = 0; i < 2 * pgz_slots + 1; i += 2) {
5469 zone_info.zi_pgz_meta[i].zm_chunk_len = ZM_PGZ_GUARD;
5470 }
5471
5472 for (size_t i = 1; i < pgz_slots; i++) {
5473 zone_info.zi_pgz_meta[2 * i - 1].zm_pgz_slot_next =
5474 &zone_info.zi_pgz_meta[2 * i + 1];
5475 }
5476 #if OS_ATOMIC_HAS_LLSC
5477 pgz_slot_head = &zone_info.zi_pgz_meta[1];
5478 #endif
5479 pgz_slot_tail = &zone_info.zi_pgz_meta[2 * pgz_slots - 1];
5480
5481 pgz_backtraces = zalloc_permanent(sizeof(struct pgz_backtrace) *
5482 2 * pgz_slots, ZALIGN_PTR);
5483
5484 /*
5485 * expand the pmap so that pmap_enter_options_addr()
5486 * in pgz_protect() never need to call pmap_expand().
5487 */
5488 for (uint32_t slot = 0; slot < pgz_slots; slot++) {
5489 (void)pmap_enter_options_addr(kernel_pmap, pgz_addr(slot), 0,
5490 VM_PROT_NONE, VM_PROT_NONE, 0, FALSE,
5491 PMAP_OPTIONS_NOENTER, NULL);
5492 }
5493
5494 /* do this last as this will enable pgz */
5495 percpu_foreach(counter, pgz_sample_counter) {
5496 *counter = zalloc_random_uniform32(0, 2 * pgz_sample_rate);
5497 }
5498 }
5499 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, pgz_init);
5500
5501 static void
panic_display_pgz_bt(bool has_syms,uint32_t slot,bool free)5502 panic_display_pgz_bt(bool has_syms, uint32_t slot, bool free)
5503 {
5504 struct pgz_backtrace *bt = pgz_bt(slot, free);
5505 const char *what = free ? "Free" : "Allocation";
5506 uintptr_t buf[MAX_ZTRACE_DEPTH];
5507
5508 if (!ml_validate_nofault((vm_offset_t)bt, sizeof(*bt))) {
5509 paniclog_append_noflush(" Can't decode %s Backtrace\n", what);
5510 return;
5511 }
5512
5513 backtrace_unpack(BTP_KERN_OFFSET_32, buf, MAX_ZTRACE_DEPTH,
5514 (uint8_t *)bt->pgz_bt, 4 * bt->pgz_depth);
5515
5516 paniclog_append_noflush(" %s Backtrace:\n", what);
5517 for (uint32_t i = 0; i < bt->pgz_depth && i < MAX_ZTRACE_DEPTH; i++) {
5518 if (has_syms) {
5519 paniclog_append_noflush(" %p ", (void *)buf[i]);
5520 panic_print_symbol_name(buf[i]);
5521 paniclog_append_noflush("\n");
5522 } else {
5523 paniclog_append_noflush(" %p\n", (void *)buf[i]);
5524 }
5525 }
5526 kmod_panic_dump((vm_offset_t *)buf, bt->pgz_depth);
5527 }
5528
5529 static void
panic_display_pgz_uaf_info(bool has_syms,vm_offset_t addr)5530 panic_display_pgz_uaf_info(bool has_syms, vm_offset_t addr)
5531 {
5532 struct zone_page_metadata *meta;
5533 vm_offset_t elem, esize;
5534 const char *type;
5535 const char *prob;
5536 uint32_t slot;
5537 zone_t z;
5538
5539 slot = pgz_slot(addr);
5540 meta = pgz_meta(slot);
5541 elem = pgz_addr(slot) + (meta->zm_pgz_orig_addr & PAGE_MASK);
5542
5543 paniclog_append_noflush("Probabilistic GZAlloc Report:\n");
5544
5545 if (ml_validate_nofault((vm_offset_t)meta, sizeof(*meta)) &&
5546 meta->zm_index &&
5547 meta->zm_index < os_atomic_load(&num_zones, relaxed)) {
5548 z = &zone_array[meta->zm_index];
5549 } else {
5550 paniclog_append_noflush(" Zone : <unknown>\n");
5551 paniclog_append_noflush(" Address : %p\n", (void *)addr);
5552 paniclog_append_noflush("\n");
5553 return;
5554 }
5555
5556 esize = zone_elem_size(z);
5557 paniclog_append_noflush(" Zone : %s%s\n",
5558 zone_heap_name(z), zone_name(z));
5559 paniclog_append_noflush(" Address : %p\n", (void *)addr);
5560 paniclog_append_noflush(" Element : [%p, %p) of size %d\n",
5561 (void *)elem, (void *)(elem + esize), (uint32_t)esize);
5562
5563 if (addr < elem) {
5564 type = "out-of-bounds(underflow) + use-after-free";
5565 prob = "low";
5566 } else if (meta->zm_chunk_len == ZM_PGZ_DOUBLE_FREE) {
5567 type = "double-free";
5568 prob = "high";
5569 } else if (addr < elem + esize) {
5570 type = "use-after-free";
5571 prob = "high";
5572 } else if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5573 type = "out-of-bounds + use-after-free";
5574 prob = "low";
5575 } else {
5576 type = "out-of-bounds";
5577 prob = "high";
5578 }
5579 paniclog_append_noflush(" Kind : %s (%s confidence)\n",
5580 type, prob);
5581 if (addr < elem) {
5582 paniclog_append_noflush(" Access : %d byte(s) before\n",
5583 (uint32_t)(elem - addr) + 1);
5584 } else if (addr < elem + esize) {
5585 paniclog_append_noflush(" Access : %d byte(s) inside\n",
5586 (uint32_t)(addr - elem) + 1);
5587 } else {
5588 paniclog_append_noflush(" Access : %d byte(s) past\n",
5589 (uint32_t)(addr - (elem + esize)) + 1);
5590 }
5591
5592 panic_display_pgz_bt(has_syms, slot, false);
5593 if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5594 panic_display_pgz_bt(has_syms, slot, true);
5595 }
5596
5597 paniclog_append_noflush("\n");
5598 }
5599
5600 #endif /* CONFIG_PROB_GZALLOC */
5601 #endif /* !ZALLOC_TEST */
5602 #pragma mark zfree
5603 #if !ZALLOC_TEST
5604
5605 /*!
5606 * @defgroup zfree
5607 * @{
5608 *
5609 * @brief
5610 * The codepath for zone frees.
5611 *
5612 * @discussion
5613 * There are 4 major ways to allocate memory that end up in the zone allocator:
5614 * - @c zfree()
5615 * - @c zfree_percpu()
5616 * - @c kfree*()
5617 * - @c zfree_permanent()
5618 *
5619 * While permanent zones have their own allocation scheme, all other codepaths
5620 * will eventually go through the @c zfree_ext() choking point.
5621 *
5622 * Ignoring the @c gzalloc_free() codepath, the decision tree looks like this:
5623 * <code>
5624 * zfree_ext()
5625 * ├───> zfree_cached() ────────────────╮
5626 * │ │ │
5627 * │ │ │
5628 * │ ├───> zfree_cached_slow() ───┤
5629 * │ │ │ │
5630 * │ │ v │
5631 * ╰───────┴───> zfree_item() ──────────┴───>
5632 * </code>
5633 *
5634 * @c zfree_ext() takes care of all the generic work to perform on an element
5635 * before it is freed (zeroing, logging, tagging, ...) then will hand it off to:
5636 * - @c zfree_item() if zone caching is off
5637 * - @c zfree_cached() if zone caching is on.
5638 *
5639 * @c zfree_cached can take a number of decisions:
5640 * - a fast path if the (f) or (a) magazines have space (preemption disabled),
5641 * - using the cpu local or recirculation depot calling @c zfree_cached_slow(),
5642 * - falling back to @c zfree_item() when CPU caching has been disabled.
5643 */
5644
5645 #if KASAN_ZALLOC
5646 /*
5647 * Called from zfree() to add the element being freed to the KASan quarantine.
5648 *
5649 * Returns true if the newly-freed element made it into the quarantine without
5650 * displacing another, false otherwise. In the latter case, addrp points to the
5651 * address of the displaced element, which will be freed by the zone.
5652 */
5653 static bool
kasan_quarantine_freed_element(zone_t * zonep,void ** addrp)5654 kasan_quarantine_freed_element(
5655 zone_t *zonep, /* the zone the element is being freed to */
5656 void **addrp) /* address of the element being freed */
5657 {
5658 zone_t zone = *zonep;
5659 void *addr = *addrp;
5660
5661 /*
5662 * Resize back to the real allocation size and hand off to the KASan
5663 * quarantine. `addr` may then point to a different allocation, if the
5664 * current element replaced another in the quarantine. The zone then
5665 * takes ownership of the swapped out free element.
5666 */
5667 vm_size_t usersz = zone_elem_size(zone) - 2 * zone->z_kasan_redzone;
5668 vm_size_t sz = usersz;
5669
5670 if (addr && zone->z_kasan_redzone) {
5671 kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
5672 addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
5673 assert(sz == zone_elem_size(zone));
5674 }
5675 if (addr && !zone->kasan_noquarantine) {
5676 kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz);
5677 if (!addr) {
5678 return TRUE;
5679 }
5680 }
5681 if (addr && zone->kasan_noquarantine) {
5682 kasan_unpoison(addr, zone_elem_size(zone));
5683 }
5684 *addrp = addr;
5685 return FALSE;
5686 }
5687 #endif /* KASAN_ZALLOC */
5688
5689 __header_always_inline void
zfree_drop(zone_t zone,struct zone_page_metadata * meta,zone_element_t ze,bool recirc)5690 zfree_drop(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze,
5691 bool recirc)
5692 {
5693 vm_offset_t esize = zone_elem_size(zone);
5694
5695 if (zone_meta_mark_free(meta, ze) == recirc) {
5696 zone_meta_double_free_panic(zone, ze, __func__);
5697 }
5698
5699 vm_offset_t old_size = meta->zm_alloc_size;
5700 vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
5701 vm_offset_t new_size = zone_meta_alloc_size_sub(zone, meta, esize);
5702
5703 if (new_size == 0) {
5704 /* whether the page was on the intermediate or all_used, queue, move it to free */
5705 zone_meta_requeue(zone, &zone->z_pageq_empty, meta);
5706 zone->z_wired_empty += meta->zm_chunk_len;
5707 } else if (old_size + esize > max_size) {
5708 /* first free element on page, move from all_used */
5709 zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
5710 }
5711 }
5712
5713 static void
zfree_item(zone_t zone,struct zone_page_metadata * meta,zone_element_t ze)5714 zfree_item(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze)
5715 {
5716 /* transfer preemption count to lock */
5717 zone_lock_nopreempt_check_contention(zone, NULL);
5718
5719 zfree_drop(zone, meta, ze, false);
5720 zone_elems_free_add(zone, 1);
5721
5722 zone_unlock(zone);
5723 }
5724
5725 __attribute__((noinline))
5726 static void
zfree_cached_slow(zone_t zone,struct zone_page_metadata * meta,zone_element_t ze,zone_cache_t cache)5727 zfree_cached_slow(zone_t zone, struct zone_page_metadata *meta,
5728 zone_element_t ze, zone_cache_t cache)
5729 {
5730 struct zone_depot mags;
5731 zone_magazine_t mag = NULL;
5732 uint32_t depot_max;
5733 uint16_t n_mags = 0;
5734
5735 if (zone_meta_is_free(meta, ze)) {
5736 zone_meta_double_free_panic(zone, ze, __func__);
5737 }
5738
5739 if (zone == zc_magazine_zone) {
5740 mag = (zone_magazine_t)zone_element_addr(zone, ze,
5741 zone_elem_size(zone));
5742 #if KASAN_ZALLOC
5743 kasan_poison_range((vm_offset_t)mag, zone_elem_size(zone),
5744 ASAN_VALID);
5745 #endif
5746 } else {
5747 mag = zone_magazine_alloc(Z_NOWAIT);
5748 if (__improbable(mag == NULL)) {
5749 return zfree_item(zone, meta, ze);
5750 }
5751 mag->zm_cur = 1;
5752 mag->zm_elems[0] = ze;
5753 }
5754
5755 mag = zone_magazine_replace(&cache->zc_free_cur,
5756 &cache->zc_free_elems, mag);
5757
5758 z_debug_assert(cache->zc_free_cur <= 1);
5759 z_debug_assert(mag->zm_cur == zc_mag_size());
5760
5761 /*
5762 * Depot growth policy:
5763 *
5764 * The zc_alloc and zc_free are on average half empty/full,
5765 * hence count for "1" unit of zc_mag_size().
5766 *
5767 * We use the local depot for each `zc_depot_max` extra `zc_mag_size()`
5768 * worth of element we're allowed.
5769 *
5770 * If pushing the bucket puts us in excess of `zc_depot_max`,
5771 * then we trim (zc_recirc_batch) buckets out, in order
5772 * to amortize taking the zone lock.
5773 *
5774 * Note that `zc_depot_max` can be mutated by the GC concurrently,
5775 * so take a copy that we use throughout.
5776 */
5777 depot_max = os_atomic_load(&cache->zc_depot_max, relaxed);
5778 if (2 * zc_mag_size() <= depot_max) {
5779 zone_depot_lock_nopreempt(cache);
5780
5781 STAILQ_INSERT_TAIL(&cache->zc_depot, mag, zm_link);
5782 cache->zc_depot_cur++;
5783
5784 if (__probable((cache->zc_depot_cur + 1) * zc_mag_size() <=
5785 depot_max)) {
5786 return zone_depot_unlock(cache);
5787 }
5788
5789 /*
5790 * Never free more than half of the magazines.
5791 */
5792 n_mags = MIN(zc_recirc_batch, cache->zc_depot_cur / 2);
5793 assert(n_mags && n_mags < cache->zc_depot_cur);
5794
5795 STAILQ_FIRST(&mags) = mag = STAILQ_FIRST(&cache->zc_depot);
5796 for (uint16_t i = n_mags; i-- > 1;) {
5797 mag = STAILQ_NEXT(mag, zm_link);
5798 }
5799
5800 cache->zc_depot_cur -= n_mags;
5801 STAILQ_FIRST(&cache->zc_depot) = STAILQ_NEXT(mag, zm_link);
5802 STAILQ_NEXT(mag, zm_link) = NULL;
5803
5804 zone_depot_unlock(cache);
5805
5806 mags.stqh_last = &STAILQ_NEXT(mag, zm_link);
5807 } else {
5808 enable_preemption();
5809
5810 n_mags = 1;
5811 STAILQ_FIRST(&mags) = mag;
5812 mags.stqh_last = &STAILQ_NEXT(mag, zm_link);
5813 STAILQ_NEXT(mag, zm_link) = NULL;
5814 }
5815
5816 /*
5817 * Preflight validity of all the elements before we touch the zone
5818 * metadata, and then insert them into the recirculation depot.
5819 */
5820 STAILQ_FOREACH(mag, &mags, zm_link) {
5821 for (uint16_t i = 0; i < zc_mag_size(); i++) {
5822 zone_element_validate(zone, mag->zm_elems[i]);
5823 }
5824 }
5825
5826 zone_lock_check_contention(zone, cache);
5827
5828 STAILQ_FOREACH(mag, &mags, zm_link) {
5829 for (uint16_t i = 0; i < zc_mag_size(); i++) {
5830 zone_element_t e = mag->zm_elems[i];
5831
5832 if (!zone_meta_mark_free(zone_meta_from_element(e), e)) {
5833 zone_meta_double_free_panic(zone, e, __func__);
5834 }
5835 }
5836 }
5837 STAILQ_CONCAT(&zone->z_recirc, &mags);
5838 zone->z_recirc_cur += n_mags;
5839
5840 zone_elems_free_add(zone, n_mags * zc_mag_size());
5841
5842 zone_unlock(zone);
5843 }
5844
5845 static void
zfree_cached(zone_t zone,struct zone_page_metadata * meta,zone_element_t ze)5846 zfree_cached(zone_t zone, struct zone_page_metadata *meta, zone_element_t ze)
5847 {
5848 zone_cache_t cache = zpercpu_get(zone->z_pcpu_cache);
5849
5850 if (cache->zc_free_cur >= zc_mag_size()) {
5851 if (cache->zc_alloc_cur >= zc_mag_size()) {
5852 return zfree_cached_slow(zone, meta, ze, cache);
5853 }
5854 zone_cache_swap_magazines(cache);
5855 }
5856
5857 if (__improbable(cache->zc_alloc_elems == NULL)) {
5858 return zfree_item(zone, meta, ze);
5859 }
5860
5861 if (zone_meta_is_free(meta, ze)) {
5862 zone_meta_double_free_panic(zone, ze, __func__);
5863 }
5864
5865 uint16_t idx = cache->zc_free_cur++;
5866 if (idx >= zc_mag_size()) {
5867 zone_accounting_panic(zone, "zc_free_cur overflow");
5868 }
5869 cache->zc_free_elems[idx] = ze;
5870
5871 enable_preemption();
5872 }
5873
5874 /*
5875 * The function is noinline when zlog can be used so that the backtracing can
5876 * reliably skip the zfree_ext() and zfree_log()
5877 * boring frames.
5878 */
5879 #if ZONE_ENABLE_LOGGING
5880 __attribute__((noinline))
5881 #endif /* ZONE_ENABLE_LOGGING */
5882 void
zfree_ext(zone_t zone,zone_stats_t zstats,void * addr,vm_size_t elem_size)5883 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr, vm_size_t elem_size)
5884 {
5885 struct zone_page_metadata *page_meta;
5886 vm_offset_t elem = (vm_offset_t)addr;
5887 zone_element_t ze;
5888
5889 DTRACE_VM2(zfree, zone_t, zone, void*, addr);
5890
5891 #if CONFIG_KERNEL_TBI && KASAN_TBI
5892 if (zone->z_tbi_tag) {
5893 elem = kasan_tbi_tag_zfree(elem, elem_size, zone->z_percpu);
5894 /* addr is still consumed in the function: gzalloc_free */
5895 addr = (void *)elem;
5896 }
5897 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
5898 #if CONFIG_PROB_GZALLOC
5899 if (__improbable(pgz_owned(elem))) {
5900 elem = pgz_unprotect(elem, __builtin_frame_address(0));
5901 addr = (void *)elem;
5902 }
5903 #endif /* CONFIG_PROB_GZALLOC */
5904 #if VM_TAG_SIZECLASSES
5905 if (__improbable(zone->z_uses_tags)) {
5906 vm_tag_t tag = *ztSlot(zone, elem) >> 1;
5907 // set the tag with b0 clear so the block remains inuse
5908 *ztSlot(zone, elem) = 0xFFFE;
5909 vm_tag_update_zone_size(tag, zone->z_tags_sizeclass,
5910 -(long)elem_size);
5911 }
5912 #endif /* VM_TAG_SIZECLASSES */
5913
5914 #if KASAN_ZALLOC
5915 /*
5916 * Call zone_element_resolve() and throw away the results in
5917 * order to validate the element and its zone membership.
5918 * Any validation panics need to happen now, while we're
5919 * still close to the caller.
5920 *
5921 * Note that elem has not been adjusted, so we have to remove the
5922 * redzone first.
5923 */
5924 zone_element_t ze_discard;
5925 vm_offset_t elem_actual = elem - zone->z_kasan_redzone;
5926 (void)zone_element_resolve(zone, elem_actual, elem_size, &ze_discard);
5927
5928 if (kasan_quarantine_freed_element(&zone, &addr)) {
5929 return;
5930 }
5931 /*
5932 * kasan_quarantine_freed_element() might return a different
5933 * {zone, addr} than the one being freed for kalloc heaps.
5934 *
5935 * Make sure we reload everything.
5936 */
5937 elem = (vm_offset_t)addr;
5938 elem_size = zone_elem_size(zone);
5939 #endif
5940 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
5941 if (__improbable(zone->z_btlog)) {
5942 zfree_log(zone->z_btlog, elem, __builtin_frame_address(0));
5943 }
5944 #endif /* ZONE_ENABLE_LOGGING */
5945 #if CONFIG_GZALLOC
5946 if (__improbable(zone->z_gzalloc_tracked)) {
5947 return gzalloc_free(zone, zstats, addr);
5948 }
5949 #endif /* CONFIG_GZALLOC */
5950
5951 page_meta = zone_element_resolve(zone, elem, elem_size, &ze);
5952 #if KASAN_ZALLOC
5953 if (zone->z_percpu) {
5954 zpercpu_foreach_cpu(i) {
5955 kasan_poison_range(elem + ptoa(i), elem_size,
5956 ASAN_HEAP_FREED);
5957 }
5958 } else {
5959 kasan_poison_range(elem, elem_size, ASAN_HEAP_FREED);
5960 }
5961 #endif
5962
5963 disable_preemption();
5964 zpercpu_get(zstats)->zs_mem_freed += elem_size;
5965
5966 if (zone->z_pcpu_cache) {
5967 return zfree_cached(zone, page_meta, ze);
5968 }
5969
5970 return zfree_item(zone, page_meta, ze);
5971 }
5972
5973 void
5974 (zfree)(union zone_or_view zov, void *addr)
5975 {
5976 zone_t zone = zov.zov_view->zv_zone;
5977 zone_stats_t zstats = zov.zov_view->zv_stats;
5978 vm_offset_t esize = zone_elem_size(zone);
5979
5980 assert(zone > &zone_array[ZONE_ID__LAST_RO]);
5981 assert(!zone->z_percpu);
5982 #if !KASAN_KALLOC
5983 bzero(addr, esize);
5984 #endif /* !KASAN_KALLOC */
5985 zfree_ext(zone, zstats, addr, esize);
5986 }
5987
5988 __attribute__((noinline))
5989 void
zfree_percpu(union zone_or_view zov,void * addr)5990 zfree_percpu(union zone_or_view zov, void *addr)
5991 {
5992 zone_t zone = zov.zov_view->zv_zone;
5993 zone_stats_t zstats = zov.zov_view->zv_stats;
5994 vm_offset_t esize = zone_elem_size(zone);
5995
5996 assert(zone > &zone_array[ZONE_ID__LAST_RO]);
5997 assert(zone->z_percpu);
5998 addr = (void *)__zpcpu_demangle(addr);
5999 #if !KASAN_KALLOC
6000 zpercpu_foreach_cpu(i) {
6001 bzero((char *)addr + ptoa(i), esize);
6002 }
6003 #endif /* !KASAN_KALLOC */
6004 zfree_ext(zone, zstats, addr, esize);
6005 }
6006
6007 void
6008 (zfree_id)(zone_id_t zid, void *addr)
6009 {
6010 (zfree)(&zone_array[zid], addr);
6011 }
6012
6013 void
6014 (zfree_ro)(zone_id_t zid, void *addr)
6015 {
6016 assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6017 zone_t zone = &zone_array[zid];
6018 zone_stats_t zstats = zone->z_stats;
6019 vm_offset_t esize = zone_ro_elem_size[zid];
6020
6021 #if ZSECURITY_CONFIG(READ_ONLY)
6022 assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
6023 pmap_ro_zone_bzero(zid, (vm_offset_t)addr, 0, esize);
6024 #elif !KASAN_KALLOC
6025 (void)zid;
6026 bzero(addr, esize);
6027 #endif /* !KASAN_KALLOC */
6028 zfree_ext(zone, zstats, addr, esize);
6029 }
6030
6031 /*! @} */
6032 #endif /* !ZALLOC_TEST */
6033 #pragma mark zalloc
6034 #if !ZALLOC_TEST
6035
6036 /*!
6037 * @defgroup zalloc
6038 * @{
6039 *
6040 * @brief
6041 * The codepath for zone allocations.
6042 *
6043 * @discussion
6044 * There are 4 major ways to allocate memory that end up in the zone allocator:
6045 * - @c zalloc(), @c zalloc_flags(), ...
6046 * - @c zalloc_percpu()
6047 * - @c kalloc*()
6048 * - @c zalloc_permanent()
6049 *
6050 * While permanent zones have their own allocation scheme, all other codepaths
6051 * will eventually go through the @c zalloc_ext() choking point.
6052 *
6053 * Ignoring the @c zalloc_gz() codepath, the decision tree looks like this:
6054 * <code>
6055 * zalloc_ext()
6056 * │
6057 * ├───> zalloc_cached() ──────> zalloc_cached_fast() ───╮
6058 * │ │ ^ │
6059 * │ │ │ │
6060 * │ ╰───> zalloc_cached_slow() ───╯ │
6061 * │ │ │
6062 * │<─────────────────╮ ├─────────────╮ │
6063 * │ │ │ │ │
6064 * │ │ v │ │
6065 * │<───────╮ ╭──> zalloc_item_slow() ────┤ │
6066 * │ │ │ │ │
6067 * │ │ │ v │
6068 * ╰───> zalloc_item() ──────────> zalloc_item_fast() ───┤
6069 * │
6070 * v
6071 * zalloc_return()
6072 * </code>
6073 *
6074 *
6075 * The @c zalloc_item() track is used when zone caching is off:
6076 * - @c zalloc_item_fast() is used when there are enough elements available,
6077 * - @c zalloc_item_slow() is used when a refill is needed, which can cause
6078 * the zone to grow. This is the only codepath that refills.
6079 *
6080 * This track uses the zone lock for serialization:
6081 * - taken in @c zalloc_item(),
6082 * - maintained during @c zalloc_item_slow() (possibly dropped and re-taken),
6083 * - dropped in @c zalloc_item_fast().
6084 *
6085 *
6086 * The @c zalloc_cached() track is used when zone caching is on:
6087 * - @c zalloc_cached_fast() is taken when the cache has elements,
6088 * - @c zalloc_cached_slow() is taken if a cache refill is needed.
6089 * It can chose many strategies:
6090 * ~ @c zalloc_cached_from_depot() to try to reuse cpu stashed magazines,
6091 * ~ @c zalloc_cached_from_recirc() using the global recirculation depot
6092 * @c z_recirc,
6093 * ~ using zalloc_import() if the zone has enough elements,
6094 * ~ falling back to the @c zalloc_item() track if zone caching is disabled
6095 * due to VM pressure or the zone has no available elements.
6096 *
6097 * This track disables preemption for serialization:
6098 * - preemption is disabled in @c zalloc_ext(),
6099 * - kept disabled during @c zalloc_cached_slow(), converted into a zone lock
6100 * if switching to @c zalloc_item_slow(),
6101 * - preemption is reenabled in @c zalloc_cached_fast().
6102 *
6103 * @c zalloc_cached_from_depot() also takes depot locks (taken by the caller,
6104 * released by @c zalloc_cached_from_depot().
6105 *
6106 * In general the @c zalloc_*_slow() codepaths deal with refilling and will
6107 * tail call into the @c zalloc_*_fast() code to perform the actual allocation.
6108 *
6109 * @c zalloc_return() is the final function everyone tail calls into,
6110 * which prepares the element for consumption by the caller and deals with
6111 * common treatment (zone logging, tags, kasan, validation, ...).
6112 */
6113
6114 /*!
6115 * @function zalloc_import
6116 *
6117 * @brief
6118 * Import @c n elements in the specified array, opposite of @c zfree_drop().
6119 *
6120 * @param zone The zone to import elements from
6121 * @param elems The array to import into
6122 * @param n The number of elements to import. Must be non zero,
6123 * and smaller than @c zone->z_elems_free.
6124 */
6125 __header_always_inline void
zalloc_import(zone_t zone,zone_element_t * elems,zalloc_flags_t flags,vm_size_t esize,uint32_t n)6126 zalloc_import(zone_t zone, zone_element_t *elems, zalloc_flags_t flags,
6127 vm_size_t esize, uint32_t n)
6128 {
6129 uint32_t i = 0;
6130
6131 assertf(STAILQ_EMPTY(&zone->z_recirc),
6132 "Trying to import from zone %p [%s%s] with non empty recirc",
6133 zone, zone_heap_name(zone), zone_name(zone));
6134
6135 do {
6136 vm_offset_t page, eidx, size = 0;
6137 struct zone_page_metadata *meta;
6138
6139 if (!zone_pva_is_null(zone->z_pageq_partial)) {
6140 meta = zone_pva_to_meta(zone->z_pageq_partial);
6141 page = zone_pva_to_addr(zone->z_pageq_partial);
6142 } else if (!zone_pva_is_null(zone->z_pageq_empty)) {
6143 meta = zone_pva_to_meta(zone->z_pageq_empty);
6144 page = zone_pva_to_addr(zone->z_pageq_empty);
6145 zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len);
6146 } else {
6147 zone_accounting_panic(zone, "z_elems_free corruption");
6148 }
6149
6150 zone_meta_validate(zone, meta, page);
6151
6152 vm_offset_t old_size = meta->zm_alloc_size;
6153 vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
6154
6155 do {
6156 eidx = zone_meta_find_and_clear_bit(zone, meta, flags);
6157 elems[i++] = zone_element_encode(page, eidx);
6158 size += esize;
6159 } while (i < n && old_size + size + esize <= max_size);
6160
6161 vm_offset_t new_size = zone_meta_alloc_size_add(zone, meta, size);
6162
6163 if (new_size + esize > max_size) {
6164 zone_meta_requeue(zone, &zone->z_pageq_full, meta);
6165 } else if (old_size == 0) {
6166 /* remove from free, move to intermediate */
6167 zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
6168 }
6169 } while (i < n);
6170 }
6171
6172 /*!
6173 * @function zalloc_return
6174 *
6175 * @brief
6176 * Performs the tail-end of the work required on allocations before the caller
6177 * uses them.
6178 *
6179 * @discussion
6180 * This function is called without any zone lock held,
6181 * and preemption back to the state it had when @c zalloc_ext() was called.
6182 *
6183 * @param zone The zone we're allocating from.
6184 * @param ze The encoded element we just allocated.
6185 * @param flags The flags passed to @c zalloc_ext() (for Z_ZERO).
6186 * @param elem_size The element size for this zone.
6187 */
6188 __attribute__((noinline))
6189 static void *
zalloc_return(zone_t zone,zone_element_t ze,zalloc_flags_t flags __unused,vm_offset_t elem_size)6190 zalloc_return(zone_t zone, zone_element_t ze, zalloc_flags_t flags __unused,
6191 vm_offset_t elem_size)
6192 {
6193 vm_offset_t addr = zone_element_addr(zone, ze, elem_size);
6194
6195 #if CONFIG_KERNEL_TBI && KASAN_TBI
6196 addr = kasan_tbi_fix_address_tag(addr);
6197 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
6198 #if ZALLOC_ENABLE_ZERO_CHECK
6199 zalloc_validate_element(zone, addr, elem_size, flags);
6200 #endif /* ZALLOC_ENABLE_ZERO_CHECK */
6201 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS
6202 if (__improbable(zone->z_btlog)) {
6203 zalloc_log(zone->z_btlog, addr, flags,
6204 __builtin_frame_address(0));
6205 }
6206 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
6207 #if VM_TAG_SIZECLASSES
6208 if (__improbable(zone->z_uses_tags)) {
6209 vm_tag_t tag = zalloc_flags_get_tag(flags);
6210 if (tag == VM_KERN_MEMORY_NONE) {
6211 zone_security_flags_t zsflags = zone_security_config(zone);
6212 if (zsflags.z_kheap_id == KHEAP_ID_DATA_BUFFERS) {
6213 tag = VM_KERN_MEMORY_KALLOC_DATA;
6214 } else if (zsflags.z_kheap_id == KHEAP_ID_KT_VAR ||
6215 zsflags.z_kalloc_type) {
6216 tag = VM_KERN_MEMORY_KALLOC_TYPE;
6217 } else {
6218 tag = VM_KERN_MEMORY_KALLOC;
6219 }
6220 }
6221 // set the tag with b0 clear so the block remains inuse
6222 *ztSlot(zone, addr) = (vm_tag_t)(tag << 1);
6223 vm_tag_update_zone_size(tag, zone->z_tags_sizeclass,
6224 (long)elem_size);
6225 }
6226 #endif /* VM_TAG_SIZECLASSES */
6227 #if CONFIG_PROB_GZALLOC
6228 if ((flags & Z_PGZ) && !zone_addr_size_crosses_page(addr, elem_size)) {
6229 addr = pgz_protect(zone, addr, flags,
6230 __builtin_frame_address(0));
6231 }
6232 #endif
6233
6234 /*
6235 * Kasan integration of kalloc heaps are handled by kalloc_ext()
6236 */
6237 if ((flags & Z_SKIP_KASAN) == 0) {
6238 #if KASAN_ZALLOC
6239 if (zone->z_kasan_redzone) {
6240 addr = kasan_alloc(addr, elem_size,
6241 elem_size - 2 * zone->z_kasan_redzone,
6242 zone->z_kasan_redzone);
6243 elem_size -= 2 * zone->z_kasan_redzone;
6244 __nosan_bzero((char *)addr, elem_size);
6245 } else if (flags & Z_PCPU) {
6246 zpercpu_foreach_cpu(i) {
6247 kasan_poison_range(addr + ptoa(i), elem_size, ASAN_VALID);
6248 __nosan_bzero((char *)addr + ptoa(i), elem_size);
6249 }
6250 } else {
6251 kasan_poison_range(addr, elem_size, ASAN_VALID);
6252 __nosan_bzero((char *)addr, elem_size);
6253 }
6254 #endif /* KASAN_ZALLOC */
6255 #if CONFIG_KERNEL_TBI && KASAN_TBI
6256 if (__probable(zone->z_tbi_tag)) {
6257 addr = kasan_tbi_tag_zalloc(addr, elem_size,
6258 elem_size, (flags & Z_PCPU));
6259 } else {
6260 addr = kasan_tbi_tag_zalloc_default(addr,
6261 elem_size, (flags & Z_PCPU));
6262 }
6263 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
6264 }
6265
6266 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
6267 return (void *)addr;
6268 }
6269
6270 #if CONFIG_GZALLOC
6271 /*!
6272 * @function zalloc_gz
6273 *
6274 * @brief
6275 * Performs allocations for zones using gzalloc.
6276 *
6277 * @discussion
6278 * This function is noinline so that it doesn't affect the codegen
6279 * of the fastpath.
6280 */
6281 __attribute__((noinline))
6282 static void *
zalloc_gz(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6283 zalloc_gz(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, vm_size_t esize)
6284 {
6285 vm_offset_t addr = gzalloc_alloc(zone, zstats, flags);
6286 return zalloc_return(zone, zone_element_encode(addr, 0),
6287 flags, esize);
6288 }
6289 #endif /* CONFIG_GZALLOC */
6290
6291 __attribute__((noinline))
6292 static void *
zalloc_item_fast(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6293 zalloc_item_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
6294 vm_size_t esize)
6295 {
6296 zone_element_t ze;
6297
6298 zalloc_import(zone, &ze, flags, esize, 1);
6299 zone_elems_free_sub(zone, 1);
6300 zpercpu_get(zstats)->zs_mem_allocated += esize;
6301 zone_unlock(zone);
6302
6303 return zalloc_return(zone, ze, flags, esize);
6304 }
6305
6306 static inline bool
zalloc_item_slow_should_schedule_async(zone_t zone,zalloc_flags_t flags)6307 zalloc_item_slow_should_schedule_async(zone_t zone, zalloc_flags_t flags)
6308 {
6309 /*
6310 * If we can't wait, then async it is.
6311 */
6312 if (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) {
6313 return true;
6314 }
6315
6316 if (zone->z_elems_free == 0) {
6317 return false;
6318 }
6319
6320 /*
6321 * Early boot gets to tap in bootstrap reserves
6322 */
6323 if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
6324 return true;
6325 }
6326
6327 /*
6328 * Allow threads to tap up to 3/4 of the reserve only doing asyncs.
6329 * Note that reserve-less zones will always say "true" here.
6330 */
6331 if (zone->z_elems_free >= zone->z_elems_rsv / 4) {
6332 return true;
6333 }
6334
6335 /*
6336 * After this, only VM and GC threads get to tap in the reserve.
6337 */
6338 return current_thread()->options & (TH_OPT_ZONE_PRIV | TH_OPT_VMPRIV);
6339 }
6340
6341 /*!
6342 * @function zalloc_item_slow
6343 *
6344 * @brief
6345 * Performs allocations when the zone is out of elements.
6346 *
6347 * @discussion
6348 * This function might drop the lock and reenable preemption,
6349 * which means the per-CPU caching layer or recirculation depot
6350 * might have received elements.
6351 */
6352 __attribute__((noinline))
6353 static void *
zalloc_item_slow(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6354 zalloc_item_slow(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
6355 vm_size_t esize)
6356 {
6357 if (zalloc_item_slow_should_schedule_async(zone, flags)) {
6358 zone_expand_async_schedule_if_needed(zone);
6359 } else {
6360 zone_expand_locked(zone, flags, zalloc_needs_refill);
6361 }
6362 if (__improbable(zone->z_elems_free == 0)) {
6363 zone_unlock(zone);
6364 if (__improbable(flags & Z_NOFAIL)) {
6365 zone_nofail_panic(zone);
6366 }
6367 DTRACE_VM2(zalloc, zone_t, zone, void*, NULL);
6368 return NULL;
6369 }
6370
6371 /*
6372 * We might have changed core or got preempted/blocked while expanding
6373 * the zone. Allocating from the zone when the recirculation depot
6374 * is not empty is not allowed.
6375 *
6376 * It will be rare but possible for the depot to refill while we were
6377 * waiting for pages. If that happens we need to start over.
6378 */
6379 if (!STAILQ_EMPTY(&zone->z_recirc)) {
6380 zone_unlock(zone);
6381 return zalloc_ext(zone, zstats, flags, esize);
6382 }
6383
6384 return zalloc_item_fast(zone, zstats, flags, esize);
6385 }
6386
6387 /*!
6388 * @function zalloc_item
6389 *
6390 * @brief
6391 * Performs allocations when zone caching is off.
6392 *
6393 * @discussion
6394 * This function calls @c zalloc_item_slow() when refilling the zone
6395 * is needed, or @c zalloc_item_fast() if the zone has enough free elements.
6396 */
6397 static void *
zalloc_item(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6398 zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
6399 vm_size_t esize)
6400 {
6401 zone_lock_nopreempt_check_contention(zone, NULL);
6402
6403 /*
6404 * When we commited to the zalloc_item() path,
6405 * zone caching might have been flipped/enabled.
6406 *
6407 * If we got preempted for long enough, the recirculation layer
6408 * can have been populated, and allocating from the zone would be
6409 * incorrect.
6410 *
6411 * So double check for this extremely rare race here.
6412 */
6413 if (__improbable(!STAILQ_EMPTY(&zone->z_recirc))) {
6414 zone_unlock(zone);
6415 return zalloc_ext(zone, zstats, flags, esize);
6416 }
6417
6418 if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) {
6419 return zalloc_item_slow(zone, zstats, flags, esize);
6420 }
6421
6422 return zalloc_item_fast(zone, zstats, flags, esize);
6423 }
6424
6425 __attribute__((always_inline))
6426 static void *
zalloc_cached_fast(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize,zone_cache_t cache,zone_magazine_t freemag)6427 zalloc_cached_fast(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
6428 vm_size_t esize, zone_cache_t cache, zone_magazine_t freemag)
6429 {
6430 zone_element_t ze;
6431 uint32_t index;
6432
6433 index = --cache->zc_alloc_cur;
6434 if (index >= zc_mag_size()) {
6435 zone_accounting_panic(zone, "zc_alloc_cur wrap around");
6436 }
6437 ze = cache->zc_alloc_elems[index];
6438 cache->zc_alloc_elems[index].ze_value = 0;
6439
6440 zpercpu_get(zstats)->zs_mem_allocated += esize;
6441 enable_preemption();
6442
6443 if (zone_meta_is_free(zone_meta_from_element(ze), ze)) {
6444 zone_meta_double_free_panic(zone, ze, __func__);
6445 }
6446
6447 if (freemag) {
6448 zone_magazine_free(freemag);
6449 }
6450 return zalloc_return(zone, ze, flags, esize);
6451 }
6452
6453 __attribute__((noinline))
6454 static void *
zalloc_cached_from_depot(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize,zone_cache_t cache)6455 zalloc_cached_from_depot(
6456 zone_t zone,
6457 zone_stats_t zstats,
6458 zalloc_flags_t flags,
6459 vm_size_t esize,
6460 zone_cache_t cache)
6461 {
6462 zone_magazine_t mag = STAILQ_FIRST(&cache->zc_depot);
6463
6464 STAILQ_REMOVE_HEAD(&cache->zc_depot, zm_link);
6465 STAILQ_NEXT(mag, zm_link) = NULL;
6466
6467 if (cache->zc_depot_cur-- == 0) {
6468 zone_accounting_panic(zone, "zc_depot_cur wrap-around");
6469 }
6470 zone_depot_unlock_nopreempt(cache);
6471
6472 mag = zone_magazine_replace(&cache->zc_alloc_cur,
6473 &cache->zc_alloc_elems, mag);
6474
6475 z_debug_assert(cache->zc_alloc_cur == zc_mag_size());
6476 z_debug_assert(mag->zm_cur == 0);
6477
6478 if (zone == zc_magazine_zone) {
6479 enable_preemption();
6480 bzero(mag, esize);
6481 return mag;
6482 }
6483
6484 return zalloc_cached_fast(zone, zstats, flags, esize, cache, mag);
6485 }
6486
6487 __attribute__((noinline))
6488 static void *
zalloc_cached_import(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize,zone_cache_t cache)6489 zalloc_cached_import(
6490 zone_t zone,
6491 zone_stats_t zstats,
6492 zalloc_flags_t flags,
6493 vm_size_t esize,
6494 zone_cache_t cache)
6495 {
6496 uint16_t n_elems = zc_mag_size();
6497
6498 if (zone->z_elems_free < n_elems + zone->z_elems_rsv / 2 &&
6499 os_sub_overflow(zone->z_elems_free,
6500 zone->z_elems_rsv / 2, &n_elems)) {
6501 n_elems = 0;
6502 }
6503
6504 z_debug_assert(n_elems <= zc_mag_size());
6505
6506 if (__improbable(n_elems == 0)) {
6507 /*
6508 * If importing elements would deplete the zone,
6509 * call zalloc_item_slow()
6510 */
6511 return zalloc_item_slow(zone, zstats, flags, esize);
6512 }
6513
6514 if (__improbable(zone_caching_disabled)) {
6515 if (__improbable(zone_caching_disabled < 0)) {
6516 /*
6517 * In the first 10s after boot, mess with
6518 * the scan position in order to make early
6519 * allocations patterns less predictible.
6520 */
6521 zone_early_scramble_rr(zone, zstats);
6522 }
6523 return zalloc_item_fast(zone, zstats, flags, esize);
6524 }
6525
6526 zalloc_import(zone, cache->zc_alloc_elems, flags, esize, n_elems);
6527
6528 cache->zc_alloc_cur = n_elems;
6529 zone_elems_free_sub(zone, n_elems);
6530
6531 zone_unlock_nopreempt(zone);
6532
6533 return zalloc_cached_fast(zone, zstats, flags, esize, cache, NULL);
6534 }
6535
6536 static void *
zalloc_cached_from_recirc(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize,zone_cache_t cache)6537 zalloc_cached_from_recirc(
6538 zone_t zone,
6539 zone_stats_t zstats,
6540 zalloc_flags_t flags,
6541 vm_size_t esize,
6542 zone_cache_t cache)
6543 {
6544 struct zone_depot mags;
6545 zone_magazine_t mag;
6546 uint16_t n_mags = 1;
6547
6548 STAILQ_FIRST(&mags) = mag = STAILQ_FIRST(&zone->z_recirc);
6549
6550 for (;;) {
6551 for (uint16_t i = 0; i < zc_mag_size(); i++) {
6552 zone_element_t e = mag->zm_elems[i];
6553
6554 if (!zone_meta_mark_used(zone_meta_from_element(e), e)) {
6555 zone_meta_double_free_panic(zone, e, __func__);
6556 }
6557 }
6558
6559 if (n_mags >= zone->z_recirc_cur) {
6560 STAILQ_INIT(&zone->z_recirc);
6561 assert(STAILQ_NEXT(mag, zm_link) == NULL);
6562 break;
6563 }
6564
6565 if (n_mags >= zc_recirc_batch || n_mags * zc_mag_size() >=
6566 cache->zc_depot_max) {
6567 STAILQ_FIRST(&zone->z_recirc) = STAILQ_NEXT(mag, zm_link);
6568 STAILQ_NEXT(mag, zm_link) = NULL;
6569 break;
6570 }
6571
6572 n_mags++;
6573 mag = STAILQ_NEXT(mag, zm_link);
6574 }
6575
6576 zone_elems_free_sub(zone, n_mags * zc_mag_size());
6577 zone_counter_sub(zone, z_recirc_cur, n_mags);
6578
6579 zone_unlock_nopreempt(zone);
6580
6581 mags.stqh_last = &STAILQ_NEXT(mag, zm_link);
6582
6583 /*
6584 * And then incorporate everything into our per-cpu layer.
6585 */
6586
6587 mag = STAILQ_FIRST(&mags);
6588
6589 if (n_mags > 1) {
6590 STAILQ_FIRST(&mags) = STAILQ_NEXT(mag, zm_link);
6591 STAILQ_NEXT(mag, zm_link) = NULL;
6592
6593 zone_depot_lock_nopreempt(cache);
6594
6595 cache->zc_depot_cur += n_mags - 1;
6596 STAILQ_CONCAT(&cache->zc_depot, &mags);
6597
6598 zone_depot_unlock_nopreempt(cache);
6599 }
6600
6601 mag = zone_magazine_replace(&cache->zc_alloc_cur,
6602 &cache->zc_alloc_elems, mag);
6603 z_debug_assert(cache->zc_alloc_cur == zc_mag_size());
6604 z_debug_assert(mag->zm_cur == 0);
6605
6606 return zalloc_cached_fast(zone, zstats, flags, esize, cache, mag);
6607 }
6608
6609 __attribute__((noinline))
6610 static void *
zalloc_cached_slow(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize,zone_cache_t cache)6611 zalloc_cached_slow(
6612 zone_t zone,
6613 zone_stats_t zstats,
6614 zalloc_flags_t flags,
6615 vm_size_t esize,
6616 zone_cache_t cache)
6617 {
6618 /*
6619 * Try to allocate from our local depot, if there's one.
6620 */
6621 if (STAILQ_FIRST(&cache->zc_depot)) {
6622 zone_depot_lock_nopreempt(cache);
6623
6624 if (STAILQ_FIRST(&cache->zc_depot)) {
6625 return zalloc_cached_from_depot(zone, zstats, flags,
6626 esize, cache);
6627 }
6628
6629 zone_depot_unlock_nopreempt(cache);
6630 }
6631
6632 zone_lock_nopreempt_check_contention(zone, cache);
6633
6634 /*
6635 * If the recirculation depot is empty, we'll need to import.
6636 * The system is tuned for this to be extremely rare.
6637 */
6638 if (__improbable(STAILQ_EMPTY(&zone->z_recirc))) {
6639 return zalloc_cached_import(zone, zstats, flags, esize, cache);
6640 }
6641
6642 /*
6643 * If the recirculation depot has elements, then try to fill from it.
6644 */
6645 return zalloc_cached_from_recirc(zone, zstats, flags, esize, cache);
6646 }
6647
6648 /*!
6649 * @function zalloc_cached
6650 *
6651 * @brief
6652 * Performs allocations when zone caching is on.
6653 *
6654 * @discussion
6655 * This function calls @c zalloc_cached_fast() when the caches have elements
6656 * ready.
6657 *
6658 * Else it will call @c zalloc_cached_slow() so that the cache is refilled,
6659 * which might switch to the @c zalloc_item_slow() track when the backing zone
6660 * needs to be refilled.
6661 */
6662 static void *
zalloc_cached(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6663 zalloc_cached(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags,
6664 vm_size_t esize)
6665 {
6666 zone_cache_t cache;
6667
6668 cache = zpercpu_get(zone->z_pcpu_cache);
6669
6670 if (cache->zc_alloc_cur == 0) {
6671 if (__improbable(cache->zc_free_cur == 0)) {
6672 return zalloc_cached_slow(zone, zstats, flags, esize, cache);
6673 }
6674 zone_cache_swap_magazines(cache);
6675 }
6676
6677 return zalloc_cached_fast(zone, zstats, flags, esize, cache, NULL);
6678 }
6679
6680 /*!
6681 * @function zalloc_ext
6682 *
6683 * @brief
6684 * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu().
6685 */
6686 void *
zalloc_ext(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags,vm_size_t esize)6687 zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags, vm_size_t esize)
6688 {
6689 /*
6690 * KASan uses zalloc() for fakestack, which can be called anywhere.
6691 * However, we make sure these calls can never block.
6692 */
6693 assertf(startup_phase < STARTUP_SUB_EARLY_BOOT ||
6694 #if KASAN_ZALLOC
6695 zone->kasan_fakestacks ||
6696 #endif /* KASAN_ZALLOC */
6697 ml_get_interrupts_enabled() ||
6698 ml_is_quiescing() ||
6699 debug_mode_active(),
6700 "Calling {k,z}alloc from interrupt disabled context isn't allowed");
6701
6702 /*
6703 * Make sure Z_NOFAIL was not obviously misused
6704 */
6705 if (flags & Z_NOFAIL) {
6706 assert(!zone->exhaustible &&
6707 (flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
6708 }
6709 #if VM_TAG_SIZECLASSES
6710 if (__improbable(zone->z_uses_tags)) {
6711 vm_tag_t tag = zalloc_flags_get_tag(flags);
6712 if (flags & Z_VM_TAG_BT_BIT) {
6713 tag = vm_tag_bt() ?: tag;
6714 }
6715 if (tag != VM_KERN_MEMORY_NONE) {
6716 tag = vm_tag_will_update_zone(tag, zone->z_tags_sizeclass,
6717 flags & (Z_WAITOK | Z_NOWAIT | Z_NOPAGEWAIT));
6718 }
6719 flags = Z_VM_TAG(flags & ~Z_VM_TAG_MASK, tag);
6720 }
6721 #endif /* VM_TAG_SIZECLASSES */
6722
6723 #if CONFIG_GZALLOC
6724 if (__improbable(zone->z_gzalloc_tracked)) {
6725 return zalloc_gz(zone, zstats, flags, esize);
6726 }
6727 #endif /* CONFIG_GZALLOC */
6728
6729 disable_preemption();
6730
6731 #if ZALLOC_ENABLE_ZERO_CHECK
6732 if (zalloc_skip_zero_check()) {
6733 flags |= Z_NOZZC;
6734 }
6735 #endif
6736 #if CONFIG_PROB_GZALLOC
6737 if (zone->z_pgz_tracked && pgz_sample(flags)) {
6738 flags |= Z_PGZ;
6739 }
6740 #endif /* CONFIG_PROB_GZALLOC */
6741
6742 if (zone->z_pcpu_cache) {
6743 return zalloc_cached(zone, zstats, flags, esize);
6744 }
6745
6746 return zalloc_item(zone, zstats, flags, esize);
6747 }
6748
6749 __attribute__((always_inline))
6750 void *
zalloc(union zone_or_view zov)6751 zalloc(union zone_or_view zov)
6752 {
6753 return zalloc_flags(zov, Z_WAITOK);
6754 }
6755
6756 __attribute__((always_inline))
6757 void *
zalloc_noblock(union zone_or_view zov)6758 zalloc_noblock(union zone_or_view zov)
6759 {
6760 return zalloc_flags(zov, Z_NOWAIT);
6761 }
6762
6763 void *
zalloc_flags(union zone_or_view zov,zalloc_flags_t flags)6764 zalloc_flags(union zone_or_view zov, zalloc_flags_t flags)
6765 {
6766 zone_t zone = zov.zov_view->zv_zone;
6767 zone_stats_t zstats = zov.zov_view->zv_stats;
6768 vm_size_t esize = zone_elem_size(zone);
6769
6770 assert(zone > &zone_array[ZONE_ID__LAST_RO]);
6771 assert(!zone->z_percpu);
6772 return zalloc_ext(zone, zstats, flags, esize);
6773 }
6774
6775 __attribute__((always_inline))
6776 void *
6777 (zalloc_id)(zone_id_t zid, zalloc_flags_t flags)
6778 {
6779 return zalloc_flags(&zone_array[zid], flags);
6780 }
6781
6782 void *
6783 (zalloc_ro)(zone_id_t zid, zalloc_flags_t flags)
6784 {
6785 assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6786 zone_t zone = &zone_array[zid];
6787 zone_stats_t zstats = zone->z_stats;
6788 vm_size_t esize = zone_ro_elem_size[zid];
6789 void *elem;
6790
6791 assert(!zone->z_percpu);
6792 elem = zalloc_ext(zone, zstats, flags, esize);
6793 #if ZSECURITY_CONFIG(READ_ONLY)
6794 assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
6795 if (elem) {
6796 zone_require_ro(zid, esize, elem);
6797 }
6798 #endif
6799 return elem;
6800 }
6801
6802 #if ZSECURITY_CONFIG(READ_ONLY)
6803
6804 __attribute__((always_inline))
6805 static bool
from_current_stack(const vm_offset_t addr,vm_size_t size)6806 from_current_stack(const vm_offset_t addr, vm_size_t size)
6807 {
6808 vm_offset_t start = (vm_offset_t)__builtin_frame_address(0);
6809 vm_offset_t end = (start + kernel_stack_size - 1) & -kernel_stack_size;
6810 return (addr >= start) && (addr + size < end);
6811 }
6812
6813 #if XNU_MONITOR
6814 /*
6815 * Check if an address is from const memory i.e TEXT or DATA CONST segements
6816 * or the SECURITY_READ_ONLY_LATE section.
6817 */
6818 __attribute__((always_inline))
6819 static bool
from_const_memory(const vm_offset_t addr,vm_size_t size)6820 from_const_memory(const vm_offset_t addr, vm_size_t size)
6821 {
6822 extern uint8_t text_start[] __SEGMENT_START_SYM("__TEXT");
6823 extern uint8_t text_end[] __SEGMENT_END_SYM("__TEXT");
6824
6825 extern uint8_t data_const_start[] __SEGMENT_START_SYM("__DATA_CONST");
6826 extern uint8_t data_const_end[] __SEGMENT_END_SYM("__DATA_CONST");
6827
6828 extern uint8_t security_start[] __SECTION_START_SYM(SECURITY_SEGMENT_NAME,
6829 SECURITY_SECTION_NAME);
6830 extern uint8_t security_end[] __SECTION_END_SYM(SECURITY_SEGMENT_NAME,
6831 SECURITY_SECTION_NAME);
6832
6833 const uint8_t *_addr = (const uint8_t *) addr;
6834
6835 return (_addr >= text_start && _addr + size <= text_end) ||
6836 (_addr >= data_const_start && _addr + size <= data_const_end) ||
6837 (_addr >= security_start && _addr + size <= security_end);
6838 }
6839 #else
6840 __attribute__((always_inline))
6841 static bool
from_const_memory(const vm_offset_t addr,vm_size_t size)6842 from_const_memory(const vm_offset_t addr, vm_size_t size)
6843 {
6844 (void) addr;
6845 (void) size;
6846 return true;
6847 }
6848 #endif /* XNU_MONITOR */
6849
6850 __abortlike
6851 static void
zalloc_ro_mut_validation_panic(zone_id_t zid,void * elem,const vm_offset_t src,vm_size_t src_size)6852 zalloc_ro_mut_validation_panic(zone_id_t zid, void *elem,
6853 const vm_offset_t src, vm_size_t src_size)
6854 {
6855 if (from_ro_map(src, src_size)) {
6856 zone_t src_zone = &zone_array[zone_index_from_ptr((void *)src)];
6857 zone_t dst_zone = &zone_array[zid];
6858 panic("zalloc_ro_mut failed: source (%p) not from same zone as dst (%p)"
6859 " (expected: %s, actual: %s", (void *)src, elem, src_zone->z_name,
6860 dst_zone->z_name);
6861 }
6862 vm_offset_t start = (vm_offset_t)__builtin_frame_address(0);
6863 vm_offset_t end = (start + kernel_stack_size - 1) & -kernel_stack_size;
6864 panic("zalloc_ro_mut failed: source (%p) neither from RO zone map nor from"
6865 " current stack (%p - %p)\n", (void *)src, (void *)start, (void *)end);
6866 }
6867
6868 __attribute__((always_inline))
6869 static void
zalloc_ro_mut_validate_src(zone_id_t zid,void * elem,const vm_offset_t src,vm_size_t src_size)6870 zalloc_ro_mut_validate_src(zone_id_t zid, void *elem,
6871 const vm_offset_t src, vm_size_t src_size)
6872 {
6873 if (from_current_stack(src, src_size) ||
6874 (from_ro_map(src, src_size) &&
6875 zid == zone_index_from_ptr((void *)src)) ||
6876 from_const_memory(src, src_size)) {
6877 return;
6878 }
6879 zalloc_ro_mut_validation_panic(zid, elem, src, src_size);
6880 }
6881
6882 #endif /* ZSECURITY_CONFIG(READ_ONLY) */
6883
6884 __attribute__((noinline))
6885 void
zalloc_ro_mut(zone_id_t zid,void * elem,vm_offset_t offset,const void * new_data,vm_size_t new_data_size)6886 zalloc_ro_mut(zone_id_t zid, void *elem, vm_offset_t offset,
6887 const void *new_data, vm_size_t new_data_size)
6888 {
6889 assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6890
6891 #if ZSECURITY_CONFIG(READ_ONLY)
6892 zalloc_ro_mut_validate_src(zid, elem, (vm_offset_t)new_data,
6893 new_data_size);
6894 pmap_ro_zone_memcpy(zid, (vm_offset_t) elem, offset,
6895 (vm_offset_t) new_data, new_data_size);
6896 #else
6897 (void)zid;
6898 memcpy((void *)((uintptr_t)elem + offset), new_data, new_data_size);
6899 #endif
6900 }
6901
6902 __attribute__((noinline))
6903 uint64_t
zalloc_ro_mut_atomic(zone_id_t zid,void * elem,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)6904 zalloc_ro_mut_atomic(zone_id_t zid, void *elem, vm_offset_t offset,
6905 zro_atomic_op_t op, uint64_t value)
6906 {
6907 assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6908
6909 #if ZSECURITY_CONFIG(READ_ONLY)
6910 value = pmap_ro_zone_atomic_op(zid, (vm_offset_t)elem, offset, op, value);
6911 #else
6912 (void)zid;
6913 value = __zalloc_ro_mut_atomic((vm_offset_t)elem + offset, op, value);
6914 #endif
6915 return value;
6916 }
6917
6918 void
zalloc_ro_clear(zone_id_t zid,void * elem,vm_offset_t offset,vm_size_t size)6919 zalloc_ro_clear(zone_id_t zid, void *elem, vm_offset_t offset, vm_size_t size)
6920 {
6921 assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6922 #if ZSECURITY_CONFIG(READ_ONLY)
6923 pmap_ro_zone_bzero(zid, (vm_offset_t)elem, offset, size);
6924 #else
6925 (void)zid;
6926 bzero((void *)((uintptr_t)elem + offset), size);
6927 #endif
6928 }
6929
6930 /*
6931 * This function will run in the PPL and needs to be robust
6932 * against an attacker with arbitrary kernel write.
6933 */
6934
6935 #if ZSECURITY_CONFIG(READ_ONLY)
6936
6937 __abortlike
6938 static void
zone_id_require_ro_panic(zone_id_t zid,vm_size_t esize,void * addr)6939 zone_id_require_ro_panic(zone_id_t zid, vm_size_t esize, void *addr)
6940 {
6941 vm_offset_t va = (vm_offset_t)addr;
6942 uint32_t zindex;
6943 zone_t other;
6944 zone_t zone = &zone_array[zid];
6945
6946 if (!from_ro_map(addr, 1)) {
6947 panic("zone_require_ro failed: address not in a ro zone (addr: %p)", addr);
6948 }
6949
6950 if (zone_addr_size_crosses_page(va, esize)) {
6951 panic("zone_require_ro failed: address crosses a page (addr: %p)", addr);
6952 }
6953
6954 zindex = zone_index_from_ptr(addr);
6955 other = &zone_array[zindex];
6956 if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
6957 panic("zone_require_ro failed: invalid zone index %d "
6958 "(addr: %p, expected: %s%s)", zindex,
6959 addr, zone_heap_name(zone), zone->z_name);
6960 } else {
6961 panic("zone_require_ro failed: address in unexpected zone id %d (%s%s) "
6962 "(addr: %p, expected: %s%s)",
6963 zindex, zone_heap_name(other), other->z_name,
6964 addr, zone_heap_name(zone), zone->z_name);
6965 }
6966 }
6967
6968 #endif /* ZSECURITY_CONFIG(READ_ONLY) */
6969
6970 __attribute__((always_inline))
6971 void
zone_require_ro(zone_id_t zid,vm_size_t esize,void * addr)6972 zone_require_ro(zone_id_t zid, vm_size_t esize, void *addr)
6973 {
6974 #if ZSECURITY_CONFIG(READ_ONLY)
6975 vm_offset_t va = (vm_offset_t)addr;
6976 struct zone_page_metadata *meta = zone_meta_from_addr(va);
6977
6978 /*
6979 * Check that:
6980 * - the first byte of the element is in the map
6981 * - the element doesn't cross a page (implies it is wholy in the map)
6982 * - the zone ID matches
6983 *
6984 * The code is weirdly written to minimize instruction count.
6985 */
6986 if (!from_ro_map(addr, 1) ||
6987 zone_addr_size_crosses_page(va, esize) ||
6988 zid != meta->zm_index) {
6989 zone_id_require_ro_panic(zid, esize, addr);
6990 }
6991 #else
6992 #pragma unused(zid, esize, addr)
6993 #endif
6994 }
6995
6996 void
zone_require_ro_range_contains(zone_id_t zid,void * addr)6997 zone_require_ro_range_contains(zone_id_t zid, void *addr)
6998 {
6999 vm_size_t esize = zone_ro_elem_size[zid];
7000 vm_offset_t va = (vm_offset_t)addr;
7001
7002 /* this is called by the pmap and we know for those the RO submap is on */
7003 assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
7004
7005 if (!from_ro_map(addr, esize)) {
7006 zone_t zone = &zone_array[zid];
7007 zone_invalid_element_addr_panic(zone, va);
7008 }
7009 }
7010
7011 void *
zalloc_percpu(union zone_or_view zov,zalloc_flags_t flags)7012 zalloc_percpu(union zone_or_view zov, zalloc_flags_t flags)
7013 {
7014 zone_t zone = zov.zov_view->zv_zone;
7015 zone_stats_t zstats = zov.zov_view->zv_stats;
7016 vm_size_t esize = zone_elem_size(zone);
7017
7018 assert(zone > &zone_array[ZONE_ID__LAST_RO]);
7019 assert(zone->z_percpu);
7020 flags |= Z_PCPU;
7021 return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags, esize));
7022 }
7023
7024 static void *
_zalloc_permanent(zone_t zone,vm_size_t size,vm_offset_t mask)7025 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
7026 {
7027 struct zone_page_metadata *page_meta;
7028 vm_offset_t offs, addr;
7029 zone_pva_t pva;
7030
7031 assert(ml_get_interrupts_enabled() ||
7032 ml_is_quiescing() ||
7033 debug_mode_active() ||
7034 startup_phase < STARTUP_SUB_EARLY_BOOT);
7035
7036 size = (size + mask) & ~mask;
7037 assert(size <= PAGE_SIZE);
7038
7039 zone_lock(zone);
7040 assert(zone->z_self == zone);
7041
7042 for (;;) {
7043 pva = zone->z_pageq_partial;
7044 while (!zone_pva_is_null(pva)) {
7045 page_meta = zone_pva_to_meta(pva);
7046 if (page_meta->zm_bump + size <= PAGE_SIZE) {
7047 goto found;
7048 }
7049 pva = page_meta->zm_page_next;
7050 }
7051
7052 zone_expand_locked(zone, Z_WAITOK, NULL);
7053 }
7054
7055 found:
7056 offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask);
7057 page_meta->zm_bump = (uint16_t)(offs + size);
7058 page_meta->zm_alloc_size += size;
7059 zone->z_elems_free -= size;
7060 zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
7061
7062 if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) {
7063 zone_meta_requeue(zone, &zone->z_pageq_full, page_meta);
7064 }
7065
7066 zone_unlock(zone);
7067
7068 addr = offs + zone_pva_to_addr(pva);
7069
7070 DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
7071 return (void *)addr;
7072 }
7073
7074 static void *
_zalloc_permanent_large(size_t size,vm_offset_t mask,vm_tag_t tag)7075 _zalloc_permanent_large(size_t size, vm_offset_t mask, vm_tag_t tag)
7076 {
7077 vm_offset_t addr;
7078
7079 kernel_memory_allocate(kernel_map, &addr, size, mask,
7080 KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, tag);
7081
7082 return (void *)addr;
7083 }
7084
7085 void *
zalloc_permanent_tag(vm_size_t size,vm_offset_t mask,vm_tag_t tag)7086 zalloc_permanent_tag(vm_size_t size, vm_offset_t mask, vm_tag_t tag)
7087 {
7088 if (size <= PAGE_SIZE) {
7089 zone_t zone = &zone_array[ZONE_ID_PERMANENT];
7090 return _zalloc_permanent(zone, size, mask);
7091 }
7092 return _zalloc_permanent_large(size, mask, tag);
7093 }
7094
7095 void *
zalloc_percpu_permanent(vm_size_t size,vm_offset_t mask)7096 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
7097 {
7098 zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
7099 return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
7100 }
7101
7102 /*! @} */
7103 #endif /* !ZALLOC_TEST */
7104 #pragma mark zone GC / trimming
7105 #if !ZALLOC_TEST
7106
7107 static thread_call_data_t zone_defrag_callout;
7108
7109 static void
zone_reclaim_chunk(zone_t z,struct zone_page_metadata * meta,uint32_t free_count,struct zone_depot * mags)7110 zone_reclaim_chunk(zone_t z, struct zone_page_metadata *meta,
7111 uint32_t free_count, struct zone_depot *mags)
7112 {
7113 vm_address_t page_addr;
7114 vm_size_t size_to_free;
7115 uint32_t bitmap_ref;
7116 uint32_t page_count;
7117 zone_security_flags_t zsflags = zone_security_config(z);
7118 bool sequester = zsflags.z_va_sequester && !z->z_destroyed;
7119 bool oob_guard = false;
7120
7121 if (zone_submap_is_sequestered(zsflags)) {
7122 /*
7123 * If the entire map is sequestered, we can't return the VA.
7124 * It stays pinned to the zone forever.
7125 */
7126 sequester = true;
7127 }
7128
7129 zone_meta_queue_pop(z, &z->z_pageq_empty);
7130
7131 page_addr = zone_meta_to_addr(meta);
7132 page_count = meta->zm_chunk_len;
7133 oob_guard = meta->zm_guarded;
7134
7135 if (meta->zm_alloc_size) {
7136 zone_metadata_corruption(z, meta, "alloc_size");
7137 }
7138 if (z->z_percpu) {
7139 if (page_count != 1) {
7140 zone_metadata_corruption(z, meta, "page_count");
7141 }
7142 size_to_free = ptoa(z->z_chunk_pages);
7143 zone_remove_wired_pages(z->z_chunk_pages);
7144 } else {
7145 if (page_count > z->z_chunk_pages) {
7146 zone_metadata_corruption(z, meta, "page_count");
7147 }
7148 if (page_count < z->z_chunk_pages) {
7149 /* Dequeue non populated VA from z_pageq_va */
7150 zone_meta_remqueue(z, meta + page_count);
7151 }
7152 size_to_free = ptoa(page_count);
7153 zone_remove_wired_pages(page_count);
7154 }
7155
7156 zone_counter_sub(z, z_elems_free, free_count);
7157 zone_counter_sub(z, z_elems_avail, free_count);
7158 zone_counter_sub(z, z_wired_empty, page_count);
7159 zone_counter_sub(z, z_wired_cur, page_count);
7160 if (z->z_elems_free_min < free_count) {
7161 z->z_elems_free_min = 0;
7162 } else {
7163 z->z_elems_free_min -= free_count;
7164 }
7165 if (z->z_elems_free_max < free_count) {
7166 z->z_elems_free_max = 0;
7167 } else {
7168 z->z_elems_free_max -= free_count;
7169 }
7170
7171 bitmap_ref = 0;
7172 if (sequester) {
7173 if (meta->zm_inline_bitmap) {
7174 for (int i = 0; i < meta->zm_chunk_len; i++) {
7175 meta[i].zm_bitmap = 0;
7176 }
7177 } else {
7178 bitmap_ref = meta->zm_bitmap;
7179 meta->zm_bitmap = 0;
7180 }
7181 meta->zm_chunk_len = 0;
7182 } else {
7183 if (!meta->zm_inline_bitmap) {
7184 bitmap_ref = meta->zm_bitmap;
7185 }
7186 zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
7187 bzero(meta, sizeof(*meta) * (z->z_chunk_pages + oob_guard));
7188 }
7189
7190 #if CONFIG_ZLEAKS
7191 if (__improbable(zleak_should_disable_for_zone(z) &&
7192 startup_phase >= STARTUP_SUB_THREAD_CALL)) {
7193 thread_call_enter(&zone_leaks_callout);
7194 }
7195 #endif /* CONFIG_ZLEAKS */
7196
7197 zone_unlock(z);
7198
7199 if (bitmap_ref) {
7200 zone_bits_free(bitmap_ref);
7201 }
7202
7203 /* Free the pages for metadata and account for them */
7204 #if KASAN_ZALLOC
7205 kasan_poison_range(page_addr, size_to_free, ASAN_VALID);
7206 #endif
7207 #if VM_TAG_SIZECLASSES
7208 if (z->z_uses_tags) {
7209 ztMemoryRemove(z, page_addr, size_to_free);
7210 }
7211 #endif /* VM_TAG_SIZECLASSES */
7212
7213 if (sequester) {
7214 kernel_memory_depopulate(page_addr, size_to_free,
7215 KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
7216 } else {
7217 assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_VM);
7218 kmem_free(zone_submap(zsflags), page_addr,
7219 ptoa(z->z_chunk_pages + oob_guard));
7220 if (oob_guard) {
7221 os_atomic_dec(&zone_guard_pages, relaxed);
7222 }
7223 }
7224
7225 zone_magazine_free_list(mags);
7226 thread_yield_to_preemption();
7227
7228 zone_lock(z);
7229
7230 if (sequester) {
7231 zone_meta_queue_push(z, &z->z_pageq_va, meta);
7232 }
7233 }
7234
7235 static uint16_t
zone_reclaim_elements(zone_t z,uint16_t * count,zone_element_t * elems)7236 zone_reclaim_elements(zone_t z, uint16_t *count, zone_element_t *elems)
7237 {
7238 uint16_t n = *count;
7239
7240 z_debug_assert(n <= zc_mag_size());
7241
7242 for (uint16_t i = 0; i < n; i++) {
7243 zone_element_t ze = elems[i];
7244 elems[i].ze_value = 0;
7245 zfree_drop(z, zone_element_validate(z, ze), ze, false);
7246 }
7247
7248 *count = 0;
7249 return n;
7250 }
7251
7252 static uint16_t
zone_reclaim_recirc_magazine(zone_t z,struct zone_depot * mags)7253 zone_reclaim_recirc_magazine(zone_t z, struct zone_depot *mags)
7254 {
7255 zone_magazine_t mag = STAILQ_FIRST(&z->z_recirc);
7256
7257 STAILQ_REMOVE_HEAD(&z->z_recirc, zm_link);
7258 STAILQ_INSERT_TAIL(mags, mag, zm_link);
7259 zone_counter_sub(z, z_recirc_cur, 1);
7260
7261 z_debug_assert(mag->zm_cur == zc_mag_size());
7262
7263 for (uint16_t i = 0; i < zc_mag_size(); i++) {
7264 zone_element_t ze = mag->zm_elems[i];
7265 mag->zm_elems[i].ze_value = 0;
7266 zfree_drop(z, zone_element_validate(z, ze), ze, true);
7267 }
7268
7269 mag->zm_cur = 0;
7270
7271 return zc_mag_size();
7272 }
7273
7274 static void
zone_depot_trim(zone_cache_t zc,struct zone_depot * head)7275 zone_depot_trim(zone_cache_t zc, struct zone_depot *head)
7276 {
7277 zone_magazine_t mag;
7278
7279 if (zc->zc_depot_cur == 0 ||
7280 2 * (zc->zc_depot_cur + 1) * zc_mag_size() <= zc->zc_depot_max) {
7281 return;
7282 }
7283
7284 zone_depot_lock(zc);
7285
7286 while (zc->zc_depot_cur &&
7287 2 * (zc->zc_depot_cur + 1) * zc_mag_size() > zc->zc_depot_max) {
7288 mag = STAILQ_FIRST(&zc->zc_depot);
7289 STAILQ_REMOVE_HEAD(&zc->zc_depot, zm_link);
7290 STAILQ_INSERT_TAIL(head, mag, zm_link);
7291 zc->zc_depot_cur--;
7292 }
7293
7294 zone_depot_unlock(zc);
7295 }
7296
7297 __enum_decl(zone_reclaim_mode_t, uint32_t, {
7298 ZONE_RECLAIM_TRIM,
7299 ZONE_RECLAIM_DRAIN,
7300 ZONE_RECLAIM_DESTROY,
7301 });
7302
7303 /*!
7304 * @function zone_reclaim
7305 *
7306 * @brief
7307 * Drains or trim the zone.
7308 *
7309 * @discussion
7310 * Draining the zone will free it from all its elements.
7311 *
7312 * Trimming the zone tries to respect the working set size, and avoids draining
7313 * the depot when it's not necessary.
7314 *
7315 * @param z The zone to reclaim from
7316 * @param mode The purpose of this reclaim.
7317 */
7318 static void
zone_reclaim(zone_t z,zone_reclaim_mode_t mode)7319 zone_reclaim(zone_t z, zone_reclaim_mode_t mode)
7320 {
7321 struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
7322 zone_magazine_t mag;
7323
7324 zone_lock(z);
7325
7326 if (mode == ZONE_RECLAIM_DESTROY) {
7327 if (!z->z_destructible || z->z_elems_rsv) {
7328 panic("zdestroy: Zone %s%s isn't destructible",
7329 zone_heap_name(z), z->z_name);
7330 }
7331
7332 if (!z->z_self || z->z_expander || z->z_expander_vm_priv ||
7333 z->z_async_refilling || z->z_expanding_wait) {
7334 panic("zdestroy: Zone %s%s in an invalid state for destruction",
7335 zone_heap_name(z), z->z_name);
7336 }
7337
7338 #if !KASAN_ZALLOC
7339 /*
7340 * Unset the valid bit. We'll hit an assert failure on further
7341 * operations on this zone, until zinit() is called again.
7342 *
7343 * Leave the zone valid for KASan as we will see zfree's on
7344 * quarantined free elements even after the zone is destroyed.
7345 */
7346 z->z_self = NULL;
7347 #endif
7348 z->z_destroyed = true;
7349 } else if (z->z_destroyed) {
7350 return zone_unlock(z);
7351 } else if (z->z_elems_free <= z->z_elems_rsv) {
7352 /* If the zone is under its reserve level, leave it alone. */
7353 return zone_unlock(z);
7354 }
7355
7356 if (z->z_pcpu_cache) {
7357 if (mode != ZONE_RECLAIM_TRIM) {
7358 zpercpu_foreach(zc, z->z_pcpu_cache) {
7359 zc->zc_depot_max /= 2;
7360 }
7361 } else {
7362 zpercpu_foreach(zc, z->z_pcpu_cache) {
7363 if (zc->zc_depot_max > 0) {
7364 zc->zc_depot_max--;
7365 }
7366 }
7367 }
7368
7369 zone_unlock(z);
7370
7371 if (mode == ZONE_RECLAIM_TRIM) {
7372 zpercpu_foreach(zc, z->z_pcpu_cache) {
7373 zone_depot_trim(zc, &mags);
7374 }
7375 } else {
7376 zpercpu_foreach(zc, z->z_pcpu_cache) {
7377 zone_depot_lock(zc);
7378 STAILQ_CONCAT(&mags, &zc->zc_depot);
7379 zc->zc_depot_cur = 0;
7380 zone_depot_unlock(zc);
7381 }
7382 }
7383
7384 zone_lock(z);
7385
7386 uint32_t freed = 0;
7387
7388 STAILQ_FOREACH(mag, &mags, zm_link) {
7389 freed += zone_reclaim_elements(z,
7390 &mag->zm_cur, mag->zm_elems);
7391
7392 if (freed >= zc_free_batch_size) {
7393 z->z_elems_free_min += freed;
7394 z->z_elems_free_max += freed;
7395 z->z_elems_free += freed;
7396 zone_unlock(z);
7397 thread_yield_to_preemption();
7398 zone_lock(z);
7399 freed = 0;
7400 }
7401 }
7402
7403 if (mode == ZONE_RECLAIM_DESTROY) {
7404 zpercpu_foreach(zc, z->z_pcpu_cache) {
7405 freed += zone_reclaim_elements(z,
7406 &zc->zc_alloc_cur, zc->zc_alloc_elems);
7407 freed += zone_reclaim_elements(z,
7408 &zc->zc_free_cur, zc->zc_free_elems);
7409 }
7410
7411 z->z_elems_free_wss = 0;
7412 z->z_elems_free_min = 0;
7413 z->z_elems_free_max = 0;
7414 z->z_contention_cur = 0;
7415 z->z_contention_wma = 0;
7416 } else {
7417 z->z_elems_free_min += freed;
7418 z->z_elems_free_max += freed;
7419 }
7420 z->z_elems_free += freed;
7421 }
7422
7423 for (;;) {
7424 struct zone_page_metadata *meta;
7425 uint32_t count, goal, freed = 0;
7426
7427 goal = z->z_elems_rsv;
7428 if (mode == ZONE_RECLAIM_TRIM) {
7429 /*
7430 * When trimming, only free elements in excess
7431 * of the working set estimate.
7432 *
7433 * However if we are in a situation where the working
7434 * set estimate is clearly growing, ignore the estimate
7435 * as the next working set update will grow it and
7436 * we want to avoid churn.
7437 */
7438 goal = MAX(goal, MAX(z->z_elems_free_wss,
7439 z->z_elems_free - z->z_elems_free_min));
7440
7441 /*
7442 * Add some slop to account for "the last partial chunk in flight"
7443 * so that we do not deplete the recirculation depot too harshly.
7444 */
7445 goal += z->z_chunk_elems / 2;
7446 }
7447
7448 if (z->z_elems_free <= goal) {
7449 break;
7450 }
7451
7452 /*
7453 * If we're above target, but we have no free page, then drain
7454 * the recirculation depot until we get a free chunk or exhaust
7455 * the depot.
7456 *
7457 * This is rather abrupt but also somehow will reduce
7458 * fragmentation anyway, and the zone code will import
7459 * over time anyway.
7460 */
7461 while (z->z_recirc_cur && zone_pva_is_null(z->z_pageq_empty)) {
7462 if (freed >= zc_free_batch_size) {
7463 zone_unlock(z);
7464 zone_magazine_free_list(&mags);
7465 thread_yield_to_preemption();
7466 zone_lock(z);
7467 freed = 0;
7468 /* we dropped the lock, needs to reassess */
7469 continue;
7470 }
7471 freed += zone_reclaim_recirc_magazine(z, &mags);
7472 }
7473
7474 if (zone_pva_is_null(z->z_pageq_empty)) {
7475 break;
7476 }
7477
7478 meta = zone_pva_to_meta(z->z_pageq_empty);
7479 count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_size(z);
7480
7481 if (z->z_elems_free - count < goal) {
7482 break;
7483 }
7484
7485 zone_reclaim_chunk(z, meta, count, &mags);
7486 }
7487
7488 zone_unlock(z);
7489
7490 zone_magazine_free_list(&mags);
7491 }
7492
7493 static void
zone_reclaim_all(zone_reclaim_mode_t mode)7494 zone_reclaim_all(zone_reclaim_mode_t mode)
7495 {
7496 /*
7497 * Start with zones with VA sequester since depopulating
7498 * pages will not need to allocate vm map entries for holes,
7499 * which will give memory back to the system faster.
7500 */
7501 zone_index_foreach(zid) {
7502 zone_t z = &zone_array[zid];
7503 if (z == zc_magazine_zone) {
7504 continue;
7505 }
7506 if (zone_security_array[zid].z_va_sequester && z->collectable) {
7507 zone_reclaim(z, mode);
7508 }
7509 }
7510
7511 zone_index_foreach(zid) {
7512 zone_t z = &zone_array[zid];
7513 if (z == zc_magazine_zone) {
7514 continue;
7515 }
7516 if (!zone_security_array[zid].z_va_sequester && z->collectable) {
7517 zone_reclaim(z, mode);
7518 }
7519 }
7520
7521 zone_reclaim(zc_magazine_zone, mode);
7522 }
7523
7524 void
zone_userspace_reboot_checks(void)7525 zone_userspace_reboot_checks(void)
7526 {
7527 vm_size_t label_zone_size = zone_size_allocated(ipc_service_port_label_zone);
7528 if (label_zone_size != 0) {
7529 panic("Zone %s should be empty upon userspace reboot. Actual size: %lu.",
7530 ipc_service_port_label_zone->z_name, (unsigned long)label_zone_size);
7531 }
7532 }
7533
7534 void
zone_gc(zone_gc_level_t level)7535 zone_gc(zone_gc_level_t level)
7536 {
7537 zone_reclaim_mode_t mode;
7538 zone_t largest_zone = NULL;
7539
7540 switch (level) {
7541 case ZONE_GC_TRIM:
7542 mode = ZONE_RECLAIM_TRIM;
7543 break;
7544 case ZONE_GC_DRAIN:
7545 mode = ZONE_RECLAIM_DRAIN;
7546 break;
7547 case ZONE_GC_JETSAM:
7548 largest_zone = kill_process_in_largest_zone();
7549 mode = ZONE_RECLAIM_TRIM;
7550 break;
7551 }
7552
7553 current_thread()->options |= TH_OPT_ZONE_PRIV;
7554 lck_mtx_lock(&zone_gc_lock);
7555
7556 zone_reclaim_all(mode);
7557
7558 if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) {
7559 /*
7560 * If we possibly killed a process, but we're still critical,
7561 * we need to drain harder.
7562 */
7563 zone_reclaim(largest_zone, ZONE_RECLAIM_DRAIN);
7564 zone_reclaim_all(ZONE_RECLAIM_DRAIN);
7565 }
7566
7567 lck_mtx_unlock(&zone_gc_lock);
7568 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
7569 }
7570
7571 void
zone_gc_trim(void)7572 zone_gc_trim(void)
7573 {
7574 zone_gc(ZONE_GC_TRIM);
7575 }
7576
7577 void
zone_gc_drain(void)7578 zone_gc_drain(void)
7579 {
7580 zone_gc(ZONE_GC_DRAIN);
7581 }
7582
7583 static bool
zone_defrag_needed(zone_t z)7584 zone_defrag_needed(zone_t z)
7585 {
7586 uint32_t recirc_size = z->z_recirc_cur * zc_mag_size();
7587
7588 if (recirc_size <= z->z_chunk_elems / 2) {
7589 return false;
7590 }
7591 if (recirc_size * z->z_elem_size <= zc_defrag_threshold) {
7592 return false;
7593 }
7594 return recirc_size * zc_defrag_ratio > z->z_elems_free_wss * 100;
7595 }
7596
7597 /*!
7598 * @function zone_defrag
7599 *
7600 * @brief
7601 * Resize the recirculation depot to match the working set size.
7602 *
7603 * @discussion
7604 * When zones grow very large due to a spike in usage, and then some of those
7605 * elements get freed, the elements in magazines in the recirculation depot
7606 * are in no particular order.
7607 *
7608 * In order to control fragmentation, we need to detect "empty" pages so that
7609 * they get onto the @c z_pageq_empty freelist, so that allocations re-pack
7610 * naturally.
7611 *
7612 * This is done very gently, never in excess of the working set and some slop.
7613 */
7614 static bool
zone_autogc_needed(zone_t z)7615 zone_autogc_needed(zone_t z)
7616 {
7617 uint32_t free_min = z->z_elems_free_min;
7618
7619 if (free_min * z->z_elem_size <= zc_autogc_threshold) {
7620 return false;
7621 }
7622
7623 return free_min * zc_autogc_ratio > z->z_elems_free_wss * 100;
7624 }
7625
7626 static void
zone_defrag(zone_t z)7627 zone_defrag(zone_t z)
7628 {
7629 struct zone_depot mags = STAILQ_HEAD_INITIALIZER(mags);
7630 zone_magazine_t mag, tmp;
7631 uint32_t freed = 0, goal = 0;
7632
7633 zone_lock(z);
7634
7635 goal = z->z_elems_free_wss + z->z_chunk_elems / 2 +
7636 zc_mag_size() - 1;
7637
7638 while (z->z_recirc_cur * zc_mag_size() > goal) {
7639 if (freed >= zc_free_batch_size) {
7640 zone_unlock(z);
7641 thread_yield_to_preemption();
7642 zone_lock(z);
7643 freed = 0;
7644 /* we dropped the lock, needs to reassess */
7645 continue;
7646 }
7647 freed += zone_reclaim_recirc_magazine(z, &mags);
7648 }
7649
7650 zone_unlock(z);
7651
7652 STAILQ_FOREACH_SAFE(mag, &mags, zm_link, tmp) {
7653 zone_magazine_free(mag);
7654 }
7655 }
7656
7657 static void
zone_defrag_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)7658 zone_defrag_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
7659 {
7660 zone_foreach(z) {
7661 if (!z->collectable || z == zc_magazine_zone) {
7662 continue;
7663 }
7664
7665 if (zone_autogc_needed(z)) {
7666 current_thread()->options |= TH_OPT_ZONE_PRIV;
7667 lck_mtx_lock(&zone_gc_lock);
7668 zone_reclaim(z, ZONE_RECLAIM_TRIM);
7669 lck_mtx_unlock(&zone_gc_lock);
7670 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
7671 } else if (zone_defrag_needed(z)) {
7672 zone_defrag(z);
7673 }
7674 }
7675
7676 if (zone_autogc_needed(zc_magazine_zone)) {
7677 current_thread()->options |= TH_OPT_ZONE_PRIV;
7678 lck_mtx_lock(&zone_gc_lock);
7679 zone_reclaim(zc_magazine_zone, ZONE_RECLAIM_TRIM);
7680 lck_mtx_unlock(&zone_gc_lock);
7681 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
7682 } else if (zone_defrag_needed(zc_magazine_zone)) {
7683 zone_defrag(zc_magazine_zone);
7684 }
7685 }
7686
7687 void
compute_zone_working_set_size(__unused void * param)7688 compute_zone_working_set_size(__unused void *param)
7689 {
7690 uint32_t zc_auto = zc_auto_threshold;
7691 bool kick_defrag = false;
7692
7693 /*
7694 * Keep zone caching disabled until the first proc is made.
7695 */
7696 if (__improbable(zone_caching_disabled < 0)) {
7697 return;
7698 }
7699
7700 zone_caching_disabled = vm_pool_low();
7701
7702 if (os_mul_overflow(zc_auto, Z_CONTENTION_WMA_UNIT, &zc_auto)) {
7703 zc_auto = 0;
7704 }
7705
7706 zone_foreach(z) {
7707 uint32_t wma;
7708 bool needs_caching = false;
7709
7710 if (z->z_self != z) {
7711 continue;
7712 }
7713
7714 zone_lock(z);
7715
7716 wma = z->z_elems_free_max - z->z_elems_free_min;
7717 wma = (3 * wma + z->z_elems_free_wss) / 4;
7718 z->z_elems_free_max = z->z_elems_free_min = z->z_elems_free;
7719 z->z_elems_free_wss = wma;
7720
7721 if (!kick_defrag &&
7722 (zone_defrag_needed(z) || zone_autogc_needed(z))) {
7723 kick_defrag = true;
7724 }
7725
7726 /* fixed point decimal of contentions per second */
7727 wma = z->z_contention_cur * Z_CONTENTION_WMA_UNIT /
7728 ZONE_WSS_UPDATE_PERIOD;
7729 z->z_contention_cur = 0;
7730 z->z_contention_wma = (3 * wma + z->z_contention_wma) / 4;
7731
7732 /*
7733 * If the zone seems to be very quiet,
7734 * gently lower its cpu-local depot size.
7735 */
7736 if (z->z_pcpu_cache && wma < Z_CONTENTION_WMA_UNIT / 2 &&
7737 z->z_contention_wma < Z_CONTENTION_WMA_UNIT / 2) {
7738 zpercpu_foreach(zc, z->z_pcpu_cache) {
7739 if (zc->zc_depot_max > zc_mag_size()) {
7740 zc->zc_depot_max--;
7741 }
7742 }
7743 }
7744
7745 /*
7746 * If the zone has been contending like crazy for two periods,
7747 * and is eligible, maybe it's time to enable caching.
7748 */
7749 if (!z->z_nocaching && !z->z_pcpu_cache && !z->exhaustible &&
7750 zc_auto && z->z_contention_wma >= zc_auto && wma >= zc_auto) {
7751 needs_caching = true;
7752 }
7753
7754 zone_unlock(z);
7755
7756 if (needs_caching) {
7757 zone_enable_caching(z);
7758 }
7759 }
7760
7761 if (kick_defrag) {
7762 thread_call_enter(&zone_defrag_callout);
7763 }
7764 }
7765
7766 #endif /* !ZALLOC_TEST */
7767 #pragma mark vm integration, MIG routines
7768 #if !ZALLOC_TEST
7769
7770 extern unsigned int stack_total;
7771 #if defined (__x86_64__)
7772 extern unsigned int inuse_ptepages_count;
7773 #endif
7774
7775 static void
panic_print_types_in_zone(zone_t z,const char * debug_str)7776 panic_print_types_in_zone(zone_t z, const char* debug_str)
7777 {
7778 kalloc_type_view_t kt_cur = NULL;
7779 const char *prev_type = "";
7780 size_t skip_over_site = sizeof("site.") - 1;
7781 paniclog_append_noflush("kalloc types in zone, %s (%s):\n",
7782 debug_str, z->z_name);
7783 kt_cur = (kalloc_type_view_t) z->z_views;
7784 while (kt_cur) {
7785 struct zone_view kt_zv = kt_cur->kt_zv;
7786 const char *typename = kt_zv.zv_name + skip_over_site;
7787 if (strcmp(typename, prev_type) != 0) {
7788 paniclog_append_noflush("\t%-50s\n", typename);
7789 prev_type = typename;
7790 }
7791 kt_cur = (kalloc_type_view_t) kt_zv.zv_next;
7792 }
7793 paniclog_append_noflush("\n");
7794 }
7795
7796 static void
panic_display_kalloc_types(void)7797 panic_display_kalloc_types(void)
7798 {
7799 if (kalloc_type_src_zone) {
7800 panic_print_types_in_zone(kalloc_type_src_zone, "addr belongs to");
7801 }
7802 if (kalloc_type_dst_zone) {
7803 panic_print_types_in_zone(kalloc_type_dst_zone,
7804 "addr is being freed to");
7805 }
7806 }
7807
7808 static void
zone_find_n_largest(const uint32_t n,zone_t * largest_zones,uint64_t * zone_size)7809 zone_find_n_largest(const uint32_t n, zone_t *largest_zones,
7810 uint64_t *zone_size)
7811 {
7812 zone_index_foreach(zid) {
7813 zone_t z = &zone_array[zid];
7814 vm_offset_t size = zone_size_wired(z);
7815
7816 if (zid == ZONE_ID_VM_PAGES) {
7817 continue;
7818 }
7819 for (uint32_t i = 0; i < n; i++) {
7820 if (size > zone_size[i]) {
7821 largest_zones[i] = z;
7822 zone_size[i] = size;
7823 break;
7824 }
7825 }
7826 }
7827 }
7828
7829 #define NUM_LARGEST_ZONES 5
7830 static void
panic_display_largest_zones(void)7831 panic_display_largest_zones(void)
7832 {
7833 zone_t largest_zones[NUM_LARGEST_ZONES] = { NULL };
7834 uint64_t largest_size[NUM_LARGEST_ZONES] = { 0 };
7835
7836 zone_find_n_largest(NUM_LARGEST_ZONES, (zone_t *) &largest_zones,
7837 (uint64_t *) &largest_size);
7838
7839 paniclog_append_noflush("Largest zones:\n%-28s %10s %10s\n",
7840 "Zone Name", "Cur Size", "Free Size");
7841 for (uint32_t i = 0; i < NUM_LARGEST_ZONES; i++) {
7842 zone_t z = largest_zones[i];
7843 paniclog_append_noflush("%-8s%-20s %9luM %9luK\n",
7844 zone_heap_name(z), z->z_name,
7845 (uintptr_t)largest_size[i] >> 20,
7846 (uintptr_t)zone_size_free(z) >> 10);
7847 }
7848 }
7849
7850 static void
panic_display_zprint(void)7851 panic_display_zprint(void)
7852 {
7853 panic_display_largest_zones();
7854 paniclog_append_noflush("%-20s %10lu\n", "Kernel Stacks",
7855 (uintptr_t)(kernel_stack_size * stack_total));
7856 #if defined (__x86_64__)
7857 paniclog_append_noflush("%-20s %10lu\n", "PageTables",
7858 (uintptr_t)ptoa(inuse_ptepages_count));
7859 #endif
7860 paniclog_append_noflush("%-20s %10lu\n", "Kalloc.Large",
7861 (uintptr_t)kalloc_large_total);
7862
7863 if (panic_kext_memory_info) {
7864 mach_memory_info_t *mem_info = panic_kext_memory_info;
7865
7866 paniclog_append_noflush("\n%-5s %10s\n", "Kmod", "Size");
7867 for (uint32_t i = 0; i < panic_kext_memory_size / sizeof(mem_info[0]); i++) {
7868 if ((mem_info[i].flags & VM_KERN_SITE_TYPE) != VM_KERN_SITE_KMOD) {
7869 continue;
7870 }
7871 if (mem_info[i].size > (1024 * 1024)) {
7872 paniclog_append_noflush("%-5lld %10lld\n",
7873 mem_info[i].site, mem_info[i].size);
7874 }
7875 }
7876 }
7877 }
7878
7879 static void
panic_display_zone_info(void)7880 panic_display_zone_info(void)
7881 {
7882 paniclog_append_noflush("Zone info:\n");
7883 paniclog_append_noflush(" Zone map: %p - %p\n",
7884 (void *) zone_info.zi_map_range.min_address,
7885 (void *) zone_info.zi_map_range.max_address);
7886 #if CONFIG_PROB_GZALLOC
7887 if (pgz_submap) {
7888 paniclog_append_noflush(" . PGZ : %p - %p\n",
7889 (void *) pgz_submap->min_offset,
7890 (void *) pgz_submap->max_offset);
7891 }
7892 #endif /* CONFIG_PROB_GZALLOC */
7893 for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) {
7894 vm_map_t map = zone_submaps[i];
7895
7896 if (map == VM_MAP_NULL) {
7897 continue;
7898 }
7899 paniclog_append_noflush(" . %-6s: %p - %p\n",
7900 zone_submaps_names[i],
7901 (void *) map->min_offset,
7902 (void *) map->max_offset);
7903 }
7904 paniclog_append_noflush(" Metadata: %p - %p\n"
7905 " Bitmaps : %p - %p\n"
7906 "\n",
7907 (void *) zone_info.zi_meta_range.min_address,
7908 (void *) zone_info.zi_meta_range.max_address,
7909 (void *) zone_info.zi_bits_range.min_address,
7910 (void *) zone_info.zi_bits_range.max_address);
7911 }
7912
7913 static void
panic_display_zone_fault(vm_offset_t addr)7914 panic_display_zone_fault(vm_offset_t addr)
7915 {
7916 struct zone_page_metadata meta = { };
7917 vm_map_t map = VM_MAP_NULL;
7918 vm_offset_t oob_offs = 0, size = 0;
7919 int map_idx = -1;
7920 zone_t z = NULL;
7921 const char *kind = "whild deref";
7922 bool oob = false;
7923
7924 /*
7925 * First: look if we bumped into guard pages between submaps
7926 */
7927 for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) {
7928 map = zone_submaps[i];
7929 if (map == VM_MAP_NULL) {
7930 continue;
7931 }
7932
7933 if (addr >= map->min_offset && addr < map->max_offset) {
7934 map_idx = i;
7935 break;
7936 }
7937 }
7938
7939 if (map_idx == -1) {
7940 /* this really shouldn't happen, submaps are back to back */
7941 return;
7942 }
7943
7944 paniclog_append_noflush("Probabilistic GZAlloc Report:\n");
7945
7946 /*
7947 * Second: look if there's just no metadata at all
7948 */
7949 if (ml_nofault_copy((vm_offset_t)zone_meta_from_addr(addr),
7950 (vm_offset_t)&meta, sizeof(meta)) != sizeof(meta) ||
7951 meta.zm_index == 0 || meta.zm_index >= MAX_ZONES ||
7952 zone_array[meta.zm_index].z_self == NULL) {
7953 paniclog_append_noflush(" Zone : <unknown>\n");
7954 kind = "wild deref, missing or invalid metadata";
7955 } else {
7956 z = &zone_array[meta.zm_index];
7957 paniclog_append_noflush(" Zone : %s%s\n",
7958 zone_heap_name(z), zone_name(z));
7959 if (meta.zm_chunk_len == ZM_PGZ_GUARD) {
7960 kind = "out-of-bounds (high confidence)";
7961 oob = true;
7962 size = zone_element_size((void *)addr,
7963 &z, false, &oob_offs);
7964 } else {
7965 kind = "use-after-free (medium confidence)";
7966 }
7967 }
7968
7969 paniclog_append_noflush(" Address : %p\n", (void *)addr);
7970 if (oob) {
7971 paniclog_append_noflush(" Element : [%p, %p) of size %d\n",
7972 (void *)(trunc_page(addr) - (size - oob_offs)),
7973 (void *)trunc_page(addr), (uint32_t)(size - oob_offs));
7974 }
7975 paniclog_append_noflush(" Submap : %s [%p; %p)\n",
7976 zone_submaps_names[map_idx],
7977 (void *)map->min_offset, (void *)map->max_offset);
7978 paniclog_append_noflush(" Kind : %s\n", kind);
7979 if (oob) {
7980 paniclog_append_noflush(" Access : %d byte(s) past\n",
7981 (uint32_t)(addr & PAGE_MASK) + 1);
7982 }
7983 paniclog_append_noflush(" Metadata: zid:%d inl:%d cl:0x%x "
7984 "0x%04x 0x%08x 0x%08x 0x%08x\n",
7985 meta.zm_index, meta.zm_inline_bitmap, meta.zm_chunk_len,
7986 meta.zm_alloc_size, meta.zm_bitmap,
7987 meta.zm_page_next.packed_address,
7988 meta.zm_page_prev.packed_address);
7989 paniclog_append_noflush("\n");
7990 }
7991
7992 void
panic_display_zalloc(void)7993 panic_display_zalloc(void)
7994 {
7995 bool keepsyms = false;
7996
7997 PE_parse_boot_argn("keepsyms", &keepsyms, sizeof(keepsyms));
7998
7999 panic_display_zone_info();
8000
8001 if (panic_fault_address) {
8002 #if CONFIG_PROB_GZALLOC
8003 if (pgz_owned(panic_fault_address)) {
8004 panic_display_pgz_uaf_info(keepsyms, panic_fault_address);
8005 } else
8006 #endif /* CONFIG_PROB_GZALLOC */
8007 if (zone_maps_owned(panic_fault_address, 1)) {
8008 panic_display_zone_fault(panic_fault_address);
8009 }
8010 }
8011
8012 if (panic_include_zprint) {
8013 panic_display_zprint();
8014 } else if (zone_map_nearing_threshold(ZONE_MAP_EXHAUSTION_PRINT_PANIC)) {
8015 panic_display_largest_zones();
8016 }
8017 #if CONFIG_ZLEAKS
8018 if (zleak_active) {
8019 panic_display_zleaks(keepsyms);
8020 }
8021 #endif
8022 if (panic_include_kalloc_types) {
8023 panic_display_kalloc_types();
8024 }
8025 }
8026
8027 /*
8028 * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
8029 * requesting zone information.
8030 * Frees unused pages towards the end of the region, and zero'es out unused
8031 * space on the last page.
8032 */
8033 static vm_map_copy_t
create_vm_map_copy(vm_offset_t start_addr,vm_size_t total_size,vm_size_t used_size)8034 create_vm_map_copy(
8035 vm_offset_t start_addr,
8036 vm_size_t total_size,
8037 vm_size_t used_size)
8038 {
8039 kern_return_t kr;
8040 vm_offset_t end_addr;
8041 vm_size_t free_size;
8042 vm_map_copy_t copy;
8043
8044 if (used_size != total_size) {
8045 end_addr = start_addr + used_size;
8046 free_size = total_size - (round_page(end_addr) - start_addr);
8047
8048 if (free_size >= PAGE_SIZE) {
8049 kmem_free(ipc_kernel_map,
8050 round_page(end_addr), free_size);
8051 }
8052 bzero((char *) end_addr, round_page(end_addr) - end_addr);
8053 }
8054
8055 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
8056 (vm_map_size_t)used_size, TRUE, ©);
8057 assert(kr == KERN_SUCCESS);
8058
8059 return copy;
8060 }
8061
8062 static boolean_t
get_zone_info(zone_t z,mach_zone_name_t * zn,mach_zone_info_t * zi)8063 get_zone_info(
8064 zone_t z,
8065 mach_zone_name_t *zn,
8066 mach_zone_info_t *zi)
8067 {
8068 struct zone zcopy;
8069 vm_size_t cached = 0;
8070
8071 assert(z != ZONE_NULL);
8072 zone_lock(z);
8073 if (!z->z_self) {
8074 zone_unlock(z);
8075 return FALSE;
8076 }
8077 zcopy = *z;
8078 if (z->z_pcpu_cache) {
8079 zpercpu_foreach(zc, z->z_pcpu_cache) {
8080 cached += zc->zc_alloc_cur + zc->zc_free_cur;
8081 cached += zc->zc_depot_cur * zc_mag_size();
8082 }
8083 }
8084 zone_unlock(z);
8085
8086 if (zn != NULL) {
8087 /*
8088 * Append kalloc heap name to zone name (if zone is used by kalloc)
8089 */
8090 char temp_zone_name[MAX_ZONE_NAME] = "";
8091 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
8092 zone_heap_name(z), z->z_name);
8093
8094 /* assuming here the name data is static */
8095 (void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
8096 strlen(temp_zone_name) + 1);
8097 }
8098
8099 if (zi != NULL) {
8100 *zi = (mach_zone_info_t) {
8101 .mzi_count = zone_count_allocated(&zcopy) - cached,
8102 .mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)),
8103 // max_size for zprint is now high-watermark of pages used
8104 .mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)),
8105 .mzi_elem_size = zone_scale_for_percpu(&zcopy, zcopy.z_elem_size),
8106 .mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages),
8107 .mzi_exhaustible = (uint64_t)zcopy.exhaustible,
8108 };
8109 zpercpu_foreach(zs, zcopy.z_stats) {
8110 zi->mzi_sum_size += zs->zs_mem_allocated;
8111 }
8112 if (zcopy.collectable) {
8113 SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
8114 ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty)));
8115 SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
8116 }
8117 }
8118
8119 return TRUE;
8120 }
8121
8122 kern_return_t
task_zone_info(__unused task_t task,__unused mach_zone_name_array_t * namesp,__unused mach_msg_type_number_t * namesCntp,__unused task_zone_info_array_t * infop,__unused mach_msg_type_number_t * infoCntp)8123 task_zone_info(
8124 __unused task_t task,
8125 __unused mach_zone_name_array_t *namesp,
8126 __unused mach_msg_type_number_t *namesCntp,
8127 __unused task_zone_info_array_t *infop,
8128 __unused mach_msg_type_number_t *infoCntp)
8129 {
8130 return KERN_FAILURE;
8131 }
8132
8133
8134 /* mach_memory_info entitlement */
8135 #define MEMORYINFO_ENTITLEMENT "com.apple.private.memoryinfo"
8136
8137 /* macro needed to rate-limit mach_memory_info */
8138 #define NSEC_DAY (NSEC_PER_SEC * 60 * 60 * 24)
8139
8140 /* declarations necessary to call kauth_cred_issuser() */
8141 struct ucred;
8142 extern int kauth_cred_issuser(struct ucred *);
8143 extern struct ucred *kauth_cred_get(void);
8144
8145 static kern_return_t
mach_memory_info_security_check(void)8146 mach_memory_info_security_check(void)
8147 {
8148 /* If not root or does not have the memoryinfo entitlement, fail */
8149 if (!kauth_cred_issuser(kauth_cred_get())) {
8150 return KERN_NO_ACCESS;
8151 }
8152
8153 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8154 if (!IOTaskHasEntitlement(current_task(), MEMORYINFO_ENTITLEMENT)) {
8155 return KERN_DENIED;
8156 }
8157
8158 /*
8159 * On release non-mac arm devices, allow mach_memory_info
8160 * to be called twice per day per boot. memorymaintenanced
8161 * calls it once per day, which leaves room for a sysdiagnose.
8162 */
8163 static uint64_t first_call, second_call = 0;
8164 uint64_t now = 0;
8165 absolutetime_to_nanoseconds(ml_get_timebase(), &now);
8166
8167 if (!first_call) {
8168 first_call = now;
8169 } else if (!second_call) {
8170 second_call = now;
8171 } else if (first_call + NSEC_DAY > now) {
8172 return KERN_DENIED;
8173 } else if (first_call + NSEC_DAY < now) {
8174 first_call = now;
8175 second_call = 0;
8176 }
8177 #endif
8178
8179 return KERN_SUCCESS;
8180 }
8181
8182 kern_return_t
mach_zone_info(host_priv_t host,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp,mach_zone_info_array_t * infop,mach_msg_type_number_t * infoCntp)8183 mach_zone_info(
8184 host_priv_t host,
8185 mach_zone_name_array_t *namesp,
8186 mach_msg_type_number_t *namesCntp,
8187 mach_zone_info_array_t *infop,
8188 mach_msg_type_number_t *infoCntp)
8189 {
8190 return mach_memory_info(host, namesp, namesCntp, infop, infoCntp, NULL, NULL);
8191 }
8192
8193
8194 kern_return_t
mach_memory_info(host_priv_t host,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp,mach_zone_info_array_t * infop,mach_msg_type_number_t * infoCntp,mach_memory_info_array_t * memoryInfop,mach_msg_type_number_t * memoryInfoCntp)8195 mach_memory_info(
8196 host_priv_t host,
8197 mach_zone_name_array_t *namesp,
8198 mach_msg_type_number_t *namesCntp,
8199 mach_zone_info_array_t *infop,
8200 mach_msg_type_number_t *infoCntp,
8201 mach_memory_info_array_t *memoryInfop,
8202 mach_msg_type_number_t *memoryInfoCntp)
8203 {
8204 mach_zone_name_t *names;
8205 vm_offset_t names_addr;
8206 vm_size_t names_size;
8207
8208 mach_zone_info_t *info;
8209 vm_offset_t info_addr;
8210 vm_size_t info_size;
8211
8212 mach_memory_info_t *memory_info;
8213 vm_offset_t memory_info_addr;
8214 vm_size_t memory_info_size;
8215 vm_size_t memory_info_vmsize;
8216 unsigned int num_info;
8217
8218 unsigned int max_zones, used_zones, i;
8219 mach_zone_name_t *zn;
8220 mach_zone_info_t *zi;
8221 kern_return_t kr;
8222
8223 uint64_t zones_collectable_bytes = 0;
8224
8225 if (host == HOST_NULL) {
8226 return KERN_INVALID_HOST;
8227 }
8228
8229 kr = mach_memory_info_security_check();
8230 if (kr != KERN_SUCCESS) {
8231 return kr;
8232 }
8233
8234 /*
8235 * We assume that zones aren't freed once allocated.
8236 * We won't pick up any zones that are allocated later.
8237 */
8238
8239 max_zones = os_atomic_load(&num_zones, relaxed);
8240
8241 names_size = round_page(max_zones * sizeof *names);
8242 kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size,
8243 KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8244 if (kr != KERN_SUCCESS) {
8245 return kr;
8246 }
8247 names = (mach_zone_name_t *) names_addr;
8248
8249 info_size = round_page(max_zones * sizeof *info);
8250 kr = kmem_alloc(ipc_kernel_map, &info_addr, info_size,
8251 KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8252 if (kr != KERN_SUCCESS) {
8253 kmem_free(ipc_kernel_map,
8254 names_addr, names_size);
8255 return kr;
8256 }
8257 info = (mach_zone_info_t *) info_addr;
8258
8259 zn = &names[0];
8260 zi = &info[0];
8261
8262 used_zones = max_zones;
8263 for (i = 0; i < max_zones; i++) {
8264 if (!get_zone_info(&(zone_array[i]), zn, zi)) {
8265 used_zones--;
8266 continue;
8267 }
8268 zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
8269 zn++;
8270 zi++;
8271 }
8272
8273 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
8274 *namesCntp = used_zones;
8275
8276 *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
8277 *infoCntp = used_zones;
8278
8279 num_info = 0;
8280 memory_info_addr = 0;
8281
8282 if (memoryInfop && memoryInfoCntp) {
8283 vm_map_copy_t copy;
8284 num_info = vm_page_diagnose_estimate();
8285 memory_info_size = num_info * sizeof(*memory_info);
8286 memory_info_vmsize = round_page(memory_info_size);
8287 kr = kmem_alloc(ipc_kernel_map, &memory_info_addr, memory_info_vmsize,
8288 KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8289 if (kr != KERN_SUCCESS) {
8290 return kr;
8291 }
8292
8293 kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
8294 VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
8295 assert(kr == KERN_SUCCESS);
8296
8297 memory_info = (mach_memory_info_t *) memory_info_addr;
8298 vm_page_diagnose(memory_info, num_info, zones_collectable_bytes);
8299
8300 kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
8301 assert(kr == KERN_SUCCESS);
8302
8303 kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
8304 (vm_map_size_t)memory_info_size, TRUE, ©);
8305 assert(kr == KERN_SUCCESS);
8306
8307 *memoryInfop = (mach_memory_info_t *) copy;
8308 *memoryInfoCntp = num_info;
8309 }
8310
8311 return KERN_SUCCESS;
8312 }
8313
8314 kern_return_t
mach_zone_info_for_zone(host_priv_t host,mach_zone_name_t name,mach_zone_info_t * infop)8315 mach_zone_info_for_zone(
8316 host_priv_t host,
8317 mach_zone_name_t name,
8318 mach_zone_info_t *infop)
8319 {
8320 zone_t zone_ptr;
8321
8322 if (host == HOST_NULL) {
8323 return KERN_INVALID_HOST;
8324 }
8325
8326 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8327 if (!PE_i_can_has_debugger(NULL)) {
8328 return KERN_INVALID_HOST;
8329 }
8330 #endif
8331
8332 if (infop == NULL) {
8333 return KERN_INVALID_ARGUMENT;
8334 }
8335
8336 zone_ptr = ZONE_NULL;
8337 zone_foreach(z) {
8338 /*
8339 * Append kalloc heap name to zone name (if zone is used by kalloc)
8340 */
8341 char temp_zone_name[MAX_ZONE_NAME] = "";
8342 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
8343 zone_heap_name(z), z->z_name);
8344
8345 /* Find the requested zone by name */
8346 if (track_this_zone(temp_zone_name, name.mzn_name)) {
8347 zone_ptr = z;
8348 break;
8349 }
8350 }
8351
8352 /* No zones found with the requested zone name */
8353 if (zone_ptr == ZONE_NULL) {
8354 return KERN_INVALID_ARGUMENT;
8355 }
8356
8357 if (get_zone_info(zone_ptr, NULL, infop)) {
8358 return KERN_SUCCESS;
8359 }
8360 return KERN_FAILURE;
8361 }
8362
8363 kern_return_t
mach_zone_info_for_largest_zone(host_priv_t host,mach_zone_name_t * namep,mach_zone_info_t * infop)8364 mach_zone_info_for_largest_zone(
8365 host_priv_t host,
8366 mach_zone_name_t *namep,
8367 mach_zone_info_t *infop)
8368 {
8369 if (host == HOST_NULL) {
8370 return KERN_INVALID_HOST;
8371 }
8372
8373 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8374 if (!PE_i_can_has_debugger(NULL)) {
8375 return KERN_INVALID_HOST;
8376 }
8377 #endif
8378
8379 if (namep == NULL || infop == NULL) {
8380 return KERN_INVALID_ARGUMENT;
8381 }
8382
8383 if (get_zone_info(zone_find_largest(NULL), namep, infop)) {
8384 return KERN_SUCCESS;
8385 }
8386 return KERN_FAILURE;
8387 }
8388
8389 uint64_t
get_zones_collectable_bytes(void)8390 get_zones_collectable_bytes(void)
8391 {
8392 uint64_t zones_collectable_bytes = 0;
8393 mach_zone_info_t zi;
8394
8395 zone_foreach(z) {
8396 if (get_zone_info(z, NULL, &zi)) {
8397 zones_collectable_bytes +=
8398 GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
8399 }
8400 }
8401
8402 return zones_collectable_bytes;
8403 }
8404
8405 kern_return_t
mach_zone_get_zlog_zones(host_priv_t host,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp)8406 mach_zone_get_zlog_zones(
8407 host_priv_t host,
8408 mach_zone_name_array_t *namesp,
8409 mach_msg_type_number_t *namesCntp)
8410 {
8411 #if ZONE_ENABLE_LOGGING
8412 unsigned int max_zones, logged_zones, i;
8413 kern_return_t kr;
8414 zone_t zone_ptr;
8415 mach_zone_name_t *names;
8416 vm_offset_t names_addr;
8417 vm_size_t names_size;
8418
8419 if (host == HOST_NULL) {
8420 return KERN_INVALID_HOST;
8421 }
8422
8423 if (namesp == NULL || namesCntp == NULL) {
8424 return KERN_INVALID_ARGUMENT;
8425 }
8426
8427 max_zones = os_atomic_load(&num_zones, relaxed);
8428
8429 names_size = round_page(max_zones * sizeof *names);
8430 kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size,
8431 KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8432 if (kr != KERN_SUCCESS) {
8433 return kr;
8434 }
8435 names = (mach_zone_name_t *) names_addr;
8436
8437 zone_ptr = ZONE_NULL;
8438 logged_zones = 0;
8439 for (i = 0; i < max_zones; i++) {
8440 zone_t z = &(zone_array[i]);
8441 assert(z != ZONE_NULL);
8442
8443 /* Copy out the zone name if zone logging is enabled */
8444 if (z->z_btlog) {
8445 get_zone_info(z, &names[logged_zones], NULL);
8446 logged_zones++;
8447 }
8448 }
8449
8450 *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
8451 *namesCntp = logged_zones;
8452
8453 return KERN_SUCCESS;
8454
8455 #else /* ZONE_ENABLE_LOGGING */
8456 #pragma unused(host, namesp, namesCntp)
8457 return KERN_FAILURE;
8458 #endif /* ZONE_ENABLE_LOGGING */
8459 }
8460
8461 kern_return_t
mach_zone_get_btlog_records(host_priv_t host,mach_zone_name_t name,zone_btrecord_array_t * recsp,mach_msg_type_number_t * numrecs)8462 mach_zone_get_btlog_records(
8463 host_priv_t host,
8464 mach_zone_name_t name,
8465 zone_btrecord_array_t *recsp,
8466 mach_msg_type_number_t *numrecs)
8467 {
8468 #if ZONE_ENABLE_LOGGING
8469 zone_btrecord_t *recs;
8470 kern_return_t kr;
8471 vm_address_t addr;
8472 vm_size_t size;
8473 zone_t zone_ptr;
8474 vm_map_copy_t copy;
8475
8476 if (host == HOST_NULL) {
8477 return KERN_INVALID_HOST;
8478 }
8479
8480 if (recsp == NULL || numrecs == NULL) {
8481 return KERN_INVALID_ARGUMENT;
8482 }
8483
8484 zone_ptr = ZONE_NULL;
8485 zone_foreach(z) {
8486 /*
8487 * Append kalloc heap name to zone name (if zone is used by kalloc)
8488 */
8489 char temp_zone_name[MAX_ZONE_NAME] = "";
8490 snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
8491 zone_heap_name(z), z->z_name);
8492
8493 /* Find the requested zone by name */
8494 if (track_this_zone(temp_zone_name, name.mzn_name)) {
8495 zone_ptr = z;
8496 break;
8497 }
8498 }
8499
8500 /* No zones found with the requested zone name */
8501 if (zone_ptr == ZONE_NULL) {
8502 return KERN_INVALID_ARGUMENT;
8503 }
8504
8505 /* Logging not turned on for the requested zone */
8506 if (!zone_ptr->z_btlog) {
8507 return KERN_FAILURE;
8508 }
8509
8510 kr = btlog_get_records(zone_ptr->z_btlog, &recs, numrecs);
8511 if (kr != KERN_SUCCESS) {
8512 return kr;
8513 }
8514
8515 addr = (vm_address_t)recs;
8516 size = sizeof(zone_btrecord_t) * *numrecs;
8517
8518 kr = vm_map_copyin(ipc_kernel_map, addr, size, TRUE, ©);
8519 assert(kr == KERN_SUCCESS);
8520
8521 *recsp = (zone_btrecord_t *)copy;
8522 return KERN_SUCCESS;
8523
8524 #else /* !ZONE_ENABLE_LOGGING */
8525 #pragma unused(host, name, recsp, numrecs)
8526 return KERN_FAILURE;
8527 #endif /* !ZONE_ENABLE_LOGGING */
8528 }
8529
8530
8531 kern_return_t
mach_zone_force_gc(host_t host)8532 mach_zone_force_gc(
8533 host_t host)
8534 {
8535 if (host == HOST_NULL) {
8536 return KERN_INVALID_HOST;
8537 }
8538
8539 #if DEBUG || DEVELOPMENT
8540 extern boolean_t(*volatile consider_buffer_cache_collect)(int);
8541 /* Callout to buffer cache GC to drop elements in the apfs zones */
8542 if (consider_buffer_cache_collect != NULL) {
8543 (void)(*consider_buffer_cache_collect)(0);
8544 }
8545 zone_gc(ZONE_GC_DRAIN);
8546 #endif /* DEBUG || DEVELOPMENT */
8547 return KERN_SUCCESS;
8548 }
8549
8550 zone_t
zone_find_largest(uint64_t * zone_size)8551 zone_find_largest(uint64_t *zone_size)
8552 {
8553 zone_t largest_zone = 0;
8554 uint64_t largest_zone_size = 0;
8555 zone_find_n_largest(1, &largest_zone, &largest_zone_size);
8556 if (zone_size) {
8557 *zone_size = largest_zone_size;
8558 }
8559 return largest_zone;
8560 }
8561
8562 #endif /* !ZALLOC_TEST */
8563 #pragma mark zone creation, configuration, destruction
8564 #if !ZALLOC_TEST
8565
8566 static zone_t
zone_init_defaults(zone_id_t zid)8567 zone_init_defaults(zone_id_t zid)
8568 {
8569 zone_t z = &zone_array[zid];
8570
8571 z->z_wired_max = ~0u;
8572 z->collectable = true;
8573 z->expandable = true;
8574
8575 lck_ticket_init(&z->z_lock, &zone_locks_grp);
8576 STAILQ_INIT(&z->z_recirc);
8577 return z;
8578 }
8579
8580 static bool
zone_is_initializing(zone_t z)8581 zone_is_initializing(zone_t z)
8582 {
8583 return !z->z_self && !z->z_destroyed;
8584 }
8585
8586 void
zone_set_noexpand(zone_t zone,vm_size_t nelems)8587 zone_set_noexpand(zone_t zone, vm_size_t nelems)
8588 {
8589 if (!zone_is_initializing(zone)) {
8590 panic("%s: called after zone_create()", __func__);
8591 }
8592 zone->expandable = false;
8593 zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems);
8594 }
8595
8596 void
zone_set_exhaustible(zone_t zone,vm_size_t nelems)8597 zone_set_exhaustible(zone_t zone, vm_size_t nelems)
8598 {
8599 if (!zone_is_initializing(zone)) {
8600 panic("%s: called after zone_create()", __func__);
8601 }
8602 zone->expandable = false;
8603 zone->exhaustible = true;
8604 zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems);
8605 }
8606
8607 /**
8608 * @function zone_create_find
8609 *
8610 * @abstract
8611 * Finds an unused zone for the given name and element size.
8612 *
8613 * @param name the zone name
8614 * @param size the element size (including redzones, ...)
8615 * @param flags the flags passed to @c zone_create*
8616 * @param zid_inout the desired zone ID or ZONE_ID_ANY
8617 *
8618 * @returns a zone to initialize further.
8619 */
8620 static zone_t
zone_create_find(const char * name,vm_size_t size,zone_create_flags_t flags,zone_id_t * zid_inout)8621 zone_create_find(
8622 const char *name,
8623 vm_size_t size,
8624 zone_create_flags_t flags,
8625 zone_id_t *zid_inout)
8626 {
8627 zone_id_t nzones, zid = *zid_inout;
8628 zone_t z;
8629
8630 simple_lock(&all_zones_lock, &zone_locks_grp);
8631
8632 nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
8633 assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
8634
8635 if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
8636 /*
8637 * The first time around, make sure the reserved zone IDs
8638 * have an initialized lock as zone_index_foreach() will
8639 * enumerate them.
8640 */
8641 while (nzones < ZONE_ID__FIRST_DYNAMIC) {
8642 zone_init_defaults(nzones++);
8643 }
8644
8645 os_atomic_store(&num_zones, nzones, release);
8646 }
8647
8648 if (zid != ZONE_ID_ANY) {
8649 if (zid >= ZONE_ID__FIRST_DYNAMIC) {
8650 panic("zone_create: invalid desired zone ID %d for %s",
8651 zid, name);
8652 }
8653 if (flags & ZC_DESTRUCTIBLE) {
8654 panic("zone_create: ID %d (%s) must be permanent", zid, name);
8655 }
8656 if (zone_array[zid].z_self) {
8657 panic("zone_create: creating zone ID %d (%s) twice", zid, name);
8658 }
8659 z = &zone_array[zid];
8660 } else {
8661 if (flags & ZC_DESTRUCTIBLE) {
8662 /*
8663 * If possible, find a previously zdestroy'ed zone in the
8664 * zone_array that we can reuse.
8665 */
8666 for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
8667 i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
8668 z = &zone_array[i];
8669
8670 /*
8671 * If the zone name and the element size are the
8672 * same, we can just reuse the old zone struct.
8673 */
8674 if (strcmp(z->z_name, name) || zone_elem_size(z) != size) {
8675 continue;
8676 }
8677 bitmap_clear(zone_destroyed_bitmap, i);
8678 z->z_destroyed = false;
8679 z->z_self = z;
8680 zid = (zone_id_t)i;
8681 goto out;
8682 }
8683 }
8684
8685 zid = nzones++;
8686 z = zone_init_defaults(zid);
8687
8688 /*
8689 * The release barrier pairs with the acquire in
8690 * zone_index_foreach() and makes sure that enumeration loops
8691 * always see an initialized zone lock.
8692 */
8693 os_atomic_store(&num_zones, nzones, release);
8694 }
8695
8696 out:
8697 num_zones_in_use++;
8698 simple_unlock(&all_zones_lock);
8699
8700 *zid_inout = zid;
8701 return z;
8702 }
8703
8704 __abortlike
8705 static void
zone_create_panic(const char * name,const char * f1,const char * f2)8706 zone_create_panic(const char *name, const char *f1, const char *f2)
8707 {
8708 panic("zone_create: creating zone %s: flag %s and %s are incompatible",
8709 name, f1, f2);
8710 }
8711 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
8712 if ((flags) & forbidden_flag) { \
8713 zone_create_panic(name, #current_flag, #forbidden_flag); \
8714 }
8715
8716 /*
8717 * Adjusts the size of the element based on minimum size, alignment
8718 * and kasan redzones
8719 */
8720 static vm_size_t
zone_elem_adjust_size(const char * name __unused,vm_size_t elem_size,zone_create_flags_t flags __unused,uint32_t * redzone __unused)8721 zone_elem_adjust_size(
8722 const char *name __unused,
8723 vm_size_t elem_size,
8724 zone_create_flags_t flags __unused,
8725 uint32_t *redzone __unused)
8726 {
8727 vm_size_t size;
8728 /*
8729 * Adjust element size for minimum size and pointer alignment
8730 */
8731 size = (elem_size + ZONE_ALIGN_SIZE - 1) & -ZONE_ALIGN_SIZE;
8732 if (size < ZONE_MIN_ELEM_SIZE) {
8733 size = ZONE_MIN_ELEM_SIZE;
8734 }
8735
8736 #if KASAN_ZALLOC
8737 /*
8738 * Expand the zone allocation size to include the redzones.
8739 *
8740 * For page-multiple zones add a full guard page because they
8741 * likely require alignment.
8742 */
8743 uint32_t redzone_tmp;
8744 if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU)) {
8745 redzone_tmp = 0;
8746 } else if ((size & PAGE_MASK) == 0) {
8747 if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
8748 panic("zone_create: zone %s can't provide more than PAGE_SIZE"
8749 "alignment", name);
8750 }
8751 redzone_tmp = PAGE_SIZE;
8752 } else if (flags & ZC_ALIGNMENT_REQUIRED) {
8753 redzone_tmp = 0;
8754 } else {
8755 redzone_tmp = KASAN_GUARD_SIZE;
8756 }
8757 size += redzone_tmp * 2;
8758 if (redzone) {
8759 *redzone = redzone_tmp;
8760 }
8761 #endif
8762 return size;
8763 }
8764
8765 /*
8766 * Returns the allocation chunk size that has least framentation
8767 */
8768 static vm_size_t
zone_get_min_alloc_granule(vm_size_t elem_size,zone_create_flags_t flags)8769 zone_get_min_alloc_granule(
8770 vm_size_t elem_size,
8771 zone_create_flags_t flags)
8772 {
8773 vm_size_t alloc_granule = PAGE_SIZE;
8774 if (flags & ZC_PERCPU) {
8775 alloc_granule = PAGE_SIZE * zpercpu_count();
8776 if (PAGE_SIZE % elem_size > 256) {
8777 panic("zone_create: per-cpu zone has too much fragmentation");
8778 }
8779 } else if (flags & ZC_READONLY) {
8780 alloc_granule = PAGE_SIZE;
8781 } else if ((elem_size & PAGE_MASK) == 0) {
8782 /* zero fragmentation by definition */
8783 alloc_granule = elem_size;
8784 } else if (alloc_granule % elem_size == 0) {
8785 /* zero fragmentation by definition */
8786 } else {
8787 vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
8788 vm_size_t alloc_tmp = PAGE_SIZE;
8789 vm_size_t max_chunk_size = ZONE_MAX_ALLOC_SIZE;
8790
8791 #if __arm64__
8792 /*
8793 * Increase chunk size to 48K for sizes larger than 4K on 16k
8794 * machines, so as to reduce internal fragementation for kalloc
8795 * zones with sizes 12K and 24K.
8796 */
8797 if (elem_size > 4 * 1024 && PAGE_SIZE == 16 * 1024) {
8798 max_chunk_size = 48 * 1024;
8799 }
8800 #endif
8801 while ((alloc_tmp += PAGE_SIZE) <= max_chunk_size) {
8802 vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
8803 if (frag_tmp < frag) {
8804 frag = frag_tmp;
8805 alloc_granule = alloc_tmp;
8806 }
8807 }
8808 }
8809 return alloc_granule;
8810 }
8811
8812 vm_size_t
zone_get_early_alloc_size(const char * name __unused,vm_size_t elem_size,zone_create_flags_t flags,vm_size_t min_elems)8813 zone_get_early_alloc_size(
8814 const char *name __unused,
8815 vm_size_t elem_size,
8816 zone_create_flags_t flags,
8817 vm_size_t min_elems)
8818 {
8819 vm_size_t adjusted_size, alloc_granule, chunk_elems;
8820
8821 adjusted_size = zone_elem_adjust_size(name, elem_size, flags, NULL);
8822 alloc_granule = zone_get_min_alloc_granule(adjusted_size, flags);
8823 chunk_elems = alloc_granule / adjusted_size;
8824
8825 return ((min_elems + chunk_elems - 1) / chunk_elems) * alloc_granule;
8826 }
8827
8828 zone_t
8829 zone_create_ext(
8830 const char *name,
8831 vm_size_t size,
8832 zone_create_flags_t flags,
8833 zone_id_t zid,
8834 void (^extra_setup)(zone_t))
8835 {
8836 vm_size_t alloc;
8837 uint32_t redzone;
8838 zone_t z;
8839 zone_security_flags_t *zsflags;
8840
8841 if (size > ZONE_MAX_ALLOC_SIZE) {
8842 panic("zone_create: element size too large: %zd", (size_t)size);
8843 }
8844
8845 if (size < 2 * sizeof(vm_size_t)) {
8846 /* Elements are too small for kasan. */
8847 flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
8848 }
8849
8850 size = zone_elem_adjust_size(name, size, flags, &redzone);
8851 /*
8852 * Allocate the zone slot, return early if we found an older match.
8853 */
8854 z = zone_create_find(name, size, flags, &zid);
8855 if (__improbable(z->z_self)) {
8856 /* We found a zone to reuse */
8857 return z;
8858 }
8859
8860 /*
8861 * Initialize the zone properly.
8862 */
8863
8864 /*
8865 * If the kernel is post lockdown, copy the zone name passed in.
8866 * Else simply maintain a pointer to the name string as it can only
8867 * be a core XNU zone (no unloadable kext exists before lockdown).
8868 */
8869 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
8870 size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
8871 char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
8872 strlcpy(buf, name, nsz);
8873 z->z_name = buf;
8874 } else {
8875 z->z_name = name;
8876 }
8877 if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
8878 z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
8879 } else {
8880 /*
8881 * zone_init() hasn't run yet, use the storage provided by
8882 * zone_stats_startup(), and zone_init() will replace it
8883 * with the final value once the PERCPU zone exists.
8884 */
8885 z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]);
8886 }
8887
8888 alloc = zone_get_min_alloc_granule(size, flags);
8889
8890 z->z_elem_size = (uint16_t)size;
8891 z->z_chunk_pages = (uint16_t)atop(alloc);
8892 if (flags & ZC_PERCPU) {
8893 z->z_chunk_elems = (uint16_t)(PAGE_SIZE / z->z_elem_size);
8894 } else {
8895 z->z_chunk_elems = (uint16_t)(alloc / z->z_elem_size);
8896 }
8897 if (zone_element_idx(zone_element_encode(0,
8898 z->z_chunk_elems - 1)) != z->z_chunk_elems - 1) {
8899 panic("zone_element_encode doesn't work for zone [%s]", name);
8900 }
8901
8902 #if KASAN_ZALLOC
8903 z->z_kasan_redzone = redzone;
8904 if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
8905 z->kasan_fakestacks = true;
8906 }
8907 #endif
8908
8909 /*
8910 * Handle KPI flags
8911 */
8912 zsflags = &zone_security_array[zid];
8913 /*
8914 * Some zones like ipc ports and procs rely on sequestering for
8915 * correctness, so explicitly turn on sequestering despite the
8916 * configuration.
8917 */
8918 if (flags & ZC_SEQUESTER) {
8919 zsflags->z_va_sequester = true;
8920 }
8921
8922 /* ZC_CACHING applied after all configuration is done */
8923 if (flags & ZC_NOCACHING) {
8924 z->z_nocaching = true;
8925 }
8926
8927 if (flags & ZC_READONLY) {
8928 zone_create_assert_not_both(name, flags, ZC_READONLY, ZC_VM);
8929 #if ZSECURITY_CONFIG(READ_ONLY)
8930 zsflags->z_submap_idx = Z_SUBMAP_IDX_READ_ONLY;
8931 zsflags->z_va_sequester = true;
8932 #endif
8933 zone_ro_elem_size[zid] = (uint16_t)size;
8934 assert(size <= PAGE_SIZE);
8935 if ((PAGE_SIZE % size) * 10 >= PAGE_SIZE) {
8936 panic("Fragmentation greater than 10%% with elem size %d zone %s%s",
8937 (uint32_t)size, zone_heap_name(z), z->z_name);
8938 }
8939 }
8940
8941 if (flags & ZC_PERCPU) {
8942 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_READONLY);
8943 zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_PGZ_USE_GUARDS);
8944 z->z_percpu = true;
8945 z->z_nogzalloc = true;
8946 }
8947 if (flags & ZC_NOGC) {
8948 z->collectable = false;
8949 }
8950 /*
8951 * Handle ZC_NOENCRYPT from xnu only
8952 */
8953 if (startup_phase < STARTUP_SUB_LOCKDOWN && flags & ZC_NOENCRYPT) {
8954 zsflags->z_noencrypt = true;
8955 }
8956 if (flags & ZC_ALIGNMENT_REQUIRED) {
8957 z->alignment_required = true;
8958 }
8959 if (flags & (ZC_NOGZALLOC | ZC_READONLY)) {
8960 z->z_nogzalloc = true;
8961 }
8962 if (flags & ZC_NOCALLOUT) {
8963 z->no_callout = true;
8964 }
8965 if (flags & ZC_DESTRUCTIBLE) {
8966 zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_READONLY);
8967 z->z_destructible = true;
8968 }
8969 /*
8970 * Handle Internal flags
8971 */
8972 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
8973 if (flags & ZC_PGZ_USE_GUARDS) {
8974 /*
8975 * Try to turn on guard pages only for zones
8976 * with a chance of OOB.
8977 */
8978 if (startup_phase < STARTUP_SUB_LOCKDOWN) {
8979 zsflags->z_pgz_use_guards = true;
8980 }
8981 z->z_pgz_use_guards = true;
8982 z->z_pgz_oob_offs = (uint16_t)(alloc -
8983 z->z_chunk_elems * z->z_elem_size);
8984 }
8985 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
8986 if (!(flags & ZC_NOTBITAG)) {
8987 z->z_tbi_tag = true;
8988 }
8989 if (flags & ZC_KALLOC_TYPE) {
8990 zsflags->z_kalloc_type = true;
8991 }
8992 if (flags & ZC_VM) {
8993 zsflags->z_submap_idx = Z_SUBMAP_IDX_VM;
8994 zsflags->z_va_sequester = true;
8995 }
8996 if (flags & ZC_KASAN_NOQUARANTINE) {
8997 z->kasan_noquarantine = true;
8998 }
8999 /* ZC_KASAN_NOREDZONE already handled */
9000
9001 /*
9002 * Then if there's extra tuning, do it
9003 */
9004 if (extra_setup) {
9005 extra_setup(z);
9006 }
9007
9008 /*
9009 * Configure debugging features
9010 */
9011 #if CONFIG_GZALLOC
9012 if (!z->z_nogzalloc && (flags & ZC_VM) == 0) {
9013 gzalloc_zone_init(z); /* might set z->z_gzalloc_tracked */
9014 if (z->z_gzalloc_tracked) {
9015 if (startup_phase < STARTUP_SUB_LOCKDOWN) {
9016 zsflags->z_pgz_use_guards = false;
9017 }
9018 z->z_pgz_use_guards = false;
9019 z->z_pgz_oob_offs = 0;
9020 z->z_nocaching = true;
9021 }
9022 }
9023 #endif
9024 #if CONFIG_PROB_GZALLOC
9025 if (!z->z_gzalloc_tracked && (flags & (ZC_READONLY | ZC_PERCPU)) == 0) {
9026 pgz_zone_init(z);
9027 }
9028 #endif
9029 #if ZONE_ENABLE_LOGGING
9030 if (!z->z_gzalloc_tracked && startup_phase >= STARTUP_SUB_ZALLOC) {
9031 /*
9032 * Check for and set up zone leak detection
9033 * if requested via boot-args.
9034 */
9035 zone_setup_logging(z);
9036 }
9037 #endif /* ZONE_ENABLE_LOGGING */
9038
9039 #if VM_TAG_SIZECLASSES
9040 if (!z->z_gzalloc_tracked && (zsflags->z_kheap_id || zsflags->z_kalloc_type)
9041 && zone_tagging_on) {
9042 assert(startup_phase < STARTUP_SUB_LOCKDOWN);
9043 static uint16_t sizeclass_idx;
9044 z->z_uses_tags = true;
9045 z->z_tags_inline = (((page_size + size - 1) / size) <=
9046 (sizeof(uint32_t) / sizeof(uint16_t)));
9047 if (zsflags->z_kheap_id == KHEAP_ID_DEFAULT) {
9048 zone_tags_sizeclasses[sizeclass_idx] = (uint16_t)size;
9049 z->z_tags_sizeclass = sizeclass_idx++;
9050 } else {
9051 uint16_t i = 0;
9052 for (; i < sizeclass_idx; i++) {
9053 if (size == zone_tags_sizeclasses[i]) {
9054 z->z_tags_sizeclass = i;
9055 break;
9056 }
9057 }
9058 /*
9059 * Size class wasn't found, add it to zone_tags_sizeclasses
9060 */
9061 if (i == sizeclass_idx) {
9062 assert(i < VM_TAG_SIZECLASSES);
9063 zone_tags_sizeclasses[i] = (uint16_t)size;
9064 z->z_tags_sizeclass = sizeclass_idx++;
9065 }
9066 }
9067 assert(z->z_tags_sizeclass < VM_TAG_SIZECLASSES);
9068 }
9069 #endif
9070
9071 /*
9072 * Finally, fixup properties based on security policies, boot-args, ...
9073 */
9074 #if ZSECURITY_CONFIG(SUBMAP_USER_DATA)
9075 if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) {
9076 zsflags->z_submap_idx = Z_SUBMAP_IDX_DATA;
9077 zsflags->z_va_sequester = false;
9078 }
9079 #endif
9080
9081 if ((flags & ZC_CACHING) && !z->z_nocaching) {
9082 /*
9083 * If zcache hasn't been initialized yet, remember our decision,
9084 *
9085 * zone_enable_caching() will be called again by
9086 * zcache_bootstrap(), while the system is still single
9087 * threaded, to build the missing caches.
9088 */
9089 if (__probable(zc_magazine_zone)) {
9090 zone_enable_caching(z);
9091 } else {
9092 z->z_pcpu_cache =
9093 __zpcpu_mangle_for_boot(&zone_cache_startup[zid]);
9094 }
9095 }
9096
9097 zone_lock(z);
9098 z->z_self = z;
9099 zone_unlock(z);
9100
9101 return z;
9102 }
9103
9104 __startup_func
9105 void
zone_create_startup(struct zone_create_startup_spec * spec)9106 zone_create_startup(struct zone_create_startup_spec *spec)
9107 {
9108 zone_t z;
9109
9110 z = zone_create_ext(spec->z_name, spec->z_size,
9111 spec->z_flags, spec->z_zid, spec->z_setup);
9112 if (spec->z_var) {
9113 *spec->z_var = z;
9114 }
9115 }
9116
9117 /*
9118 * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
9119 * union works. trust but verify.
9120 */
9121 #define zalloc_check_zov_alias(f1, f2) \
9122 static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
9123 zalloc_check_zov_alias(z_self, zv_zone);
9124 zalloc_check_zov_alias(z_stats, zv_stats);
9125 zalloc_check_zov_alias(z_name, zv_name);
9126 zalloc_check_zov_alias(z_views, zv_next);
9127 #undef zalloc_check_zov_alias
9128
9129 __startup_func
9130 void
zone_view_startup_init(struct zone_view_startup_spec * spec)9131 zone_view_startup_init(struct zone_view_startup_spec *spec)
9132 {
9133 struct kalloc_heap *heap = NULL;
9134 zone_view_t zv = spec->zv_view;
9135 zone_t z;
9136 zone_security_flags_t zsflags;
9137
9138 switch (spec->zv_heapid) {
9139 case KHEAP_ID_DEFAULT:
9140 panic("%s: Use KALLOC_TYPE_DEFINE for zone view %s instead"
9141 "of ZONE_VIEW_DEFINE as it is from default kalloc heap",
9142 __func__, zv->zv_name);
9143 __builtin_unreachable();
9144 case KHEAP_ID_DATA_BUFFERS:
9145 heap = KHEAP_DATA_BUFFERS;
9146 break;
9147 default:
9148 heap = NULL;
9149 }
9150
9151 if (heap) {
9152 z = kalloc_heap_zone_for_size(heap, spec->zv_size);
9153 } else {
9154 z = *spec->zv_zone;
9155 assert(spec->zv_size <= zone_elem_size(z));
9156 }
9157
9158 assert(z);
9159
9160 zv->zv_zone = z;
9161 zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
9162 zv->zv_next = z->z_views;
9163 zsflags = zone_security_config(z);
9164 if (z->z_views == NULL && zsflags.z_kheap_id == KHEAP_ID_NONE) {
9165 /*
9166 * count the raw view for zones not in a heap,
9167 * kalloc_heap_init() already counts it for its members.
9168 */
9169 zone_view_count += 2;
9170 } else {
9171 zone_view_count += 1;
9172 }
9173 z->z_views = zv;
9174 }
9175
9176 zone_t
zone_create(const char * name,vm_size_t size,zone_create_flags_t flags)9177 zone_create(
9178 const char *name,
9179 vm_size_t size,
9180 zone_create_flags_t flags)
9181 {
9182 return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
9183 }
9184
9185 static_assert(ZONE_ID__LAST_RO_EXT - ZONE_ID__FIRST_RO_EXT == ZC_RO_ID__LAST);
9186
9187 zone_id_t
zone_create_ro(const char * name,vm_size_t size,zone_create_flags_t flags,zone_create_ro_id_t zc_ro_id)9188 zone_create_ro(
9189 const char *name,
9190 vm_size_t size,
9191 zone_create_flags_t flags,
9192 zone_create_ro_id_t zc_ro_id)
9193 {
9194 assert(zc_ro_id <= ZC_RO_ID__LAST);
9195 zone_id_t reserved_zid = ZONE_ID__FIRST_RO_EXT + zc_ro_id;
9196 (void)zone_create_ext(name, size, ZC_READONLY | flags, reserved_zid, NULL);
9197 return reserved_zid;
9198 }
9199
9200 zone_t
zinit(vm_size_t size,vm_size_t max,vm_size_t alloc __unused,const char * name)9201 zinit(
9202 vm_size_t size, /* the size of an element */
9203 vm_size_t max, /* maximum memory to use */
9204 vm_size_t alloc __unused, /* allocation size */
9205 const char *name) /* a name for the zone */
9206 {
9207 zone_t z = zone_create(name, size, ZC_DESTRUCTIBLE);
9208 z->z_wired_max = zone_alloc_pages_for_nelems(z, max / size);
9209 return z;
9210 }
9211
9212 void
zdestroy(zone_t z)9213 zdestroy(zone_t z)
9214 {
9215 unsigned int zindex = zone_index(z);
9216 zone_security_flags_t zsflags = zone_security_array[zindex];
9217
9218 current_thread()->options |= TH_OPT_ZONE_PRIV;
9219 lck_mtx_lock(&zone_gc_lock);
9220
9221 zone_reclaim(z, ZONE_RECLAIM_DESTROY);
9222
9223 lck_mtx_unlock(&zone_gc_lock);
9224 current_thread()->options &= ~TH_OPT_ZONE_PRIV;
9225
9226 #if CONFIG_GZALLOC
9227 if (__improbable(z->z_gzalloc_tracked)) {
9228 /* If the zone is gzalloc managed dump all the elements in the free cache */
9229 gzalloc_empty_free_cache(z);
9230 }
9231 #endif
9232
9233 zone_lock(z);
9234
9235 if (!zone_submap_is_sequestered(zsflags)) {
9236 while (!zone_pva_is_null(z->z_pageq_va)) {
9237 struct zone_page_metadata *meta;
9238
9239 zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
9240 meta = zone_meta_queue_pop(z, &z->z_pageq_va);
9241 assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX);
9242 bzero(meta, sizeof(*meta) * z->z_chunk_pages);
9243 zone_unlock(z);
9244 kmem_free(zone_submap(zsflags), zone_meta_to_addr(meta),
9245 ptoa(z->z_chunk_pages));
9246 zone_lock(z);
9247 }
9248 }
9249
9250 #if !KASAN_ZALLOC
9251 /* Assert that all counts are zero */
9252 if (z->z_elems_avail || z->z_elems_free || zone_size_wired(z) ||
9253 (z->z_va_cur && !zone_submap_is_sequestered(zsflags))) {
9254 panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
9255 zone_heap_name(z), z->z_name);
9256 }
9257
9258 /* consistency check: make sure everything is indeed empty */
9259 assert(zone_pva_is_null(z->z_pageq_empty));
9260 assert(zone_pva_is_null(z->z_pageq_partial));
9261 assert(zone_pva_is_null(z->z_pageq_full));
9262 if (!zone_submap_is_sequestered(zsflags)) {
9263 assert(zone_pva_is_null(z->z_pageq_va));
9264 }
9265 #endif
9266
9267 zone_unlock(z);
9268
9269 simple_lock(&all_zones_lock, &zone_locks_grp);
9270
9271 assert(!bitmap_test(zone_destroyed_bitmap, zindex));
9272 /* Mark the zone as empty in the bitmap */
9273 bitmap_set(zone_destroyed_bitmap, zindex);
9274 num_zones_in_use--;
9275 assert(num_zones_in_use > 0);
9276
9277 simple_unlock(&all_zones_lock);
9278 }
9279
9280 #endif /* !ZALLOC_TEST */
9281 #pragma mark zalloc module init
9282 #if !ZALLOC_TEST
9283
9284 /*
9285 * Initialize the "zone of zones" which uses fixed memory allocated
9286 * earlier in memory initialization. zone_bootstrap is called
9287 * before zone_init.
9288 */
9289 __startup_func
9290 void
zone_bootstrap(void)9291 zone_bootstrap(void)
9292 {
9293 #if DEBUG || DEVELOPMENT
9294 #if __x86_64__
9295 if (PE_parse_boot_argn("kernPOST", NULL, 0)) {
9296 /*
9297 * rdar://79781535 Disable early gaps while running kernPOST on Intel
9298 * the fp faulting code gets triggered and deadlocks.
9299 */
9300 zone_caching_disabled = 1;
9301 }
9302 #endif /* __x86_64__ */
9303 #endif /* DEBUG || DEVELOPMENT */
9304
9305 /* Validate struct zone_packed_virtual_address expectations */
9306 static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
9307 if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
9308 panic("zone_pva_t can't pack a kernel page address in 31 bits");
9309 }
9310
9311 zpercpu_early_count = ml_early_cpu_max_number() + 1;
9312
9313 /*
9314 * Initialize random used to scramble early allocations
9315 */
9316 zpercpu_foreach_cpu(cpu) {
9317 random_bool_init(&zone_bool_gen[cpu].zbg_bg);
9318 }
9319
9320 #if CONFIG_PROB_GZALLOC
9321 /*
9322 * Set pgz_sample_counter on the boot CPU so that we do not sample
9323 * any allocation until PGZ has been properly setup (in pgz_init()).
9324 */
9325 *PERCPU_GET_MASTER(pgz_sample_counter) = INT32_MAX;
9326 #endif /* CONFIG_PROB_GZALLOC */
9327
9328 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
9329 /*
9330 * Randomly assign zones to one of the 4 general submaps,
9331 * and pick whether they allocate from the begining
9332 * or the end of it.
9333 *
9334 * A lot of OOB exploitation relies on precise interleaving
9335 * of specific types in the heap.
9336 *
9337 * Woops, you can't guarantee that anymore.
9338 */
9339 for (zone_id_t i = 1; i < MAX_ZONES; i++) {
9340 uint32_t r = zalloc_random_uniform32(0,
9341 ZSECURITY_CONFIG_GENERAL_SUBMAPS * 2);
9342
9343 zone_security_array[i].z_submap_from_end = (r & 1);
9344 zone_security_array[i].z_submap_idx += (r >> 1);
9345 }
9346 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
9347
9348 thread_call_setup_with_options(&zone_expand_callout,
9349 zone_expand_async, NULL, THREAD_CALL_PRIORITY_HIGH,
9350 THREAD_CALL_OPTIONS_ONCE);
9351
9352 thread_call_setup_with_options(&zone_defrag_callout,
9353 zone_defrag_async, NULL, THREAD_CALL_PRIORITY_USER,
9354 THREAD_CALL_OPTIONS_ONCE);
9355 }
9356
9357 #define ZONE_GUARD_SIZE (64UL << 10)
9358
9359 #if __LP64__
9360 static inline vm_offset_t
zone_restricted_va_max(void)9361 zone_restricted_va_max(void)
9362 {
9363 vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
9364 vm_offset_t vm_page_max = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
9365
9366 return trunc_page(MIN(compressor_max, vm_page_max));
9367 }
9368 #else
9369 static inline vm_offset_t
zone_restricted_va_max(void)9370 zone_restricted_va_max(void)
9371 {
9372 return 0;
9373 }
9374 #endif
9375
9376 __startup_func
9377 static void
zone_tunables_fixup(void)9378 zone_tunables_fixup(void)
9379 {
9380 int wdt = 0;
9381
9382 #if CONFIG_PROB_GZALLOC && (DEVELOPMENT || DEBUG)
9383 if (!PE_parse_boot_argn("pgz", NULL, 0) &&
9384 PE_parse_boot_argn("pgz1", NULL, 0)) {
9385 /*
9386 * if pgz1= was used, but pgz= was not,
9387 * then the more specific pgz1 takes precedence.
9388 */
9389 pgz_all = false;
9390 }
9391 #endif
9392
9393 if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
9394 zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
9395 }
9396 if (zc_magazine_size > PAGE_SIZE / ZONE_MIN_ELEM_SIZE) {
9397 zc_magazine_size = (uint16_t)(PAGE_SIZE / ZONE_MIN_ELEM_SIZE);
9398 }
9399 if (PE_parse_boot_argn("wdt", &wdt, sizeof(wdt)) && wdt == -1 &&
9400 !PE_parse_boot_argn("zet", NULL, 0)) {
9401 zone_exhausted_timeout = -1;
9402 }
9403 }
9404 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
9405
9406 __startup_func
9407 static vm_size_t
zone_phys_size_max(void)9408 zone_phys_size_max(void)
9409 {
9410 vm_size_t zsize;
9411 vm_size_t zsizearg;
9412
9413 if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
9414 zsize = zsizearg * (1024ULL * 1024);
9415 } else {
9416 /* Set target zone size as 1/4 of physical memory */
9417 zsize = (vm_size_t)(sane_size >> 2);
9418 #if defined(__LP64__)
9419 zsize += zsize >> 1;
9420 #endif /* __LP64__ */
9421 }
9422
9423 if (zsize < CONFIG_ZONE_MAP_MIN) {
9424 zsize = CONFIG_ZONE_MAP_MIN; /* Clamp to min */
9425 }
9426 if (zsize > sane_size >> 1) {
9427 zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */
9428 }
9429 if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
9430 /* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
9431 printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
9432 (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX);
9433 zsize = ZONE_MAP_MAX;
9434 }
9435
9436 return (vm_size_t)trunc_page(zsize);
9437 }
9438
9439 __startup_func
9440 static struct kmem_range
zone_init_allocate_va(vm_map_address_t addr,vm_size_t size,bool random)9441 zone_init_allocate_va(vm_map_address_t addr, vm_size_t size, bool random)
9442 {
9443 int vm_alloc_flags = VM_FLAGS_ANYWHERE;
9444 struct kmem_range r;
9445 kern_return_t kr;
9446 vm_map_entry_t entry;
9447
9448 if (random) {
9449 vm_alloc_flags |= VM_FLAGS_RANDOM_ADDR;
9450 }
9451
9452 vm_object_reference(kernel_object);
9453
9454 kr = vm_map_enter(kernel_map, &addr, size, 0,
9455 vm_alloc_flags, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_ZONE,
9456 kernel_object, addr, FALSE, VM_PROT_NONE, VM_PROT_NONE,
9457 VM_INHERIT_NONE);
9458
9459 if (KERN_SUCCESS != kr) {
9460 panic("vm_map_enter(0x%zx) failed: %d", (size_t)size, kr);
9461 }
9462
9463 vm_map_lookup_entry(kernel_map, addr, &entry);
9464 VME_OFFSET_SET(entry, addr);
9465
9466 r.min_address = (vm_offset_t)addr;
9467 r.max_address = (vm_offset_t)addr + size;
9468 return r;
9469 }
9470
9471 __startup_func
9472 static void
zone_submap_init(vm_offset_t * submap_min,zone_submap_idx_t idx,uint64_t zone_sub_map_numer,uint64_t * remaining_denom,vm_offset_t * remaining_size)9473 zone_submap_init(
9474 vm_offset_t *submap_min,
9475 zone_submap_idx_t idx,
9476 uint64_t zone_sub_map_numer,
9477 uint64_t *remaining_denom,
9478 vm_offset_t *remaining_size)
9479 {
9480 vm_map_create_options_t vmco;
9481 vm_map_address_t addr;
9482 vm_offset_t submap_start, submap_end;
9483 vm_size_t submap_size;
9484 vm_map_t submap;
9485 vm_prot_t prot = VM_PROT_DEFAULT;
9486 vm_prot_t prot_max = VM_PROT_ALL;
9487 kern_return_t kr;
9488
9489 submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
9490 *remaining_denom);
9491 submap_start = *submap_min;
9492 submap_end = submap_start + submap_size;
9493
9494 #if defined(__LP64__)
9495 if (idx == Z_SUBMAP_IDX_VM) {
9496 vm_packing_verify_range("vm_compressor",
9497 submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
9498 vm_packing_verify_range("vm_page",
9499 submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
9500 }
9501 #endif /* defined(__LP64__) */
9502
9503 vmco = VM_MAP_CREATE_NEVER_FAULTS;
9504 if (!zone_submap_is_sequestered(idx)) {
9505 vmco |= VM_MAP_CREATE_DISABLE_HOLELIST;
9506 }
9507
9508 vm_map_will_allocate_early_map(&zone_submaps[idx]);
9509 submap = kmem_suballoc(kernel_map, submap_min, submap_size, vmco,
9510 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
9511 VM_KERN_MEMORY_ZONE).kmr_submap;
9512
9513 if (idx == Z_SUBMAP_IDX_READ_ONLY) {
9514 zone_info.zi_ro_range.min_address = submap_start;
9515 zone_info.zi_ro_range.max_address = submap_end;
9516 prot_max = prot = VM_PROT_NONE;
9517 }
9518
9519 addr = submap_start;
9520 kr = vm_map_enter(submap, &addr, ZONE_GUARD_SIZE / 2, 0,
9521 VM_FLAGS_FIXED | VM_FLAGS_PERMANENT,
9522 VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_ZONE,
9523 kernel_object, addr, FALSE, prot, prot_max, VM_INHERIT_NONE);
9524 if (kr != KERN_SUCCESS) {
9525 panic("ksubmap[%s]: failed to make first entry (%d)",
9526 zone_submaps_names[idx], kr);
9527 }
9528
9529 addr = submap_end - ZONE_GUARD_SIZE / 2;
9530 kr = vm_map_enter(submap, &addr, ZONE_GUARD_SIZE / 2, 0,
9531 VM_FLAGS_FIXED | VM_FLAGS_PERMANENT,
9532 VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_ZONE,
9533 kernel_object, addr, FALSE, prot, prot_max, VM_INHERIT_NONE);
9534 if (kr != KERN_SUCCESS) {
9535 panic("ksubmap[%s]: failed to make last entry (%d)",
9536 zone_submaps_names[idx], kr);
9537 }
9538
9539 #if DEBUG || DEVELOPMENT
9540 printf("zone_init: map %-5s %p:%p (%zuM)\n",
9541 zone_submaps_names[idx], (void *)submap_start, (void *)submap_end,
9542 (size_t)submap_size >> 20);
9543 #endif /* DEBUG || DEVELOPMENT */
9544
9545 zone_submaps[idx] = submap;
9546 *submap_min = submap_end;
9547 *remaining_size -= submap_size;
9548 *remaining_denom -= zone_sub_map_numer;
9549 }
9550
9551 static inline void
zone_pva_relocate(zone_pva_t * pva,uint32_t delta)9552 zone_pva_relocate(zone_pva_t *pva, uint32_t delta)
9553 {
9554 if (!zone_pva_is_null(*pva) && !zone_pva_is_queue(*pva)) {
9555 pva->packed_address += delta;
9556 }
9557 }
9558
9559 /*
9560 * Allocate metadata array and migrate bootstrap initial metadata and memory.
9561 */
9562 __startup_func
9563 static void
zone_metadata_init(void)9564 zone_metadata_init(void)
9565 {
9566 vm_map_t vm_map = zone_submaps[Z_SUBMAP_IDX_VM];
9567 vm_map_entry_t first;
9568
9569 struct kmem_range meta_r, bits_r, early_r;
9570 vm_size_t early_sz;
9571 vm_offset_t reloc_base;
9572
9573 /*
9574 * Step 1: Allocate the metadata + bitmaps range
9575 *
9576 * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k
9577 * of physical memory (16M per 1G).
9578 *
9579 * Let's preallocate for the worst to avoid weird panics.
9580 */
9581 vm_map_will_allocate_early_map(&zone_meta_map);
9582 meta_r = zone_kmem_suballoc(zone_info.zi_meta_range.min_address,
9583 zone_meta_size + zone_bits_size, VM_FLAGS_FIXED_RANGE_SUBALLOC,
9584 VM_KERN_MEMORY_ZONE, &zone_meta_map);
9585 meta_r.min_address += ZONE_GUARD_SIZE;
9586 meta_r.max_address -= ZONE_GUARD_SIZE;
9587 bits_r.max_address = meta_r.max_address;
9588 meta_r.max_address -= zone_bits_size;
9589 bits_r.min_address = meta_r.max_address;
9590
9591 #if DEBUG || DEVELOPMENT
9592 printf("zone_init: metadata %p:%p (%zuK)\n",
9593 (void *)meta_r.min_address, (void *)meta_r.max_address,
9594 (size_t)kmem_range_size(&meta_r) >> 10);
9595 printf("zone_init: metabits %p:%p (%zuK)\n",
9596 (void *)bits_r.min_address, (void *)bits_r.max_address,
9597 (size_t)kmem_range_size(&bits_r) >> 10);
9598 #endif /* DEBUG || DEVELOPMENT */
9599
9600 bits_r.min_address = (bits_r.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE;
9601 bits_r.max_address = bits_r.max_address & -ZBA_CHUNK_SIZE;
9602
9603 /*
9604 * Step 2: Install new ranges.
9605 * Relocate metadata and bits.
9606 */
9607 early_r = zone_info.zi_map_range;
9608 early_sz = kmem_range_size(&early_r);
9609
9610 zone_info.zi_map_range = zone_map_range;
9611 zone_info.zi_meta_range = meta_r;
9612 zone_info.zi_bits_range = bits_r;
9613 zone_info.zi_meta_base = (struct zone_page_metadata *)meta_r.min_address -
9614 zone_pva_from_addr(zone_map_range.min_address).packed_address;
9615
9616 vm_map_lock(vm_map);
9617 first = vm_map_first_entry(vm_map);
9618 reloc_base = first->vme_end;
9619 first->vme_end += early_sz;
9620 vm_map->size += early_sz;
9621 vm_map_unlock(vm_map);
9622
9623 struct zone_page_metadata *early_meta = zone_early_meta_array_startup;
9624 struct zone_page_metadata *new_meta = zone_meta_from_addr(reloc_base);
9625 vm_offset_t reloc_delta = reloc_base - early_r.min_address;
9626 /* this needs to sign extend */
9627 uint32_t pva_delta = (uint32_t)((intptr_t)reloc_delta >> PAGE_SHIFT);
9628
9629 zone_meta_populate(reloc_base, early_sz);
9630 memcpy(new_meta, early_meta,
9631 atop(early_sz) * sizeof(struct zone_page_metadata));
9632 for (uint32_t i = 0; i < atop(early_sz); i++) {
9633 zone_pva_relocate(&new_meta[i].zm_page_next, pva_delta);
9634 zone_pva_relocate(&new_meta[i].zm_page_prev, pva_delta);
9635 }
9636
9637 static_assert(ZONE_ID_VM_MAP_ENTRY == ZONE_ID_VM_MAP + 1);
9638 static_assert(ZONE_ID_VM_MAP_HOLES == ZONE_ID_VM_MAP + 2);
9639
9640 for (zone_id_t zid = ZONE_ID_VM_MAP; zid <= ZONE_ID_VM_MAP_HOLES; zid++) {
9641 zone_pva_relocate(&zone_array[zid].z_pageq_partial, pva_delta);
9642 zone_pva_relocate(&zone_array[zid].z_pageq_full, pva_delta);
9643 }
9644
9645 zba_populate(0);
9646 memcpy(zba_base_header(), zba_chunk_startup, sizeof(zba_chunk_startup));
9647
9648 /*
9649 * Step 3: Relocate the boostrap VM structs
9650 * (including rewriting their content).
9651 */
9652
9653 #if __x86_64__
9654 kernel_memory_populate(reloc_base, early_sz,
9655 KMA_KOBJECT | KMA_NOENCRYPT | KMA_NOFAIL,
9656 VM_KERN_MEMORY_OSFMK);
9657 __nosan_memcpy((void *)reloc_base, (void *)early_r.min_address, early_sz);
9658 #else
9659 for (vm_address_t addr = early_r.min_address;
9660 addr < early_r.max_address; addr += PAGE_SIZE) {
9661 pmap_paddr_t pa = kvtophys(trunc_page(addr));
9662 __assert_only kern_return_t kr;
9663
9664 kr = pmap_enter_options_addr(kernel_pmap, addr + reloc_delta,
9665 pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE,
9666 0, NULL);
9667 assert(kr == KERN_SUCCESS);
9668 }
9669 #endif
9670
9671 #if KASAN
9672 kasan_notify_address(reloc_base, early_sz);
9673 #if CONFIG_KERNEL_TBI && KASAN_TBI
9674 kasan_tbi_copy_tags(reloc_base, early_r.min_address, early_sz);
9675 #endif /* CONFIG_KERNEL_TBI && KASAN_TBI */
9676 #endif /* KASAN */
9677
9678 vm_map_relocate_early_maps(reloc_delta);
9679
9680 for (uint32_t i = 0; i < atop(early_sz); i++) {
9681 zone_id_t zid = new_meta[i].zm_index;
9682 zone_t z = &zone_array[zid];
9683 vm_size_t esize = zone_elem_size(z);
9684 vm_address_t base = reloc_base + ptoa(i);
9685 vm_address_t addr;
9686 zone_element_t ze;
9687
9688 if (new_meta[i].zm_chunk_len >= ZM_SECONDARY_PAGE) {
9689 continue;
9690 }
9691
9692 for (uint32_t eidx = 0; eidx < z->z_chunk_elems; eidx++) {
9693 ze = zone_element_encode(base, eidx);
9694 if (zone_meta_is_free(&new_meta[i], ze)) {
9695 continue;
9696 }
9697
9698 addr = zone_element_addr(z, ze, esize);
9699 #if KASAN_ZALLOC
9700 if (z->z_kasan_redzone) {
9701 addr = kasan_alloc(addr, esize,
9702 esize - 2 * z->z_kasan_redzone,
9703 z->z_kasan_redzone);
9704 } else {
9705 kasan_poison_range(addr, esize, ASAN_VALID);
9706 }
9707 #endif
9708 vm_map_relocate_early_elem(zid, addr, reloc_delta);
9709 }
9710 }
9711
9712 #if !__x86_64__
9713 pmap_remove(kernel_pmap, early_r.min_address, early_r.max_address);
9714 #endif
9715 }
9716
9717 uint16_t submap_ratios[Z_SUBMAP_IDX_COUNT] = {
9718 #if ZSECURITY_CONFIG(READ_ONLY)
9719 [Z_SUBMAP_IDX_VM] = 15,
9720 [Z_SUBMAP_IDX_READ_ONLY] = 5,
9721 #else
9722 [Z_SUBMAP_IDX_VM] = 20,
9723 #endif /* !ZSECURITY_CONFIG(READ_ONLY) */
9724 #if ZSECURITY_CONFIG(SUBMAP_USER_DATA) && ZSECURITY_CONFIG(SAD_FENG_SHUI)
9725 [Z_SUBMAP_IDX_GENERAL_0] = 15,
9726 [Z_SUBMAP_IDX_GENERAL_1] = 15,
9727 [Z_SUBMAP_IDX_GENERAL_2] = 15,
9728 [Z_SUBMAP_IDX_GENERAL_3] = 15,
9729 [Z_SUBMAP_IDX_DATA] = 20,
9730 #elif ZSECURITY_CONFIG(SUBMAP_USER_DATA)
9731 [Z_SUBMAP_IDX_GENERAL_0] = 40,
9732 [Z_SUBMAP_IDX_DATA] = 40,
9733 #elif ZSECURITY_CONFIG(SAD_FENG_SHUI)
9734 #error invalid configuration: SAD_FENG_SHUI requires SUBMAP_USER_DATA
9735 #else
9736 [Z_SUBMAP_IDX_GENERAL_0] = 80,
9737 #endif /* ZSECURITY_CONFIG(SUBMAP_USER_DATA) && ZSECURITY_CONFIG(SAD_FENG_SHUI) */
9738 };
9739
9740 __startup_func
9741 static void
zone_set_map_sizes(void)9742 zone_set_map_sizes(void)
9743 {
9744 uint64_t denom = 0;
9745 zone_pages_wired_max = (uint32_t)atop(zone_phys_size_max());
9746 for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
9747 denom += submap_ratios[idx];
9748 }
9749 assert(denom == 100);
9750
9751 #if __LP64__
9752 zone_map_size = ZONE_MAP_VA_SIZE_LP64;
9753 #else
9754 zone_map_size = ptoa(zone_pages_wired_max *
9755 (denom + submap_ratios[Z_SUBMAP_IDX_VM]) / denom);
9756 #endif
9757
9758 /*
9759 * Declare restrictions on zone max
9760 */
9761 vm_offset_t restricted_va_max = zone_restricted_va_max();
9762 vm_offset_t vm_submap_size = round_page(
9763 (submap_ratios[Z_SUBMAP_IDX_VM] * zone_map_size) / denom);
9764
9765 #if CONFIG_PROB_GZALLOC
9766 vm_submap_size += pgz_get_size();
9767 #endif /* CONFIG_PROB_GZALLOC */
9768 if (os_sub_overflow(restricted_va_max, vm_submap_size,
9769 &zone_map_range.min_address)) {
9770 zone_map_range.min_address = 0;
9771 }
9772
9773 zone_meta_size = round_page(atop(zone_map_size) *
9774 sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2;
9775 zone_bits_size = round_page(16 * (ptoa(zone_pages_wired_max) >> 10));
9776
9777 #if VM_TAG_SIZECLASSES
9778 if (zone_tagging_on) {
9779 zone_tagbase_map_size = round_page(atop(zone_map_size) * sizeof(uint32_t));
9780 zone_tags_map_size = 2048 * 1024 * sizeof(vm_tag_t);
9781 }
9782 #endif /* VM_TAG_SIZECLASSES */
9783 }
9784 STARTUP(KMEM, STARTUP_RANK_FIRST, zone_set_map_sizes);
9785
9786 /*
9787 * Can't use zone_info.zi_map_range at this point as it is being used to
9788 * store the range of early pmap memory that was stolen to bootstrap the
9789 * necessary VM zones.
9790 */
9791 KMEM_RANGE_REGISTER_DYNAMIC(zones, &zone_map_range, ^() {
9792 return zone_map_size;
9793 });
9794 KMEM_RANGE_REGISTER_DYNAMIC(zone_meta, &zone_info.zi_meta_range, ^() {
9795 return zone_meta_size + zone_bits_size;
9796 });
9797
9798 #if VM_TAG_SIZECLASSES
9799 KMEM_RANGE_REGISTER_DYNAMIC(zone_tagbase, &zone_tagbase_range, ^() {
9800 return zone_tagbase_map_size;
9801 });
9802 KMEM_RANGE_REGISTER_DYNAMIC(zone_tags, &zone_tags_range, ^() {
9803 return zone_tags_map_size;
9804 });
9805 #endif /* VM_TAG_SIZECLASSES */
9806
9807
9808 /*
9809 * Global initialization of Zone Allocator.
9810 * Runs after zone_bootstrap.
9811 */
9812 __startup_func
9813 static void
zone_init(void)9814 zone_init(void)
9815 {
9816 vm_size_t remaining_size;
9817 vm_offset_t submap_min = 0;
9818 uint64_t denom = 100;
9819 /*
9820 * And now allocate the various pieces of VA and submaps.
9821 */
9822
9823 #if !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT)
9824 /*
9825 * Make a first allocation of contiguous VA, that we'll deallocate,
9826 * and we'll carve-out memory in that range again linearly.
9827 * The kernel is stil single threaded at this stage. This doesn't need
9828 * to be done on platforms that declare and process kmem_claims as that
9829 * process will create a temporary mapping for the required range.
9830 */
9831 zone_map_range = zone_init_allocate_va(0, zone_map_size, false);
9832 #endif /* !ZSECURITY_CONFIG(KERNEL_DATA_SPLIT) */
9833
9834 submap_min = zone_map_range.min_address;
9835 remaining_size = zone_map_size;
9836
9837 #if CONFIG_PROB_GZALLOC
9838 vm_size_t pgz_size = pgz_get_size();
9839
9840 vm_map_will_allocate_early_map(&pgz_submap);
9841 zone_info.zi_pgz_range = zone_kmem_suballoc(submap_min, pgz_size,
9842 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
9843 VM_KERN_MEMORY_ZONE, &pgz_submap);
9844
9845 submap_min += pgz_size;
9846 remaining_size -= pgz_size;
9847 #if DEBUG || DEVELOPMENT
9848 printf("zone_init: pgzalloc %p:%p (%zuM) [%d slots]\n",
9849 (void *)zone_info.zi_pgz_range.min_address,
9850 (void *)zone_info.zi_pgz_range.max_address,
9851 (size_t)pgz_size >> 20, pgz_slots);
9852 #endif /* DEBUG || DEVELOPMENT */
9853 #endif /* CONFIG_PROB_GZALLOC */
9854
9855 /*
9856 * Allocate the submaps
9857 */
9858 for (zone_submap_idx_t idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
9859 if (submap_ratios[idx] == 0) {
9860 zone_submaps[idx] = VM_MAP_NULL;
9861 } else {
9862 zone_submap_init(&submap_min, idx, submap_ratios[idx],
9863 &denom, &remaining_size);
9864 }
9865 }
9866
9867 zone_metadata_init();
9868
9869 #if VM_TAG_SIZECLASSES
9870 if (zone_tagging_on) {
9871 zone_tagging_init();
9872 }
9873 #endif
9874 #if CONFIG_GZALLOC
9875 gzalloc_init();
9876 #endif
9877
9878 zone_create_flags_t kma_flags = ZC_NOCACHING |
9879 ZC_NOGC | ZC_NOGZALLOC | ZC_NOCALLOUT |
9880 ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE | ZC_VM_LP64;
9881
9882 (void)zone_create_ext("vm.permanent", 1, kma_flags,
9883 ZONE_ID_PERMANENT, ^(zone_t z) {
9884 z->z_permanent = true;
9885 z->z_elem_size = 1;
9886 });
9887 (void)zone_create_ext("vm.permanent.percpu", 1,
9888 kma_flags | ZC_PERCPU, ZONE_ID_PERCPU_PERMANENT, ^(zone_t z) {
9889 z->z_permanent = true;
9890 z->z_elem_size = 1;
9891 });
9892
9893 /*
9894 * Now migrate the startup statistics into their final storage.
9895 */
9896 int cpu = cpu_number();
9897 zone_index_foreach(idx) {
9898 zone_t tz = &zone_array[idx];
9899
9900 if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) {
9901 zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
9902
9903 *zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu);
9904 tz->z_stats = zs;
9905 }
9906 }
9907
9908 #if VM_TAG_SIZECLASSES
9909 if (zone_tagging_on) {
9910 vm_allocation_zones_init();
9911 }
9912 #endif
9913 }
9914 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
9915
9916 __startup_func
9917 static void
zone_cache_bootstrap(void)9918 zone_cache_bootstrap(void)
9919 {
9920 zone_t magzone;
9921
9922 magzone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) +
9923 zc_mag_size() * sizeof(zone_element_t),
9924 ZC_VM_LP64 | ZC_NOGZALLOC | ZC_KASAN_NOREDZONE | ZC_KASAN_NOQUARANTINE |
9925 ZC_SEQUESTER | ZC_CACHING | ZC_ZFREE_CLEARMEM | ZC_PGZ_USE_GUARDS);
9926 magzone->z_elems_rsv = (uint16_t)(2 * zpercpu_count());
9927
9928 os_atomic_store(&zc_magazine_zone, magzone, compiler_acq_rel);
9929
9930 /*
9931 * Now that we are initialized, we can enable zone caching for zones that
9932 * were made before zcache_bootstrap() was called.
9933 *
9934 * The system is still single threaded so we don't need to take the lock.
9935 */
9936 zone_index_foreach(i) {
9937 zone_t z = &zone_array[i];
9938 if (z->z_pcpu_cache) {
9939 z->z_pcpu_cache = NULL;
9940 zone_enable_caching(z);
9941 }
9942 #if ZONE_ENABLE_LOGGING
9943 if (!z->z_gzalloc_tracked && z->z_self == z) {
9944 zone_setup_logging(z);
9945 }
9946 #endif /* ZONE_ENABLE_LOGGING */
9947 }
9948 }
9949 STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, zone_cache_bootstrap);
9950
9951 void
zalloc_first_proc_made(void)9952 zalloc_first_proc_made(void)
9953 {
9954 zone_caching_disabled = 0;
9955 }
9956
9957 __startup_func
9958 vm_offset_t
zone_early_mem_init(vm_size_t size)9959 zone_early_mem_init(vm_size_t size)
9960 {
9961 vm_offset_t mem;
9962
9963 assert3u(atop(size), <=, ZONE_EARLY_META_INLINE_COUNT);
9964
9965 /*
9966 * The zone that is used early to bring up the VM is stolen here.
9967 *
9968 * When the zone subsystem is actually initialized,
9969 * zone_metadata_init() will be called, and those pages
9970 * and the elements they contain, will be relocated into
9971 * the VM submap (even for architectures when those zones
9972 * do not live there).
9973 */
9974 #if __x86_64__
9975 assert3u(size, <=, sizeof(zone_early_pages_to_cram));
9976 mem = (vm_offset_t)zone_early_pages_to_cram;
9977 #else
9978 mem = (vm_offset_t)pmap_steal_memory(size);
9979 #endif
9980
9981 zone_info.zi_meta_base = zone_early_meta_array_startup -
9982 zone_pva_from_addr(mem).packed_address;
9983 zone_info.zi_map_range.min_address = mem;
9984 zone_info.zi_map_range.max_address = mem + size;
9985
9986 zone_info.zi_bits_range = (struct kmem_range){
9987 .min_address = (vm_offset_t)zba_chunk_startup,
9988 .max_address = (vm_offset_t)zba_chunk_startup +
9989 sizeof(zba_chunk_startup),
9990 };
9991 zba_init_chunk(0);
9992
9993 return mem;
9994 }
9995
9996 #endif /* !ZALLOC_TEST */
9997 #pragma mark - tests
9998 #if DEBUG || DEVELOPMENT
9999
10000 /*
10001 * Used for sysctl zone tests that aren't thread-safe. Ensure only one
10002 * thread goes through at a time.
10003 *
10004 * Or we can end up with multiple test zones (if a second zinit() comes through
10005 * before zdestroy()), which could lead us to run out of zones.
10006 */
10007 static bool any_zone_test_running = FALSE;
10008
10009 static uintptr_t *
zone_copy_allocations(zone_t z,uintptr_t * elems,zone_pva_t page_index)10010 zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index)
10011 {
10012 vm_offset_t elem_size = zone_elem_size(z);
10013 vm_offset_t base;
10014 struct zone_page_metadata *meta;
10015
10016 while (!zone_pva_is_null(page_index)) {
10017 base = zone_pva_to_addr(page_index) + zone_oob_offs(z);
10018 meta = zone_pva_to_meta(page_index);
10019
10020 if (meta->zm_inline_bitmap) {
10021 for (size_t i = 0; i < meta->zm_chunk_len; i++) {
10022 uint32_t map = meta[i].zm_bitmap;
10023
10024 for (; map; map &= map - 1) {
10025 *elems++ = INSTANCE_PUT(base +
10026 elem_size * __builtin_clz(map));
10027 }
10028 base += elem_size * 32;
10029 }
10030 } else {
10031 uint32_t order = zba_bits_ref_order(meta->zm_bitmap);
10032 bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
10033 for (size_t i = 0; i < (1u << order); i++) {
10034 uint64_t map = bits[i];
10035
10036 for (; map; map &= map - 1) {
10037 *elems++ = INSTANCE_PUT(base +
10038 elem_size * __builtin_clzll(map));
10039 }
10040 base += elem_size * 64;
10041 }
10042 }
10043
10044 page_index = meta->zm_page_next;
10045 }
10046 return elems;
10047 }
10048
10049 kern_return_t
zone_leaks(const char * zoneName,uint32_t nameLen,leak_site_proc proc)10050 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc)
10051 {
10052 zone_t zone = NULL;
10053 uintptr_t * array;
10054 uintptr_t * next;
10055 uintptr_t element;
10056 uint32_t idx, count, found;
10057 uint32_t nobtcount;
10058 uint32_t elemSize;
10059 size_t maxElems;
10060
10061 zone_foreach(z) {
10062 if (!strncmp(zoneName, z->z_name, nameLen)) {
10063 zone = z;
10064 break;
10065 }
10066 }
10067 if (zone == NULL) {
10068 return KERN_INVALID_NAME;
10069 }
10070
10071 elemSize = (uint32_t)zone_elem_size(zone);
10072 maxElems = (zone->z_elems_avail + 1) & ~1ul;
10073
10074 array = kalloc_type_tag(uintptr_t, maxElems, VM_KERN_MEMORY_DIAG);
10075 if (array == NULL) {
10076 return KERN_RESOURCE_SHORTAGE;
10077 }
10078
10079 zone_lock(zone);
10080
10081 next = array;
10082 next = zone_copy_allocations(zone, next, zone->z_pageq_partial);
10083 next = zone_copy_allocations(zone, next, zone->z_pageq_full);
10084 count = (uint32_t)(next - array);
10085
10086 zone_unlock(zone);
10087
10088 zone_leaks_scan(array, count, (uint32_t)zone_elem_size(zone), &found);
10089 assert(found <= count);
10090
10091 for (idx = 0; idx < count; idx++) {
10092 element = array[idx];
10093 if (kInstanceFlagReferenced & element) {
10094 continue;
10095 }
10096 element = INSTANCE_PUT(element) & ~kInstanceFlags;
10097 }
10098
10099 #if ZONE_ENABLE_LOGGING
10100 if (zone->z_btlog && !corruption_debug_flag) {
10101 // btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
10102 static_assert(sizeof(vm_address_t) == sizeof(uintptr_t));
10103 btlog_copy_backtraces_for_elements(zone->z_btlog,
10104 (vm_address_t *)array, &count, elemSize, proc);
10105 }
10106 #endif /* ZONE_ENABLE_LOGGING */
10107
10108 for (nobtcount = idx = 0; idx < count; idx++) {
10109 element = array[idx];
10110 if (!element) {
10111 continue;
10112 }
10113 if (kInstanceFlagReferenced & element) {
10114 continue;
10115 }
10116 nobtcount++;
10117 }
10118 if (nobtcount) {
10119 proc(nobtcount, elemSize, BTREF_NULL);
10120 }
10121
10122 kfree_type(uintptr_t, maxElems, array);
10123 return KERN_SUCCESS;
10124 }
10125
10126 static int
zone_ro_basic_test_run(__unused int64_t in,int64_t * out)10127 zone_ro_basic_test_run(__unused int64_t in, int64_t *out)
10128 {
10129 zone_security_flags_t zsflags;
10130 uint32_t x = 4;
10131 uint32_t *test_ptr;
10132
10133 if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
10134 printf("zone_ro_basic_test: Test already running.\n");
10135 return EALREADY;
10136 }
10137
10138 zsflags = zone_security_array[ZONE_ID__FIRST_RO];
10139
10140 for (int i = 0; i < 3; i++) {
10141 #if ZSECURITY_CONFIG(READ_ONLY)
10142 /* Basic Test: Create int zone, zalloc int, modify value, free int */
10143 printf("zone_ro_basic_test: Basic Test iteration %d\n", i);
10144 printf("zone_ro_basic_test: create a sub-page size zone\n");
10145
10146 printf("zone_ro_basic_test: verify flags were set\n");
10147 assert(zsflags.z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
10148
10149 printf("zone_ro_basic_test: zalloc an element\n");
10150 test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK);
10151 assert(test_ptr);
10152
10153 printf("zone_ro_basic_test: verify elem in the right submap\n");
10154 zone_require_ro_range_contains(ZONE_ID__FIRST_RO, test_ptr);
10155
10156 printf("zone_ro_basic_test: verify we can't write to it\n");
10157 assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT);
10158
10159 x = 4;
10160 printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n");
10161 zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t));
10162 assert(test_ptr);
10163 assert(*(uint32_t*)test_ptr == x);
10164
10165 x = 5;
10166 printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n");
10167 zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x);
10168 assert(test_ptr);
10169 assert(*(uint32_t*)test_ptr == x);
10170
10171 printf("zone_ro_basic_test: verify we can't write to it after assigning value\n");
10172 assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT);
10173
10174 printf("zone_ro_basic_test: free elem\n");
10175 zfree_ro(ZONE_ID__FIRST_RO, test_ptr);
10176 assert(!test_ptr);
10177 #else
10178 printf("zone_ro_basic_test: Read-only allocator n/a on 32bit platforms, test functionality of API\n");
10179
10180 printf("zone_ro_basic_test: verify flags were set\n");
10181 assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
10182
10183 printf("zone_ro_basic_test: zalloc an element\n");
10184 test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK);
10185 assert(test_ptr);
10186
10187 x = 4;
10188 printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n");
10189 zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t));
10190 assert(test_ptr);
10191 assert(*(uint32_t*)test_ptr == x);
10192
10193 x = 5;
10194 printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n");
10195 zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x);
10196 assert(test_ptr);
10197 assert(*(uint32_t*)test_ptr == x);
10198
10199 printf("zone_ro_basic_test: free elem\n");
10200 zfree_ro(ZONE_ID__FIRST_RO, test_ptr);
10201 assert(!test_ptr);
10202 #endif /* !ZSECURITY_CONFIG(READ_ONLY) */
10203 }
10204
10205 printf("zone_ro_basic_test: garbage collection\n");
10206 zone_gc(ZONE_GC_DRAIN);
10207
10208 printf("zone_ro_basic_test: Test passed\n");
10209
10210 *out = 1;
10211 os_atomic_store(&any_zone_test_running, false, relaxed);
10212 return 0;
10213 }
10214 SYSCTL_TEST_REGISTER(zone_ro_basic_test, zone_ro_basic_test_run);
10215
10216 static int
zone_basic_test_run(__unused int64_t in,int64_t * out)10217 zone_basic_test_run(__unused int64_t in, int64_t *out)
10218 {
10219 static zone_t test_zone_ptr = NULL;
10220
10221 unsigned int i = 0, max_iter = 5;
10222 void * test_ptr;
10223 zone_t test_zone;
10224 int rc = 0;
10225
10226 if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
10227 printf("zone_basic_test: Test already running.\n");
10228 return EALREADY;
10229 }
10230
10231 printf("zone_basic_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
10232
10233 /* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
10234 do {
10235 test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
10236 assert(test_zone);
10237
10238 #if KASAN_ZALLOC
10239 if (test_zone_ptr == NULL && test_zone->z_elems_free != 0)
10240 #else
10241 if (test_zone->z_elems_free != 0)
10242 #endif
10243 {
10244 printf("zone_basic_test: free count is not zero\n");
10245 rc = EIO;
10246 goto out;
10247 }
10248
10249 if (test_zone_ptr == NULL) {
10250 /* Stash the zone pointer returned on the fist zinit */
10251 printf("zone_basic_test: zone created for the first time\n");
10252 test_zone_ptr = test_zone;
10253 } else if (test_zone != test_zone_ptr) {
10254 printf("zone_basic_test: old zone pointer and new zone pointer don't match\n");
10255 rc = EIO;
10256 goto out;
10257 }
10258
10259 test_ptr = zalloc_flags(test_zone, Z_WAITOK | Z_NOFAIL);
10260 zfree(test_zone, test_ptr);
10261
10262 zdestroy(test_zone);
10263 i++;
10264
10265 printf("zone_basic_test: Iteration %d successful\n", i);
10266 } while (i < max_iter);
10267
10268 /* test Z_VA_SEQUESTER */
10269 #if ZSECURITY_CONFIG(SEQUESTER)
10270 {
10271 zone_t test_pcpu_zone;
10272 kern_return_t kr;
10273 int idx, num_allocs = 8;
10274 vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
10275 void *allocs[num_allocs];
10276 void **allocs_pcpu;
10277 vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
10278
10279 test_zone = zone_create("test_zone_sysctl", elem_size,
10280 ZC_DESTRUCTIBLE);
10281 assert(test_zone);
10282 assert(zone_security_config(test_zone).z_va_sequester);
10283
10284 test_pcpu_zone = zone_create("test_zone_sysctl.pcpu", sizeof(uint64_t),
10285 ZC_DESTRUCTIBLE | ZC_PERCPU);
10286 assert(test_pcpu_zone);
10287 assert(zone_security_config(test_pcpu_zone).z_va_sequester);
10288
10289 for (idx = 0; idx < num_allocs; idx++) {
10290 allocs[idx] = zalloc(test_zone);
10291 assert(NULL != allocs[idx]);
10292 printf("alloc[%d] %p\n", idx, allocs[idx]);
10293 }
10294 for (idx = 0; idx < num_allocs; idx++) {
10295 zfree(test_zone, allocs[idx]);
10296 }
10297 assert(!zone_pva_is_null(test_zone->z_pageq_empty));
10298
10299 kr = kmem_alloc(kernel_map, (vm_address_t *)&allocs_pcpu, PAGE_SIZE,
10300 KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG);
10301 assert(kr == KERN_SUCCESS);
10302
10303 for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10304 allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
10305 Z_WAITOK | Z_ZERO);
10306 assert(NULL != allocs_pcpu[idx]);
10307 }
10308 for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10309 zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
10310 }
10311 assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10312
10313 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
10314 vm_page_wire_count, vm_page_free_count,
10315 100L * phys_pages / zone_pages_wired_max);
10316 zone_gc(ZONE_GC_DRAIN);
10317 printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
10318 vm_page_wire_count, vm_page_free_count,
10319 100L * phys_pages / zone_pages_wired_max);
10320
10321 unsigned int allva = 0;
10322
10323 zone_foreach(z) {
10324 zone_lock(z);
10325 allva += z->z_wired_cur;
10326 if (zone_pva_is_null(z->z_pageq_va)) {
10327 zone_unlock(z);
10328 continue;
10329 }
10330 unsigned count = 0;
10331 uint64_t size;
10332 zone_pva_t pg = z->z_pageq_va;
10333 struct zone_page_metadata *page_meta;
10334 while (pg.packed_address) {
10335 page_meta = zone_pva_to_meta(pg);
10336 count += z->z_percpu ? 1 : z->z_chunk_pages;
10337 if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
10338 count -= page_meta->zm_page_index;
10339 }
10340 pg = page_meta->zm_page_next;
10341 }
10342 size = zone_size_wired(z);
10343 if (!size) {
10344 size = 1;
10345 }
10346 printf("%s%s: seq %d, res %d, %qd %%\n",
10347 zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur,
10348 z->z_wired_cur, zone_size_allocated(z) * 100ULL / size);
10349 zone_unlock(z);
10350 }
10351
10352 printf("total va: %d\n", allva);
10353
10354 assert(zone_pva_is_null(test_zone->z_pageq_empty));
10355 assert(zone_pva_is_null(test_zone->z_pageq_partial));
10356 assert(!zone_pva_is_null(test_zone->z_pageq_va));
10357 assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10358 assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial));
10359 assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va));
10360
10361 for (idx = 0; idx < num_allocs; idx++) {
10362 assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
10363 }
10364
10365 /* make sure the zone is still usable after a GC */
10366
10367 for (idx = 0; idx < num_allocs; idx++) {
10368 allocs[idx] = zalloc(test_zone);
10369 assert(allocs[idx]);
10370 printf("alloc[%d] %p\n", idx, allocs[idx]);
10371 }
10372 for (idx = 0; idx < num_allocs; idx++) {
10373 zfree(test_zone, allocs[idx]);
10374 }
10375
10376 for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10377 allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
10378 Z_WAITOK | Z_ZERO);
10379 assert(NULL != allocs_pcpu[idx]);
10380 }
10381 for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10382 zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
10383 }
10384
10385 assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10386
10387 kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE);
10388
10389 zdestroy(test_zone);
10390 zdestroy(test_pcpu_zone);
10391 }
10392 #else
10393 printf("zone_basic_test: skipping sequester test (not enabled)\n");
10394 #endif /* ZSECURITY_CONFIG(SEQUESTER) */
10395
10396 printf("zone_basic_test: Test passed\n");
10397
10398
10399 *out = 1;
10400 out:
10401 os_atomic_store(&any_zone_test_running, false, relaxed);
10402 return rc;
10403 }
10404 SYSCTL_TEST_REGISTER(zone_basic_test, zone_basic_test_run);
10405
10406 struct zone_stress_obj {
10407 TAILQ_ENTRY(zone_stress_obj) zso_link;
10408 };
10409
10410 struct zone_stress_ctx {
10411 thread_t zsc_leader;
10412 lck_mtx_t zsc_lock;
10413 zone_t zsc_zone;
10414 uint64_t zsc_end;
10415 uint32_t zsc_workers;
10416 };
10417
10418 static void
zone_stress_worker(void * arg,wait_result_t __unused wr)10419 zone_stress_worker(void *arg, wait_result_t __unused wr)
10420 {
10421 struct zone_stress_ctx *ctx = arg;
10422 bool leader = ctx->zsc_leader == current_thread();
10423 TAILQ_HEAD(zone_stress_head, zone_stress_obj) head = TAILQ_HEAD_INITIALIZER(head);
10424 struct zone_bool_gen bg = { };
10425 struct zone_stress_obj *obj;
10426 uint32_t allocs = 0;
10427
10428 random_bool_init(&bg.zbg_bg);
10429
10430 do {
10431 for (int i = 0; i < 2000; i++) {
10432 uint32_t what = random_bool_gen_bits(&bg.zbg_bg,
10433 bg.zbg_entropy, ZONE_ENTROPY_CNT, 1);
10434 switch (what) {
10435 case 0:
10436 case 1:
10437 if (allocs < 10000) {
10438 obj = zalloc(ctx->zsc_zone);
10439 TAILQ_INSERT_HEAD(&head, obj, zso_link);
10440 allocs++;
10441 }
10442 break;
10443 case 2:
10444 case 3:
10445 if (allocs < 10000) {
10446 obj = zalloc(ctx->zsc_zone);
10447 TAILQ_INSERT_TAIL(&head, obj, zso_link);
10448 allocs++;
10449 }
10450 break;
10451 case 4:
10452 if (leader) {
10453 zone_gc(ZONE_GC_DRAIN);
10454 }
10455 break;
10456 case 5:
10457 case 6:
10458 if (!TAILQ_EMPTY(&head)) {
10459 obj = TAILQ_FIRST(&head);
10460 TAILQ_REMOVE(&head, obj, zso_link);
10461 zfree(ctx->zsc_zone, obj);
10462 allocs--;
10463 }
10464 break;
10465 case 7:
10466 if (!TAILQ_EMPTY(&head)) {
10467 obj = TAILQ_LAST(&head, zone_stress_head);
10468 TAILQ_REMOVE(&head, obj, zso_link);
10469 zfree(ctx->zsc_zone, obj);
10470 allocs--;
10471 }
10472 break;
10473 }
10474 }
10475 } while (mach_absolute_time() < ctx->zsc_end);
10476
10477 while (!TAILQ_EMPTY(&head)) {
10478 obj = TAILQ_FIRST(&head);
10479 TAILQ_REMOVE(&head, obj, zso_link);
10480 zfree(ctx->zsc_zone, obj);
10481 }
10482
10483 lck_mtx_lock(&ctx->zsc_lock);
10484 if (--ctx->zsc_workers == 0) {
10485 thread_wakeup(ctx);
10486 } else if (leader) {
10487 while (ctx->zsc_workers) {
10488 lck_mtx_sleep(&ctx->zsc_lock, LCK_SLEEP_DEFAULT, ctx,
10489 THREAD_UNINT);
10490 }
10491 }
10492 lck_mtx_unlock(&ctx->zsc_lock);
10493
10494 if (!leader) {
10495 thread_terminate_self();
10496 __builtin_unreachable();
10497 }
10498 }
10499
10500 static int
zone_stress_test_run(__unused int64_t in,int64_t * out)10501 zone_stress_test_run(__unused int64_t in, int64_t *out)
10502 {
10503 struct zone_stress_ctx ctx = {
10504 .zsc_leader = current_thread(),
10505 .zsc_workers = 3,
10506 };
10507 kern_return_t kr;
10508 thread_t th;
10509
10510 if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
10511 printf("zone_stress_test: Test already running.\n");
10512 return EALREADY;
10513 }
10514
10515 lck_mtx_init(&ctx.zsc_lock, &zone_locks_grp, LCK_ATTR_NULL);
10516 ctx.zsc_zone = zone_create("test_zone_344", 344,
10517 ZC_DESTRUCTIBLE | ZC_NOCACHING);
10518 assert(ctx.zsc_zone->z_chunk_pages > 1);
10519
10520 clock_interval_to_deadline(5, NSEC_PER_SEC, &ctx.zsc_end);
10521
10522 printf("zone_stress_test: Starting (leader %p)\n", current_thread());
10523
10524 os_atomic_inc(&zalloc_simulate_vm_pressure, relaxed);
10525
10526 for (uint32_t i = 1; i < ctx.zsc_workers; i++) {
10527 kr = kernel_thread_start_priority(zone_stress_worker, &ctx,
10528 BASEPRI_DEFAULT, &th);
10529 if (kr == KERN_SUCCESS) {
10530 printf("zone_stress_test: thread %d: %p\n", i, th);
10531 thread_deallocate(th);
10532 } else {
10533 ctx.zsc_workers--;
10534 }
10535 }
10536
10537 zone_stress_worker(&ctx, 0);
10538
10539 lck_mtx_destroy(&ctx.zsc_lock, &zone_locks_grp);
10540
10541 zdestroy(ctx.zsc_zone);
10542
10543 printf("zone_stress_test: Done\n");
10544
10545 *out = 1;
10546 os_atomic_dec(&zalloc_simulate_vm_pressure, relaxed);
10547 os_atomic_store(&any_zone_test_running, false, relaxed);
10548 return 0;
10549 }
10550 SYSCTL_TEST_REGISTER(zone_stress_test, zone_stress_test_run);
10551
10552 /*
10553 * Routines to test that zone garbage collection and zone replenish threads
10554 * running at the same time don't cause problems.
10555 */
10556
10557 static int
zone_gc_replenish_test(__unused int64_t in,int64_t * out)10558 zone_gc_replenish_test(__unused int64_t in, int64_t *out)
10559 {
10560 zone_gc(ZONE_GC_DRAIN);
10561 *out = 1;
10562 return 0;
10563 }
10564 SYSCTL_TEST_REGISTER(zone_gc_replenish_test, zone_gc_replenish_test);
10565
10566 static int
zone_alloc_replenish_test(__unused int64_t in,int64_t * out)10567 zone_alloc_replenish_test(__unused int64_t in, int64_t *out)
10568 {
10569 zone_t z = vm_map_entry_zone;
10570 struct data { struct data *next; } *node, *list = NULL;
10571
10572 if (z == NULL) {
10573 printf("Couldn't find a replenish zone\n");
10574 return EIO;
10575 }
10576
10577 /* big enough to go past replenishment */
10578 for (uint32_t i = 0; i < 10 * z->z_elems_rsv; ++i) {
10579 node = zalloc(z);
10580 node->next = list;
10581 list = node;
10582 }
10583
10584 /*
10585 * release the memory we allocated
10586 */
10587 while (list != NULL) {
10588 node = list;
10589 list = list->next;
10590 zfree(z, node);
10591 }
10592
10593 *out = 1;
10594 return 0;
10595 }
10596 SYSCTL_TEST_REGISTER(zone_alloc_replenish_test, zone_alloc_replenish_test);
10597
10598 #endif /* DEBUG || DEVELOPMENT */
10599