xref: /xnu-10063.121.3/osfmk/kern/zalloc.c (revision 2c2f96dc2b9a4408a43d3150ae9c105355ca3daa) !
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 /*
57  */
58 /*
59  *	File:	kern/zalloc.c
60  *	Author:	Avadis Tevanian, Jr.
61  *
62  *	Zone-based memory allocator.  A zone is a collection of fixed size
63  *	data blocks for which quick allocation/deallocation is possible.
64  */
65 
66 #define ZALLOC_ALLOW_DEPRECATED 1
67 #if !ZALLOC_TEST
68 #include <mach/mach_types.h>
69 #include <mach/vm_param.h>
70 #include <mach/kern_return.h>
71 #include <mach/mach_host_server.h>
72 #include <mach/task_server.h>
73 #include <mach/machine/vm_types.h>
74 #include <machine/machine_routines.h>
75 #include <mach/vm_map.h>
76 #include <mach/sdt.h>
77 #if __x86_64__
78 #include <i386/cpuid.h>
79 #endif
80 
81 #include <kern/bits.h>
82 #include <kern/btlog.h>
83 #include <kern/startup.h>
84 #include <kern/kern_types.h>
85 #include <kern/assert.h>
86 #include <kern/backtrace.h>
87 #include <kern/host.h>
88 #include <kern/macro_help.h>
89 #include <kern/sched.h>
90 #include <kern/locks.h>
91 #include <kern/sched_prim.h>
92 #include <kern/misc_protos.h>
93 #include <kern/thread_call.h>
94 #include <kern/zalloc_internal.h>
95 #include <kern/kalloc.h>
96 #include <kern/debug.h>
97 
98 #include <prng/random.h>
99 
100 #include <vm/pmap.h>
101 #include <vm/vm_map.h>
102 #include <vm/vm_memtag.h>
103 #include <vm/vm_kern.h>
104 #include <vm/vm_page.h>
105 #include <vm/vm_pageout.h>
106 #include <vm/vm_compressor.h> /* C_SLOT_PACKED_PTR* */
107 
108 #include <pexpert/pexpert.h>
109 
110 #include <machine/machparam.h>
111 #include <machine/machine_routines.h>  /* ml_cpu_get_info */
112 
113 #include <os/atomic.h>
114 
115 #include <libkern/OSDebug.h>
116 #include <libkern/OSAtomic.h>
117 #include <libkern/section_keywords.h>
118 #include <sys/kdebug.h>
119 #include <sys/code_signing.h>
120 
121 #include <san/kasan.h>
122 #include <libsa/stdlib.h>
123 #include <sys/errno.h>
124 
125 #include <IOKit/IOBSD.h>
126 #include <arm64/amcc_rorgn.h>
127 
128 #if DEBUG
129 #define z_debug_assert(expr)  assert(expr)
130 #else
131 #define z_debug_assert(expr)  (void)(expr)
132 #endif
133 
134 #if CONFIG_PROB_GZALLOC && CONFIG_SPTM
135 #error This is not a supported configuration
136 #endif
137 
138 /* Returns pid of the task with the largest number of VM map entries.  */
139 extern pid_t find_largest_process_vm_map_entries(void);
140 
141 /*
142  * Callout to jetsam. If pid is -1, we wake up the memorystatus thread to do asynchronous kills.
143  * For any other pid we try to kill that process synchronously.
144  */
145 extern boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
146 
147 extern zone_t vm_object_zone;
148 extern zone_t ipc_service_port_label_zone;
149 
150 ZONE_DEFINE_TYPE(percpu_u64_zone, "percpu.64", uint64_t,
151     ZC_PERCPU | ZC_ALIGNMENT_REQUIRED | ZC_KASAN_NOREDZONE);
152 
153 #if CONFIG_KERNEL_TAGGING
154 #define ZONE_MIN_ELEM_SIZE      (sizeof(uint64_t) * 2)
155 #define ZONE_ALIGN_SIZE         ZONE_MIN_ELEM_SIZE
156 #else /* CONFIG_KERNEL_TAGGING */
157 #define ZONE_MIN_ELEM_SIZE      sizeof(uint64_t)
158 #define ZONE_ALIGN_SIZE         ZONE_MIN_ELEM_SIZE
159 #endif /* CONFIG_KERNEL_TAGGING */
160 
161 #define ZONE_MAX_ALLOC_SIZE     (32 * 1024)
162 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
163 #define ZONE_CHUNK_ALLOC_SIZE   (256 * 1024)
164 #define ZONE_GUARD_DENSE        (32  * 1024)
165 #define ZONE_GUARD_SPARSE       (64  * 1024)
166 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
167 
168 #if XNU_PLATFORM_MacOSX
169 #define ZONE_MAP_MAX            (32ULL << 30)
170 #define ZONE_MAP_VA_SIZE        (128ULL << 30)
171 #else /* XNU_PLATFORM_MacOSX */
172 #define ZONE_MAP_MAX            (8ULL << 30)
173 #define ZONE_MAP_VA_SIZE        (24ULL << 30)
174 #endif /* !XNU_PLATFORM_MacOSX */
175 
176 __enum_closed_decl(zm_len_t, uint16_t, {
177 	ZM_CHUNK_FREE           = 0x0,
178 	/* 1 through 8 are valid lengths */
179 	ZM_CHUNK_LEN_MAX        = 0x8,
180 
181 	/* PGZ magical values */
182 	ZM_PGZ_FREE             = 0x0,
183 	ZM_PGZ_ALLOCATED        = 0xa, /* [a]llocated   */
184 	ZM_PGZ_GUARD            = 0xb, /* oo[b]         */
185 	ZM_PGZ_DOUBLE_FREE      = 0xd, /* [d]ouble_free */
186 
187 	/* secondary page markers */
188 	ZM_SECONDARY_PAGE       = 0xe,
189 	ZM_SECONDARY_PCPU_PAGE  = 0xf,
190 });
191 
192 static_assert(MAX_ZONES < (1u << 10), "MAX_ZONES must fit in zm_index");
193 
194 struct zone_page_metadata {
195 	union {
196 		struct {
197 			/* The index of the zone this metadata page belongs to */
198 			zone_id_t       zm_index : 10;
199 
200 			/*
201 			 * This chunk ends with a guard page.
202 			 */
203 			uint16_t        zm_guarded : 1;
204 
205 			/*
206 			 * Whether `zm_bitmap` is an inline bitmap
207 			 * or a packed bitmap reference
208 			 */
209 			uint16_t        zm_inline_bitmap : 1;
210 
211 			/*
212 			 * Zones allocate in "chunks" of zone_t::z_chunk_pages
213 			 * consecutive pages, or zpercpu_count() pages if the
214 			 * zone is percpu.
215 			 *
216 			 * The first page of it has its metadata set with:
217 			 * - 0 if none of the pages are currently wired
218 			 * - the number of wired pages in the chunk
219 			 *   (not scaled for percpu).
220 			 *
221 			 * Other pages in the chunk have their zm_chunk_len set
222 			 * to ZM_SECONDARY_PAGE or ZM_SECONDARY_PCPU_PAGE
223 			 * depending on whether the zone is percpu or not.
224 			 * For those, zm_page_index holds the index of that page
225 			 * in the run, and zm_subchunk_len the remaining length
226 			 * within the chunk.
227 			 *
228 			 * Metadata used for PGZ pages can have 3 values:
229 			 * - ZM_PGZ_FREE:         slot is free
230 			 * - ZM_PGZ_ALLOCATED:    slot holds an allocated element
231 			 *                        at offset (zm_pgz_orig_addr & PAGE_MASK)
232 			 * - ZM_PGZ_DOUBLE_FREE:  slot detected a double free
233 			 *                        (will panic).
234 			 */
235 			zm_len_t        zm_chunk_len : 4;
236 		};
237 		uint16_t zm_bits;
238 	};
239 
240 	union {
241 #define ZM_ALLOC_SIZE_LOCK      1u
242 		uint16_t zm_alloc_size; /* first page only */
243 		struct {
244 			uint8_t zm_page_index;   /* secondary pages only */
245 			uint8_t zm_subchunk_len; /* secondary pages only */
246 		};
247 		uint16_t zm_oob_offs;   /* in guard pages  */
248 	};
249 	union {
250 		uint32_t zm_bitmap;     /* most zones      */
251 		uint32_t zm_bump;       /* permanent zones */
252 	};
253 
254 	union {
255 		struct {
256 			zone_pva_t      zm_page_next;
257 			zone_pva_t      zm_page_prev;
258 		};
259 		vm_offset_t zm_pgz_orig_addr;
260 		struct zone_page_metadata *zm_pgz_slot_next;
261 	};
262 };
263 static_assert(sizeof(struct zone_page_metadata) == 16, "validate packing");
264 
265 /*!
266  * @typedef zone_magazine_t
267  *
268  * @brief
269  * Magazine of cached allocations.
270  *
271  * @field zm_next       linkage used by magazine depots.
272  * @field zm_elems      an array of @c zc_mag_size() elements.
273  */
274 struct zone_magazine {
275 	zone_magazine_t         zm_next;
276 	smr_seq_t               zm_seq;
277 	vm_offset_t             zm_elems[0];
278 };
279 
280 /*!
281  * @typedef zone_cache_t
282  *
283  * @brief
284  * Magazine of cached allocations.
285  *
286  * @discussion
287  * Below is a diagram of the caching system. This design is inspired by the
288  * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
289  * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams and the FreeBSD UMA
290  * zone allocator (itself derived from this seminal work).
291  *
292  * It is divided into 3 layers:
293  * - the per-cpu layer,
294  * - the recirculation depot layer,
295  * - the Zone Allocator.
296  *
297  * The per-cpu and recirculation depot layer use magazines (@c zone_magazine_t),
298  * which are stacks of up to @c zc_mag_size() elements.
299  *
300  * <h2>CPU layer</h2>
301  *
302  * The CPU layer (@c zone_cache_t) looks like this:
303  *
304  *      ╭─ a ─ f ─┬───────── zm_depot ──────────╮
305  *      │ ╭─╮ ╭─╮ │ ╭─╮ ╭─╮ ╭─╮ ╭─╮ ╭─╮         │
306  *      │ │#│ │#│ │ │#│ │#│ │#│ │#│ │#│         │
307  *      │ │#│ │ │ │ │#│ │#│ │#│ │#│ │#│         │
308  *      │ │ │ │ │ │ │#│ │#│ │#│ │#│ │#│         │
309  *      │ ╰─╯ ╰─╯ │ ╰─╯ ╰─╯ ╰─╯ ╰─╯ ╰─╯         │
310  *      ╰─────────┴─────────────────────────────╯
311  *
312  * It has two pre-loaded magazines (a)lloc and (f)ree which we allocate from,
313  * or free to. Serialization is achieved through disabling preemption, and only
314  * the current CPU can acces those allocations. This is represented on the left
315  * hand side of the diagram above.
316  *
317  * The right hand side is the per-cpu depot. It consists of @c zm_depot_count
318  * full magazines, and is protected by the @c zm_depot_lock for access.
319  * The lock is expected to absolutely never be contended, as only the local CPU
320  * tends to access the local per-cpu depot in regular operation mode.
321  *
322  * However unlike UMA, our implementation allows for the zone GC to reclaim
323  * per-CPU magazines aggresively, which is serialized with the @c zm_depot_lock.
324  *
325  *
326  * <h2>Recirculation Depot</h2>
327  *
328  * The recirculation depot layer is a list similar to the per-cpu depot,
329  * however it is different in two fundamental ways:
330  *
331  * - it is protected by the regular zone lock,
332  * - elements referenced by the magazines in that layer appear free
333  *   to the zone layer.
334  *
335  *
336  * <h2>Magazine circulation and sizing</h2>
337  *
338  * The caching system sizes itself dynamically. Operations that allocate/free
339  * a single element call @c zone_lock_nopreempt_check_contention() which records
340  * contention on the lock by doing a trylock and recording its success.
341  *
342  * This information is stored in the @c z_recirc_cont_cur field of the zone,
343  * and a windowed moving average is maintained in @c z_contention_wma.
344  * The periodically run function @c compute_zone_working_set_size() will then
345  * take this into account to decide to grow the number of buckets allowed
346  * in the depot or shrink it based on the @c zc_grow_level and @c zc_shrink_level
347  * thresholds.
348  *
349  * The per-cpu layer will attempt to work with its depot, finding both full and
350  * empty magazines cached there. If it can't get what it needs, then it will
351  * mediate with the zone recirculation layer. Such recirculation is done in
352  * batches in order to amortize lock holds.
353  * (See @c {zalloc,zfree}_cached_depot_recirculate()).
354  *
355  * The recirculation layer keeps a track of what the minimum amount of magazines
356  * it had over time was for each of the full and empty queues. This allows for
357  * @c compute_zone_working_set_size() to return memory to the system when a zone
358  * stops being used as much.
359  *
360  * <h2>Security considerations</h2>
361  *
362  * The zone caching layer has been designed to avoid returning elements in
363  * a strict LIFO behavior: @c zalloc() will allocate from the (a) magazine,
364  * and @c zfree() free to the (f) magazine, and only swap them when the
365  * requested operation cannot be fulfilled.
366  *
367  * The per-cpu overflow depot or the recirculation depots are similarly used
368  * in FIFO order.
369  *
370  * @field zc_depot_lock     a lock to access @c zc_depot, @c zc_depot_cur.
371  * @field zc_alloc_cur      denormalized number of elements in the (a) magazine
372  * @field zc_free_cur       denormalized number of elements in the (f) magazine
373  * @field zc_alloc_elems    a pointer to the array of elements in (a)
374  * @field zc_free_elems     a pointer to the array of elements in (f)
375  *
376  * @field zc_depot          a list of @c zc_depot_cur full magazines
377  */
378 typedef struct zone_cache {
379 	hw_lck_ticket_t            zc_depot_lock;
380 	uint16_t                   zc_alloc_cur;
381 	uint16_t                   zc_free_cur;
382 	vm_offset_t               *zc_alloc_elems;
383 	vm_offset_t               *zc_free_elems;
384 	struct zone_depot          zc_depot;
385 	smr_t                      zc_smr;
386 	zone_smr_free_cb_t XNU_PTRAUTH_SIGNED_FUNCTION_PTR("zc_free") zc_free;
387 } __attribute__((aligned(64))) * zone_cache_t;
388 
389 #if !__x86_64__
390 static
391 #endif
392 __security_const_late struct {
393 	struct mach_vm_range       zi_map_range;  /* all zone submaps     */
394 	struct mach_vm_range       zi_ro_range;   /* read-only range      */
395 	struct mach_vm_range       zi_meta_range; /* debugging only       */
396 	struct mach_vm_range       zi_bits_range; /* bits buddy allocator */
397 	struct mach_vm_range       zi_xtra_range; /* vm tracking metadata */
398 	struct mach_vm_range       zi_pgz_range;
399 	struct zone_page_metadata *zi_pgz_meta;
400 
401 	/*
402 	 * The metadata lives within the zi_meta_range address range.
403 	 *
404 	 * The correct formula to find a metadata index is:
405 	 *     absolute_page_index - page_index(zi_map_range.min_address)
406 	 *
407 	 * And then this index is used to dereference zi_meta_range.min_address
408 	 * as a `struct zone_page_metadata` array.
409 	 *
410 	 * To avoid doing that substraction all the time in the various fast-paths,
411 	 * zi_meta_base are pre-offset with that minimum page index to avoid redoing
412 	 * that math all the time.
413 	 */
414 	struct zone_page_metadata *zi_meta_base;
415 } zone_info;
416 
417 __startup_data static struct mach_vm_range  zone_map_range;
418 __startup_data static vm_map_size_t         zone_meta_size;
419 __startup_data static vm_map_size_t         zone_bits_size;
420 __startup_data static vm_map_size_t         zone_xtra_size;
421 
422 /*
423  * Initial array of metadata for stolen memory.
424  *
425  * The numbers here have to be kept in sync with vm_map_steal_memory()
426  * so that we have reserved enough metadata.
427  *
428  * After zone_init() has run (which happens while the kernel is still single
429  * threaded), the metadata is moved to its final dynamic location, and
430  * this array is unmapped with the rest of __startup_data at lockdown.
431  */
432 #define ZONE_EARLY_META_INLINE_COUNT    64
433 __startup_data
434 static struct zone_page_metadata
435     zone_early_meta_array_startup[ZONE_EARLY_META_INLINE_COUNT];
436 
437 
438 __startup_data __attribute__((aligned(PAGE_MAX_SIZE)))
439 static uint8_t zone_early_pages_to_cram[PAGE_MAX_SIZE * 16];
440 
441 /*
442  *	The zone_locks_grp allows for collecting lock statistics.
443  *	All locks are associated to this group in zinit.
444  *	Look at tools/lockstat for debugging lock contention.
445  */
446 LCK_GRP_DECLARE(zone_locks_grp, "zone_locks");
447 static LCK_MTX_DECLARE(zone_metadata_region_lck, &zone_locks_grp);
448 
449 /*
450  *	The zone metadata lock protects:
451  *	- metadata faulting,
452  *	- VM submap VA allocations,
453  *	- early gap page queue list
454  */
455 #define zone_meta_lock()   lck_mtx_lock(&zone_metadata_region_lck);
456 #define zone_meta_unlock() lck_mtx_unlock(&zone_metadata_region_lck);
457 
458 /*
459  *	Exclude more than one concurrent garbage collection
460  */
461 static LCK_GRP_DECLARE(zone_gc_lck_grp, "zone_gc");
462 static LCK_MTX_DECLARE(zone_gc_lock, &zone_gc_lck_grp);
463 static LCK_SPIN_DECLARE(zone_exhausted_lock, &zone_gc_lck_grp);
464 
465 /*
466  * Panic logging metadata
467  */
468 bool panic_include_zprint = false;
469 bool panic_include_kalloc_types = false;
470 zone_t kalloc_type_src_zone = ZONE_NULL;
471 zone_t kalloc_type_dst_zone = ZONE_NULL;
472 mach_memory_info_t *panic_kext_memory_info = NULL;
473 vm_size_t panic_kext_memory_size = 0;
474 vm_offset_t panic_fault_address = 0;
475 
476 /*
477  *      Protects zone_array, num_zones, num_zones_in_use, and
478  *      zone_destroyed_bitmap
479  */
480 static SIMPLE_LOCK_DECLARE(all_zones_lock, 0);
481 static zone_id_t        num_zones_in_use;
482 zone_id_t _Atomic       num_zones;
483 SECURITY_READ_ONLY_LATE(unsigned int) zone_view_count;
484 
485 /*
486  * Initial globals for zone stats until we can allocate the real ones.
487  * Those get migrated inside the per-CPU ones during zone_init() and
488  * this array is unmapped with the rest of __startup_data at lockdown.
489  */
490 
491 /* zone to allocate zone_magazine structs from */
492 static SECURITY_READ_ONLY_LATE(zone_t) zc_magazine_zone;
493 /*
494  * Until pid1 is made, zone caching is off,
495  * until compute_zone_working_set_size() runs for the firt time.
496  *
497  * -1 represents the "never enabled yet" value.
498  */
499 static int8_t zone_caching_disabled = -1;
500 
501 __startup_data
502 static struct zone_stats zone_stats_startup[MAX_ZONES];
503 struct zone              zone_array[MAX_ZONES];
504 SECURITY_READ_ONLY_LATE(zone_security_flags_t) zone_security_array[MAX_ZONES] = {
505 	[0 ... MAX_ZONES - 1] = {
506 		.z_kheap_id       = KHEAP_ID_NONE,
507 		.z_noencrypt      = false,
508 		.z_submap_idx     = Z_SUBMAP_IDX_GENERAL_0,
509 		.z_kalloc_type    = false,
510 		.z_sig_eq         = 0
511 	},
512 };
513 SECURITY_READ_ONLY_LATE(struct zone_size_params) zone_ro_size_params[ZONE_ID__LAST_RO + 1];
514 SECURITY_READ_ONLY_LATE(zone_cache_ops_t) zcache_ops[ZONE_ID__FIRST_DYNAMIC];
515 
516 /* Initialized in zone_bootstrap(), how many "copies" the per-cpu system does */
517 static SECURITY_READ_ONLY_LATE(unsigned) zpercpu_early_count;
518 
519 /* Used to keep track of destroyed slots in the zone_array */
520 static bitmap_t zone_destroyed_bitmap[BITMAP_LEN(MAX_ZONES)];
521 
522 /* number of zone mapped pages used by all zones */
523 static size_t _Atomic zone_pages_jetsam_threshold = ~0;
524 size_t zone_pages_wired;
525 size_t zone_guard_pages;
526 
527 /* Time in (ms) after which we panic for zone exhaustions */
528 TUNABLE(int, zone_exhausted_timeout, "zet", 5000);
529 static bool zone_share_always = true;
530 static TUNABLE_WRITEABLE(uint32_t, zone_early_thres_mul, "zone_early_thres_mul", 5);
531 
532 #if VM_TAG_SIZECLASSES
533 /*
534  * Zone tagging allows for per "tag" accounting of allocations for the kalloc
535  * zones only.
536  *
537  * There are 3 kinds of tags that can be used:
538  * - pre-registered VM_KERN_MEMORY_*
539  * - dynamic tags allocated per call sites in core-kernel (using vm_tag_alloc())
540  * - per-kext tags computed by IOKit (using the magic Z_VM_TAG_BT_BIT marker).
541  *
542  * The VM tracks the statistics in lazily allocated structures.
543  * See vm_tag_will_update_zone(), vm_tag_update_zone_size().
544  *
545  * If for some reason the requested tag cannot be accounted for,
546  * the tag is forced to VM_KERN_MEMORY_KALLOC which is pre-allocated.
547  *
548  * Each allocated element also remembers the tag it was assigned,
549  * which lets zalloc/zfree update statistics correctly.
550  */
551 
552 /* enable tags for zones that ask for it */
553 static TUNABLE(bool, zone_tagging_on, "-zt", false);
554 
555 /*
556  * Array of all sizeclasses used by kalloc variants so that we can
557  * have accounting per size class for each kalloc callsite
558  */
559 static uint16_t zone_tags_sizeclasses[VM_TAG_SIZECLASSES];
560 #endif /* VM_TAG_SIZECLASSES */
561 
562 #if DEBUG || DEVELOPMENT
563 static int zalloc_simulate_vm_pressure;
564 #endif /* DEBUG || DEVELOPMENT */
565 
566 #define Z_TUNABLE(t, n, d) \
567 	TUNABLE(t, _##n, #n, d); \
568 	__pure2 static inline t n(void) { return _##n; }
569 
570 /*
571  * Zone caching tunables
572  *
573  * zc_mag_size():
574  *   size of magazines, larger to reduce contention at the expense of memory
575  *
576  * zc_enable_level
577  *   number of contentions per second after which zone caching engages
578  *   automatically.
579  *
580  *   0 to disable.
581  *
582  * zc_grow_level
583  *   number of contentions per second x cpu after which the number of magazines
584  *   allowed in the depot can grow. (in "Z_WMA_UNIT" units).
585  *
586  * zc_shrink_level
587  *   number of contentions per second x cpu below which the number of magazines
588  *   allowed in the depot will shrink. (in "Z_WMA_UNIT" units).
589  *
590  * zc_pcpu_max
591  *   maximum memory size in bytes that can hang from a CPU,
592  *   which will affect how many magazines are allowed in the depot.
593  *
594  *   The alloc/free magazines are assumed to be on average half-empty
595  *   and to count for "1" unit of magazines.
596  *
597  * zc_autotrim_size
598  *   Size allowed to hang extra from the recirculation depot before
599  *   auto-trim kicks in.
600  *
601  * zc_autotrim_buckets
602  *
603  *   How many buckets in excess of the working-set are allowed
604  *   before auto-trim kicks in for empty buckets.
605  *
606  * zc_free_batch_size
607  *   The size of batches of frees/reclaim that can be done keeping
608  *   the zone lock held (and preemption disabled).
609  */
610 Z_TUNABLE(uint16_t, zc_mag_size, 8);
611 static Z_TUNABLE(uint32_t, zc_enable_level, 10);
612 static Z_TUNABLE(uint32_t, zc_grow_level, 5 * Z_WMA_UNIT);
613 static Z_TUNABLE(uint32_t, zc_shrink_level, Z_WMA_UNIT / 2);
614 static Z_TUNABLE(uint32_t, zc_pcpu_max, 128 << 10);
615 static Z_TUNABLE(uint32_t, zc_autotrim_size, 16 << 10);
616 static Z_TUNABLE(uint32_t, zc_autotrim_buckets, 8);
617 static Z_TUNABLE(uint32_t, zc_free_batch_size, 256);
618 
619 static SECURITY_READ_ONLY_LATE(size_t)    zone_pages_wired_max;
620 static SECURITY_READ_ONLY_LATE(vm_map_t)  zone_submaps[Z_SUBMAP_IDX_COUNT];
621 static SECURITY_READ_ONLY_LATE(vm_map_t)  zone_meta_map;
622 static char const * const zone_submaps_names[Z_SUBMAP_IDX_COUNT] = {
623 	[Z_SUBMAP_IDX_VM]               = "VM",
624 	[Z_SUBMAP_IDX_READ_ONLY]        = "RO",
625 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
626 	[Z_SUBMAP_IDX_GENERAL_0]        = "GEN0",
627 	[Z_SUBMAP_IDX_GENERAL_1]        = "GEN1",
628 	[Z_SUBMAP_IDX_GENERAL_2]        = "GEN2",
629 	[Z_SUBMAP_IDX_GENERAL_3]        = "GEN3",
630 #else
631 	[Z_SUBMAP_IDX_GENERAL_0]        = "GEN",
632 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
633 	[Z_SUBMAP_IDX_DATA]             = "DATA",
634 };
635 
636 #if __x86_64__
637 #define ZONE_ENTROPY_CNT 8
638 #else
639 #define ZONE_ENTROPY_CNT 2
640 #endif
641 static struct zone_bool_gen {
642 	struct bool_gen zbg_bg;
643 	uint32_t zbg_entropy[ZONE_ENTROPY_CNT];
644 } zone_bool_gen[MAX_CPUS];
645 
646 #if CONFIG_PROB_GZALLOC
647 /*
648  * Probabilistic gzalloc
649  * =====================
650  *
651  *
652  * Probabilistic guard zalloc samples allocations and will protect them by
653  * double-mapping the page holding them and returning the secondary virtual
654  * address to its callers.
655  *
656  * Its data structures are lazily allocated if the `pgz` or `pgz1` boot-args
657  * are set.
658  *
659  *
660  * Unlike GZalloc, PGZ uses a fixed amount of memory, and is compatible with
661  * most zalloc/kalloc features:
662  * - zone_require is functional
663  * - zone caching or zone tagging is compatible
664  * - non-blocking allocation work (they will always return NULL with gzalloc).
665  *
666  * PGZ limitations:
667  * - VA sequestering isn't respected, as the slots (which are in limited
668  *   quantity) will be reused for any type, however the PGZ quarantine
669  *   somewhat mitigates the impact.
670  * - zones with elements larger than a page cannot be protected.
671  *
672  *
673  * Tunables:
674  * --------
675  *
676  * pgz=1:
677  *   Turn on probabilistic guard malloc for all zones
678  *
679  *   (default on for DEVELOPMENT, off for RELEASE, or if pgz1... are specified)
680  *
681  * pgz_sample_rate=0 to 2^31
682  *   average sample rate between two guarded allocations.
683  *   0 means every allocation.
684  *
685  *   The default is a random number between 1000 and 10,000
686  *
687  * pgz_slots
688  *   how many allocations to protect.
689  *
690  *   Each costs:
691  *   - a PTE in the pmap (when allocated)
692  *   - 2 zone page meta's (every other page is a "guard" one, 32B total)
693  *   - 64 bytes per backtraces.
694  *   On LP64 this is <16K per 100 slots.
695  *
696  *   The default is ~200 slots per G of physical ram (32k / G)
697  *
698  *   TODO:
699  *   - try harder to allocate elements at the "end" to catch OOB more reliably.
700  *
701  * pgz_quarantine
702  *   how many slots should be free at any given time.
703  *
704  *   PGZ will round robin through free slots to be reused, but free slots are
705  *   important to detect use-after-free by acting as a quarantine.
706  *
707  *   By default, PGZ will keep 33% of the slots around at all time.
708  *
709  * pgz1=<name>, pgz2=<name>, ..., pgzn=<name>...
710  *   Specific zones for which to enable probabilistic guard malloc.
711  *   There must be no numbering gap (names after the gap will be ignored).
712  */
713 #if DEBUG || DEVELOPMENT
714 static TUNABLE(bool, pgz_all, "pgz", true);
715 #else
716 static TUNABLE(bool, pgz_all, "pgz", false);
717 #endif
718 static TUNABLE(uint32_t, pgz_sample_rate, "pgz_sample_rate", 0);
719 static TUNABLE(uint32_t, pgz_slots, "pgz_slots", UINT32_MAX);
720 static TUNABLE(uint32_t, pgz_quarantine, "pgz_quarantine", 0);
721 #endif /* CONFIG_PROB_GZALLOC */
722 
723 static zone_t zone_find_largest(uint64_t *zone_size);
724 
725 #endif /* !ZALLOC_TEST */
726 #pragma mark Zone metadata
727 #if !ZALLOC_TEST
728 
729 static inline bool
zone_has_index(zone_t z,zone_id_t zid)730 zone_has_index(zone_t z, zone_id_t zid)
731 {
732 	return zone_array + zid == z;
733 }
734 
735 __abortlike
736 void
zone_invalid_panic(zone_t zone)737 zone_invalid_panic(zone_t zone)
738 {
739 	panic("zone %p isn't in the zone_array", zone);
740 }
741 
742 __abortlike
743 static void
zone_metadata_corruption(zone_t zone,struct zone_page_metadata * meta,const char * kind)744 zone_metadata_corruption(zone_t zone, struct zone_page_metadata *meta,
745     const char *kind)
746 {
747 	panic("zone metadata corruption: %s (meta %p, zone %s%s)",
748 	    kind, meta, zone_heap_name(zone), zone->z_name);
749 }
750 
751 __abortlike
752 static void
zone_invalid_element_addr_panic(zone_t zone,vm_offset_t addr)753 zone_invalid_element_addr_panic(zone_t zone, vm_offset_t addr)
754 {
755 	panic("zone element pointer validation failed (addr: %p, zone %s%s)",
756 	    (void *)addr, zone_heap_name(zone), zone->z_name);
757 }
758 
759 __abortlike
760 static void
zone_page_metadata_index_confusion_panic(zone_t zone,vm_offset_t addr,struct zone_page_metadata * meta)761 zone_page_metadata_index_confusion_panic(zone_t zone, vm_offset_t addr,
762     struct zone_page_metadata *meta)
763 {
764 	zone_security_flags_t zsflags = zone_security_config(zone), src_zsflags;
765 	zone_id_t zidx;
766 	zone_t src_zone;
767 
768 	if (zsflags.z_kalloc_type) {
769 		panic_include_kalloc_types = true;
770 		kalloc_type_dst_zone = zone;
771 	}
772 
773 	zidx = meta->zm_index;
774 	if (zidx >= os_atomic_load(&num_zones, relaxed)) {
775 		panic("%p expected in zone %s%s[%d], but metadata has invalid zidx: %d",
776 		    (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
777 		    zidx);
778 	}
779 
780 	src_zone = &zone_array[zidx];
781 	src_zsflags = zone_security_array[zidx];
782 	if (src_zsflags.z_kalloc_type) {
783 		panic_include_kalloc_types = true;
784 		kalloc_type_src_zone = src_zone;
785 	}
786 
787 	panic("%p not in the expected zone %s%s[%d], but found in %s%s[%d]",
788 	    (void *)addr, zone_heap_name(zone), zone->z_name, zone_index(zone),
789 	    zone_heap_name(src_zone), src_zone->z_name, zidx);
790 }
791 
792 __abortlike
793 static void
zone_page_metadata_list_corruption(zone_t zone,struct zone_page_metadata * meta)794 zone_page_metadata_list_corruption(zone_t zone, struct zone_page_metadata *meta)
795 {
796 	panic("metadata list corruption through element %p detected in zone %s%s",
797 	    meta, zone_heap_name(zone), zone->z_name);
798 }
799 
800 __abortlike
801 static void
zone_page_meta_accounting_panic(zone_t zone,struct zone_page_metadata * meta,const char * kind)802 zone_page_meta_accounting_panic(zone_t zone, struct zone_page_metadata *meta,
803     const char *kind)
804 {
805 	panic("accounting mismatch (%s) for zone %s%s, meta %p", kind,
806 	    zone_heap_name(zone), zone->z_name, meta);
807 }
808 
809 __abortlike
810 static void
zone_meta_double_free_panic(zone_t zone,vm_offset_t addr,const char * caller)811 zone_meta_double_free_panic(zone_t zone, vm_offset_t addr, const char *caller)
812 {
813 	panic("%s: double free of %p to zone %s%s", caller,
814 	    (void *)addr, zone_heap_name(zone), zone->z_name);
815 }
816 
817 __abortlike
818 static void
zone_accounting_panic(zone_t zone,const char * kind)819 zone_accounting_panic(zone_t zone, const char *kind)
820 {
821 	panic("accounting mismatch (%s) for zone %s%s", kind,
822 	    zone_heap_name(zone), zone->z_name);
823 }
824 
825 #define zone_counter_sub(z, stat, value)  ({ \
826 	if (os_sub_overflow((z)->stat, value, &(z)->stat)) { \
827 	    zone_accounting_panic(z, #stat " wrap-around"); \
828 	} \
829 	(z)->stat; \
830 })
831 
832 static inline uint16_t
zone_meta_alloc_size_add(zone_t z,struct zone_page_metadata * m,vm_offset_t esize)833 zone_meta_alloc_size_add(zone_t z, struct zone_page_metadata *m,
834     vm_offset_t esize)
835 {
836 	if (os_add_overflow(m->zm_alloc_size, (uint16_t)esize, &m->zm_alloc_size)) {
837 		zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
838 	}
839 	return m->zm_alloc_size;
840 }
841 
842 static inline uint16_t
zone_meta_alloc_size_sub(zone_t z,struct zone_page_metadata * m,vm_offset_t esize)843 zone_meta_alloc_size_sub(zone_t z, struct zone_page_metadata *m,
844     vm_offset_t esize)
845 {
846 	if (os_sub_overflow(m->zm_alloc_size, esize, &m->zm_alloc_size)) {
847 		zone_page_meta_accounting_panic(z, m, "alloc_size wrap-around");
848 	}
849 	return m->zm_alloc_size;
850 }
851 
852 __abortlike
853 static void
zone_nofail_panic(zone_t zone)854 zone_nofail_panic(zone_t zone)
855 {
856 	panic("zalloc(Z_NOFAIL) can't be satisfied for zone %s%s (potential leak)",
857 	    zone_heap_name(zone), zone->z_name);
858 }
859 
860 __header_always_inline bool
zone_spans_ro_va(vm_offset_t addr_start,vm_offset_t addr_end)861 zone_spans_ro_va(vm_offset_t addr_start, vm_offset_t addr_end)
862 {
863 	const struct mach_vm_range *ro_r = &zone_info.zi_ro_range;
864 	struct mach_vm_range r = { addr_start, addr_end };
865 
866 	return mach_vm_range_intersects(ro_r, &r);
867 }
868 
869 #define from_range(r, addr, size) \
870 	__builtin_choose_expr(__builtin_constant_p(size) ? (size) == 1 : 0, \
871 	mach_vm_range_contains(r, (mach_vm_offset_t)(addr)), \
872 	mach_vm_range_contains(r, (mach_vm_offset_t)(addr), size))
873 
874 #define from_ro_map(addr, size) \
875 	from_range(&zone_info.zi_ro_range, addr, size)
876 
877 #define from_zone_map(addr, size) \
878 	from_range(&zone_info.zi_map_range, addr, size)
879 
880 __header_always_inline bool
zone_pva_is_null(zone_pva_t page)881 zone_pva_is_null(zone_pva_t page)
882 {
883 	return page.packed_address == 0;
884 }
885 
886 __header_always_inline bool
zone_pva_is_queue(zone_pva_t page)887 zone_pva_is_queue(zone_pva_t page)
888 {
889 	// actual kernel pages have the top bit set
890 	return (int32_t)page.packed_address > 0;
891 }
892 
893 __header_always_inline bool
zone_pva_is_equal(zone_pva_t pva1,zone_pva_t pva2)894 zone_pva_is_equal(zone_pva_t pva1, zone_pva_t pva2)
895 {
896 	return pva1.packed_address == pva2.packed_address;
897 }
898 
899 __header_always_inline zone_pva_t *
zone_pageq_base(void)900 zone_pageq_base(void)
901 {
902 	extern zone_pva_t data_seg_start[] __SEGMENT_START_SYM("__DATA");
903 
904 	/*
905 	 * `-1` so that if the first __DATA variable is a page queue,
906 	 * it gets a non 0 index
907 	 */
908 	return data_seg_start - 1;
909 }
910 
911 __header_always_inline void
zone_queue_set_head(zone_t z,zone_pva_t queue,zone_pva_t oldv,struct zone_page_metadata * meta)912 zone_queue_set_head(zone_t z, zone_pva_t queue, zone_pva_t oldv,
913     struct zone_page_metadata *meta)
914 {
915 	zone_pva_t *queue_head = &zone_pageq_base()[queue.packed_address];
916 
917 	if (!zone_pva_is_equal(*queue_head, oldv)) {
918 		zone_page_metadata_list_corruption(z, meta);
919 	}
920 	*queue_head = meta->zm_page_next;
921 }
922 
923 __header_always_inline zone_pva_t
zone_queue_encode(zone_pva_t * headp)924 zone_queue_encode(zone_pva_t *headp)
925 {
926 	return (zone_pva_t){ (uint32_t)(headp - zone_pageq_base()) };
927 }
928 
929 __header_always_inline zone_pva_t
zone_pva_from_addr(vm_address_t addr)930 zone_pva_from_addr(vm_address_t addr)
931 {
932 	// cannot use atop() because we want to maintain the sign bit
933 	return (zone_pva_t){ (uint32_t)((intptr_t)addr >> PAGE_SHIFT) };
934 }
935 
936 __header_always_inline vm_address_t
zone_pva_to_addr(zone_pva_t page)937 zone_pva_to_addr(zone_pva_t page)
938 {
939 	// cause sign extension so that we end up with the right address
940 	return (vm_offset_t)(int32_t)page.packed_address << PAGE_SHIFT;
941 }
942 
943 __header_always_inline struct zone_page_metadata *
zone_pva_to_meta(zone_pva_t page)944 zone_pva_to_meta(zone_pva_t page)
945 {
946 	return &zone_info.zi_meta_base[page.packed_address];
947 }
948 
949 __header_always_inline zone_pva_t
zone_pva_from_meta(struct zone_page_metadata * meta)950 zone_pva_from_meta(struct zone_page_metadata *meta)
951 {
952 	return (zone_pva_t){ (uint32_t)(meta - zone_info.zi_meta_base) };
953 }
954 
955 __header_always_inline struct zone_page_metadata *
zone_meta_from_addr(vm_offset_t addr)956 zone_meta_from_addr(vm_offset_t addr)
957 {
958 	return zone_pva_to_meta(zone_pva_from_addr(addr));
959 }
960 
961 __header_always_inline zone_id_t
zone_index_from_ptr(const void * ptr)962 zone_index_from_ptr(const void *ptr)
963 {
964 	return zone_pva_to_meta(zone_pva_from_addr((vm_offset_t)ptr))->zm_index;
965 }
966 
967 __header_always_inline vm_offset_t
zone_meta_to_addr(struct zone_page_metadata * meta)968 zone_meta_to_addr(struct zone_page_metadata *meta)
969 {
970 	return ptoa((int32_t)(meta - zone_info.zi_meta_base));
971 }
972 
973 __attribute__((overloadable))
974 __header_always_inline void
zone_meta_validate(zone_t z,struct zone_page_metadata * meta,vm_address_t addr)975 zone_meta_validate(zone_t z, struct zone_page_metadata *meta, vm_address_t addr)
976 {
977 	if (!zone_has_index(z, meta->zm_index)) {
978 		zone_page_metadata_index_confusion_panic(z, addr, meta);
979 	}
980 }
981 
982 __attribute__((overloadable))
983 __header_always_inline void
zone_meta_validate(zone_t z,struct zone_page_metadata * meta)984 zone_meta_validate(zone_t z, struct zone_page_metadata *meta)
985 {
986 	zone_meta_validate(z, meta, zone_meta_to_addr(meta));
987 }
988 
989 __header_always_inline void
zone_meta_queue_push(zone_t z,zone_pva_t * headp,struct zone_page_metadata * meta)990 zone_meta_queue_push(zone_t z, zone_pva_t *headp,
991     struct zone_page_metadata *meta)
992 {
993 	zone_pva_t head = *headp;
994 	zone_pva_t queue_pva = zone_queue_encode(headp);
995 	struct zone_page_metadata *tmp;
996 
997 	meta->zm_page_next = head;
998 	if (!zone_pva_is_null(head)) {
999 		tmp = zone_pva_to_meta(head);
1000 		if (!zone_pva_is_equal(tmp->zm_page_prev, queue_pva)) {
1001 			zone_page_metadata_list_corruption(z, meta);
1002 		}
1003 		tmp->zm_page_prev = zone_pva_from_meta(meta);
1004 	}
1005 	meta->zm_page_prev = queue_pva;
1006 	*headp = zone_pva_from_meta(meta);
1007 }
1008 
1009 __header_always_inline struct zone_page_metadata *
zone_meta_queue_pop(zone_t z,zone_pva_t * headp)1010 zone_meta_queue_pop(zone_t z, zone_pva_t *headp)
1011 {
1012 	zone_pva_t head = *headp;
1013 	struct zone_page_metadata *meta = zone_pva_to_meta(head);
1014 	struct zone_page_metadata *tmp;
1015 
1016 	zone_meta_validate(z, meta);
1017 
1018 	if (!zone_pva_is_null(meta->zm_page_next)) {
1019 		tmp = zone_pva_to_meta(meta->zm_page_next);
1020 		if (!zone_pva_is_equal(tmp->zm_page_prev, head)) {
1021 			zone_page_metadata_list_corruption(z, meta);
1022 		}
1023 		tmp->zm_page_prev = meta->zm_page_prev;
1024 	}
1025 	*headp = meta->zm_page_next;
1026 
1027 	meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
1028 
1029 	return meta;
1030 }
1031 
1032 __header_always_inline void
zone_meta_remqueue(zone_t z,struct zone_page_metadata * meta)1033 zone_meta_remqueue(zone_t z, struct zone_page_metadata *meta)
1034 {
1035 	zone_pva_t meta_pva = zone_pva_from_meta(meta);
1036 	struct zone_page_metadata *tmp;
1037 
1038 	if (!zone_pva_is_null(meta->zm_page_next)) {
1039 		tmp = zone_pva_to_meta(meta->zm_page_next);
1040 		if (!zone_pva_is_equal(tmp->zm_page_prev, meta_pva)) {
1041 			zone_page_metadata_list_corruption(z, meta);
1042 		}
1043 		tmp->zm_page_prev = meta->zm_page_prev;
1044 	}
1045 	if (zone_pva_is_queue(meta->zm_page_prev)) {
1046 		zone_queue_set_head(z, meta->zm_page_prev, meta_pva, meta);
1047 	} else {
1048 		tmp = zone_pva_to_meta(meta->zm_page_prev);
1049 		if (!zone_pva_is_equal(tmp->zm_page_next, meta_pva)) {
1050 			zone_page_metadata_list_corruption(z, meta);
1051 		}
1052 		tmp->zm_page_next = meta->zm_page_next;
1053 	}
1054 
1055 	meta->zm_page_next = meta->zm_page_prev = (zone_pva_t){ 0 };
1056 }
1057 
1058 __header_always_inline void
zone_meta_requeue(zone_t z,zone_pva_t * headp,struct zone_page_metadata * meta)1059 zone_meta_requeue(zone_t z, zone_pva_t *headp,
1060     struct zone_page_metadata *meta)
1061 {
1062 	zone_meta_remqueue(z, meta);
1063 	zone_meta_queue_push(z, headp, meta);
1064 }
1065 
1066 /* prevents a given metadata from ever reaching the z_pageq_empty queue */
1067 static inline void
zone_meta_lock_in_partial(zone_t z,struct zone_page_metadata * m,uint32_t len)1068 zone_meta_lock_in_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
1069 {
1070 	uint16_t new_size = zone_meta_alloc_size_add(z, m, ZM_ALLOC_SIZE_LOCK);
1071 
1072 	assert(new_size % sizeof(vm_offset_t) == ZM_ALLOC_SIZE_LOCK);
1073 	if (new_size == ZM_ALLOC_SIZE_LOCK) {
1074 		zone_meta_requeue(z, &z->z_pageq_partial, m);
1075 		zone_counter_sub(z, z_wired_empty, len);
1076 	}
1077 }
1078 
1079 /* allows a given metadata to reach the z_pageq_empty queue again */
1080 static inline void
zone_meta_unlock_from_partial(zone_t z,struct zone_page_metadata * m,uint32_t len)1081 zone_meta_unlock_from_partial(zone_t z, struct zone_page_metadata *m, uint32_t len)
1082 {
1083 	uint16_t new_size = zone_meta_alloc_size_sub(z, m, ZM_ALLOC_SIZE_LOCK);
1084 
1085 	assert(new_size % sizeof(vm_offset_t) == 0);
1086 	if (new_size == 0) {
1087 		zone_meta_requeue(z, &z->z_pageq_empty, m);
1088 		z->z_wired_empty += len;
1089 	}
1090 }
1091 
1092 /*
1093  * Routine to populate a page backing metadata in the zone_metadata_region.
1094  * Must be called without the zone lock held as it might potentially block.
1095  */
1096 static void
zone_meta_populate(vm_offset_t base,vm_size_t size)1097 zone_meta_populate(vm_offset_t base, vm_size_t size)
1098 {
1099 	struct zone_page_metadata *from = zone_meta_from_addr(base);
1100 	struct zone_page_metadata *to   = from + atop(size);
1101 	vm_offset_t page_addr = trunc_page(from);
1102 
1103 	for (; page_addr < (vm_offset_t)to; page_addr += PAGE_SIZE) {
1104 #if !KASAN
1105 		/*
1106 		 * This can race with another thread doing a populate on the same metadata
1107 		 * page, where we see an updated pmap but unmapped KASan shadow, causing a
1108 		 * fault in the shadow when we first access the metadata page. Avoid this
1109 		 * by always synchronizing on the zone_metadata_region lock with KASan.
1110 		 */
1111 		if (pmap_find_phys(kernel_pmap, page_addr)) {
1112 			continue;
1113 		}
1114 #endif
1115 
1116 		for (;;) {
1117 			kern_return_t ret = KERN_SUCCESS;
1118 
1119 			/*
1120 			 * All updates to the zone_metadata_region are done
1121 			 * under the zone_metadata_region_lck
1122 			 */
1123 			zone_meta_lock();
1124 			if (0 == pmap_find_phys(kernel_pmap, page_addr)) {
1125 				ret = kernel_memory_populate(page_addr,
1126 				    PAGE_SIZE, KMA_NOPAGEWAIT | KMA_KOBJECT | KMA_ZERO,
1127 				    VM_KERN_MEMORY_OSFMK);
1128 			}
1129 			zone_meta_unlock();
1130 
1131 			if (ret == KERN_SUCCESS) {
1132 				break;
1133 			}
1134 
1135 			/*
1136 			 * We can't pass KMA_NOPAGEWAIT under a global lock as it leads
1137 			 * to bad system deadlocks, so if the allocation failed,
1138 			 * we need to do the VM_PAGE_WAIT() outside of the lock.
1139 			 */
1140 			VM_PAGE_WAIT();
1141 		}
1142 	}
1143 }
1144 
1145 __abortlike
1146 static void
zone_invalid_element_panic(zone_t zone,vm_offset_t addr)1147 zone_invalid_element_panic(zone_t zone, vm_offset_t addr)
1148 {
1149 	struct zone_page_metadata *meta;
1150 	const char *from_cache = "";
1151 	vm_offset_t page;
1152 
1153 	if (!from_zone_map(addr, zone_elem_inner_size(zone))) {
1154 		panic("addr %p being freed to zone %s%s%s, isn't from zone map",
1155 		    (void *)addr, zone_heap_name(zone), zone->z_name, from_cache);
1156 	}
1157 	page = trunc_page(addr);
1158 	meta = zone_meta_from_addr(addr);
1159 
1160 	if (!zone_has_index(zone, meta->zm_index)) {
1161 		zone_page_metadata_index_confusion_panic(zone, addr, meta);
1162 	}
1163 
1164 	if (meta->zm_chunk_len == ZM_SECONDARY_PCPU_PAGE) {
1165 		panic("metadata %p corresponding to addr %p being freed to "
1166 		    "zone %s%s%s, is marked as secondary per cpu page",
1167 		    meta, (void *)addr, zone_heap_name(zone), zone->z_name,
1168 		    from_cache);
1169 	}
1170 	if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1171 		page -= ptoa(meta->zm_page_index);
1172 		meta -= meta->zm_page_index;
1173 	}
1174 
1175 	if (meta->zm_chunk_len > ZM_CHUNK_LEN_MAX) {
1176 		panic("metadata %p corresponding to addr %p being freed to "
1177 		    "zone %s%s%s, has chunk len greater than max",
1178 		    meta, (void *)addr, zone_heap_name(zone), zone->z_name,
1179 		    from_cache);
1180 	}
1181 
1182 	if ((addr - zone_elem_inner_offs(zone) - page) % zone_elem_outer_size(zone)) {
1183 		panic("addr %p being freed to zone %s%s%s, isn't aligned to "
1184 		    "zone element size", (void *)addr, zone_heap_name(zone),
1185 		    zone->z_name, from_cache);
1186 	}
1187 
1188 	zone_invalid_element_addr_panic(zone, addr);
1189 }
1190 
1191 __attribute__((always_inline))
1192 static struct zone_page_metadata *
zone_element_resolve(zone_t zone,vm_offset_t addr,vm_offset_t * idx)1193 zone_element_resolve(
1194 	zone_t                  zone,
1195 	vm_offset_t             addr,
1196 	vm_offset_t            *idx)
1197 {
1198 	struct zone_page_metadata *meta;
1199 	vm_offset_t offs, eidx;
1200 
1201 	meta = zone_meta_from_addr(addr);
1202 	if (!from_zone_map(addr, 1) || !zone_has_index(zone, meta->zm_index)) {
1203 		zone_invalid_element_panic(zone, addr);
1204 	}
1205 
1206 	offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone);
1207 	if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1208 		offs += ptoa(meta->zm_page_index);
1209 		meta -= meta->zm_page_index;
1210 	}
1211 
1212 	eidx = Z_FAST_QUO(offs, zone->z_quo_magic);
1213 	if (eidx * zone_elem_outer_size(zone) != offs) {
1214 		zone_invalid_element_panic(zone, addr);
1215 	}
1216 
1217 	*idx = eidx;
1218 	return meta;
1219 }
1220 
1221 #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST)
1222 void *
zone_element_pgz_oob_adjust(void * ptr,vm_size_t req_size,vm_size_t elem_size)1223 zone_element_pgz_oob_adjust(void *ptr, vm_size_t req_size, vm_size_t elem_size)
1224 {
1225 	vm_offset_t addr = (vm_offset_t)ptr;
1226 	vm_offset_t end = addr + elem_size;
1227 	vm_offset_t offs;
1228 
1229 	/*
1230 	 * 0-sized allocations in a KALLOC_MINSIZE bucket
1231 	 * would be offset to the next allocation which is incorrect.
1232 	 */
1233 	req_size = MAX(roundup(req_size, KALLOC_MINALIGN), KALLOC_MINALIGN);
1234 
1235 	/*
1236 	 * Given how chunks work, for a zone with PGZ guards on,
1237 	 * there's a single element which ends precisely
1238 	 * at the page boundary: the last one.
1239 	 */
1240 	if (req_size == elem_size ||
1241 	    (end & PAGE_MASK) ||
1242 	    !zone_meta_from_addr(addr)->zm_guarded) {
1243 		return ptr;
1244 	}
1245 
1246 	offs = elem_size - req_size;
1247 	zone_meta_from_addr(end)->zm_oob_offs = (uint16_t)offs;
1248 
1249 	return (char *)addr + offs;
1250 }
1251 #endif /* !ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */
1252 
1253 __abortlike
1254 static void
zone_element_bounds_check_panic(vm_address_t addr,vm_size_t len)1255 zone_element_bounds_check_panic(vm_address_t addr, vm_size_t len)
1256 {
1257 	struct zone_page_metadata *meta;
1258 	vm_offset_t offs, size, page;
1259 	zone_t      zone;
1260 
1261 	page = trunc_page(addr);
1262 	meta = zone_meta_from_addr(addr);
1263 	zone = &zone_array[meta->zm_index];
1264 
1265 	if (zone->z_percpu) {
1266 		panic("zone bound checks: address %p is a per-cpu allocation",
1267 		    (void *)addr);
1268 	}
1269 
1270 	if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1271 		page -= ptoa(meta->zm_page_index);
1272 		meta -= meta->zm_page_index;
1273 	}
1274 
1275 	size = zone_elem_outer_size(zone);
1276 	offs = Z_FAST_MOD(addr - zone_elem_inner_offs(zone) - page + size,
1277 	    zone->z_quo_magic, size);
1278 	panic("zone bound checks: buffer %p of length %zd overflows "
1279 	    "object %p of size %zd in zone %p[%s%s]",
1280 	    (void *)addr, len, (void *)(addr - offs - zone_elem_redzone(zone)),
1281 	    zone_elem_inner_size(zone), zone, zone_heap_name(zone), zone_name(zone));
1282 }
1283 
1284 void
zone_element_bounds_check(vm_address_t addr,vm_size_t len)1285 zone_element_bounds_check(vm_address_t addr, vm_size_t len)
1286 {
1287 	struct zone_page_metadata *meta;
1288 	vm_offset_t offs, size;
1289 	zone_t      zone;
1290 
1291 	if (!from_zone_map(addr, 1)) {
1292 		return;
1293 	}
1294 
1295 #if CONFIG_PROB_GZALLOC
1296 	if (__improbable(pgz_owned(addr))) {
1297 		meta = zone_meta_from_addr(addr);
1298 		addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK);
1299 	}
1300 #endif /* CONFIG_PROB_GZALLOC */
1301 	meta = zone_meta_from_addr(addr);
1302 	zone = zone_by_id(meta->zm_index);
1303 
1304 	if (zone->z_percpu) {
1305 		zone_element_bounds_check_panic(addr, len);
1306 	}
1307 
1308 	if (zone->z_permanent) {
1309 		/* We don't know bounds for those */
1310 		return;
1311 	}
1312 
1313 	offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone);
1314 	if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1315 		offs += ptoa(meta->zm_page_index);
1316 	}
1317 	size = zone_elem_outer_size(zone);
1318 	offs = Z_FAST_MOD(offs + size, zone->z_quo_magic, size);
1319 	if (len + zone_elem_redzone(zone) > size - offs) {
1320 		zone_element_bounds_check_panic(addr, len);
1321 	}
1322 }
1323 
1324 /*
1325  * Routine to get the size of a zone allocated address.
1326  * If the address doesnt belong to the zone maps, returns 0.
1327  */
1328 vm_size_t
zone_element_size(void * elem,zone_t * z,bool clear_oob,vm_offset_t * oob_offs)1329 zone_element_size(void *elem, zone_t *z, bool clear_oob, vm_offset_t *oob_offs)
1330 {
1331 	vm_address_t addr = (vm_address_t)elem;
1332 	struct zone_page_metadata *meta;
1333 	vm_size_t esize, offs, end;
1334 	zone_t zone;
1335 
1336 	if (from_zone_map(addr, sizeof(void *))) {
1337 		meta  = zone_meta_from_addr(addr);
1338 		zone  = zone_by_id(meta->zm_index);
1339 		esize = zone_elem_inner_size(zone);
1340 		end   = vm_memtag_canonicalize_address(addr + esize);
1341 		offs  = 0;
1342 
1343 #if ZSECURITY_CONFIG(PGZ_OOB_ADJUST)
1344 		/*
1345 		 * If the chunk uses guards, and that (addr + esize)
1346 		 * either crosses a page boundary or is at the boundary,
1347 		 * we need to look harder.
1348 		 */
1349 		if (oob_offs && meta->zm_guarded && atop(addr ^ end)) {
1350 			/*
1351 			 * Because in the vast majority of cases the element
1352 			 * size is sub-page, and that meta[1] must be faulted,
1353 			 * we can quickly peek at whether it's a guard.
1354 			 *
1355 			 * For elements larger than a page, finding the guard
1356 			 * page requires a little more effort.
1357 			 */
1358 			if (meta[1].zm_chunk_len == ZM_PGZ_GUARD) {
1359 				offs = meta[1].zm_oob_offs;
1360 				if (clear_oob) {
1361 					meta[1].zm_oob_offs = 0;
1362 				}
1363 			} else if (esize > PAGE_SIZE) {
1364 				struct zone_page_metadata *gmeta;
1365 
1366 				if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1367 					gmeta = meta + meta->zm_subchunk_len;
1368 				} else {
1369 					gmeta = meta + zone->z_chunk_pages;
1370 				}
1371 				assert(gmeta->zm_chunk_len == ZM_PGZ_GUARD);
1372 
1373 				if (end >= zone_meta_to_addr(gmeta)) {
1374 					offs = gmeta->zm_oob_offs;
1375 					if (clear_oob) {
1376 						gmeta->zm_oob_offs = 0;
1377 					}
1378 				}
1379 			}
1380 		}
1381 #else
1382 #pragma unused(end, clear_oob)
1383 #endif /* ZSECURITY_CONFIG(PGZ_OOB_ADJUST) */
1384 
1385 		if (oob_offs) {
1386 			*oob_offs = offs;
1387 		}
1388 		if (z) {
1389 			*z = zone;
1390 		}
1391 		return esize;
1392 	}
1393 
1394 	if (oob_offs) {
1395 		*oob_offs = 0;
1396 	}
1397 
1398 	return 0;
1399 }
1400 
1401 zone_id_t
zone_id_for_element(void * addr,vm_size_t esize)1402 zone_id_for_element(void *addr, vm_size_t esize)
1403 {
1404 	zone_id_t zid = ZONE_ID_INVALID;
1405 	if (from_zone_map(addr, esize)) {
1406 		zid = zone_index_from_ptr(addr);
1407 		__builtin_assume(zid != ZONE_ID_INVALID);
1408 	}
1409 	return zid;
1410 }
1411 
1412 /* This function just formats the reason for the panics by redoing the checks */
1413 __abortlike
1414 static void
zone_require_panic(zone_t zone,void * addr)1415 zone_require_panic(zone_t zone, void *addr)
1416 {
1417 	uint32_t zindex;
1418 	zone_t other;
1419 
1420 	if (!from_zone_map(addr, zone_elem_inner_size(zone))) {
1421 		panic("zone_require failed: address not in a zone (addr: %p)", addr);
1422 	}
1423 
1424 	zindex = zone_index_from_ptr(addr);
1425 	other = &zone_array[zindex];
1426 	if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
1427 		panic("zone_require failed: invalid zone index %d "
1428 		    "(addr: %p, expected: %s%s)", zindex,
1429 		    addr, zone_heap_name(zone), zone->z_name);
1430 	} else {
1431 		panic("zone_require failed: address in unexpected zone id %d (%s%s) "
1432 		    "(addr: %p, expected: %s%s)",
1433 		    zindex, zone_heap_name(other), other->z_name,
1434 		    addr, zone_heap_name(zone), zone->z_name);
1435 	}
1436 }
1437 
1438 __abortlike
1439 static void
zone_id_require_panic(zone_id_t zid,void * addr)1440 zone_id_require_panic(zone_id_t zid, void *addr)
1441 {
1442 	zone_require_panic(&zone_array[zid], addr);
1443 }
1444 
1445 /*
1446  * Routines to panic if a pointer is not mapped to an expected zone.
1447  * This can be used as a means of pinning an object to the zone it is expected
1448  * to be a part of.  Causes a panic if the address does not belong to any
1449  * specified zone, does not belong to any zone, has been freed and therefore
1450  * unmapped from the zone, or the pointer contains an uninitialized value that
1451  * does not belong to any zone.
1452  */
1453 void
zone_require(zone_t zone,void * addr)1454 zone_require(zone_t zone, void *addr)
1455 {
1456 	vm_size_t esize = zone_elem_inner_size(zone);
1457 
1458 	if (from_zone_map(addr, esize) &&
1459 	    zone_has_index(zone, zone_index_from_ptr(addr))) {
1460 		return;
1461 	}
1462 	zone_require_panic(zone, addr);
1463 }
1464 
1465 void
zone_id_require(zone_id_t zid,vm_size_t esize,void * addr)1466 zone_id_require(zone_id_t zid, vm_size_t esize, void *addr)
1467 {
1468 	if (from_zone_map(addr, esize) && zid == zone_index_from_ptr(addr)) {
1469 		return;
1470 	}
1471 	zone_id_require_panic(zid, addr);
1472 }
1473 
1474 void
zone_id_require_aligned(zone_id_t zid,void * addr)1475 zone_id_require_aligned(zone_id_t zid, void *addr)
1476 {
1477 	zone_t zone = zone_by_id(zid);
1478 	vm_offset_t elem, offs;
1479 
1480 	elem = (vm_offset_t)addr;
1481 	offs = (elem & PAGE_MASK) - zone_elem_inner_offs(zone);
1482 
1483 	if (from_zone_map(addr, 1)) {
1484 		struct zone_page_metadata *meta;
1485 
1486 		meta = zone_meta_from_addr(elem);
1487 		if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
1488 			offs += ptoa(meta->zm_page_index);
1489 		}
1490 
1491 		if (zid == meta->zm_index &&
1492 		    Z_FAST_ALIGNED(offs, zone->z_align_magic)) {
1493 			return;
1494 		}
1495 	}
1496 
1497 	zone_invalid_element_panic(zone, elem);
1498 }
1499 
1500 bool
zone_owns(zone_t zone,void * addr)1501 zone_owns(zone_t zone, void *addr)
1502 {
1503 	vm_size_t esize = zone_elem_inner_size(zone);
1504 
1505 	if (from_zone_map(addr, esize)) {
1506 		return zone_has_index(zone, zone_index_from_ptr(addr));
1507 	}
1508 	return false;
1509 }
1510 
1511 static inline struct mach_vm_range
zone_kmem_suballoc(mach_vm_offset_t addr,vm_size_t size,int flags,vm_tag_t tag,vm_map_t * new_map)1512 zone_kmem_suballoc(
1513 	mach_vm_offset_t        addr,
1514 	vm_size_t               size,
1515 	int                     flags,
1516 	vm_tag_t                tag,
1517 	vm_map_t                *new_map)
1518 {
1519 	struct mach_vm_range r;
1520 
1521 	*new_map = kmem_suballoc(kernel_map, &addr, size,
1522 	    VM_MAP_CREATE_NEVER_FAULTS | VM_MAP_CREATE_DISABLE_HOLELIST,
1523 	    flags, KMS_PERMANENT | KMS_NOFAIL, tag).kmr_submap;
1524 
1525 	r.min_address = addr;
1526 	r.max_address = addr + size;
1527 	return r;
1528 }
1529 
1530 #endif /* !ZALLOC_TEST */
1531 #pragma mark Zone bits allocator
1532 
1533 /*!
1534  * @defgroup Zone Bitmap allocator
1535  * @{
1536  *
1537  * @brief
1538  * Functions implementing the zone bitmap allocator
1539  *
1540  * @discussion
1541  * The zone allocator maintains which elements are allocated or free in bitmaps.
1542  *
1543  * When the number of elements per page is smaller than 32, it is stored inline
1544  * on the @c zone_page_metadata structure (@c zm_inline_bitmap is set,
1545  * and @c zm_bitmap used for storage).
1546  *
1547  * When the number of elements is larger, then a bitmap is allocated from
1548  * a buddy allocator (impelemented under the @c zba_* namespace). Pointers
1549  * to bitmaps are implemented as a packed 32 bit bitmap reference, stored in
1550  * @c zm_bitmap. The low 3 bits encode the scale (order) of the allocation in
1551  * @c ZBA_GRANULE units, and hence actual allocations encoded with that scheme
1552  * cannot be larger than 1024 bytes (8192 bits).
1553  *
1554  * This buddy allocator can actually accomodate allocations as large
1555  * as 8k on 16k systems and 2k on 4k systems.
1556  *
1557  * Note: @c zba_* functions are implementation details not meant to be used
1558  * outside of the allocation of the allocator itself. Interfaces to the rest of
1559  * the zone allocator are documented and not @c zba_* prefixed.
1560  */
1561 
1562 #define ZBA_CHUNK_SIZE          PAGE_MAX_SIZE
1563 #define ZBA_GRANULE             sizeof(uint64_t)
1564 #define ZBA_GRANULE_BITS        (8 * sizeof(uint64_t))
1565 #define ZBA_MAX_ORDER           (PAGE_MAX_SHIFT - 4)
1566 #define ZBA_MAX_ALLOC_ORDER     7
1567 #define ZBA_SLOTS               (ZBA_CHUNK_SIZE / ZBA_GRANULE)
1568 #define ZBA_HEADS_COUNT         (ZBA_MAX_ORDER + 1)
1569 #define ZBA_PTR_MASK            0x0fffffff
1570 #define ZBA_ORDER_SHIFT         29
1571 #define ZBA_HAS_EXTRA_BIT       0x10000000
1572 
1573 static_assert(2ul * ZBA_GRANULE << ZBA_MAX_ORDER == ZBA_CHUNK_SIZE, "chunk sizes");
1574 static_assert(ZBA_MAX_ALLOC_ORDER <= ZBA_MAX_ORDER, "ZBA_MAX_ORDER is enough");
1575 
1576 struct zone_bits_chain {
1577 	uint32_t zbc_next;
1578 	uint32_t zbc_prev;
1579 } __attribute__((aligned(ZBA_GRANULE)));
1580 
1581 struct zone_bits_head {
1582 	uint32_t zbh_next;
1583 	uint32_t zbh_unused;
1584 } __attribute__((aligned(ZBA_GRANULE)));
1585 
1586 static_assert(sizeof(struct zone_bits_chain) == ZBA_GRANULE, "zbc size");
1587 static_assert(sizeof(struct zone_bits_head) == ZBA_GRANULE, "zbh size");
1588 
1589 struct zone_bits_allocator_meta {
1590 	uint32_t  zbam_left;
1591 	uint32_t  zbam_right;
1592 	struct zone_bits_head zbam_lists[ZBA_HEADS_COUNT];
1593 	struct zone_bits_head zbam_lists_with_extra[ZBA_HEADS_COUNT];
1594 };
1595 
1596 struct zone_bits_allocator_header {
1597 	uint64_t zbah_bits[ZBA_SLOTS / (8 * sizeof(uint64_t))];
1598 };
1599 
1600 #if ZALLOC_TEST
1601 static struct zalloc_bits_allocator_test_setup {
1602 	vm_offset_t zbats_base;
1603 	void      (*zbats_populate)(vm_address_t addr, vm_size_t size);
1604 } zba_test_info;
1605 
1606 static struct zone_bits_allocator_header *
zba_base_header(void)1607 zba_base_header(void)
1608 {
1609 	return (struct zone_bits_allocator_header *)zba_test_info.zbats_base;
1610 }
1611 
1612 static kern_return_t
zba_populate(uint32_t n,bool with_extra __unused)1613 zba_populate(uint32_t n, bool with_extra __unused)
1614 {
1615 	vm_address_t base = zba_test_info.zbats_base;
1616 	zba_test_info.zbats_populate(base + n * ZBA_CHUNK_SIZE, ZBA_CHUNK_SIZE);
1617 
1618 	return KERN_SUCCESS;
1619 }
1620 #else
1621 __startup_data __attribute__((aligned(ZBA_CHUNK_SIZE)))
1622 static uint8_t zba_chunk_startup[ZBA_CHUNK_SIZE];
1623 
1624 static SECURITY_READ_ONLY_LATE(uint8_t) zba_xtra_shift;
1625 static LCK_MTX_DECLARE(zba_mtx, &zone_locks_grp);
1626 
1627 static struct zone_bits_allocator_header *
zba_base_header(void)1628 zba_base_header(void)
1629 {
1630 	return (struct zone_bits_allocator_header *)zone_info.zi_bits_range.min_address;
1631 }
1632 
1633 static void
zba_lock(void)1634 zba_lock(void)
1635 {
1636 	lck_mtx_lock(&zba_mtx);
1637 }
1638 
1639 static void
zba_unlock(void)1640 zba_unlock(void)
1641 {
1642 	lck_mtx_unlock(&zba_mtx);
1643 }
1644 
1645 __abortlike
1646 static void
zba_memory_exhausted(void)1647 zba_memory_exhausted(void)
1648 {
1649 	uint64_t zsize = 0;
1650 	zone_t z = zone_find_largest(&zsize);
1651 	panic("zba_populate: out of bitmap space, "
1652 	    "likely due to memory leak in zone [%s%s] "
1653 	    "(%u%c, %d elements allocated)",
1654 	    zone_heap_name(z), zone_name(z),
1655 	    mach_vm_size_pretty(zsize), mach_vm_size_unit(zsize),
1656 	    zone_count_allocated(z));
1657 }
1658 
1659 
1660 static kern_return_t
zba_populate(uint32_t n,bool with_extra)1661 zba_populate(uint32_t n, bool with_extra)
1662 {
1663 	vm_size_t bits_size = ZBA_CHUNK_SIZE;
1664 	vm_size_t xtra_size = bits_size * CHAR_BIT << zba_xtra_shift;
1665 	vm_address_t bits_addr;
1666 	vm_address_t xtra_addr;
1667 	kern_return_t kr;
1668 
1669 	bits_addr = zone_info.zi_bits_range.min_address + n * bits_size;
1670 	xtra_addr = zone_info.zi_xtra_range.min_address + n * xtra_size;
1671 
1672 	kr = kernel_memory_populate(bits_addr, bits_size,
1673 	    KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT,
1674 	    VM_KERN_MEMORY_OSFMK);
1675 	if (kr != KERN_SUCCESS) {
1676 		return kr;
1677 	}
1678 
1679 
1680 	if (with_extra) {
1681 		kr = kernel_memory_populate(xtra_addr, xtra_size,
1682 		    KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT,
1683 		    VM_KERN_MEMORY_OSFMK);
1684 		if (kr != KERN_SUCCESS) {
1685 			kernel_memory_depopulate(bits_addr, bits_size,
1686 			    KMA_ZERO | KMA_KOBJECT | KMA_NOPAGEWAIT,
1687 			    VM_KERN_MEMORY_OSFMK);
1688 		}
1689 	}
1690 
1691 	return kr;
1692 }
1693 #endif
1694 
1695 __pure2
1696 static struct zone_bits_allocator_meta *
zba_meta(void)1697 zba_meta(void)
1698 {
1699 	return (struct zone_bits_allocator_meta *)&zba_base_header()[1];
1700 }
1701 
1702 __pure2
1703 static uint64_t *
zba_slot_base(void)1704 zba_slot_base(void)
1705 {
1706 	return (uint64_t *)zba_base_header();
1707 }
1708 
1709 __pure2
1710 static struct zone_bits_head *
zba_head(uint32_t order,bool with_extra)1711 zba_head(uint32_t order, bool with_extra)
1712 {
1713 	if (with_extra) {
1714 		return &zba_meta()->zbam_lists_with_extra[order];
1715 	} else {
1716 		return &zba_meta()->zbam_lists[order];
1717 	}
1718 }
1719 
1720 __pure2
1721 static uint32_t
zba_head_index(struct zone_bits_head * hd)1722 zba_head_index(struct zone_bits_head *hd)
1723 {
1724 	return (uint32_t)((uint64_t *)hd - zba_slot_base());
1725 }
1726 
1727 __pure2
1728 static struct zone_bits_chain *
zba_chain_for_index(uint32_t index)1729 zba_chain_for_index(uint32_t index)
1730 {
1731 	return (struct zone_bits_chain *)(zba_slot_base() + index);
1732 }
1733 
1734 __pure2
1735 static uint32_t
zba_chain_to_index(const struct zone_bits_chain * zbc)1736 zba_chain_to_index(const struct zone_bits_chain *zbc)
1737 {
1738 	return (uint32_t)((const uint64_t *)zbc - zba_slot_base());
1739 }
1740 
1741 __abortlike
1742 static void
zba_head_corruption_panic(uint32_t order,bool with_extra)1743 zba_head_corruption_panic(uint32_t order, bool with_extra)
1744 {
1745 	panic("zone bits allocator head[%d:%d:%p] is corrupt",
1746 	    order, with_extra, zba_head(order, with_extra));
1747 }
1748 
1749 __abortlike
1750 static void
zba_chain_corruption_panic(struct zone_bits_chain * a,struct zone_bits_chain * b)1751 zba_chain_corruption_panic(struct zone_bits_chain *a, struct zone_bits_chain *b)
1752 {
1753 	panic("zone bits allocator freelist is corrupt (%p <-> %p)", a, b);
1754 }
1755 
1756 static void
zba_push_block(struct zone_bits_chain * zbc,uint32_t order,bool with_extra)1757 zba_push_block(struct zone_bits_chain *zbc, uint32_t order, bool with_extra)
1758 {
1759 	struct zone_bits_head *hd = zba_head(order, with_extra);
1760 	uint32_t hd_index = zba_head_index(hd);
1761 	uint32_t index = zba_chain_to_index(zbc);
1762 	struct zone_bits_chain *next;
1763 
1764 	if (hd->zbh_next) {
1765 		next = zba_chain_for_index(hd->zbh_next);
1766 		if (next->zbc_prev != hd_index) {
1767 			zba_head_corruption_panic(order, with_extra);
1768 		}
1769 		next->zbc_prev = index;
1770 	}
1771 	zbc->zbc_next = hd->zbh_next;
1772 	zbc->zbc_prev = hd_index;
1773 	hd->zbh_next = index;
1774 }
1775 
1776 static void
zba_remove_block(struct zone_bits_chain * zbc)1777 zba_remove_block(struct zone_bits_chain *zbc)
1778 {
1779 	struct zone_bits_chain *prev = zba_chain_for_index(zbc->zbc_prev);
1780 	uint32_t index = zba_chain_to_index(zbc);
1781 
1782 	if (prev->zbc_next != index) {
1783 		zba_chain_corruption_panic(prev, zbc);
1784 	}
1785 	if ((prev->zbc_next = zbc->zbc_next)) {
1786 		struct zone_bits_chain *next = zba_chain_for_index(zbc->zbc_next);
1787 		if (next->zbc_prev != index) {
1788 			zba_chain_corruption_panic(zbc, next);
1789 		}
1790 		next->zbc_prev = zbc->zbc_prev;
1791 	}
1792 }
1793 
1794 static vm_address_t
zba_try_pop_block(uint32_t order,bool with_extra)1795 zba_try_pop_block(uint32_t order, bool with_extra)
1796 {
1797 	struct zone_bits_head *hd = zba_head(order, with_extra);
1798 	struct zone_bits_chain *zbc;
1799 
1800 	if (hd->zbh_next == 0) {
1801 		return 0;
1802 	}
1803 
1804 	zbc = zba_chain_for_index(hd->zbh_next);
1805 	zba_remove_block(zbc);
1806 	return (vm_address_t)zbc;
1807 }
1808 
1809 static struct zone_bits_allocator_header *
zba_header(vm_offset_t addr)1810 zba_header(vm_offset_t addr)
1811 {
1812 	addr &= -(vm_offset_t)ZBA_CHUNK_SIZE;
1813 	return (struct zone_bits_allocator_header *)addr;
1814 }
1815 
1816 static size_t
zba_node_parent(size_t node)1817 zba_node_parent(size_t node)
1818 {
1819 	return (node - 1) / 2;
1820 }
1821 
1822 static size_t
zba_node_left_child(size_t node)1823 zba_node_left_child(size_t node)
1824 {
1825 	return node * 2 + 1;
1826 }
1827 
1828 static size_t
zba_node_buddy(size_t node)1829 zba_node_buddy(size_t node)
1830 {
1831 	return ((node - 1) ^ 1) + 1;
1832 }
1833 
1834 static size_t
zba_node(vm_offset_t addr,uint32_t order)1835 zba_node(vm_offset_t addr, uint32_t order)
1836 {
1837 	vm_offset_t offs = (addr % ZBA_CHUNK_SIZE) / ZBA_GRANULE;
1838 	return (offs >> order) + (1 << (ZBA_MAX_ORDER - order + 1)) - 1;
1839 }
1840 
1841 static struct zone_bits_chain *
zba_chain_for_node(struct zone_bits_allocator_header * zbah,size_t node,uint32_t order)1842 zba_chain_for_node(struct zone_bits_allocator_header *zbah, size_t node, uint32_t order)
1843 {
1844 	vm_offset_t offs = (node - (1 << (ZBA_MAX_ORDER - order + 1)) + 1) << order;
1845 	return (struct zone_bits_chain *)((vm_offset_t)zbah + offs * ZBA_GRANULE);
1846 }
1847 
1848 static void
zba_node_flip_split(struct zone_bits_allocator_header * zbah,size_t node)1849 zba_node_flip_split(struct zone_bits_allocator_header *zbah, size_t node)
1850 {
1851 	zbah->zbah_bits[node / 64] ^= 1ull << (node % 64);
1852 }
1853 
1854 static bool
zba_node_is_split(struct zone_bits_allocator_header * zbah,size_t node)1855 zba_node_is_split(struct zone_bits_allocator_header *zbah, size_t node)
1856 {
1857 	return zbah->zbah_bits[node / 64] & (1ull << (node % 64));
1858 }
1859 
1860 static void
zba_free(vm_offset_t addr,uint32_t order,bool with_extra)1861 zba_free(vm_offset_t addr, uint32_t order, bool with_extra)
1862 {
1863 	struct zone_bits_allocator_header *zbah = zba_header(addr);
1864 	struct zone_bits_chain *zbc;
1865 	size_t node = zba_node(addr, order);
1866 
1867 	while (node) {
1868 		size_t parent = zba_node_parent(node);
1869 
1870 		zba_node_flip_split(zbah, parent);
1871 		if (zba_node_is_split(zbah, parent)) {
1872 			break;
1873 		}
1874 
1875 		zbc = zba_chain_for_node(zbah, zba_node_buddy(node), order);
1876 		zba_remove_block(zbc);
1877 		order++;
1878 		node = parent;
1879 	}
1880 
1881 	zba_push_block(zba_chain_for_node(zbah, node, order), order, with_extra);
1882 }
1883 
1884 static vm_size_t
zba_chunk_header_size(uint32_t n)1885 zba_chunk_header_size(uint32_t n)
1886 {
1887 	vm_size_t hdr_size = sizeof(struct zone_bits_allocator_header);
1888 	if (n == 0) {
1889 		hdr_size += sizeof(struct zone_bits_allocator_meta);
1890 	}
1891 	return hdr_size;
1892 }
1893 
1894 static void
zba_init_chunk(uint32_t n,bool with_extra)1895 zba_init_chunk(uint32_t n, bool with_extra)
1896 {
1897 	vm_size_t hdr_size = zba_chunk_header_size(n);
1898 	vm_offset_t page = (vm_offset_t)zba_base_header() + n * ZBA_CHUNK_SIZE;
1899 	struct zone_bits_allocator_header *zbah = zba_header(page);
1900 	vm_size_t size = ZBA_CHUNK_SIZE;
1901 	size_t node;
1902 
1903 	for (uint32_t o = ZBA_MAX_ORDER + 1; o-- > 0;) {
1904 		if (size < hdr_size + (ZBA_GRANULE << o)) {
1905 			continue;
1906 		}
1907 		size -= ZBA_GRANULE << o;
1908 		node = zba_node(page + size, o);
1909 		zba_node_flip_split(zbah, zba_node_parent(node));
1910 		zba_push_block(zba_chain_for_node(zbah, node, o), o, with_extra);
1911 	}
1912 }
1913 
1914 __attribute__((noinline))
1915 static void
zba_grow(bool with_extra)1916 zba_grow(bool with_extra)
1917 {
1918 	struct zone_bits_allocator_meta *meta = zba_meta();
1919 	kern_return_t kr = KERN_SUCCESS;
1920 	uint32_t chunk;
1921 
1922 #if !ZALLOC_TEST
1923 	if (meta->zbam_left >= meta->zbam_right) {
1924 		zba_memory_exhausted();
1925 	}
1926 #endif
1927 
1928 	if (with_extra) {
1929 		chunk = meta->zbam_right - 1;
1930 	} else {
1931 		chunk = meta->zbam_left;
1932 	}
1933 
1934 	kr = zba_populate(chunk, with_extra);
1935 	if (kr == KERN_SUCCESS) {
1936 		if (with_extra) {
1937 			meta->zbam_right -= 1;
1938 		} else {
1939 			meta->zbam_left += 1;
1940 		}
1941 
1942 		zba_init_chunk(chunk, with_extra);
1943 #if !ZALLOC_TEST
1944 	} else {
1945 		/*
1946 		 * zba_populate() has to be allowed to fail populating,
1947 		 * as we are under a global lock, we need to do the
1948 		 * VM_PAGE_WAIT() outside of the lock.
1949 		 */
1950 		assert(kr == KERN_RESOURCE_SHORTAGE);
1951 		zba_unlock();
1952 		VM_PAGE_WAIT();
1953 		zba_lock();
1954 #endif
1955 	}
1956 }
1957 
1958 static vm_offset_t
zba_alloc(uint32_t order,bool with_extra)1959 zba_alloc(uint32_t order, bool with_extra)
1960 {
1961 	struct zone_bits_allocator_header *zbah;
1962 	uint32_t cur = order;
1963 	vm_address_t addr;
1964 	size_t node;
1965 
1966 	while ((addr = zba_try_pop_block(cur, with_extra)) == 0) {
1967 		if (__improbable(cur++ >= ZBA_MAX_ORDER)) {
1968 			zba_grow(with_extra);
1969 			cur = order;
1970 		}
1971 	}
1972 
1973 	zbah = zba_header(addr);
1974 	node = zba_node(addr, cur);
1975 	zba_node_flip_split(zbah, zba_node_parent(node));
1976 	while (cur > order) {
1977 		cur--;
1978 		zba_node_flip_split(zbah, node);
1979 		node = zba_node_left_child(node);
1980 		zba_push_block(zba_chain_for_node(zbah, node + 1, cur),
1981 		    cur, with_extra);
1982 	}
1983 
1984 	return addr;
1985 }
1986 
1987 #define zba_map_index(type, n)    (n / (8 * sizeof(type)))
1988 #define zba_map_bit(type, n)      ((type)1 << (n % (8 * sizeof(type))))
1989 #define zba_map_mask_lt(type, n)  (zba_map_bit(type, n) - 1)
1990 #define zba_map_mask_ge(type, n)  ((type)-zba_map_bit(type, n))
1991 
1992 #if !ZALLOC_TEST
1993 #if VM_TAG_SIZECLASSES
1994 
1995 static void *
zba_extra_ref_ptr(uint32_t bref,vm_offset_t idx)1996 zba_extra_ref_ptr(uint32_t bref, vm_offset_t idx)
1997 {
1998 	vm_offset_t base = zone_info.zi_xtra_range.min_address;
1999 	vm_offset_t offs = (bref & ZBA_PTR_MASK) * ZBA_GRANULE * CHAR_BIT;
2000 
2001 	return (void *)(base + ((offs + idx) << zba_xtra_shift));
2002 }
2003 
2004 #endif /* VM_TAG_SIZECLASSES */
2005 
2006 static uint32_t
zba_bits_ref_order(uint32_t bref)2007 zba_bits_ref_order(uint32_t bref)
2008 {
2009 	return bref >> ZBA_ORDER_SHIFT;
2010 }
2011 
2012 static bitmap_t *
zba_bits_ref_ptr(uint32_t bref)2013 zba_bits_ref_ptr(uint32_t bref)
2014 {
2015 	return zba_slot_base() + (bref & ZBA_PTR_MASK);
2016 }
2017 
2018 static vm_offset_t
zba_scan_bitmap_inline(zone_t zone,struct zone_page_metadata * meta,zalloc_flags_t flags,vm_offset_t eidx)2019 zba_scan_bitmap_inline(zone_t zone, struct zone_page_metadata *meta,
2020     zalloc_flags_t flags, vm_offset_t eidx)
2021 {
2022 	size_t i = eidx / 32;
2023 	uint32_t map;
2024 
2025 	if (eidx % 32) {
2026 		map = meta[i].zm_bitmap & zba_map_mask_ge(uint32_t, eidx);
2027 		if (map) {
2028 			eidx = __builtin_ctz(map);
2029 			meta[i].zm_bitmap ^= 1u << eidx;
2030 			return i * 32 + eidx;
2031 		}
2032 		i++;
2033 	}
2034 
2035 	uint32_t chunk_len = meta->zm_chunk_len;
2036 	if (flags & Z_PCPU) {
2037 		chunk_len = zpercpu_count();
2038 	}
2039 	for (int j = 0; j < chunk_len; j++, i++) {
2040 		if (i >= chunk_len) {
2041 			i = 0;
2042 		}
2043 		if (__probable(map = meta[i].zm_bitmap)) {
2044 			meta[i].zm_bitmap &= map - 1;
2045 			return i * 32 + __builtin_ctz(map);
2046 		}
2047 	}
2048 
2049 	zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
2050 }
2051 
2052 static vm_offset_t
zba_scan_bitmap_ref(zone_t zone,struct zone_page_metadata * meta,vm_offset_t eidx)2053 zba_scan_bitmap_ref(zone_t zone, struct zone_page_metadata *meta,
2054     vm_offset_t eidx)
2055 {
2056 	uint32_t bits_size = 1 << zba_bits_ref_order(meta->zm_bitmap);
2057 	bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2058 	size_t i = eidx / 64;
2059 	uint64_t map;
2060 
2061 	if (eidx % 64) {
2062 		map = bits[i] & zba_map_mask_ge(uint64_t, eidx);
2063 		if (map) {
2064 			eidx = __builtin_ctzll(map);
2065 			bits[i] ^= 1ull << eidx;
2066 			return i * 64 + eidx;
2067 		}
2068 		i++;
2069 	}
2070 
2071 	for (int j = 0; j < bits_size; i++, j++) {
2072 		if (i >= bits_size) {
2073 			i = 0;
2074 		}
2075 		if (__probable(map = bits[i])) {
2076 			bits[i] &= map - 1;
2077 			return i * 64 + __builtin_ctzll(map);
2078 		}
2079 	}
2080 
2081 	zone_page_meta_accounting_panic(zone, meta, "zm_bitmap");
2082 }
2083 
2084 /*!
2085  * @function zone_meta_find_and_clear_bit
2086  *
2087  * @brief
2088  * The core of the bitmap allocator: find a bit set in the bitmaps.
2089  *
2090  * @discussion
2091  * This method will round robin through available allocations,
2092  * with a per-core memory of the last allocated element index allocated.
2093  *
2094  * This is done in order to avoid a fully LIFO behavior which makes exploiting
2095  * double-free bugs way too practical.
2096  *
2097  * @param zone          The zone we're allocating from.
2098  * @param meta          The main metadata for the chunk being allocated from.
2099  * @param flags         the alloc flags (for @c Z_PCPU).
2100  */
2101 static vm_offset_t
zone_meta_find_and_clear_bit(zone_t zone,zone_stats_t zs,struct zone_page_metadata * meta,zalloc_flags_t flags)2102 zone_meta_find_and_clear_bit(
2103 	zone_t                  zone,
2104 	zone_stats_t            zs,
2105 	struct zone_page_metadata *meta,
2106 	zalloc_flags_t          flags)
2107 {
2108 	vm_offset_t eidx = zs->zs_alloc_rr + 1;
2109 
2110 	if (meta->zm_inline_bitmap) {
2111 		eidx = zba_scan_bitmap_inline(zone, meta, flags, eidx);
2112 	} else {
2113 		eidx = zba_scan_bitmap_ref(zone, meta, eidx);
2114 	}
2115 	zs->zs_alloc_rr = (uint16_t)eidx;
2116 	return eidx;
2117 }
2118 
2119 /*!
2120  * @function zone_meta_bits_init_inline
2121  *
2122  * @brief
2123  * Initializes the inline zm_bitmap field(s) for a newly assigned chunk.
2124  *
2125  * @param meta          The main metadata for the initialized chunk.
2126  * @param count         The number of elements the chunk can hold
2127  *                      (which might be partial for partially populated chunks).
2128  */
2129 static void
zone_meta_bits_init_inline(struct zone_page_metadata * meta,uint32_t count)2130 zone_meta_bits_init_inline(struct zone_page_metadata *meta, uint32_t count)
2131 {
2132 	/*
2133 	 * We're called with the metadata zm_bitmap fields already zeroed out.
2134 	 */
2135 	for (size_t i = 0; i < count / 32; i++) {
2136 		meta[i].zm_bitmap = ~0u;
2137 	}
2138 	if (count % 32) {
2139 		meta[count / 32].zm_bitmap = zba_map_mask_lt(uint32_t, count);
2140 	}
2141 }
2142 
2143 /*!
2144  * @function zone_meta_bits_alloc_init
2145  *
2146  * @brief
2147  * Allocates a  zm_bitmap field for a newly assigned chunk.
2148  *
2149  * @param count         The number of elements the chunk can hold
2150  *                      (which might be partial for partially populated chunks).
2151  * @param nbits         The maximum nuber of bits that will be used.
2152  * @param with_extra    Whether "VM Tracking" metadata needs to be allocated.
2153  */
2154 static uint32_t
zone_meta_bits_alloc_init(uint32_t count,uint32_t nbits,bool with_extra)2155 zone_meta_bits_alloc_init(uint32_t count, uint32_t nbits, bool with_extra)
2156 {
2157 	static_assert(ZONE_MAX_ALLOC_SIZE / ZONE_MIN_ELEM_SIZE <=
2158 	    ZBA_GRANULE_BITS << ZBA_MAX_ORDER, "bitmaps will be large enough");
2159 
2160 	uint32_t order = flsll((nbits - 1) / ZBA_GRANULE_BITS);
2161 	uint64_t *bits;
2162 	size_t   i = 0;
2163 
2164 	assert(order <= ZBA_MAX_ALLOC_ORDER);
2165 	assert(count <= ZBA_GRANULE_BITS << order);
2166 
2167 	zba_lock();
2168 	bits = (uint64_t *)zba_alloc(order, with_extra);
2169 	zba_unlock();
2170 
2171 	while (i < count / 64) {
2172 		bits[i++] = ~0ull;
2173 	}
2174 	if (count % 64) {
2175 		bits[i++] = zba_map_mask_lt(uint64_t, count);
2176 	}
2177 	while (i < 1u << order) {
2178 		bits[i++] = 0;
2179 	}
2180 
2181 	return (uint32_t)(bits - zba_slot_base()) +
2182 	       (order << ZBA_ORDER_SHIFT) +
2183 	       (with_extra ? ZBA_HAS_EXTRA_BIT : 0);
2184 }
2185 
2186 /*!
2187  * @function zone_meta_bits_merge
2188  *
2189  * @brief
2190  * Adds elements <code>[start, end)</code> to a chunk being extended.
2191  *
2192  * @param meta          The main metadata for the extended chunk.
2193  * @param start         The index of the first element to add to the chunk.
2194  * @param end           The index of the last (exclusive) element to add.
2195  */
2196 static void
zone_meta_bits_merge(struct zone_page_metadata * meta,uint32_t start,uint32_t end)2197 zone_meta_bits_merge(struct zone_page_metadata *meta,
2198     uint32_t start, uint32_t end)
2199 {
2200 	if (meta->zm_inline_bitmap) {
2201 		while (start < end) {
2202 			size_t s_i = start / 32;
2203 			size_t s_e = end / 32;
2204 
2205 			if (s_i == s_e) {
2206 				meta[s_i].zm_bitmap |= zba_map_mask_lt(uint32_t, end) &
2207 				    zba_map_mask_ge(uint32_t, start);
2208 				break;
2209 			}
2210 
2211 			meta[s_i].zm_bitmap |= zba_map_mask_ge(uint32_t, start);
2212 			start += 32 - (start % 32);
2213 		}
2214 	} else {
2215 		uint64_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2216 
2217 		while (start < end) {
2218 			size_t s_i = start / 64;
2219 			size_t s_e = end / 64;
2220 
2221 			if (s_i == s_e) {
2222 				bits[s_i] |= zba_map_mask_lt(uint64_t, end) &
2223 				    zba_map_mask_ge(uint64_t, start);
2224 				break;
2225 			}
2226 			bits[s_i] |= zba_map_mask_ge(uint64_t, start);
2227 			start += 64 - (start % 64);
2228 		}
2229 	}
2230 }
2231 
2232 /*!
2233  * @function zone_bits_free
2234  *
2235  * @brief
2236  * Frees a bitmap to the zone bitmap allocator.
2237  *
2238  * @param bref
2239  * A bitmap reference set by @c zone_meta_bits_init() in a @c zm_bitmap field.
2240  */
2241 static void
zone_bits_free(uint32_t bref)2242 zone_bits_free(uint32_t bref)
2243 {
2244 	zba_lock();
2245 	zba_free((vm_offset_t)zba_bits_ref_ptr(bref),
2246 	    zba_bits_ref_order(bref), (bref & ZBA_HAS_EXTRA_BIT));
2247 	zba_unlock();
2248 }
2249 
2250 /*!
2251  * @function zone_meta_is_free
2252  *
2253  * @brief
2254  * Returns whether a given element appears free.
2255  */
2256 static bool
zone_meta_is_free(struct zone_page_metadata * meta,vm_offset_t eidx)2257 zone_meta_is_free(struct zone_page_metadata *meta, vm_offset_t eidx)
2258 {
2259 	if (meta->zm_inline_bitmap) {
2260 		uint32_t bit = zba_map_bit(uint32_t, eidx);
2261 		return meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit;
2262 	} else {
2263 		bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2264 		uint64_t bit = zba_map_bit(uint64_t, eidx);
2265 		return bits[zba_map_index(uint64_t, eidx)] & bit;
2266 	}
2267 }
2268 
2269 /*!
2270  * @function zone_meta_mark_free
2271  *
2272  * @brief
2273  * Marks an element as free and returns whether it was marked as used.
2274  */
2275 static bool
zone_meta_mark_free(struct zone_page_metadata * meta,vm_offset_t eidx)2276 zone_meta_mark_free(struct zone_page_metadata *meta, vm_offset_t eidx)
2277 {
2278 	if (meta->zm_inline_bitmap) {
2279 		uint32_t bit = zba_map_bit(uint32_t, eidx);
2280 		if (meta[zba_map_index(uint32_t, eidx)].zm_bitmap & bit) {
2281 			return false;
2282 		}
2283 		meta[zba_map_index(uint32_t, eidx)].zm_bitmap ^= bit;
2284 	} else {
2285 		bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
2286 		uint64_t bit = zba_map_bit(uint64_t, eidx);
2287 		if (bits[zba_map_index(uint64_t, eidx)] & bit) {
2288 			return false;
2289 		}
2290 		bits[zba_map_index(uint64_t, eidx)] ^= bit;
2291 	}
2292 	return true;
2293 }
2294 
2295 #if VM_TAG_SIZECLASSES
2296 
2297 __startup_func
2298 void
__zone_site_register(vm_allocation_site_t * site)2299 __zone_site_register(vm_allocation_site_t *site)
2300 {
2301 	if (zone_tagging_on) {
2302 		vm_tag_alloc(site);
2303 	}
2304 }
2305 
2306 uint16_t
zone_index_from_tag_index(uint32_t sizeclass_idx)2307 zone_index_from_tag_index(uint32_t sizeclass_idx)
2308 {
2309 	return zone_tags_sizeclasses[sizeclass_idx];
2310 }
2311 
2312 #endif /* VM_TAG_SIZECLASSES */
2313 #endif /* !ZALLOC_TEST */
2314 /*! @} */
2315 #pragma mark zalloc helpers
2316 #if !ZALLOC_TEST
2317 
2318 static inline void *
zstack_tbi_fix(vm_offset_t elem)2319 zstack_tbi_fix(vm_offset_t elem)
2320 {
2321 #if CONFIG_KERNEL_TAGGING
2322 	elem = vm_memtag_fixup_ptr(elem);
2323 #endif /* CONFIG_KERNEL_TAGGING */
2324 	return (void *)elem;
2325 }
2326 
2327 static inline vm_offset_t
zstack_tbi_fill(void * addr)2328 zstack_tbi_fill(void *addr)
2329 {
2330 	vm_offset_t elem = (vm_offset_t)addr;
2331 
2332 	return vm_memtag_canonicalize_address(elem);
2333 }
2334 
2335 __attribute__((always_inline))
2336 static inline void
zstack_push_no_delta(zstack_t * stack,void * addr)2337 zstack_push_no_delta(zstack_t *stack, void *addr)
2338 {
2339 	vm_offset_t elem = zstack_tbi_fill(addr);
2340 
2341 	*(vm_offset_t *)addr = stack->z_head - elem;
2342 	stack->z_head = elem;
2343 }
2344 
2345 __attribute__((always_inline))
2346 void
zstack_push(zstack_t * stack,void * addr)2347 zstack_push(zstack_t *stack, void *addr)
2348 {
2349 	zstack_push_no_delta(stack, addr);
2350 	stack->z_count++;
2351 }
2352 
2353 __attribute__((always_inline))
2354 static inline void *
zstack_pop_no_delta(zstack_t * stack)2355 zstack_pop_no_delta(zstack_t *stack)
2356 {
2357 	void *addr = zstack_tbi_fix(stack->z_head);
2358 
2359 	stack->z_head += *(vm_offset_t *)addr;
2360 	*(vm_offset_t *)addr = 0;
2361 
2362 	return addr;
2363 }
2364 
2365 __attribute__((always_inline))
2366 void *
zstack_pop(zstack_t * stack)2367 zstack_pop(zstack_t *stack)
2368 {
2369 	stack->z_count--;
2370 	return zstack_pop_no_delta(stack);
2371 }
2372 
2373 static inline void
zone_recirc_lock_nopreempt_check_contention(zone_t zone)2374 zone_recirc_lock_nopreempt_check_contention(zone_t zone)
2375 {
2376 	uint32_t ticket;
2377 
2378 	if (__probable(hw_lck_ticket_reserve_nopreempt(&zone->z_recirc_lock,
2379 	    &ticket, &zone_locks_grp))) {
2380 		return;
2381 	}
2382 
2383 	hw_lck_ticket_wait(&zone->z_recirc_lock, ticket, NULL, &zone_locks_grp);
2384 
2385 	/*
2386 	 * If zone caching has been disabled due to memory pressure,
2387 	 * then recording contention is not useful, give the system
2388 	 * time to recover.
2389 	 */
2390 	if (__probable(!zone_caching_disabled && !zone_exhausted(zone))) {
2391 		zone->z_recirc_cont_cur++;
2392 	}
2393 }
2394 
2395 static inline void
zone_recirc_lock_nopreempt(zone_t zone)2396 zone_recirc_lock_nopreempt(zone_t zone)
2397 {
2398 	hw_lck_ticket_lock_nopreempt(&zone->z_recirc_lock, &zone_locks_grp);
2399 }
2400 
2401 static inline void
zone_recirc_unlock_nopreempt(zone_t zone)2402 zone_recirc_unlock_nopreempt(zone_t zone)
2403 {
2404 	hw_lck_ticket_unlock_nopreempt(&zone->z_recirc_lock);
2405 }
2406 
2407 static inline void
zone_lock_nopreempt_check_contention(zone_t zone)2408 zone_lock_nopreempt_check_contention(zone_t zone)
2409 {
2410 	uint32_t ticket;
2411 #if KASAN_FAKESTACK
2412 	spl_t s = 0;
2413 	if (zone->z_kasan_fakestacks) {
2414 		s = splsched();
2415 	}
2416 #endif /* KASAN_FAKESTACK */
2417 
2418 	if (__probable(hw_lck_ticket_reserve_nopreempt(&zone->z_lock, &ticket,
2419 	    &zone_locks_grp))) {
2420 #if KASAN_FAKESTACK
2421 		zone->z_kasan_spl = s;
2422 #endif /* KASAN_FAKESTACK */
2423 		return;
2424 	}
2425 
2426 	hw_lck_ticket_wait(&zone->z_lock, ticket, NULL, &zone_locks_grp);
2427 #if KASAN_FAKESTACK
2428 	zone->z_kasan_spl = s;
2429 #endif /* KASAN_FAKESTACK */
2430 
2431 	/*
2432 	 * If zone caching has been disabled due to memory pressure,
2433 	 * then recording contention is not useful, give the system
2434 	 * time to recover.
2435 	 */
2436 	if (__probable(!zone_caching_disabled &&
2437 	    !zone->z_pcpu_cache && !zone_exhausted(zone))) {
2438 		zone->z_recirc_cont_cur++;
2439 	}
2440 }
2441 
2442 static inline void
zone_lock_nopreempt(zone_t zone)2443 zone_lock_nopreempt(zone_t zone)
2444 {
2445 #if KASAN_FAKESTACK
2446 	spl_t s = 0;
2447 	if (zone->z_kasan_fakestacks) {
2448 		s = splsched();
2449 	}
2450 #endif /* KASAN_FAKESTACK */
2451 	hw_lck_ticket_lock_nopreempt(&zone->z_lock, &zone_locks_grp);
2452 #if KASAN_FAKESTACK
2453 	zone->z_kasan_spl = s;
2454 #endif /* KASAN_FAKESTACK */
2455 }
2456 
2457 static inline void
zone_unlock_nopreempt(zone_t zone)2458 zone_unlock_nopreempt(zone_t zone)
2459 {
2460 #if KASAN_FAKESTACK
2461 	spl_t s = zone->z_kasan_spl;
2462 	zone->z_kasan_spl = 0;
2463 #endif /* KASAN_FAKESTACK */
2464 	hw_lck_ticket_unlock_nopreempt(&zone->z_lock);
2465 #if KASAN_FAKESTACK
2466 	if (zone->z_kasan_fakestacks) {
2467 		splx(s);
2468 	}
2469 #endif /* KASAN_FAKESTACK */
2470 }
2471 
2472 static inline void
zone_depot_lock_nopreempt(zone_cache_t zc)2473 zone_depot_lock_nopreempt(zone_cache_t zc)
2474 {
2475 	hw_lck_ticket_lock_nopreempt(&zc->zc_depot_lock, &zone_locks_grp);
2476 }
2477 
2478 static inline void
zone_depot_unlock_nopreempt(zone_cache_t zc)2479 zone_depot_unlock_nopreempt(zone_cache_t zc)
2480 {
2481 	hw_lck_ticket_unlock_nopreempt(&zc->zc_depot_lock);
2482 }
2483 
2484 static inline void
zone_depot_lock(zone_cache_t zc)2485 zone_depot_lock(zone_cache_t zc)
2486 {
2487 	hw_lck_ticket_lock(&zc->zc_depot_lock, &zone_locks_grp);
2488 }
2489 
2490 static inline void
zone_depot_unlock(zone_cache_t zc)2491 zone_depot_unlock(zone_cache_t zc)
2492 {
2493 	hw_lck_ticket_unlock(&zc->zc_depot_lock);
2494 }
2495 
2496 zone_t
zone_by_id(size_t zid)2497 zone_by_id(size_t zid)
2498 {
2499 	return (zone_t)((uintptr_t)zone_array + zid * sizeof(struct zone));
2500 }
2501 
2502 static inline bool
zone_supports_vm(zone_t z)2503 zone_supports_vm(zone_t z)
2504 {
2505 	/*
2506 	 * VM_MAP_ENTRY and VM_MAP_HOLES zones are allowed
2507 	 * to overcommit because they're used to reclaim memory
2508 	 * (VM support).
2509 	 */
2510 	return z >= &zone_array[ZONE_ID_VM_MAP_ENTRY] &&
2511 	       z <= &zone_array[ZONE_ID_VM_MAP_HOLES];
2512 }
2513 
2514 const char *
zone_name(zone_t z)2515 zone_name(zone_t z)
2516 {
2517 	return z->z_name;
2518 }
2519 
2520 const char *
zone_heap_name(zone_t z)2521 zone_heap_name(zone_t z)
2522 {
2523 	zone_security_flags_t zsflags = zone_security_config(z);
2524 	if (__probable(zsflags.z_kheap_id < KHEAP_ID_COUNT)) {
2525 		return kalloc_heap_names[zsflags.z_kheap_id];
2526 	}
2527 	return "invalid";
2528 }
2529 
2530 static uint32_t
zone_alloc_pages_for_nelems(zone_t z,vm_size_t max_elems)2531 zone_alloc_pages_for_nelems(zone_t z, vm_size_t max_elems)
2532 {
2533 	vm_size_t elem_count, chunks;
2534 
2535 	elem_count = ptoa(z->z_percpu ? 1 : z->z_chunk_pages) /
2536 	    zone_elem_outer_size(z);
2537 	chunks = (max_elems + elem_count - 1) / elem_count;
2538 
2539 	return (uint32_t)MIN(UINT32_MAX, chunks * z->z_chunk_pages);
2540 }
2541 
2542 static inline vm_size_t
zone_submaps_approx_size(void)2543 zone_submaps_approx_size(void)
2544 {
2545 	vm_size_t size = 0;
2546 
2547 	for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
2548 		if (zone_submaps[idx] != VM_MAP_NULL) {
2549 			size += zone_submaps[idx]->size;
2550 		}
2551 	}
2552 
2553 	return size;
2554 }
2555 
2556 static inline void
zone_depot_init(struct zone_depot * zd)2557 zone_depot_init(struct zone_depot *zd)
2558 {
2559 	*zd = (struct zone_depot){
2560 		.zd_tail = &zd->zd_head,
2561 	};
2562 }
2563 
2564 static inline void
zone_depot_insert_head_full(struct zone_depot * zd,zone_magazine_t mag)2565 zone_depot_insert_head_full(struct zone_depot *zd, zone_magazine_t mag)
2566 {
2567 	if (zd->zd_full++ == 0) {
2568 		zd->zd_tail = &mag->zm_next;
2569 	}
2570 	mag->zm_next = zd->zd_head;
2571 	zd->zd_head = mag;
2572 }
2573 
2574 static inline void
zone_depot_insert_tail_full(struct zone_depot * zd,zone_magazine_t mag)2575 zone_depot_insert_tail_full(struct zone_depot *zd, zone_magazine_t mag)
2576 {
2577 	zd->zd_full++;
2578 	mag->zm_next = *zd->zd_tail;
2579 	*zd->zd_tail = mag;
2580 	zd->zd_tail = &mag->zm_next;
2581 }
2582 
2583 static inline void
zone_depot_insert_head_empty(struct zone_depot * zd,zone_magazine_t mag)2584 zone_depot_insert_head_empty(struct zone_depot *zd, zone_magazine_t mag)
2585 {
2586 	zd->zd_empty++;
2587 	mag->zm_next = *zd->zd_tail;
2588 	*zd->zd_tail = mag;
2589 }
2590 
2591 static inline zone_magazine_t
zone_depot_pop_head_full(struct zone_depot * zd,zone_t z)2592 zone_depot_pop_head_full(struct zone_depot *zd, zone_t z)
2593 {
2594 	zone_magazine_t mag = zd->zd_head;
2595 
2596 	assert(zd->zd_full);
2597 
2598 	zd->zd_full--;
2599 	if (z && z->z_recirc_full_min > zd->zd_full) {
2600 		z->z_recirc_full_min = zd->zd_full;
2601 	}
2602 	zd->zd_head = mag->zm_next;
2603 	if (zd->zd_full == 0) {
2604 		zd->zd_tail = &zd->zd_head;
2605 	}
2606 
2607 	mag->zm_next = NULL;
2608 	return mag;
2609 }
2610 
2611 static inline zone_magazine_t
zone_depot_pop_head_empty(struct zone_depot * zd,zone_t z)2612 zone_depot_pop_head_empty(struct zone_depot *zd, zone_t z)
2613 {
2614 	zone_magazine_t mag = *zd->zd_tail;
2615 
2616 	assert(zd->zd_empty);
2617 
2618 	zd->zd_empty--;
2619 	if (z && z->z_recirc_empty_min > zd->zd_empty) {
2620 		z->z_recirc_empty_min = zd->zd_empty;
2621 	}
2622 	*zd->zd_tail = mag->zm_next;
2623 
2624 	mag->zm_next = NULL;
2625 	return mag;
2626 }
2627 
2628 static inline smr_seq_t
zone_depot_move_full(struct zone_depot * dst,struct zone_depot * src,uint32_t n,zone_t z)2629 zone_depot_move_full(
2630 	struct zone_depot      *dst,
2631 	struct zone_depot      *src,
2632 	uint32_t                n,
2633 	zone_t                  z)
2634 {
2635 	zone_magazine_t head, last;
2636 
2637 	assert(n);
2638 	assert(src->zd_full >= n);
2639 
2640 	src->zd_full -= n;
2641 	if (z && z->z_recirc_full_min > src->zd_full) {
2642 		z->z_recirc_full_min = src->zd_full;
2643 	}
2644 	head = last = src->zd_head;
2645 	for (uint32_t i = n; i-- > 1;) {
2646 		last = last->zm_next;
2647 	}
2648 
2649 	src->zd_head = last->zm_next;
2650 	if (src->zd_full == 0) {
2651 		src->zd_tail = &src->zd_head;
2652 	}
2653 
2654 	if (z && zone_security_array[zone_index(z)].z_lifo) {
2655 		if (dst->zd_full == 0) {
2656 			dst->zd_tail = &last->zm_next;
2657 		}
2658 		last->zm_next = dst->zd_head;
2659 		dst->zd_head = head;
2660 	} else {
2661 		last->zm_next = *dst->zd_tail;
2662 		*dst->zd_tail = head;
2663 		dst->zd_tail = &last->zm_next;
2664 	}
2665 	dst->zd_full += n;
2666 
2667 	return last->zm_seq;
2668 }
2669 
2670 static inline void
zone_depot_move_empty(struct zone_depot * dst,struct zone_depot * src,uint32_t n,zone_t z)2671 zone_depot_move_empty(
2672 	struct zone_depot      *dst,
2673 	struct zone_depot      *src,
2674 	uint32_t                n,
2675 	zone_t                  z)
2676 {
2677 	zone_magazine_t head, last;
2678 
2679 	assert(n);
2680 	assert(src->zd_empty >= n);
2681 
2682 	src->zd_empty -= n;
2683 	if (z && z->z_recirc_empty_min > src->zd_empty) {
2684 		z->z_recirc_empty_min = src->zd_empty;
2685 	}
2686 	head = last = *src->zd_tail;
2687 	for (uint32_t i = n; i-- > 1;) {
2688 		last = last->zm_next;
2689 	}
2690 
2691 	*src->zd_tail = last->zm_next;
2692 
2693 	dst->zd_empty += n;
2694 	last->zm_next = *dst->zd_tail;
2695 	*dst->zd_tail = head;
2696 }
2697 
2698 static inline bool
zone_depot_poll(struct zone_depot * depot,smr_t smr)2699 zone_depot_poll(struct zone_depot *depot, smr_t smr)
2700 {
2701 	if (depot->zd_full == 0) {
2702 		return false;
2703 	}
2704 
2705 	return smr == NULL || smr_poll(smr, depot->zd_head->zm_seq);
2706 }
2707 
2708 static void
zone_cache_swap_magazines(zone_cache_t cache)2709 zone_cache_swap_magazines(zone_cache_t cache)
2710 {
2711 	uint16_t count_a = cache->zc_alloc_cur;
2712 	uint16_t count_f = cache->zc_free_cur;
2713 	vm_offset_t *elems_a = cache->zc_alloc_elems;
2714 	vm_offset_t *elems_f = cache->zc_free_elems;
2715 
2716 	z_debug_assert(count_a <= zc_mag_size());
2717 	z_debug_assert(count_f <= zc_mag_size());
2718 
2719 	cache->zc_alloc_cur = count_f;
2720 	cache->zc_free_cur = count_a;
2721 	cache->zc_alloc_elems = elems_f;
2722 	cache->zc_free_elems = elems_a;
2723 }
2724 
2725 __pure2
2726 static smr_t
zone_cache_smr(zone_cache_t cache)2727 zone_cache_smr(zone_cache_t cache)
2728 {
2729 	return cache->zc_smr;
2730 }
2731 
2732 /*!
2733  * @function zone_magazine_replace
2734  *
2735  * @brief
2736  * Unlod a magazine and load a new one instead.
2737  */
2738 static zone_magazine_t
zone_magazine_replace(zone_cache_t zc,zone_magazine_t mag,bool empty)2739 zone_magazine_replace(zone_cache_t zc, zone_magazine_t mag, bool empty)
2740 {
2741 	zone_magazine_t old;
2742 	vm_offset_t **elems;
2743 
2744 	mag->zm_seq = SMR_SEQ_INVALID;
2745 
2746 	if (empty) {
2747 		elems = &zc->zc_free_elems;
2748 		zc->zc_free_cur = 0;
2749 	} else {
2750 		elems = &zc->zc_alloc_elems;
2751 		zc->zc_alloc_cur = zc_mag_size();
2752 	}
2753 	old = (zone_magazine_t)((uintptr_t)*elems -
2754 	    offsetof(struct zone_magazine, zm_elems));
2755 	*elems = mag->zm_elems;
2756 
2757 	return old;
2758 }
2759 
2760 static zone_magazine_t
zone_magazine_alloc(zalloc_flags_t flags)2761 zone_magazine_alloc(zalloc_flags_t flags)
2762 {
2763 	return zalloc_flags(zc_magazine_zone, flags | Z_ZERO);
2764 }
2765 
2766 static void
zone_magazine_free(zone_magazine_t mag)2767 zone_magazine_free(zone_magazine_t mag)
2768 {
2769 	(zfree)(zc_magazine_zone, mag);
2770 }
2771 
2772 static void
zone_magazine_free_list(struct zone_depot * zd)2773 zone_magazine_free_list(struct zone_depot *zd)
2774 {
2775 	zone_magazine_t tmp, mag = *zd->zd_tail;
2776 
2777 	while (mag) {
2778 		tmp = mag->zm_next;
2779 		zone_magazine_free(mag);
2780 		mag = tmp;
2781 	}
2782 
2783 	*zd->zd_tail = NULL;
2784 	zd->zd_empty = 0;
2785 }
2786 
2787 void
zone_enable_caching(zone_t zone)2788 zone_enable_caching(zone_t zone)
2789 {
2790 	size_t size_per_mag = zone_elem_inner_size(zone) * zc_mag_size();
2791 	zone_cache_t caches;
2792 	size_t depot_limit;
2793 
2794 	depot_limit = zc_pcpu_max() / size_per_mag;
2795 	zone->z_depot_limit = (uint16_t)MIN(depot_limit, INT16_MAX);
2796 
2797 	caches = zalloc_percpu_permanent_type(struct zone_cache);
2798 	zpercpu_foreach(zc, caches) {
2799 		zc->zc_alloc_elems = zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)->zm_elems;
2800 		zc->zc_free_elems = zone_magazine_alloc(Z_WAITOK | Z_NOFAIL)->zm_elems;
2801 		zone_depot_init(&zc->zc_depot);
2802 		hw_lck_ticket_init(&zc->zc_depot_lock, &zone_locks_grp);
2803 	}
2804 
2805 	zone_lock(zone);
2806 	assert(zone->z_pcpu_cache == NULL);
2807 	zone->z_pcpu_cache = caches;
2808 	zone->z_recirc_cont_cur = 0;
2809 	zone->z_recirc_cont_wma = 0;
2810 	zone->z_elems_free_min = 0; /* becomes z_recirc_empty_min */
2811 	zone->z_elems_free_wma = 0; /* becomes z_recirc_empty_wma */
2812 	zone_unlock(zone);
2813 }
2814 
2815 bool
zone_maps_owned(vm_address_t addr,vm_size_t size)2816 zone_maps_owned(vm_address_t addr, vm_size_t size)
2817 {
2818 	return from_zone_map(addr, size);
2819 }
2820 
2821 #if KASAN_LIGHT
2822 bool
kasan_zone_maps_owned(vm_address_t addr,vm_size_t size)2823 kasan_zone_maps_owned(vm_address_t addr, vm_size_t size)
2824 {
2825 	return from_zone_map(addr, size) ||
2826 	       mach_vm_range_size(&zone_info.zi_map_range) == 0;
2827 }
2828 #endif /* KASAN_LIGHT */
2829 
2830 void
zone_map_sizes(vm_map_size_t * psize,vm_map_size_t * pfree,vm_map_size_t * plargest_free)2831 zone_map_sizes(
2832 	vm_map_size_t    *psize,
2833 	vm_map_size_t    *pfree,
2834 	vm_map_size_t    *plargest_free)
2835 {
2836 	vm_map_size_t size, free, largest;
2837 
2838 	vm_map_sizes(zone_submaps[0], psize, pfree, plargest_free);
2839 
2840 	for (uint32_t i = 1; i < Z_SUBMAP_IDX_COUNT; i++) {
2841 		vm_map_sizes(zone_submaps[i], &size, &free, &largest);
2842 		*psize += size;
2843 		*pfree += free;
2844 		*plargest_free = MAX(*plargest_free, largest);
2845 	}
2846 }
2847 
2848 __attribute__((always_inline))
2849 vm_map_t
zone_submap(zone_security_flags_t zsflags)2850 zone_submap(zone_security_flags_t zsflags)
2851 {
2852 	return zone_submaps[zsflags.z_submap_idx];
2853 }
2854 
2855 unsigned
zpercpu_count(void)2856 zpercpu_count(void)
2857 {
2858 	return zpercpu_early_count;
2859 }
2860 
2861 #if ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC
2862 /*
2863  * Returns a random number of a given bit-width.
2864  *
2865  * DO NOT COPY THIS CODE OUTSIDE OF ZALLOC
2866  *
2867  * This uses Intel's rdrand because random() uses FP registers
2868  * which causes FP faults and allocations which isn't something
2869  * we can do from zalloc itself due to reentrancy problems.
2870  *
2871  * For pre-rdrand machines (which we no longer support),
2872  * we use a bad biased random generator that doesn't use FP.
2873  * Such HW is no longer supported, but VM of newer OSes on older
2874  * bare metal is made to limp along (with reduced security) this way.
2875  */
2876 static uint64_t
zalloc_random_mask64(uint32_t bits)2877 zalloc_random_mask64(uint32_t bits)
2878 {
2879 	uint64_t mask = ~0ull >> (64 - bits);
2880 	uint64_t v;
2881 
2882 #if __x86_64__
2883 	if (__probable(cpuid_features() & CPUID_FEATURE_RDRAND)) {
2884 		asm volatile ("1: rdrand %0; jnc 1b\n" : "=r" (v) :: "cc");
2885 		v &= mask;
2886 	} else {
2887 		disable_preemption();
2888 		int cpu = cpu_number();
2889 		v = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg,
2890 		    zone_bool_gen[cpu].zbg_entropy,
2891 		    ZONE_ENTROPY_CNT, bits);
2892 		enable_preemption();
2893 	}
2894 #else
2895 	v = early_random() & mask;
2896 #endif
2897 
2898 	return v;
2899 }
2900 
2901 /*
2902  * Returns a random number within [bound_min, bound_max)
2903  *
2904  * This isn't _exactly_ uniform, but the skew is small enough
2905  * not to matter for the consumers of this interface.
2906  *
2907  * Values within [bound_min, 2^64 % (bound_max - bound_min))
2908  * will be returned (bound_max - bound_min) / 2^64 more often
2909  * than values within [2^64 % (bound_max - bound_min), bound_max).
2910  */
2911 static uint32_t
zalloc_random_uniform32(uint32_t bound_min,uint32_t bound_max)2912 zalloc_random_uniform32(uint32_t bound_min, uint32_t bound_max)
2913 {
2914 	uint64_t delta = bound_max - bound_min;
2915 
2916 	return bound_min + (uint32_t)(zalloc_random_mask64(64) % delta);
2917 }
2918 
2919 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) || CONFIG_PROB_GZALLOC */
2920 #if ZALLOC_ENABLE_LOGGING || CONFIG_PROB_GZALLOC
2921 /*
2922  * Track all kalloc zones of specified size for zlog name
2923  * kalloc.type.<size> or kalloc.type.var.<size> or kalloc.<size>
2924  *
2925  * Additionally track all shared kalloc zones with shared.kalloc
2926  */
2927 static bool
track_kalloc_zones(zone_t z,const char * logname)2928 track_kalloc_zones(zone_t z, const char *logname)
2929 {
2930 	const char *prefix;
2931 	size_t len;
2932 	zone_security_flags_t zsflags = zone_security_config(z);
2933 
2934 	prefix = "kalloc.type.var.";
2935 	len    = strlen(prefix);
2936 	if (zsflags.z_kalloc_type && zsflags.z_kheap_id == KHEAP_ID_KT_VAR &&
2937 	    strncmp(logname, prefix, len) == 0) {
2938 		vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
2939 
2940 		return zone_elem_inner_size(z) == sizeclass;
2941 	}
2942 
2943 	prefix = "kalloc.type.";
2944 	len    = strlen(prefix);
2945 	if (zsflags.z_kalloc_type && zsflags.z_kheap_id != KHEAP_ID_KT_VAR &&
2946 	    strncmp(logname, prefix, len) == 0) {
2947 		vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
2948 
2949 		return zone_elem_inner_size(z) == sizeclass;
2950 	}
2951 
2952 	prefix = "kalloc.";
2953 	len    = strlen(prefix);
2954 	if ((zsflags.z_kheap_id || zsflags.z_kalloc_type) &&
2955 	    strncmp(logname, prefix, len) == 0) {
2956 		vm_size_t sizeclass = strtoul(logname + len, NULL, 0);
2957 
2958 		return zone_elem_inner_size(z) == sizeclass;
2959 	}
2960 
2961 	prefix = "shared.kalloc";
2962 	if ((zsflags.z_kheap_id == KHEAP_ID_SHARED) &&
2963 	    (strcmp(logname, prefix) == 0)) {
2964 		return true;
2965 	}
2966 
2967 	return false;
2968 }
2969 #endif
2970 
2971 int
track_this_zone(const char * zonename,const char * logname)2972 track_this_zone(const char *zonename, const char *logname)
2973 {
2974 	unsigned int len;
2975 	const char *zc = zonename;
2976 	const char *lc = logname;
2977 
2978 	/*
2979 	 * Compare the strings.  We bound the compare by MAX_ZONE_NAME.
2980 	 */
2981 
2982 	for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) {
2983 		/*
2984 		 * If the current characters don't match, check for a space in
2985 		 * in the zone name and a corresponding period in the log name.
2986 		 * If that's not there, then the strings don't match.
2987 		 */
2988 
2989 		if (*zc != *lc && !(*zc == ' ' && *lc == '.')) {
2990 			break;
2991 		}
2992 
2993 		/*
2994 		 * The strings are equal so far.  If we're at the end, then it's a match.
2995 		 */
2996 
2997 		if (*zc == '\0') {
2998 			return TRUE;
2999 		}
3000 	}
3001 
3002 	return FALSE;
3003 }
3004 
3005 #if DEBUG || DEVELOPMENT
3006 
3007 vm_size_t
zone_element_info(void * addr,vm_tag_t * ptag)3008 zone_element_info(void *addr, vm_tag_t * ptag)
3009 {
3010 	vm_size_t     size = 0;
3011 	vm_tag_t      tag = VM_KERN_MEMORY_NONE;
3012 	struct zone *src_zone;
3013 
3014 	if (from_zone_map(addr, sizeof(void *))) {
3015 		src_zone = zone_by_id(zone_index_from_ptr(addr));
3016 		size     = zone_elem_inner_size(src_zone);
3017 #if VM_TAG_SIZECLASSES
3018 		if (__improbable(src_zone->z_uses_tags)) {
3019 			struct zone_page_metadata *meta;
3020 			vm_offset_t eidx;
3021 			vm_tag_t *slot;
3022 
3023 			meta = zone_element_resolve(src_zone,
3024 			    (vm_offset_t)addr, &eidx);
3025 			slot = zba_extra_ref_ptr(meta->zm_bitmap, eidx);
3026 			tag  = *slot;
3027 		}
3028 #endif /* VM_TAG_SIZECLASSES */
3029 	}
3030 
3031 	*ptag = tag;
3032 	return size;
3033 }
3034 
3035 #endif /* DEBUG || DEVELOPMENT */
3036 #if KASAN_CLASSIC
3037 
3038 vm_size_t
kasan_quarantine_resolve(vm_address_t addr,zone_t * zonep)3039 kasan_quarantine_resolve(vm_address_t addr, zone_t *zonep)
3040 {
3041 	zone_t zone = zone_by_id(zone_index_from_ptr((void *)addr));
3042 
3043 	*zonep = zone;
3044 	return zone_elem_inner_size(zone);
3045 }
3046 
3047 #endif /* KASAN_CLASSIC */
3048 #endif /* !ZALLOC_TEST */
3049 #pragma mark Zone zeroing and early random
3050 #if !ZALLOC_TEST
3051 
3052 /*
3053  * Zone zeroing
3054  *
3055  * All allocations from zones are zeroed on free and are additionally
3056  * check that they are still zero on alloc. The check is
3057  * always on, on embedded devices. Perf regression was detected
3058  * on intel as we cant use the vectorized implementation of
3059  * memcmp_zero_ptr_aligned due to cyclic dependenices between
3060  * initization and allocation. Therefore we perform the check
3061  * on 20% of the allocations.
3062  */
3063 #if ZALLOC_ENABLE_ZERO_CHECK
3064 #if defined(__x86_64__)
3065 /*
3066  * Peform zero validation on every 5th allocation
3067  */
3068 static TUNABLE(uint32_t, zzc_rate, "zzc_rate", 5);
3069 static uint32_t PERCPU_DATA(zzc_decrementer);
3070 #endif /* defined(__x86_64__) */
3071 
3072 /*
3073  * Determine if zero validation for allocation should be skipped
3074  */
3075 static bool
zalloc_skip_zero_check(void)3076 zalloc_skip_zero_check(void)
3077 {
3078 #if defined(__x86_64__)
3079 	uint32_t *counterp, cnt;
3080 
3081 	counterp = PERCPU_GET(zzc_decrementer);
3082 	cnt = *counterp;
3083 	if (__probable(cnt > 0)) {
3084 		*counterp  = cnt - 1;
3085 		return true;
3086 	}
3087 	*counterp = zzc_rate - 1;
3088 #endif /* !defined(__x86_64__) */
3089 	return false;
3090 }
3091 
3092 __abortlike
3093 static void
zalloc_uaf_panic(zone_t z,uintptr_t elem,size_t size)3094 zalloc_uaf_panic(zone_t z, uintptr_t elem, size_t size)
3095 {
3096 	uint32_t esize = (uint32_t)zone_elem_inner_size(z);
3097 	uint32_t first_offs = ~0u;
3098 	uintptr_t first_bits = 0, v;
3099 	char buf[1024];
3100 	int pos = 0;
3101 
3102 	buf[0] = '\0';
3103 
3104 	for (uint32_t o = 0; o < size; o += sizeof(v)) {
3105 		if ((v = *(uintptr_t *)(elem + o)) == 0) {
3106 			continue;
3107 		}
3108 		pos += scnprintf(buf + pos, sizeof(buf) - pos, "\n"
3109 		    "%5d: 0x%016lx", o, v);
3110 		if (first_offs > o) {
3111 			first_offs = o;
3112 			first_bits = v;
3113 		}
3114 	}
3115 
3116 	(panic)("[%s%s]: element modified after free "
3117 	"(off:%d, val:0x%016lx, sz:%d, ptr:%p)%s",
3118 	zone_heap_name(z), zone_name(z),
3119 	first_offs, first_bits, esize, (void *)elem, buf);
3120 }
3121 
3122 static void
zalloc_validate_element(zone_t zone,vm_offset_t elem,vm_size_t size,zalloc_flags_t flags)3123 zalloc_validate_element(
3124 	zone_t                  zone,
3125 	vm_offset_t             elem,
3126 	vm_size_t               size,
3127 	zalloc_flags_t          flags)
3128 {
3129 	if (flags & Z_NOZZC) {
3130 		return;
3131 	}
3132 	if (memcmp_zero_ptr_aligned((void *)elem, size)) {
3133 		zalloc_uaf_panic(zone, elem, size);
3134 	}
3135 	if (flags & Z_PCPU) {
3136 		for (size_t i = zpercpu_count(); --i > 0;) {
3137 			elem += PAGE_SIZE;
3138 			if (memcmp_zero_ptr_aligned((void *)elem, size)) {
3139 				zalloc_uaf_panic(zone, elem, size);
3140 			}
3141 		}
3142 	}
3143 }
3144 
3145 #endif /* ZALLOC_ENABLE_ZERO_CHECK */
3146 
3147 __attribute__((noinline))
3148 static void
zone_early_scramble_rr(zone_t zone,int cpu,zone_stats_t zs)3149 zone_early_scramble_rr(zone_t zone, int cpu, zone_stats_t zs)
3150 {
3151 #if KASAN_FAKESTACK
3152 	/*
3153 	 * This can cause re-entrancy with kasan fakestacks
3154 	 */
3155 #pragma unused(zone, cpu, zs)
3156 #else
3157 	uint32_t bits;
3158 
3159 	bits = random_bool_gen_bits(&zone_bool_gen[cpu].zbg_bg,
3160 	    zone_bool_gen[cpu].zbg_entropy, ZONE_ENTROPY_CNT, 8);
3161 
3162 	zs->zs_alloc_rr += bits;
3163 	zs->zs_alloc_rr %= zone->z_chunk_elems;
3164 #endif
3165 }
3166 
3167 #endif /* !ZALLOC_TEST */
3168 #pragma mark Zone Leak Detection
3169 #if !ZALLOC_TEST
3170 #if ZALLOC_ENABLE_LOGGING || CONFIG_ZLEAKS
3171 
3172 /*
3173  * Zone leak debugging code
3174  *
3175  * When enabled, this code keeps a log to track allocations to a particular
3176  * zone that have not yet been freed.
3177  *
3178  * Examining this log will reveal the source of a zone leak.
3179  *
3180  * The log is allocated only when logging is enabled (it is off by default),
3181  * so there is no effect on the system when it's turned off.
3182  *
3183  * Zone logging is enabled with the `zlog<n>=<zone>` boot-arg for each
3184  * zone name to log, with n starting at 1.
3185  *
3186  * Leaks debugging utilizes 2 tunables:
3187  * - zlsize (in kB) which describes how much "size" the record covers
3188  *   (zones with smaller elements get more records, default is 4M).
3189  *
3190  * - zlfreq (in bytes) which describes a sample rate in cumulative allocation
3191  *   size at which automatic leak detection will sample allocations.
3192  *   (default is 8k)
3193  *
3194  *
3195  * Zone corruption logging
3196  *
3197  * Logging can also be used to help identify the source of a zone corruption.
3198  *
3199  * First, identify the zone that is being corrupted,
3200  * then add "-zc zlog<n>=<zone name>" to the boot-args.
3201  *
3202  * When -zc is used in conjunction with zlog,
3203  * it changes the logging style to track both allocations and frees to the zone.
3204  *
3205  * When the corruption is detected, examining the log will show you the stack
3206  * traces of the callers who last allocated and freed any particular element in
3207  * the zone.
3208  *
3209  * Corruption debugging logs will have zrecs records
3210  * (tuned by the zrecs= boot-arg, 16k elements per G of RAM by default).
3211  */
3212 
3213 #define ZRECORDS_MAX            (256u << 10)
3214 #define ZRECORDS_DEFAULT        (16u  << 10)
3215 static TUNABLE(uint32_t, zrecs, "zrecs", 0);
3216 static TUNABLE(uint32_t, zlsize, "zlsize", 4 * 1024);
3217 static TUNABLE(uint32_t, zlfreq, "zlfreq", 8 * 1024);
3218 
3219 __startup_func
3220 static void
zone_leaks_init_zrecs(void)3221 zone_leaks_init_zrecs(void)
3222 {
3223 	/*
3224 	 * Don't allow more than ZRECORDS_MAX records,
3225 	 * even if the user asked for more.
3226 	 *
3227 	 * This prevents accidentally hogging too much kernel memory
3228 	 * and making the system unusable.
3229 	 */
3230 	if (zrecs == 0) {
3231 		zrecs = ZRECORDS_DEFAULT *
3232 		    (uint32_t)((max_mem + (1ul << 30)) >> 30);
3233 	}
3234 	if (zrecs > ZRECORDS_MAX) {
3235 		zrecs = ZRECORDS_MAX;
3236 	}
3237 }
3238 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_leaks_init_zrecs);
3239 
3240 static uint32_t
zone_leaks_record_count(zone_t z)3241 zone_leaks_record_count(zone_t z)
3242 {
3243 	uint32_t recs = (zlsize << 10) / zone_elem_inner_size(z);
3244 
3245 	return MIN(MAX(recs, ZRECORDS_DEFAULT), ZRECORDS_MAX);
3246 }
3247 
3248 static uint32_t
zone_leaks_sample_rate(zone_t z)3249 zone_leaks_sample_rate(zone_t z)
3250 {
3251 	return zlfreq / zone_elem_inner_size(z);
3252 }
3253 
3254 #if ZALLOC_ENABLE_LOGGING
3255 /* Log allocations and frees to help debug a zone element corruption */
3256 static TUNABLE(bool, corruption_debug_flag, "-zc", false);
3257 
3258 /*
3259  * A maximum of 10 zlog<n> boot args can be provided (zlog1 -> zlog10)
3260  */
3261 #define MAX_ZONES_LOG_REQUESTS  10
3262 
3263 /**
3264  * @function zone_setup_logging
3265  *
3266  * @abstract
3267  * Optionally sets up a zone for logging.
3268  *
3269  * @discussion
3270  * We recognized two boot-args:
3271  *
3272  *	zlog=<zone_to_log>
3273  *	zrecs=<num_records_in_log>
3274  *	zlsize=<memory to cover for leaks>
3275  *
3276  * The zlog arg is used to specify the zone name that should be logged,
3277  * and zrecs/zlsize is used to control the size of the log.
3278  */
3279 static void
zone_setup_logging(zone_t z)3280 zone_setup_logging(zone_t z)
3281 {
3282 	char zone_name[MAX_ZONE_NAME]; /* Temp. buffer for the zone name */
3283 	char zlog_name[MAX_ZONE_NAME]; /* Temp. buffer to create the strings zlog1, zlog2 etc... */
3284 	char zlog_val[MAX_ZONE_NAME];  /* the zone name we're logging, if any */
3285 	bool logging_on = false;
3286 
3287 	/*
3288 	 * Append kalloc heap name to zone name (if zone is used by kalloc)
3289 	 */
3290 	snprintf(zone_name, MAX_ZONE_NAME, "%s%s", zone_heap_name(z), z->z_name);
3291 
3292 	/* zlog0 isn't allowed. */
3293 	for (int i = 1; i <= MAX_ZONES_LOG_REQUESTS; i++) {
3294 		snprintf(zlog_name, MAX_ZONE_NAME, "zlog%d", i);
3295 
3296 		if (PE_parse_boot_argn(zlog_name, zlog_val, sizeof(zlog_val))) {
3297 			if (track_this_zone(zone_name, zlog_val) ||
3298 			    track_kalloc_zones(z, zlog_val)) {
3299 				logging_on = true;
3300 				break;
3301 			}
3302 		}
3303 	}
3304 
3305 	/*
3306 	 * Backwards compat. with the old boot-arg used to specify single zone
3307 	 * logging i.e. zlog Needs to happen after the newer zlogn checks
3308 	 * because the prefix will match all the zlogn
3309 	 * boot-args.
3310 	 */
3311 	if (!logging_on &&
3312 	    PE_parse_boot_argn("zlog", zlog_val, sizeof(zlog_val))) {
3313 		if (track_this_zone(zone_name, zlog_val) ||
3314 		    track_kalloc_zones(z, zlog_val)) {
3315 			logging_on = true;
3316 		}
3317 	}
3318 
3319 	/*
3320 	 * If we want to log a zone, see if we need to allocate buffer space for
3321 	 * the log.
3322 	 *
3323 	 * Some vm related zones are zinit'ed before we can do a kmem_alloc, so
3324 	 * we have to defer allocation in that case.
3325 	 *
3326 	 * zone_init() will finish the job.
3327 	 *
3328 	 * If we want to log one of the VM related zones that's set up early on,
3329 	 * we will skip allocation of the log until zinit is called again later
3330 	 * on some other zone.
3331 	 */
3332 	if (logging_on) {
3333 		if (corruption_debug_flag) {
3334 			z->z_btlog = btlog_create(BTLOG_LOG, zrecs, 0);
3335 		} else {
3336 			z->z_btlog = btlog_create(BTLOG_HASH,
3337 			    zone_leaks_record_count(z), 0);
3338 		}
3339 		if (z->z_btlog) {
3340 			z->z_log_on = true;
3341 			printf("zone[%s%s]: logging enabled\n",
3342 			    zone_heap_name(z), z->z_name);
3343 		} else {
3344 			printf("zone[%s%s]: failed to enable logging\n",
3345 			    zone_heap_name(z), z->z_name);
3346 		}
3347 	}
3348 }
3349 
3350 #endif /* ZALLOC_ENABLE_LOGGING */
3351 #if KASAN_TBI
3352 static TUNABLE(uint32_t, kasan_zrecs, "kasan_zrecs", 0);
3353 
3354 __startup_func
3355 static void
kasan_tbi_init_zrecs(void)3356 kasan_tbi_init_zrecs(void)
3357 {
3358 	/*
3359 	 * Don't allow more than ZRECORDS_MAX records,
3360 	 * even if the user asked for more.
3361 	 *
3362 	 * This prevents accidentally hogging too much kernel memory
3363 	 * and making the system unusable.
3364 	 */
3365 	if (kasan_zrecs == 0) {
3366 		kasan_zrecs = ZRECORDS_DEFAULT *
3367 		    (uint32_t)((max_mem + (1ul << 30)) >> 30);
3368 	}
3369 	if (kasan_zrecs > ZRECORDS_MAX) {
3370 		kasan_zrecs = ZRECORDS_MAX;
3371 	}
3372 }
3373 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, kasan_tbi_init_zrecs);
3374 
3375 static void
zone_setup_kasan_logging(zone_t z)3376 zone_setup_kasan_logging(zone_t z)
3377 {
3378 	if (!z->z_tbi_tag) {
3379 		printf("zone[%s%s]: kasan logging disabled for this zone\n",
3380 		    zone_heap_name(z), z->z_name);
3381 		return;
3382 	}
3383 
3384 	z->z_log_on = true;
3385 	z->z_btlog = btlog_create(BTLOG_LOG, kasan_zrecs, 0);
3386 	if (!z->z_btlog) {
3387 		printf("zone[%s%s]: failed to enable kasan logging\n",
3388 		    zone_heap_name(z), z->z_name);
3389 	}
3390 }
3391 
3392 #endif /* KASAN_TBI */
3393 #if CONFIG_ZLEAKS
3394 
3395 static thread_call_data_t zone_leaks_callout;
3396 
3397 /*
3398  * The zone leak detector, abbreviated 'zleak', keeps track
3399  * of a subset of the currently outstanding allocations
3400  * made by the zone allocator.
3401  *
3402  * Zones who use more than zleak_pages_per_zone_wired_threshold
3403  * pages will get a BTLOG_HASH btlog with sampling to minimize
3404  * perf impact, yet receive statistical data about the backtrace
3405  * that is the most likely to cause the leak.
3406  *
3407  * If the zone goes under the threshold enough, then the log
3408  * is disabled and backtraces freed. Data can be collected
3409  * from userspace with the zlog(1) command.
3410  */
3411 
3412 uint32_t                zleak_active;
3413 SECURITY_READ_ONLY_LATE(vm_size_t) zleak_max_zonemap_size;
3414 
3415 /* Size a zone will have before we will collect data on it */
3416 static size_t           zleak_pages_per_zone_wired_threshold = ~0;
3417 vm_size_t               zleak_per_zone_tracking_threshold = ~0;
3418 
3419 static inline bool
zleak_should_enable_for_zone(zone_t z)3420 zleak_should_enable_for_zone(zone_t z)
3421 {
3422 	if (z->z_log_on) {
3423 		return false;
3424 	}
3425 	if (z->z_btlog) {
3426 		return false;
3427 	}
3428 	if (z->z_exhausts) {
3429 		return false;
3430 	}
3431 	if (zone_exhaustible(z)) {
3432 		return z->z_wired_cur * 8 >= z->z_wired_max * 7;
3433 	}
3434 	return z->z_wired_cur >= zleak_pages_per_zone_wired_threshold;
3435 }
3436 
3437 static inline bool
zleak_should_disable_for_zone(zone_t z)3438 zleak_should_disable_for_zone(zone_t z)
3439 {
3440 	if (z->z_log_on) {
3441 		return false;
3442 	}
3443 	if (!z->z_btlog) {
3444 		return false;
3445 	}
3446 	if (zone_exhaustible(z)) {
3447 		return z->z_wired_cur * 8 < z->z_wired_max * 7;
3448 	}
3449 	return z->z_wired_cur < zleak_pages_per_zone_wired_threshold / 2;
3450 }
3451 
3452 static void
zleaks_enable_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)3453 zleaks_enable_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
3454 {
3455 	btlog_t log;
3456 
3457 	zone_foreach(z) {
3458 		if (zleak_should_disable_for_zone(z)) {
3459 			log = z->z_btlog;
3460 			z->z_btlog = NULL;
3461 			assert(z->z_btlog_disabled == NULL);
3462 			btlog_disable(log);
3463 			z->z_btlog_disabled = log;
3464 			os_atomic_dec(&zleak_active, relaxed);
3465 		}
3466 
3467 		if (zleak_should_enable_for_zone(z)) {
3468 			log = z->z_btlog_disabled;
3469 			if (log == NULL) {
3470 				log = btlog_create(BTLOG_HASH,
3471 				    zone_leaks_record_count(z),
3472 				    zone_leaks_sample_rate(z));
3473 			} else if (btlog_enable(log) == KERN_SUCCESS) {
3474 				z->z_btlog_disabled = NULL;
3475 			} else {
3476 				log = NULL;
3477 			}
3478 			os_atomic_store(&z->z_btlog, log, release);
3479 			os_atomic_inc(&zleak_active, relaxed);
3480 		}
3481 	}
3482 }
3483 
3484 __startup_func
3485 static void
zleak_init(void)3486 zleak_init(void)
3487 {
3488 	zleak_max_zonemap_size = ptoa(zone_pages_wired_max);
3489 
3490 	zleak_update_threshold(&zleak_per_zone_tracking_threshold,
3491 	    zleak_max_zonemap_size / 8);
3492 
3493 	thread_call_setup_with_options(&zone_leaks_callout,
3494 	    zleaks_enable_async, NULL, THREAD_CALL_PRIORITY_USER,
3495 	    THREAD_CALL_OPTIONS_ONCE);
3496 }
3497 STARTUP(ZALLOC, STARTUP_RANK_SECOND, zleak_init);
3498 
3499 kern_return_t
zleak_update_threshold(vm_size_t * arg,uint64_t value)3500 zleak_update_threshold(vm_size_t *arg, uint64_t value)
3501 {
3502 	if (value >= zleak_max_zonemap_size) {
3503 		return KERN_INVALID_VALUE;
3504 	}
3505 
3506 	if (arg == &zleak_per_zone_tracking_threshold) {
3507 		zleak_per_zone_tracking_threshold = (vm_size_t)value;
3508 		zleak_pages_per_zone_wired_threshold = atop(value);
3509 		if (startup_phase >= STARTUP_SUB_THREAD_CALL) {
3510 			thread_call_enter(&zone_leaks_callout);
3511 		}
3512 		return KERN_SUCCESS;
3513 	}
3514 
3515 	return KERN_INVALID_ARGUMENT;
3516 }
3517 
3518 static void
panic_display_zleaks(bool has_syms)3519 panic_display_zleaks(bool has_syms)
3520 {
3521 	bool did_header = false;
3522 	vm_address_t bt[BTLOG_MAX_DEPTH];
3523 	uint32_t len, count;
3524 
3525 	zone_foreach(z) {
3526 		btlog_t log = z->z_btlog;
3527 
3528 		if (log == NULL || btlog_get_type(log) != BTLOG_HASH) {
3529 			continue;
3530 		}
3531 
3532 		count = btlog_guess_top(log, bt, &len);
3533 		if (count == 0) {
3534 			continue;
3535 		}
3536 
3537 		if (!did_header) {
3538 			paniclog_append_noflush("Zone (suspected) leak report:\n");
3539 			did_header = true;
3540 		}
3541 
3542 		paniclog_append_noflush("  Zone:    %s%s\n",
3543 		    zone_heap_name(z), zone_name(z));
3544 		paniclog_append_noflush("  Count:   %d (%ld bytes)\n", count,
3545 		    (long)count * zone_scale_for_percpu(z, zone_elem_inner_size(z)));
3546 		paniclog_append_noflush("  Size:    %ld\n",
3547 		    (long)zone_size_wired(z));
3548 		paniclog_append_noflush("  Top backtrace:\n");
3549 		for (uint32_t i = 0; i < len; i++) {
3550 			if (has_syms) {
3551 				paniclog_append_noflush("    %p ", (void *)bt[i]);
3552 				panic_print_symbol_name(bt[i]);
3553 				paniclog_append_noflush("\n");
3554 			} else {
3555 				paniclog_append_noflush("    %p\n", (void *)bt[i]);
3556 			}
3557 		}
3558 
3559 		kmod_panic_dump(bt, len);
3560 		paniclog_append_noflush("\n");
3561 	}
3562 }
3563 #endif /* CONFIG_ZLEAKS */
3564 
3565 #endif /* ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS */
3566 #if ZONE_ENABLE_LOGGING || CONFIG_ZLEAKS || KASAN_TBI
3567 
3568 #if !KASAN_TBI
3569 __cold
3570 #endif
3571 static void
zalloc_log(zone_t zone,vm_offset_t addr,uint32_t count,void * fp)3572 zalloc_log(zone_t zone, vm_offset_t addr, uint32_t count, void *fp)
3573 {
3574 	btlog_t log = zone->z_btlog;
3575 	btref_get_flags_t flags = 0;
3576 	btref_t ref;
3577 
3578 #if !KASAN_TBI
3579 	if (!log || !btlog_sample(log)) {
3580 		return;
3581 	}
3582 #endif
3583 	if (get_preemption_level() || zone_supports_vm(zone)) {
3584 		/*
3585 		 * VM zones can be used by btlog, avoid reentrancy issues.
3586 		 */
3587 		flags = BTREF_GET_NOWAIT;
3588 	}
3589 
3590 	ref = btref_get(fp, flags);
3591 	while (count-- > 0) {
3592 		if (count) {
3593 			btref_retain(ref);
3594 		}
3595 		btlog_record(log, (void *)addr, ZOP_ALLOC, ref);
3596 		addr += *(vm_offset_t *)addr;
3597 	}
3598 }
3599 
3600 #define ZALLOC_LOG(zone, addr, count)  ({ \
3601 	if ((zone)->z_btlog) {                                                 \
3602 	        zalloc_log(zone, addr, count, __builtin_frame_address(0));     \
3603 	}                                                                      \
3604 })
3605 
3606 #if !KASAN_TBI
3607 __cold
3608 #endif
3609 static void
zfree_log(zone_t zone,vm_offset_t addr,uint32_t count,void * fp)3610 zfree_log(zone_t zone, vm_offset_t addr, uint32_t count, void *fp)
3611 {
3612 	btlog_t log = zone->z_btlog;
3613 	btref_get_flags_t flags = 0;
3614 	btref_t ref;
3615 
3616 #if !KASAN_TBI
3617 	if (!log) {
3618 		return;
3619 	}
3620 #endif
3621 
3622 	/*
3623 	 * See if we're doing logging on this zone.
3624 	 *
3625 	 * There are two styles of logging used depending on
3626 	 * whether we're trying to catch a leak or corruption.
3627 	 */
3628 #if !KASAN_TBI
3629 	if (btlog_get_type(log) == BTLOG_HASH) {
3630 		/*
3631 		 * We're logging to catch a leak.
3632 		 *
3633 		 * Remove any record we might have for this element
3634 		 * since it's being freed.  Note that we may not find it
3635 		 * if the buffer overflowed and that's OK.
3636 		 *
3637 		 * Since the log is of a limited size, old records get
3638 		 * overwritten if there are more zallocs than zfrees.
3639 		 */
3640 		while (count-- > 0) {
3641 			btlog_erase(log, (void *)addr);
3642 			addr += *(vm_offset_t *)addr;
3643 		}
3644 		return;
3645 	}
3646 #endif /* !KASAN_TBI */
3647 
3648 	if (get_preemption_level() || zone_supports_vm(zone)) {
3649 		/*
3650 		 * VM zones can be used by btlog, avoid reentrancy issues.
3651 		 */
3652 		flags = BTREF_GET_NOWAIT;
3653 	}
3654 
3655 	ref = btref_get(fp, flags);
3656 	while (count-- > 0) {
3657 		if (count) {
3658 			btref_retain(ref);
3659 		}
3660 		btlog_record(log, (void *)addr, ZOP_FREE, ref);
3661 		addr += *(vm_offset_t *)addr;
3662 	}
3663 }
3664 
3665 #define ZFREE_LOG(zone, addr, count)  ({ \
3666 	if ((zone)->z_btlog) {                                                 \
3667 	        zfree_log(zone, addr, count, __builtin_frame_address(0));      \
3668 	}                                                                      \
3669 })
3670 
3671 #else
3672 #define ZALLOC_LOG(...)         ((void)0)
3673 #define ZFREE_LOG(...)          ((void)0)
3674 #endif /* ZALLOC_ENABLE_LOGGING || CONFIG_ZLEAKS || KASAN_TBI */
3675 #endif /* !ZALLOC_TEST */
3676 #pragma mark zone (re)fill
3677 #if !ZALLOC_TEST
3678 
3679 /*!
3680  * @defgroup Zone Refill
3681  * @{
3682  *
3683  * @brief
3684  * Functions handling The zone refill machinery.
3685  *
3686  * @discussion
3687  * Zones are refilled based on 2 mechanisms: direct expansion, async expansion.
3688  *
3689  * @c zalloc_ext() is the codepath that kicks the zone refill when the zone is
3690  * dropping below half of its @c z_elems_rsv (0 for most zones) and will:
3691  *
3692  * - call @c zone_expand_locked() directly if the caller is allowed to block,
3693  *
3694  * - wakeup the asynchroous expansion thread call if the caller is not allowed
3695  *   to block, or if the reserve becomes depleted.
3696  *
3697  *
3698  * <h2>Synchronous expansion</h2>
3699  *
3700  * This mechanism is actually the only one that may refill a zone, and all the
3701  * other ones funnel through this one eventually.
3702  *
3703  * @c zone_expand_locked() implements the core of the expansion mechanism,
3704  * and will do so while a caller specified predicate is true.
3705  *
3706  * Zone expansion allows for up to 2 threads to concurrently refill the zone:
3707  * - one VM privileged thread,
3708  * - one regular thread.
3709  *
3710  * Regular threads that refill will put down their identity in @c z_expander,
3711  * so that priority inversion avoidance can be implemented.
3712  *
3713  * However, VM privileged threads are allowed to use VM page reserves,
3714  * which allows for the system to recover from extreme memory pressure
3715  * situations, allowing for the few allocations that @c zone_gc() or
3716  * killing processes require.
3717  *
3718  * When a VM privileged thread is also expanding, the @c z_expander_vm_priv bit
3719  * is set. @c z_expander is not necessarily the identity of this VM privileged
3720  * thread (it is if the VM privileged thread came in first, but wouldn't be, and
3721  * could even be @c THREAD_NULL otherwise).
3722  *
3723  * Note that the pageout-scan daemon might be BG and is VM privileged. To avoid
3724  * spending a whole pointer on priority inheritance for VM privileged threads
3725  * (and other issues related to having two owners), we use the rwlock boost as
3726  * a stop gap to avoid priority inversions.
3727  *
3728  *
3729  * <h2>Chunk wiring policies</h2>
3730  *
3731  * Zones allocate memory in chunks of @c zone_t::z_chunk_pages pages at a time
3732  * to try to minimize fragmentation relative to element sizes not aligning with
3733  * a chunk size well.  However, this can grow large and be hard to fulfill on
3734  * a system under a lot of memory pressure (chunks can be as long as 8 pages on
3735  * 4k page systems).
3736  *
3737  * This is why, when under memory pressure the system allows chunks to be
3738  * partially populated. The metadata of the first page in the chunk maintains
3739  * the count of actually populated pages.
3740  *
3741  * The metadata for addresses assigned to a zone are found of 4 queues:
3742  * - @c z_pageq_empty has chunk heads with populated pages and no allocated
3743  *   elements (those can be targeted by @c zone_gc()),
3744  * - @c z_pageq_partial has chunk heads with populated pages that are partially
3745  *   used,
3746  * - @c z_pageq_full has chunk heads with populated pages with no free elements
3747  *   left,
3748  * - @c z_pageq_va has either chunk heads for sequestered VA space assigned to
3749  *   the zone forever, or the first secondary metadata for a chunk whose
3750  *   corresponding page is not populated in the chunk.
3751  *
3752  * When new pages need to be wired/populated, chunks from the @c z_pageq_va
3753  * queues are preferred.
3754  *
3755  *
3756  * <h2>Asynchronous expansion</h2>
3757  *
3758  * This mechanism allows for refilling zones used mostly with non blocking
3759  * callers. It relies on a thread call (@c zone_expand_callout) which will
3760  * iterate all zones and refill the ones marked with @c z_async_refilling.
3761  *
3762  * NOTE: If the calling thread for zalloc_noblock is lower priority than
3763  *       the thread_call, then zalloc_noblock to an empty zone may succeed.
3764  *
3765  *
3766  * <h2>Dealing with zone allocations from the mach VM code</h2>
3767  *
3768  * The implementation of the mach VM itself uses the zone allocator
3769  * for things like the vm_map_entry data structure. In order to prevent
3770  * a recursion problem when adding more pages to a zone, the VM zones
3771  * use the Z_SUBMAP_IDX_VM submap which doesn't use kmem_alloc()
3772  * or any VM map functions to allocate.
3773  *
3774  * Instead, a really simple coalescing first-fit allocator is used
3775  * for this submap, and no one else than zalloc can allocate from it.
3776  *
3777  * Memory is directly populated which doesn't require allocation of
3778  * VM map entries, and avoids recursion. The cost of this scheme however,
3779  * is that `vm_map_lookup_entry` will not function on those addresses
3780  * (nor any API relying on it).
3781  */
3782 
3783 static void zone_reclaim_elements(zone_t z, uint16_t n, vm_offset_t *elems);
3784 static void zone_depot_trim(zone_t z, uint32_t target, struct zone_depot *zd);
3785 static thread_call_data_t zone_expand_callout;
3786 
3787 __attribute__((overloadable))
3788 static inline bool
zone_submap_is_sequestered(zone_submap_idx_t idx)3789 zone_submap_is_sequestered(zone_submap_idx_t idx)
3790 {
3791 	return idx != Z_SUBMAP_IDX_DATA;
3792 }
3793 
3794 __attribute__((overloadable))
3795 static inline bool
zone_submap_is_sequestered(zone_security_flags_t zsflags)3796 zone_submap_is_sequestered(zone_security_flags_t zsflags)
3797 {
3798 	return zone_submap_is_sequestered(zsflags.z_submap_idx);
3799 }
3800 
3801 static inline kma_flags_t
zone_kma_flags(zone_t z,zone_security_flags_t zsflags,zalloc_flags_t flags)3802 zone_kma_flags(zone_t z, zone_security_flags_t zsflags, zalloc_flags_t flags)
3803 {
3804 	kma_flags_t kmaflags = KMA_KOBJECT | KMA_ZERO;
3805 
3806 	if (zsflags.z_noencrypt) {
3807 		kmaflags |= KMA_NOENCRYPT;
3808 	}
3809 	if (zsflags.z_submap_idx == Z_SUBMAP_IDX_DATA) {
3810 		kmaflags |= KMA_DATA;
3811 	}
3812 	if (flags & Z_NOPAGEWAIT) {
3813 		kmaflags |= KMA_NOPAGEWAIT;
3814 	}
3815 	if (z->z_permanent || (!z->z_destructible &&
3816 	    zone_submap_is_sequestered(zsflags))) {
3817 		kmaflags |= KMA_PERMANENT;
3818 	}
3819 	if (zsflags.z_submap_from_end) {
3820 		kmaflags |= KMA_LAST_FREE;
3821 	}
3822 
3823 	if (z->z_tbi_tag) {
3824 		kmaflags |= KMA_TAG;
3825 	}
3826 
3827 	return kmaflags;
3828 }
3829 
3830 static inline void
zone_add_wired_pages(zone_t z,uint32_t pages)3831 zone_add_wired_pages(zone_t z, uint32_t pages)
3832 {
3833 	os_atomic_add(&zone_pages_wired, pages, relaxed);
3834 
3835 #if CONFIG_ZLEAKS
3836 	if (__improbable(zleak_should_enable_for_zone(z) &&
3837 	    startup_phase >= STARTUP_SUB_THREAD_CALL)) {
3838 		thread_call_enter(&zone_leaks_callout);
3839 	}
3840 #else
3841 	(void)z;
3842 #endif
3843 }
3844 
3845 static inline void
zone_remove_wired_pages(zone_t z,uint32_t pages)3846 zone_remove_wired_pages(zone_t z, uint32_t pages)
3847 {
3848 	os_atomic_sub(&zone_pages_wired, pages, relaxed);
3849 
3850 #if CONFIG_ZLEAKS
3851 	if (__improbable(zleak_should_disable_for_zone(z) &&
3852 	    startup_phase >= STARTUP_SUB_THREAD_CALL)) {
3853 		thread_call_enter(&zone_leaks_callout);
3854 	}
3855 #else
3856 	(void)z;
3857 #endif
3858 }
3859 
3860 #if CONFIG_KERNEL_TAGGING
3861 static inline vm_address_t
zone_tag_element(zone_t zone,vm_offset_t addr,vm_size_t elem_size)3862 zone_tag_element(zone_t zone, vm_offset_t addr, vm_size_t elem_size)
3863 {
3864 	vm_offset_t tagged_address;
3865 
3866 	tagged_address = vm_memtag_assign_tag(addr, elem_size);
3867 
3868 	vm_memtag_set_tag(tagged_address, elem_size);
3869 
3870 	if (zone->z_percpu) {
3871 		zpercpu_foreach_cpu(index) {
3872 			vm_memtag_set_tag(tagged_address + ptoa(index), elem_size);
3873 		}
3874 	}
3875 
3876 	return tagged_address;
3877 }
3878 
3879 static inline void
zcram_memtag_init(zone_t zone,vm_offset_t base,uint32_t start,uint32_t end)3880 zcram_memtag_init(zone_t zone, vm_offset_t base, uint32_t start, uint32_t end)
3881 {
3882 	vm_offset_t elem_size = zone_elem_outer_size(zone);
3883 	vm_offset_t oob_offs = zone_elem_outer_offs(zone);
3884 
3885 	for (uint32_t i = start; i < end; i++) {
3886 		vm_offset_t elem_addr = base + oob_offs + i * elem_size;
3887 
3888 		(void)zone_tag_element(zone, elem_addr, elem_size);
3889 	}
3890 }
3891 #endif /* CONFIG_KERNEL_TAGGING */
3892 
3893 /*!
3894  * @function zcram_and_lock()
3895  *
3896  * @brief
3897  * Prepare some memory for being usable for allocation purposes.
3898  *
3899  * @discussion
3900  * Prepare memory in <code>[addr + ptoa(pg_start), addr + ptoa(pg_end))</code>
3901  * to be usable in the zone.
3902  *
3903  * This function assumes the metadata is already populated for the range.
3904  *
3905  * Calling this function with @c pg_start being 0 means that the memory
3906  * is either a partial chunk, or a full chunk, that isn't published anywhere
3907  * and the initialization can happen without locks held.
3908  *
3909  * Calling this function with a non zero @c pg_start means that we are extending
3910  * an existing chunk: the memory in <code>[addr, addr + ptoa(pg_start))</code>,
3911  * is already usable and published in the zone, so extending it requires holding
3912  * the zone lock.
3913  *
3914  * @param zone          The zone to cram new populated pages into
3915  * @param addr          The base address for the chunk(s)
3916  * @param pg_va_new     The number of virtual pages newly assigned to the zone
3917  * @param pg_start      The first newly populated page relative to @a addr.
3918  * @param pg_end        The after-last newly populated page relative to @a addr.
3919  * @param lock          0 or ZM_ALLOC_SIZE_LOCK (used by early crams)
3920  */
3921 static void
zcram_and_lock(zone_t zone,vm_offset_t addr,uint32_t pg_va_new,uint32_t pg_start,uint32_t pg_end,uint16_t lock)3922 zcram_and_lock(zone_t zone, vm_offset_t addr, uint32_t pg_va_new,
3923     uint32_t pg_start, uint32_t pg_end, uint16_t lock)
3924 {
3925 	zone_id_t zindex = zone_index(zone);
3926 	vm_offset_t elem_size = zone_elem_outer_size(zone);
3927 	uint32_t free_start = 0, free_end = 0;
3928 	uint32_t oob_offs = zone_elem_outer_offs(zone);
3929 
3930 	struct zone_page_metadata *meta = zone_meta_from_addr(addr);
3931 	uint32_t chunk_pages = zone->z_chunk_pages;
3932 	bool guarded = meta->zm_guarded;
3933 
3934 	assert(pg_start < pg_end && pg_end <= chunk_pages);
3935 
3936 	if (pg_start == 0) {
3937 		uint16_t chunk_len = (uint16_t)pg_end;
3938 		uint16_t secondary_len = ZM_SECONDARY_PAGE;
3939 		bool inline_bitmap = false;
3940 
3941 		if (zone->z_percpu) {
3942 			chunk_len = 1;
3943 			secondary_len = ZM_SECONDARY_PCPU_PAGE;
3944 			assert(pg_end == zpercpu_count());
3945 		}
3946 		if (!zone->z_permanent && !zone->z_uses_tags) {
3947 			inline_bitmap = zone->z_chunk_elems <= 32 * chunk_pages;
3948 		}
3949 
3950 		free_end = (uint32_t)(ptoa(chunk_len) - oob_offs) / elem_size;
3951 
3952 		meta[0] = (struct zone_page_metadata){
3953 			.zm_index         = zindex,
3954 			.zm_guarded       = guarded,
3955 			.zm_inline_bitmap = inline_bitmap,
3956 			.zm_chunk_len     = chunk_len,
3957 			.zm_alloc_size    = lock,
3958 		};
3959 
3960 		if (!zone->z_permanent && !inline_bitmap) {
3961 			meta[0].zm_bitmap = zone_meta_bits_alloc_init(free_end,
3962 			    zone->z_chunk_elems, zone->z_uses_tags);
3963 		}
3964 
3965 		for (uint16_t i = 1; i < chunk_pages; i++) {
3966 			meta[i] = (struct zone_page_metadata){
3967 				.zm_index          = zindex,
3968 				.zm_guarded        = guarded,
3969 				.zm_inline_bitmap  = inline_bitmap,
3970 				.zm_chunk_len      = secondary_len,
3971 				.zm_page_index     = (uint8_t)i,
3972 				.zm_bitmap         = meta[0].zm_bitmap,
3973 				.zm_subchunk_len   = (uint8_t)(chunk_pages - i),
3974 			};
3975 		}
3976 
3977 		if (inline_bitmap) {
3978 			zone_meta_bits_init_inline(meta, free_end);
3979 		}
3980 	} else {
3981 		assert(!zone->z_percpu && !zone->z_permanent);
3982 
3983 		free_end = (uint32_t)(ptoa(pg_end) - oob_offs) / elem_size;
3984 		free_start = (uint32_t)(ptoa(pg_start) - oob_offs) / elem_size;
3985 	}
3986 
3987 #if CONFIG_KERNEL_TAGGING
3988 	if (__probable(zone->z_tbi_tag)) {
3989 		zcram_memtag_init(zone, addr, free_end, free_start);
3990 	}
3991 #endif /* CONFIG_KERNEL_TAGGING */
3992 
3993 #if KASAN_CLASSIC
3994 	assert(pg_start == 0); /* KASAN_CLASSIC never does partial chunks */
3995 	if (zone->z_permanent) {
3996 		kasan_poison_range(addr, ptoa(pg_end), ASAN_VALID);
3997 	} else if (zone->z_percpu) {
3998 		for (uint32_t i = 0; i < pg_end; i++) {
3999 			kasan_zmem_add(addr + ptoa(i), PAGE_SIZE,
4000 			    zone_elem_outer_size(zone),
4001 			    zone_elem_outer_offs(zone),
4002 			    zone_elem_redzone(zone));
4003 		}
4004 	} else {
4005 		kasan_zmem_add(addr, ptoa(pg_end),
4006 		    zone_elem_outer_size(zone),
4007 		    zone_elem_outer_offs(zone),
4008 		    zone_elem_redzone(zone));
4009 	}
4010 #endif /* KASAN_CLASSIC */
4011 
4012 	/*
4013 	 * Insert the initialized pages / metadatas into the right lists.
4014 	 */
4015 
4016 	zone_lock(zone);
4017 	assert(zone->z_self == zone);
4018 
4019 	if (pg_start != 0) {
4020 		assert(meta->zm_chunk_len == pg_start);
4021 
4022 		zone_meta_bits_merge(meta, free_start, free_end);
4023 		meta->zm_chunk_len = (uint16_t)pg_end;
4024 
4025 		/*
4026 		 * consume the zone_meta_lock_in_partial()
4027 		 * done in zone_expand_locked()
4028 		 */
4029 		zone_meta_alloc_size_sub(zone, meta, ZM_ALLOC_SIZE_LOCK);
4030 		zone_meta_remqueue(zone, meta);
4031 	}
4032 
4033 	if (zone->z_permanent || meta->zm_alloc_size) {
4034 		zone_meta_queue_push(zone, &zone->z_pageq_partial, meta);
4035 	} else {
4036 		zone_meta_queue_push(zone, &zone->z_pageq_empty, meta);
4037 		zone->z_wired_empty += zone->z_percpu ? 1 : pg_end;
4038 	}
4039 	if (pg_end < chunk_pages) {
4040 		/* push any non populated residual VA on z_pageq_va */
4041 		zone_meta_queue_push(zone, &zone->z_pageq_va, meta + pg_end);
4042 	}
4043 
4044 	zone->z_elems_free  += free_end - free_start;
4045 	zone->z_elems_avail += free_end - free_start;
4046 	zone->z_wired_cur   += zone->z_percpu ? 1 : pg_end - pg_start;
4047 	if (pg_va_new) {
4048 		zone->z_va_cur += zone->z_percpu ? 1 : pg_va_new;
4049 	}
4050 	if (zone->z_wired_hwm < zone->z_wired_cur) {
4051 		zone->z_wired_hwm = zone->z_wired_cur;
4052 	}
4053 
4054 #if CONFIG_ZLEAKS
4055 	if (__improbable(zleak_should_enable_for_zone(zone) &&
4056 	    startup_phase >= STARTUP_SUB_THREAD_CALL)) {
4057 		thread_call_enter(&zone_leaks_callout);
4058 	}
4059 #endif /* CONFIG_ZLEAKS */
4060 
4061 	zone_add_wired_pages(zone, pg_end - pg_start);
4062 }
4063 
4064 static void
zcram(zone_t zone,vm_offset_t addr,uint32_t pages,uint16_t lock)4065 zcram(zone_t zone, vm_offset_t addr, uint32_t pages, uint16_t lock)
4066 {
4067 	uint32_t chunk_pages = zone->z_chunk_pages;
4068 
4069 	assert(pages % chunk_pages == 0);
4070 	for (; pages > 0; pages -= chunk_pages, addr += ptoa(chunk_pages)) {
4071 		zcram_and_lock(zone, addr, chunk_pages, 0, chunk_pages, lock);
4072 		zone_unlock(zone);
4073 	}
4074 }
4075 
4076 __startup_func
4077 void
zone_cram_early(zone_t zone,vm_offset_t newmem,vm_size_t size)4078 zone_cram_early(zone_t zone, vm_offset_t newmem, vm_size_t size)
4079 {
4080 	uint32_t pages = (uint32_t)atop(size);
4081 
4082 
4083 	assert(from_zone_map(newmem, size));
4084 	assert3u(size % ptoa(zone->z_chunk_pages), ==, 0);
4085 	assert3u(startup_phase, <, STARTUP_SUB_ZALLOC);
4086 
4087 	/*
4088 	 * The early pages we move at the pmap layer can't be "depopulated"
4089 	 * because there's no vm_page_t for them.
4090 	 *
4091 	 * "Lock" them so that they never hit z_pageq_empty.
4092 	 */
4093 	vm_memtag_bzero((void *)newmem, size);
4094 	zcram(zone, newmem, pages, ZM_ALLOC_SIZE_LOCK);
4095 }
4096 
4097 /*!
4098  * @function zone_submap_alloc_sequestered_va
4099  *
4100  * @brief
4101  * Allocates VA without using vm_find_space().
4102  *
4103  * @discussion
4104  * Allocate VA quickly without using the slower vm_find_space() for cases
4105  * when the submaps are fully sequestered.
4106  *
4107  * The VM submap is used to implement the VM itself so it is always sequestered,
4108  * as it can't kmem_alloc which needs to always allocate vm entries.
4109  * However, it can use vm_map_enter() which tries to coalesce entries, which
4110  * always works, so the VM map only ever needs 2 entries (one for each end).
4111  *
4112  * The RO submap is similarly always sequestered if it exists (as a non
4113  * sequestered RO submap makes very little sense).
4114  *
4115  * The allocator is a very simple bump-allocator
4116  * that allocates from either end.
4117  */
4118 static kern_return_t
zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags,uint32_t pages,vm_offset_t * addrp)4119 zone_submap_alloc_sequestered_va(zone_security_flags_t zsflags, uint32_t pages,
4120     vm_offset_t *addrp)
4121 {
4122 	vm_size_t size = ptoa(pages);
4123 	vm_map_t map = zone_submap(zsflags);
4124 	vm_map_entry_t first, last;
4125 	vm_map_offset_t addr;
4126 
4127 	vm_map_lock(map);
4128 
4129 	first = vm_map_first_entry(map);
4130 	last = vm_map_last_entry(map);
4131 
4132 	if (first->vme_end + size > last->vme_start) {
4133 		vm_map_unlock(map);
4134 		return KERN_NO_SPACE;
4135 	}
4136 
4137 	if (zsflags.z_submap_from_end) {
4138 		last->vme_start -= size;
4139 		addr = last->vme_start;
4140 		VME_OFFSET_SET(last, addr);
4141 	} else {
4142 		addr = first->vme_end;
4143 		first->vme_end += size;
4144 	}
4145 	map->size += size;
4146 
4147 	vm_map_unlock(map);
4148 
4149 	*addrp = addr;
4150 	return KERN_SUCCESS;
4151 }
4152 
4153 void
zone_fill_initially(zone_t zone,vm_size_t nelems)4154 zone_fill_initially(zone_t zone, vm_size_t nelems)
4155 {
4156 	kma_flags_t kmaflags = KMA_NOFAIL | KMA_PERMANENT;
4157 	kern_return_t kr;
4158 	vm_offset_t addr;
4159 	uint32_t pages;
4160 	zone_security_flags_t zsflags = zone_security_config(zone);
4161 
4162 	assert(!zone->z_permanent && !zone->collectable && !zone->z_destructible);
4163 	assert(zone->z_elems_avail == 0);
4164 
4165 	kmaflags |= zone_kma_flags(zone, zsflags, Z_WAITOK);
4166 	pages = zone_alloc_pages_for_nelems(zone, nelems);
4167 	if (zone_submap_is_sequestered(zsflags)) {
4168 		kr = zone_submap_alloc_sequestered_va(zsflags, pages, &addr);
4169 		if (kr != KERN_SUCCESS) {
4170 			panic("zone_submap_alloc_sequestered_va() "
4171 			    "of %u pages failed", pages);
4172 		}
4173 		kernel_memory_populate(addr, ptoa(pages),
4174 		    kmaflags, VM_KERN_MEMORY_ZONE);
4175 	} else {
4176 		assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
4177 		kmem_alloc(zone_submap(zsflags), &addr, ptoa(pages),
4178 		    kmaflags, VM_KERN_MEMORY_ZONE);
4179 	}
4180 
4181 	zone_meta_populate(addr, ptoa(pages));
4182 	zcram(zone, addr, pages, 0);
4183 }
4184 
4185 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4186 __attribute__((noinline))
4187 static void
zone_scramble_va_and_unlock(zone_t z,struct zone_page_metadata * meta,uint32_t runs,uint32_t pages,uint32_t chunk_pages,uint64_t guard_mask)4188 zone_scramble_va_and_unlock(
4189 	zone_t                      z,
4190 	struct zone_page_metadata  *meta,
4191 	uint32_t                    runs,
4192 	uint32_t                    pages,
4193 	uint32_t                    chunk_pages,
4194 	uint64_t                    guard_mask)
4195 {
4196 	struct zone_page_metadata *arr[ZONE_CHUNK_ALLOC_SIZE / 4096];
4197 
4198 	for (uint32_t run = 0, n = 0; run < runs; run++) {
4199 		arr[run] = meta + n;
4200 		n += chunk_pages + ((guard_mask >> run) & 1);
4201 	}
4202 
4203 	/*
4204 	 * Fisher–Yates shuffle, for an array with indices [0, n)
4205 	 *
4206 	 * for i from n−1 downto 1 do
4207 	 *     j ← random integer such that 0 ≤ j ≤ i
4208 	 *     exchange a[j] and a[i]
4209 	 *
4210 	 * The point here is that early allocations aren't at a fixed
4211 	 * distance from each other.
4212 	 */
4213 	for (uint32_t i = runs - 1; i > 0; i--) {
4214 		uint32_t j = zalloc_random_uniform32(0, i + 1);
4215 
4216 		meta   = arr[j];
4217 		arr[j] = arr[i];
4218 		arr[i] = meta;
4219 	}
4220 
4221 	zone_lock(z);
4222 
4223 	for (uint32_t i = 0; i < runs; i++) {
4224 		zone_meta_queue_push(z, &z->z_pageq_va, arr[i]);
4225 	}
4226 	z->z_va_cur += z->z_percpu ? runs : pages;
4227 }
4228 
4229 static inline uint32_t
dist_u32(uint32_t a,uint32_t b)4230 dist_u32(uint32_t a, uint32_t b)
4231 {
4232 	return a < b ? b - a : a - b;
4233 }
4234 
4235 static uint64_t
zalloc_random_clear_n_bits(uint64_t mask,uint32_t pop,uint32_t n)4236 zalloc_random_clear_n_bits(uint64_t mask, uint32_t pop, uint32_t n)
4237 {
4238 	for (; n-- > 0; pop--) {
4239 		uint32_t bit = zalloc_random_uniform32(0, pop);
4240 		uint64_t m = mask;
4241 
4242 		for (; bit; bit--) {
4243 			m &= m - 1;
4244 		}
4245 
4246 		mask ^= 1ull << __builtin_ctzll(m);
4247 	}
4248 
4249 	return mask;
4250 }
4251 
4252 /**
4253  * @function zalloc_random_bits
4254  *
4255  * @brief
4256  * Compute a random number with a specified number of bit set in a given width.
4257  *
4258  * @discussion
4259  * This function generates a "uniform" distribution of sets of bits set in
4260  * a given width, with typically less than width/4 calls to random.
4261  *
4262  * @param pop           the target number of bits set.
4263  * @param width         the number of bits in the random integer to generate.
4264  */
4265 static uint64_t
zalloc_random_bits(uint32_t pop,uint32_t width)4266 zalloc_random_bits(uint32_t pop, uint32_t width)
4267 {
4268 	uint64_t w_mask = (1ull << width) - 1;
4269 	uint64_t mask;
4270 	uint32_t cur;
4271 
4272 	if (3 * width / 4 <= pop) {
4273 		mask = w_mask;
4274 		cur  = width;
4275 	} else if (pop <= width / 4) {
4276 		mask = 0;
4277 		cur  = 0;
4278 	} else {
4279 		/*
4280 		 * Chosing a random number this way will overwhelmingly
4281 		 * contain `width` bits +/- a few.
4282 		 */
4283 		mask = zalloc_random_mask64(width);
4284 		cur  = __builtin_popcountll(mask);
4285 
4286 		if (dist_u32(cur, pop) > dist_u32(width - cur, pop)) {
4287 			/*
4288 			 * If the opposite mask has a closer popcount,
4289 			 * then start with that one as the seed.
4290 			 */
4291 			cur = width - cur;
4292 			mask ^= w_mask;
4293 		}
4294 	}
4295 
4296 	if (cur < pop) {
4297 		/*
4298 		 * Setting `pop - cur` bits is really clearing that many from
4299 		 * the opposite mask.
4300 		 */
4301 		mask ^= w_mask;
4302 		mask = zalloc_random_clear_n_bits(mask, width - cur, pop - cur);
4303 		mask ^= w_mask;
4304 	} else if (pop < cur) {
4305 		mask = zalloc_random_clear_n_bits(mask, cur, cur - pop);
4306 	}
4307 
4308 	return mask;
4309 }
4310 #endif
4311 
4312 static void
zone_allocate_va_locked(zone_t z,zalloc_flags_t flags)4313 zone_allocate_va_locked(zone_t z, zalloc_flags_t flags)
4314 {
4315 	zone_security_flags_t zsflags = zone_security_config(z);
4316 	struct zone_page_metadata *meta;
4317 	kma_flags_t kmaflags = zone_kma_flags(z, zsflags, flags) | KMA_VAONLY;
4318 	uint32_t chunk_pages = z->z_chunk_pages;
4319 	uint32_t runs, pages, guards, rnum;
4320 	uint64_t guard_mask = 0;
4321 	bool     lead_guard = false;
4322 	kern_return_t kr;
4323 	vm_offset_t addr;
4324 
4325 	zone_unlock(z);
4326 
4327 	/*
4328 	 * A lot of OOB exploitation techniques rely on precise placement
4329 	 * and interleaving of zone pages. The layout that is sought
4330 	 * by attackers will be C/P/T types, where:
4331 	 * - (C)ompromised is the type for which attackers have a bug,
4332 	 * - (P)adding is used to pad memory,
4333 	 * - (T)arget is the type that the attacker will attempt to corrupt
4334 	 *   by exploiting (C).
4335 	 *
4336 	 * Note that in some cases C==T and P isn't needed.
4337 	 *
4338 	 * In order to make those placement games much harder,
4339 	 * we grow zones by random runs of memory, up to 256k.
4340 	 * This makes predicting the precise layout of the heap
4341 	 * quite more complicated.
4342 	 *
4343 	 * Note: this function makes a very heavy use of random,
4344 	 *       however, it is mostly limited to sequestered zones,
4345 	 *       and eventually the layout will be fixed,
4346 	 *       and the usage of random vastly reduced.
4347 	 *
4348 	 *       For non sequestered zones, there's a single call
4349 	 *       to random in order to decide whether we want
4350 	 *       a guard page or not.
4351 	 */
4352 	pages  = chunk_pages;
4353 	guards = 0;
4354 	runs   = 1;
4355 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4356 	if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) {
4357 		pages = atop(ZONE_CHUNK_ALLOC_SIZE);
4358 		runs  = (pages + chunk_pages - 1) / chunk_pages;
4359 		runs  = zalloc_random_uniform32(1, runs + 1);
4360 		pages = runs * chunk_pages;
4361 	}
4362 	static_assert(ZONE_CHUNK_ALLOC_SIZE / 4096 <= 64,
4363 	    "make sure that `runs` will never be larger than 64");
4364 #endif /* !ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4365 
4366 	/*
4367 	 * Zones that are suceptible to OOB (kalloc, ZC_PGZ_USE_GUARDS),
4368 	 * guards might be added after each chunk.
4369 	 *
4370 	 * Those guard pages are marked with the ZM_PGZ_GUARD
4371 	 * magical chunk len, and their zm_oob_offs field
4372 	 * is used to remember optional shift applied
4373 	 * to returned elements, in order to right-align-them
4374 	 * as much as possible.
4375 	 *
4376 	 * In an adversarial context, while guard pages
4377 	 * are extremely effective against linear overflow,
4378 	 * using a predictable density of guard pages feels like
4379 	 * a missed opportunity. Which is why we chose to insert
4380 	 * one guard page for about 32k of memory, and place it
4381 	 * randomly.
4382 	 */
4383 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4384 	if (z->z_percpu) {
4385 		/*
4386 		 * For per-cpu runs, have a 75% chance to have a guard.
4387 		 */
4388 		rnum = zalloc_random_uniform32(0, 4 * 128);
4389 		guards = rnum >= 128;
4390 	} else if (!zsflags.z_pgz_use_guards && !z->z_pgz_use_guards) {
4391 		vm_offset_t rest;
4392 
4393 		/*
4394 		 * For types that are less susceptible to have OOBs,
4395 		 * have a density of 1 guard every 64k, with a uniform
4396 		 * distribution.
4397 		 */
4398 		rnum   = zalloc_random_uniform32(0, ZONE_GUARD_SPARSE);
4399 		guards = (uint32_t)ptoa(pages) / ZONE_GUARD_SPARSE;
4400 		rest   = (uint32_t)ptoa(pages) % ZONE_GUARD_SPARSE;
4401 		guards += rnum < rest;
4402 	} else if (ptoa(chunk_pages) >= ZONE_GUARD_DENSE) {
4403 		/*
4404 		 * For chunks >= 32k, have a 75% chance of guard pages
4405 		 * between chunks.
4406 		 */
4407 		rnum = zalloc_random_uniform32(65, 129);
4408 		guards = runs * rnum / 128;
4409 	} else {
4410 		vm_offset_t rest;
4411 
4412 		/*
4413 		 * Otherwise, aim at 1 guard every 32k,
4414 		 * with a uniform distribution.
4415 		 */
4416 		rnum   = zalloc_random_uniform32(0, ZONE_GUARD_DENSE);
4417 		guards = (uint32_t)ptoa(pages) / ZONE_GUARD_DENSE;
4418 		rest   = (uint32_t)ptoa(pages) % ZONE_GUARD_DENSE;
4419 		guards += rnum < rest;
4420 	}
4421 	assert3u(guards, <=, runs);
4422 
4423 	guard_mask = 0;
4424 
4425 	if (!z->z_percpu && zone_submap_is_sequestered(zsflags)) {
4426 		uint32_t g = 0;
4427 
4428 		/*
4429 		 * Several exploitation strategies rely on a C/T (compromised
4430 		 * then target types) ordering of pages with a sub-page reach
4431 		 * from C into T.
4432 		 *
4433 		 * We want to reliably thwart such exploitations
4434 		 * and hence force a guard page between alternating
4435 		 * memory types.
4436 		 */
4437 		guard_mask |= 1ull << (runs - 1);
4438 		g++;
4439 
4440 		/*
4441 		 * While we randomize the chunks lengths, an attacker with
4442 		 * precise timing control can guess when overflows happen,
4443 		 * and "measure" the runs, which gives them an indication
4444 		 * of where the next run start offset is.
4445 		 *
4446 		 * In order to make this knowledge unusable, add a guard page
4447 		 * _before_ the new run with a 25% probability, regardless
4448 		 * of whether we had enough guard pages.
4449 		 */
4450 		if ((rnum & 3) == 0) {
4451 			lead_guard = true;
4452 			g++;
4453 		}
4454 		if (guards > g) {
4455 			guard_mask |= zalloc_random_bits(guards - g, runs - 1);
4456 		} else {
4457 			guards = g;
4458 		}
4459 	} else {
4460 		assert3u(runs, ==, 1);
4461 		assert3u(guards, <=, 1);
4462 		guard_mask = guards << (runs - 1);
4463 	}
4464 #else
4465 	(void)rnum;
4466 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4467 
4468 	if (zone_submap_is_sequestered(zsflags)) {
4469 		kr = zone_submap_alloc_sequestered_va(zsflags,
4470 		    pages + guards, &addr);
4471 	} else {
4472 		assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
4473 		kr = kmem_alloc(zone_submap(zsflags), &addr,
4474 		    ptoa(pages + guards), kmaflags, VM_KERN_MEMORY_ZONE);
4475 	}
4476 
4477 	if (kr != KERN_SUCCESS) {
4478 		uint64_t zone_size = 0;
4479 		zone_t zone_largest = zone_find_largest(&zone_size);
4480 		panic("zalloc[%d]: zone map exhausted while allocating from zone [%s%s], "
4481 		    "likely due to memory leak in zone [%s%s] "
4482 		    "(%u%c, %d elements allocated)",
4483 		    kr, zone_heap_name(z), zone_name(z),
4484 		    zone_heap_name(zone_largest), zone_name(zone_largest),
4485 		    mach_vm_size_pretty(zone_size),
4486 		    mach_vm_size_unit(zone_size),
4487 		    zone_count_allocated(zone_largest));
4488 	}
4489 
4490 	meta = zone_meta_from_addr(addr);
4491 	zone_meta_populate(addr, ptoa(pages + guards));
4492 
4493 	/*
4494 	 * Handle the leading guard page if any
4495 	 */
4496 	if (lead_guard) {
4497 		meta[0].zm_index = zone_index(z);
4498 		meta[0].zm_chunk_len = ZM_PGZ_GUARD;
4499 		meta[0].zm_guarded = true;
4500 		meta++;
4501 	}
4502 
4503 	for (uint32_t run = 0, n = 0; run < runs; run++) {
4504 		bool guarded = (guard_mask >> run) & 1;
4505 
4506 		for (uint32_t i = 0; i < chunk_pages; i++, n++) {
4507 			meta[n].zm_index = zone_index(z);
4508 			meta[n].zm_guarded = guarded;
4509 		}
4510 		if (guarded) {
4511 			meta[n].zm_index = zone_index(z);
4512 			meta[n].zm_chunk_len = ZM_PGZ_GUARD;
4513 			n++;
4514 		}
4515 	}
4516 	if (guards) {
4517 		os_atomic_add(&zone_guard_pages, guards, relaxed);
4518 	}
4519 
4520 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
4521 	if (__improbable(zone_caching_disabled < 0)) {
4522 		return zone_scramble_va_and_unlock(z, meta, runs, pages,
4523 		           chunk_pages, guard_mask);
4524 	}
4525 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
4526 
4527 	zone_lock(z);
4528 
4529 	for (uint32_t run = 0, n = 0; run < runs; run++) {
4530 		zone_meta_queue_push(z, &z->z_pageq_va, meta + n);
4531 		n += chunk_pages + ((guard_mask >> run) & 1);
4532 	}
4533 	z->z_va_cur += z->z_percpu ? runs : pages;
4534 }
4535 
4536 static inline void
ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size)4537 ZONE_TRACE_VM_KERN_REQUEST_START(vm_size_t size)
4538 {
4539 #if DEBUG || DEVELOPMENT
4540 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START,
4541 	    size, 0, 0, 0);
4542 #else
4543 	(void)size;
4544 #endif
4545 }
4546 
4547 static inline void
ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages)4548 ZONE_TRACE_VM_KERN_REQUEST_END(uint32_t pages)
4549 {
4550 #if DEBUG || DEVELOPMENT
4551 	task_t task = current_task_early();
4552 	if (pages && task) {
4553 		ledger_credit(task->ledger, task_ledgers.pages_grabbed_kern, pages);
4554 	}
4555 	VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END,
4556 	    pages, 0, 0, 0);
4557 #else
4558 	(void)pages;
4559 #endif
4560 }
4561 
4562 __attribute__((noinline))
4563 static void
__ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(zone_t z,uint32_t pgs)4564 __ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(zone_t z, uint32_t pgs)
4565 {
4566 	uint64_t wait_start = 0;
4567 	long mapped;
4568 
4569 	thread_wakeup(VM_PAGEOUT_GC_EVENT);
4570 
4571 	if (zone_supports_vm(z) || (current_thread()->options & TH_OPT_VMPRIV)) {
4572 		return;
4573 	}
4574 
4575 	mapped = os_atomic_load(&zone_pages_wired, relaxed);
4576 
4577 	/*
4578 	 * If the zone map is really exhausted, wait on the GC thread,
4579 	 * donating our priority (which is important because the GC
4580 	 * thread is at a rather low priority).
4581 	 */
4582 	for (uint32_t n = 1; mapped >= zone_pages_wired_max - pgs; n++) {
4583 		uint32_t wait_ms = n * (n + 1) / 2;
4584 		uint64_t interval;
4585 
4586 		if (n == 1) {
4587 			wait_start = mach_absolute_time();
4588 		} else {
4589 			thread_wakeup(VM_PAGEOUT_GC_EVENT);
4590 		}
4591 		if (zone_exhausted_timeout > 0 &&
4592 		    wait_ms > zone_exhausted_timeout) {
4593 			panic("zone map exhaustion: waited for %dms "
4594 			    "(pages: %ld, max: %ld, wanted: %d)",
4595 			    wait_ms, mapped, zone_pages_wired_max, pgs);
4596 		}
4597 
4598 		clock_interval_to_absolutetime_interval(wait_ms, NSEC_PER_MSEC,
4599 		    &interval);
4600 
4601 		lck_spin_lock(&zone_exhausted_lock);
4602 		lck_spin_sleep_with_inheritor(&zone_exhausted_lock,
4603 		    LCK_SLEEP_UNLOCK, &zone_pages_wired,
4604 		    vm_pageout_gc_thread, THREAD_UNINT, wait_start + interval);
4605 
4606 		mapped = os_atomic_load(&zone_pages_wired, relaxed);
4607 	}
4608 }
4609 
4610 static bool
zone_expand_wait_for_pages(bool waited)4611 zone_expand_wait_for_pages(bool waited)
4612 {
4613 	if (waited) {
4614 		return false;
4615 	}
4616 #if DEBUG || DEVELOPMENT
4617 	if (zalloc_simulate_vm_pressure) {
4618 		return false;
4619 	}
4620 #endif /* DEBUG || DEVELOPMENT */
4621 	return !vm_pool_low();
4622 }
4623 
4624 static inline void
zone_expand_async_schedule_if_allowed(zone_t zone)4625 zone_expand_async_schedule_if_allowed(zone_t zone)
4626 {
4627 	if (zone->z_async_refilling || zone->no_callout) {
4628 		return;
4629 	}
4630 
4631 	if (zone_exhausted(zone)) {
4632 		return;
4633 	}
4634 
4635 	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
4636 		return;
4637 	}
4638 
4639 	if (!vm_pool_low() || zone_supports_vm(zone)) {
4640 		zone->z_async_refilling = true;
4641 		thread_call_enter(&zone_expand_callout);
4642 	}
4643 }
4644 
4645 __attribute__((noinline))
4646 static bool
zalloc_expand_drain_exhausted_caches_locked(zone_t z)4647 zalloc_expand_drain_exhausted_caches_locked(zone_t z)
4648 {
4649 	struct zone_depot zd;
4650 	zone_magazine_t mag = NULL;
4651 
4652 	if (z->z_depot_size) {
4653 		z->z_depot_size = 0;
4654 		z->z_depot_cleanup = true;
4655 
4656 		zone_depot_init(&zd);
4657 		zone_depot_trim(z, 0, &zd);
4658 
4659 		zone_recirc_lock_nopreempt(z);
4660 		if (zd.zd_full) {
4661 			zone_depot_move_full(&z->z_recirc,
4662 			    &zd, zd.zd_full, NULL);
4663 		}
4664 		if (zd.zd_empty) {
4665 			zone_depot_move_empty(&z->z_recirc,
4666 			    &zd, zd.zd_empty, NULL);
4667 		}
4668 		zone_recirc_unlock_nopreempt(z);
4669 	}
4670 
4671 	zone_recirc_lock_nopreempt(z);
4672 	if (z->z_recirc.zd_full) {
4673 		mag = zone_depot_pop_head_full(&z->z_recirc, z);
4674 	}
4675 	zone_recirc_unlock_nopreempt(z);
4676 
4677 	if (mag) {
4678 		zone_reclaim_elements(z, zc_mag_size(), mag->zm_elems);
4679 		zone_magazine_free(mag);
4680 	}
4681 
4682 	return mag != NULL;
4683 }
4684 
4685 static bool
zalloc_needs_refill(zone_t zone,zalloc_flags_t flags)4686 zalloc_needs_refill(zone_t zone, zalloc_flags_t flags)
4687 {
4688 	if (zone->z_elems_free > zone->z_elems_rsv) {
4689 		return false;
4690 	}
4691 	if (!zone_exhausted(zone)) {
4692 		return true;
4693 	}
4694 	if (zone->z_pcpu_cache && zone->z_depot_size) {
4695 		if (zalloc_expand_drain_exhausted_caches_locked(zone)) {
4696 			return false;
4697 		}
4698 	}
4699 	return (flags & Z_NOFAIL) != 0;
4700 }
4701 
4702 static void
zone_wakeup_exhausted_waiters(zone_t z)4703 zone_wakeup_exhausted_waiters(zone_t z)
4704 {
4705 	z->z_exhausted_wait = false;
4706 	EVENT_INVOKE(ZONE_EXHAUSTED, zone_index(z), z, false);
4707 	thread_wakeup(&z->z_expander);
4708 }
4709 
4710 __attribute__((noinline))
4711 static void
__ZONE_EXHAUSTED_AND_WAITING_HARD__(zone_t z)4712 __ZONE_EXHAUSTED_AND_WAITING_HARD__(zone_t z)
4713 {
4714 	if (z->z_pcpu_cache && z->z_depot_size &&
4715 	    zalloc_expand_drain_exhausted_caches_locked(z)) {
4716 		return;
4717 	}
4718 
4719 	if (!z->z_exhausted_wait) {
4720 		zone_recirc_lock_nopreempt(z);
4721 		z->z_exhausted_wait = true;
4722 		zone_recirc_unlock_nopreempt(z);
4723 		EVENT_INVOKE(ZONE_EXHAUSTED, zone_index(z), z, true);
4724 	}
4725 
4726 	assert_wait(&z->z_expander, TH_UNINT);
4727 	zone_unlock(z);
4728 	thread_block(THREAD_CONTINUE_NULL);
4729 	zone_lock(z);
4730 }
4731 
4732 static pmap_mapping_type_t
zone_mapping_type(zone_t z)4733 zone_mapping_type(zone_t z)
4734 {
4735 	zone_security_flags_t zsflags = zone_security_config(z);
4736 
4737 	/*
4738 	 * If the zone has z_submap_idx is not Z_SUBMAP_IDX_DATA or
4739 	 * Z_SUBMAP_IDX_READ_ONLY, mark the corresponding mapping
4740 	 * type as PMAP_MAPPING_TYPE_RESTRICTED.
4741 	 */
4742 	switch (zsflags.z_submap_idx) {
4743 	case Z_SUBMAP_IDX_DATA:
4744 		return PMAP_MAPPING_TYPE_DEFAULT;
4745 	case Z_SUBMAP_IDX_READ_ONLY:
4746 		return PMAP_MAPPING_TYPE_ROZONE;
4747 	default:
4748 		return PMAP_MAPPING_TYPE_RESTRICTED;
4749 	}
4750 }
4751 
4752 static vm_prot_t
zone_page_prot(zone_security_flags_t zsflags)4753 zone_page_prot(zone_security_flags_t zsflags)
4754 {
4755 	switch (zsflags.z_submap_idx) {
4756 	case Z_SUBMAP_IDX_READ_ONLY:
4757 		return VM_PROT_READ;
4758 	default:
4759 		return VM_PROT_READ | VM_PROT_WRITE;
4760 	}
4761 }
4762 
4763 static void
zone_expand_locked(zone_t z,zalloc_flags_t flags)4764 zone_expand_locked(zone_t z, zalloc_flags_t flags)
4765 {
4766 	zone_security_flags_t zsflags = zone_security_config(z);
4767 	struct zone_expand ze = {
4768 		.ze_thread  = current_thread(),
4769 	};
4770 
4771 	if (!(ze.ze_thread->options & TH_OPT_VMPRIV) && zone_supports_vm(z)) {
4772 		ze.ze_thread->options |= TH_OPT_VMPRIV;
4773 		ze.ze_clear_priv = true;
4774 	}
4775 
4776 	if (ze.ze_thread->options & TH_OPT_VMPRIV) {
4777 		/*
4778 		 * When the thread is VM privileged,
4779 		 * vm_page_grab() will call VM_PAGE_WAIT()
4780 		 * without our knowledge, so we must assume
4781 		 * it's being called unfortunately.
4782 		 *
4783 		 * In practice it's not a big deal because
4784 		 * Z_NOPAGEWAIT is not really used on zones
4785 		 * that VM privileged threads are going to expand.
4786 		 */
4787 		ze.ze_pg_wait = true;
4788 		ze.ze_vm_priv = true;
4789 	}
4790 
4791 	for (;;) {
4792 		if (!z->z_permanent && !zalloc_needs_refill(z, flags)) {
4793 			goto out;
4794 		}
4795 
4796 		if (z->z_expander == NULL) {
4797 			z->z_expander = &ze;
4798 			break;
4799 		}
4800 
4801 		if (ze.ze_vm_priv && !z->z_expander->ze_vm_priv) {
4802 			change_sleep_inheritor(&z->z_expander, ze.ze_thread);
4803 			ze.ze_next = z->z_expander;
4804 			z->z_expander = &ze;
4805 			break;
4806 		}
4807 
4808 		if ((flags & Z_NOPAGEWAIT) && z->z_expander->ze_pg_wait) {
4809 			goto out;
4810 		}
4811 
4812 		z->z_expanding_wait = true;
4813 		hw_lck_ticket_sleep_with_inheritor(&z->z_lock, &zone_locks_grp,
4814 		    LCK_SLEEP_DEFAULT, &z->z_expander, z->z_expander->ze_thread,
4815 		    TH_UNINT, TIMEOUT_WAIT_FOREVER);
4816 	}
4817 
4818 	do {
4819 		struct zone_page_metadata *meta = NULL;
4820 		uint32_t new_va = 0, cur_pages = 0, min_pages = 0, pages = 0;
4821 		vm_page_t page_list = NULL;
4822 		vm_offset_t addr = 0;
4823 		int waited = 0;
4824 
4825 		if ((flags & Z_NOFAIL) && zone_exhausted(z)) {
4826 			__ZONE_EXHAUSTED_AND_WAITING_HARD__(z);
4827 			continue; /* reevaluate if we really need it */
4828 		}
4829 
4830 		/*
4831 		 * While we hold the zone lock, look if there's VA we can:
4832 		 * - complete from partial pages,
4833 		 * - reuse from the sequester list.
4834 		 *
4835 		 * When the page is being populated we pretend we allocated
4836 		 * an extra element so that zone_gc() can't attempt to free
4837 		 * the chunk (as it could become empty while we wait for pages).
4838 		 */
4839 		if (zone_pva_is_null(z->z_pageq_va)) {
4840 			zone_allocate_va_locked(z, flags);
4841 		}
4842 
4843 		meta = zone_meta_queue_pop(z, &z->z_pageq_va);
4844 		addr = zone_meta_to_addr(meta);
4845 		if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
4846 			cur_pages = meta->zm_page_index;
4847 			meta -= cur_pages;
4848 			addr -= ptoa(cur_pages);
4849 			zone_meta_lock_in_partial(z, meta, cur_pages);
4850 		}
4851 		zone_unlock(z);
4852 
4853 		/*
4854 		 * And now allocate pages to populate our VA.
4855 		 */
4856 		min_pages = z->z_chunk_pages;
4857 #if !KASAN_CLASSIC
4858 		if (!z->z_percpu) {
4859 			min_pages = (uint32_t)atop(round_page(zone_elem_outer_offs(z) +
4860 			    zone_elem_outer_size(z)));
4861 		}
4862 #endif /* !KASAN_CLASSIC */
4863 
4864 		/*
4865 		 * Trigger jetsams via VM_PAGEOUT_GC_EVENT
4866 		 * if we're running out of zone memory
4867 		 */
4868 		if (__improbable(zone_map_nearing_exhaustion())) {
4869 			__ZONE_MAP_EXHAUSTED_AND_WAITING_FOR_GC__(z, min_pages);
4870 		}
4871 
4872 		ZONE_TRACE_VM_KERN_REQUEST_START(ptoa(z->z_chunk_pages - cur_pages));
4873 
4874 		while (pages < z->z_chunk_pages - cur_pages) {
4875 			vm_page_t m = vm_page_grab();
4876 
4877 			if (m) {
4878 				pages++;
4879 				m->vmp_snext = page_list;
4880 				page_list = m;
4881 				vm_page_zero_fill(m);
4882 				continue;
4883 			}
4884 
4885 			if (pages >= min_pages &&
4886 			    !zone_expand_wait_for_pages(waited)) {
4887 				break;
4888 			}
4889 
4890 			if ((flags & Z_NOPAGEWAIT) == 0) {
4891 				/*
4892 				 * The first time we're about to wait for pages,
4893 				 * mention that to waiters and wake them all.
4894 				 *
4895 				 * Set `ze_pg_wait` in our zone_expand context
4896 				 * so that waiters who care do not wait again.
4897 				 */
4898 				if (!ze.ze_pg_wait) {
4899 					zone_lock(z);
4900 					if (z->z_expanding_wait) {
4901 						z->z_expanding_wait = false;
4902 						wakeup_all_with_inheritor(&z->z_expander,
4903 						    THREAD_AWAKENED);
4904 					}
4905 					ze.ze_pg_wait = true;
4906 					zone_unlock(z);
4907 				}
4908 
4909 				waited++;
4910 				VM_PAGE_WAIT();
4911 				continue;
4912 			}
4913 
4914 			/*
4915 			 * Undo everything and bail out:
4916 			 *
4917 			 * - free pages
4918 			 * - undo the fake allocation if any
4919 			 * - put the VA back on the VA page queue.
4920 			 */
4921 			vm_page_free_list(page_list, FALSE);
4922 			ZONE_TRACE_VM_KERN_REQUEST_END(pages);
4923 
4924 			zone_lock(z);
4925 
4926 			zone_expand_async_schedule_if_allowed(z);
4927 
4928 			if (cur_pages) {
4929 				zone_meta_unlock_from_partial(z, meta, cur_pages);
4930 			}
4931 			if (meta) {
4932 				zone_meta_queue_push(z, &z->z_pageq_va,
4933 				    meta + cur_pages);
4934 			}
4935 			goto page_shortage;
4936 		}
4937 
4938 		vm_object_t object;
4939 		object = kernel_object_default;
4940 		vm_object_lock(object);
4941 
4942 		kernel_memory_populate_object_and_unlock(object,
4943 		    addr + ptoa(cur_pages), addr + ptoa(cur_pages), ptoa(pages), page_list,
4944 		    zone_kma_flags(z, zsflags, flags), VM_KERN_MEMORY_ZONE,
4945 		    zone_page_prot(zsflags), zone_mapping_type(z));
4946 
4947 		ZONE_TRACE_VM_KERN_REQUEST_END(pages);
4948 
4949 		zcram_and_lock(z, addr, new_va, cur_pages, cur_pages + pages, 0);
4950 
4951 		/*
4952 		 * permanent zones only try once,
4953 		 * the retry loop is in the caller
4954 		 */
4955 	} while (!z->z_permanent && zalloc_needs_refill(z, flags));
4956 
4957 page_shortage:
4958 	if (z->z_expander == &ze) {
4959 		z->z_expander = ze.ze_next;
4960 	} else {
4961 		assert(z->z_expander->ze_next == &ze);
4962 		z->z_expander->ze_next = NULL;
4963 	}
4964 	if (z->z_expanding_wait) {
4965 		z->z_expanding_wait = false;
4966 		wakeup_all_with_inheritor(&z->z_expander, THREAD_AWAKENED);
4967 	}
4968 out:
4969 	if (ze.ze_clear_priv) {
4970 		ze.ze_thread->options &= ~TH_OPT_VMPRIV;
4971 	}
4972 }
4973 
4974 static void
zone_expand_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)4975 zone_expand_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
4976 {
4977 	zone_foreach(z) {
4978 		if (z->no_callout) {
4979 			/* z_async_refilling will never be set */
4980 			continue;
4981 		}
4982 
4983 		if (!z->z_async_refilling) {
4984 			/*
4985 			 * avoid locking all zones, because the one(s)
4986 			 * we're looking for have been set _before_
4987 			 * thread_call_enter() was called, if we fail
4988 			 * to observe the bit, it means the thread-call
4989 			 * has been "dinged" again and we'll notice it then.
4990 			 */
4991 			continue;
4992 		}
4993 
4994 		zone_lock(z);
4995 		if (z->z_self && z->z_async_refilling) {
4996 			zone_expand_locked(z, Z_WAITOK);
4997 			/*
4998 			 * clearing _after_ we grow is important,
4999 			 * so that we avoid waking up the thread call
5000 			 * while we grow and cause to run a second time.
5001 			 */
5002 			z->z_async_refilling = false;
5003 		}
5004 		zone_unlock(z);
5005 	}
5006 }
5007 
5008 #endif /* !ZALLOC_TEST */
5009 #pragma mark zone jetsam integration
5010 #if !ZALLOC_TEST
5011 
5012 /*
5013  * We're being very conservative here and picking a value of 95%. We might need to lower this if
5014  * we find that we're not catching the problem and are still hitting zone map exhaustion panics.
5015  */
5016 #define ZONE_MAP_JETSAM_LIMIT_DEFAULT 95
5017 
5018 /*
5019  * Threshold above which largest zones should be included in the panic log
5020  */
5021 #define ZONE_MAP_EXHAUSTION_PRINT_PANIC 80
5022 
5023 /*
5024  * Trigger zone-map-exhaustion jetsams if the zone map is X% full,
5025  * where X=zone_map_jetsam_limit.
5026  *
5027  * Can be set via boot-arg "zone_map_jetsam_limit". Set to 95% by default.
5028  */
5029 TUNABLE_WRITEABLE(unsigned int, zone_map_jetsam_limit, "zone_map_jetsam_limit",
5030     ZONE_MAP_JETSAM_LIMIT_DEFAULT);
5031 
5032 kern_return_t
zone_map_jetsam_set_limit(uint32_t value)5033 zone_map_jetsam_set_limit(uint32_t value)
5034 {
5035 	if (value <= 0 || value > 100) {
5036 		return KERN_INVALID_VALUE;
5037 	}
5038 
5039 	zone_map_jetsam_limit = value;
5040 	os_atomic_store(&zone_pages_jetsam_threshold,
5041 	    zone_pages_wired_max * value / 100, relaxed);
5042 	return KERN_SUCCESS;
5043 }
5044 
5045 void
get_zone_map_size(uint64_t * current_size,uint64_t * capacity)5046 get_zone_map_size(uint64_t *current_size, uint64_t *capacity)
5047 {
5048 	vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
5049 	*current_size = ptoa_64(phys_pages);
5050 	*capacity = ptoa_64(zone_pages_wired_max);
5051 }
5052 
5053 void
get_largest_zone_info(char * zone_name,size_t zone_name_len,uint64_t * zone_size)5054 get_largest_zone_info(char *zone_name, size_t zone_name_len, uint64_t *zone_size)
5055 {
5056 	zone_t largest_zone = zone_find_largest(zone_size);
5057 
5058 	/*
5059 	 * Append kalloc heap name to zone name (if zone is used by kalloc)
5060 	 */
5061 	snprintf(zone_name, zone_name_len, "%s%s",
5062 	    zone_heap_name(largest_zone), largest_zone->z_name);
5063 }
5064 
5065 static bool
zone_map_nearing_threshold(unsigned int threshold)5066 zone_map_nearing_threshold(unsigned int threshold)
5067 {
5068 	uint64_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
5069 	return phys_pages * 100 > zone_pages_wired_max * threshold;
5070 }
5071 
5072 bool
zone_map_nearing_exhaustion(void)5073 zone_map_nearing_exhaustion(void)
5074 {
5075 	vm_size_t pages = os_atomic_load(&zone_pages_wired, relaxed);
5076 
5077 	return pages >= os_atomic_load(&zone_pages_jetsam_threshold, relaxed);
5078 }
5079 
5080 
5081 #define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO 98
5082 
5083 /*
5084  * Tries to kill a single process if it can attribute one to the largest zone. If not, wakes up the memorystatus thread
5085  * to walk through the jetsam priority bands and kill processes.
5086  */
5087 static zone_t
kill_process_in_largest_zone(void)5088 kill_process_in_largest_zone(void)
5089 {
5090 	pid_t pid = -1;
5091 	uint64_t zone_size = 0;
5092 	zone_t largest_zone = zone_find_largest(&zone_size);
5093 
5094 	printf("zone_map_exhaustion: Zone mapped %lld of %lld, used %lld, capacity %lld [jetsam limit %d%%]\n",
5095 	    ptoa_64(os_atomic_load(&zone_pages_wired, relaxed)),
5096 	    ptoa_64(zone_pages_wired_max),
5097 	    (uint64_t)zone_submaps_approx_size(),
5098 	    (uint64_t)mach_vm_range_size(&zone_info.zi_map_range),
5099 	    zone_map_jetsam_limit);
5100 	printf("zone_map_exhaustion: Largest zone %s%s, size %lu\n", zone_heap_name(largest_zone),
5101 	    largest_zone->z_name, (uintptr_t)zone_size);
5102 
5103 	/*
5104 	 * We want to make sure we don't call this function from userspace.
5105 	 * Or we could end up trying to synchronously kill the process
5106 	 * whose context we're in, causing the system to hang.
5107 	 */
5108 	assert(current_task() == kernel_task);
5109 
5110 	/*
5111 	 * If vm_object_zone is the largest, check to see if the number of
5112 	 * elements in vm_map_entry_zone is comparable.
5113 	 *
5114 	 * If so, consider vm_map_entry_zone as the largest. This lets us target
5115 	 * a specific process to jetsam to quickly recover from the zone map
5116 	 * bloat.
5117 	 */
5118 	if (largest_zone == vm_object_zone) {
5119 		unsigned int vm_object_zone_count = zone_count_allocated(vm_object_zone);
5120 		unsigned int vm_map_entry_zone_count = zone_count_allocated(vm_map_entry_zone);
5121 		/* Is the VM map entries zone count >= 98% of the VM objects zone count? */
5122 		if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
5123 			largest_zone = vm_map_entry_zone;
5124 			printf("zone_map_exhaustion: Picking VM map entries as the zone to target, size %lu\n",
5125 			    (uintptr_t)zone_size_wired(largest_zone));
5126 		}
5127 	}
5128 
5129 	/* TODO: Extend this to check for the largest process in other zones as well. */
5130 	if (largest_zone == vm_map_entry_zone) {
5131 		pid = find_largest_process_vm_map_entries();
5132 	} else {
5133 		printf("zone_map_exhaustion: Nothing to do for the largest zone [%s%s]. "
5134 		    "Waking up memorystatus thread.\n", zone_heap_name(largest_zone),
5135 		    largest_zone->z_name);
5136 	}
5137 	if (!memorystatus_kill_on_zone_map_exhaustion(pid)) {
5138 		printf("zone_map_exhaustion: Call to memorystatus failed, victim pid: %d\n", pid);
5139 	}
5140 
5141 	return largest_zone;
5142 }
5143 
5144 #endif /* !ZALLOC_TEST */
5145 #pragma mark probabilistic gzalloc
5146 #if !ZALLOC_TEST
5147 #if CONFIG_PROB_GZALLOC
5148 
5149 extern uint32_t random(void);
5150 struct pgz_backtrace {
5151 	uint32_t  pgz_depth;
5152 	int32_t   pgz_bt[MAX_ZTRACE_DEPTH];
5153 };
5154 
5155 static int32_t  PERCPU_DATA(pgz_sample_counter);
5156 static SECURITY_READ_ONLY_LATE(struct pgz_backtrace *) pgz_backtraces;
5157 static uint32_t pgz_uses;       /* number of zones using PGZ */
5158 static int32_t  pgz_slot_avail;
5159 #if OS_ATOMIC_HAS_LLSC
5160 struct zone_page_metadata *pgz_slot_head;
5161 #else
5162 static struct pgz_slot_head {
5163 	uint32_t psh_count;
5164 	uint32_t psh_slot;
5165 } pgz_slot_head;
5166 #endif
5167 struct zone_page_metadata *pgz_slot_tail;
5168 static SECURITY_READ_ONLY_LATE(vm_map_t) pgz_submap;
5169 
5170 static struct zone_page_metadata *
pgz_meta(uint32_t index)5171 pgz_meta(uint32_t index)
5172 {
5173 	return &zone_info.zi_pgz_meta[2 * index + 1];
5174 }
5175 
5176 static struct pgz_backtrace *
pgz_bt(uint32_t slot,bool free)5177 pgz_bt(uint32_t slot, bool free)
5178 {
5179 	return &pgz_backtraces[2 * slot + free];
5180 }
5181 
5182 static void
pgz_backtrace(struct pgz_backtrace * bt,void * fp)5183 pgz_backtrace(struct pgz_backtrace *bt, void *fp)
5184 {
5185 	struct backtrace_control ctl = {
5186 		.btc_frame_addr = (uintptr_t)fp,
5187 	};
5188 
5189 	bt->pgz_depth = (uint32_t)backtrace_packed(BTP_KERN_OFFSET_32,
5190 	    (uint8_t *)bt->pgz_bt, sizeof(bt->pgz_bt), &ctl, NULL) / 4;
5191 }
5192 
5193 static uint32_t
pgz_slot(vm_offset_t addr)5194 pgz_slot(vm_offset_t addr)
5195 {
5196 	return (uint32_t)((addr - zone_info.zi_pgz_range.min_address) >> (PAGE_SHIFT + 1));
5197 }
5198 
5199 static vm_offset_t
pgz_addr(uint32_t slot)5200 pgz_addr(uint32_t slot)
5201 {
5202 	return zone_info.zi_pgz_range.min_address + ptoa(2 * slot + 1);
5203 }
5204 
5205 static bool
pgz_sample(vm_offset_t addr,vm_size_t esize)5206 pgz_sample(vm_offset_t addr, vm_size_t esize)
5207 {
5208 	int32_t *counterp, cnt;
5209 
5210 	if (zone_addr_size_crosses_page(addr, esize)) {
5211 		return false;
5212 	}
5213 
5214 	/*
5215 	 * Note: accessing pgz_sample_counter is racy but this is
5216 	 *       kind of acceptable given that this is not
5217 	 *       a security load bearing feature.
5218 	 */
5219 
5220 	counterp = PERCPU_GET(pgz_sample_counter);
5221 	cnt = *counterp;
5222 	if (__probable(cnt > 0)) {
5223 		*counterp = cnt - 1;
5224 		return false;
5225 	}
5226 
5227 	if (pgz_slot_avail <= 0) {
5228 		return false;
5229 	}
5230 
5231 	/*
5232 	 * zalloc_random_uniform() might block, so when preemption is disabled,
5233 	 * set the counter to `-1` which will cause the next allocation
5234 	 * that can block to generate a new random value.
5235 	 *
5236 	 * No allocation on this CPU will sample until then.
5237 	 */
5238 	if (get_preemption_level()) {
5239 		*counterp = -1;
5240 	} else {
5241 		*counterp = zalloc_random_uniform32(0, 2 * pgz_sample_rate);
5242 	}
5243 
5244 	return cnt == 0;
5245 }
5246 
5247 static inline bool
pgz_slot_alloc(uint32_t * slot)5248 pgz_slot_alloc(uint32_t *slot)
5249 {
5250 	struct zone_page_metadata *m;
5251 	uint32_t tries = 100;
5252 
5253 	disable_preemption();
5254 
5255 #if OS_ATOMIC_USE_LLSC
5256 	int32_t ov, nv;
5257 	os_atomic_rmw_loop(&pgz_slot_avail, ov, nv, relaxed, {
5258 		if (__improbable(ov <= 0)) {
5259 		        os_atomic_rmw_loop_give_up({
5260 				enable_preemption();
5261 				return false;
5262 			});
5263 		}
5264 		nv = ov - 1;
5265 	});
5266 #else
5267 	if (__improbable(os_atomic_dec_orig(&pgz_slot_avail, relaxed) <= 0)) {
5268 		os_atomic_inc(&pgz_slot_avail, relaxed);
5269 		enable_preemption();
5270 		return false;
5271 	}
5272 #endif
5273 
5274 again:
5275 	if (__improbable(tries-- == 0)) {
5276 		/*
5277 		 * Too much contention,
5278 		 * extremely unlikely but do not stay stuck.
5279 		 */
5280 		os_atomic_inc(&pgz_slot_avail, relaxed);
5281 		enable_preemption();
5282 		return false;
5283 	}
5284 
5285 #if OS_ATOMIC_HAS_LLSC
5286 	uint32_t castries = 20;
5287 	do {
5288 		if (__improbable(castries-- == 0)) {
5289 			/*
5290 			 * rdar://115922110 On many many cores devices,
5291 			 * this can fail for a very long time.
5292 			 */
5293 			goto again;
5294 		}
5295 
5296 		m = os_atomic_load_exclusive(&pgz_slot_head, dependency);
5297 		if (__improbable(m->zm_pgz_slot_next == NULL)) {
5298 			/*
5299 			 * Either we are waiting for an enqueuer (unlikely)
5300 			 * or we are competing with another core and
5301 			 * are looking at a popped element.
5302 			 */
5303 			os_atomic_clear_exclusive();
5304 			goto again;
5305 		}
5306 	} while (!os_atomic_store_exclusive(&pgz_slot_head,
5307 	    m->zm_pgz_slot_next, relaxed));
5308 #else
5309 	struct zone_page_metadata *base = zone_info.zi_pgz_meta;
5310 	struct pgz_slot_head ov, nv;
5311 	os_atomic_rmw_loop(&pgz_slot_head, ov, nv, dependency, {
5312 		m = &base[ov.psh_slot * 2];
5313 		if (__improbable(m->zm_pgz_slot_next == NULL)) {
5314 		        /*
5315 		         * Either we are waiting for an enqueuer (unlikely)
5316 		         * or we are competing with another core and
5317 		         * are looking at a popped element.
5318 		         */
5319 		        os_atomic_rmw_loop_give_up(goto again);
5320 		}
5321 		nv.psh_count = ov.psh_count + 1;
5322 		nv.psh_slot  = (uint32_t)((m->zm_pgz_slot_next - base) / 2);
5323 	});
5324 #endif
5325 
5326 	enable_preemption();
5327 
5328 	m->zm_pgz_slot_next = NULL;
5329 	*slot = (uint32_t)((m - zone_info.zi_pgz_meta) / 2);
5330 	return true;
5331 }
5332 
5333 static inline bool
pgz_slot_free(uint32_t slot)5334 pgz_slot_free(uint32_t slot)
5335 {
5336 	struct zone_page_metadata *m = &zone_info.zi_pgz_meta[2 * slot];
5337 	struct zone_page_metadata *t;
5338 
5339 	disable_preemption();
5340 	t = os_atomic_xchg(&pgz_slot_tail, m, relaxed);
5341 	os_atomic_store(&t->zm_pgz_slot_next, m, release);
5342 	os_atomic_inc(&pgz_slot_avail, relaxed);
5343 	enable_preemption();
5344 
5345 	return true;
5346 }
5347 
5348 /*!
5349  * @function pgz_protect()
5350  *
5351  * @brief
5352  * Try to protect an allocation with PGZ.
5353  *
5354  * @param zone          The zone the allocation was made against.
5355  * @param addr          An allocated element address to protect.
5356  * @param fp            The caller frame pointer (for the backtrace).
5357  * @returns             The new address for the element, or @c addr.
5358  */
5359 __attribute__((noinline))
5360 static vm_offset_t
pgz_protect(zone_t zone,vm_offset_t addr,void * fp)5361 pgz_protect(zone_t zone, vm_offset_t addr, void *fp)
5362 {
5363 	kern_return_t kr;
5364 	uint32_t slot;
5365 
5366 	if (!pgz_slot_alloc(&slot)) {
5367 		return addr;
5368 	}
5369 
5370 	/*
5371 	 * Try to double-map the page (may fail if Z_NOWAIT).
5372 	 * we will always find a PA because pgz_init() pre-expanded the pmap.
5373 	 */
5374 	pmap_paddr_t pa = kvtophys(trunc_page(addr));
5375 	vm_offset_t  new_addr = pgz_addr(slot);
5376 	kr = pmap_enter_options_addr(kernel_pmap, new_addr, pa,
5377 	    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE,
5378 	    get_preemption_level() ? (PMAP_OPTIONS_NOWAIT | PMAP_OPTIONS_NOPREEMPT) : 0,
5379 	    NULL, PMAP_MAPPING_TYPE_INFER);
5380 
5381 	if (__improbable(kr != KERN_SUCCESS)) {
5382 		pgz_slot_free(slot);
5383 		return addr;
5384 	}
5385 
5386 	struct zone_page_metadata tmp = {
5387 		.zm_chunk_len = ZM_PGZ_ALLOCATED,
5388 		.zm_index     = zone_index(zone),
5389 	};
5390 	struct zone_page_metadata *meta = pgz_meta(slot);
5391 
5392 	os_atomic_store(&meta->zm_bits, tmp.zm_bits, relaxed);
5393 	os_atomic_store(&meta->zm_pgz_orig_addr, addr, relaxed);
5394 	pgz_backtrace(pgz_bt(slot, false), fp);
5395 
5396 	return new_addr + (addr & PAGE_MASK);
5397 }
5398 
5399 /*!
5400  * @function pgz_unprotect()
5401  *
5402  * @brief
5403  * Release a PGZ slot and returns the original address of a freed element.
5404  *
5405  * @param addr          A PGZ protected element address.
5406  * @param fp            The caller frame pointer (for the backtrace).
5407  * @returns             The non protected address for the element
5408  *                      that was passed to @c pgz_protect().
5409  */
5410 __attribute__((noinline))
5411 static vm_offset_t
pgz_unprotect(vm_offset_t addr,void * fp)5412 pgz_unprotect(vm_offset_t addr, void *fp)
5413 {
5414 	struct zone_page_metadata *meta;
5415 	struct zone_page_metadata tmp;
5416 	uint32_t slot;
5417 
5418 	slot = pgz_slot(addr);
5419 	meta = zone_meta_from_addr(addr);
5420 	tmp  = *meta;
5421 	if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) {
5422 		goto double_free;
5423 	}
5424 
5425 	pmap_remove_options(kernel_pmap, vm_memtag_canonicalize_address(trunc_page(addr)),
5426 	    vm_memtag_canonicalize_address(trunc_page(addr) + PAGE_SIZE),
5427 	    PMAP_OPTIONS_REMOVE | PMAP_OPTIONS_NOPREEMPT);
5428 
5429 	pgz_backtrace(pgz_bt(slot, true), fp);
5430 
5431 	tmp.zm_chunk_len = ZM_PGZ_FREE;
5432 	tmp.zm_bits = os_atomic_xchg(&meta->zm_bits, tmp.zm_bits, relaxed);
5433 	if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) {
5434 		goto double_free;
5435 	}
5436 
5437 	pgz_slot_free(slot);
5438 	return tmp.zm_pgz_orig_addr;
5439 
5440 double_free:
5441 	panic_fault_address = addr;
5442 	meta->zm_chunk_len = ZM_PGZ_DOUBLE_FREE;
5443 	panic("probabilistic gzalloc double free: %p", (void *)addr);
5444 }
5445 
5446 bool
pgz_owned(mach_vm_address_t addr)5447 pgz_owned(mach_vm_address_t addr)
5448 {
5449 	return mach_vm_range_contains(&zone_info.zi_pgz_range, vm_memtag_canonicalize_address(addr));
5450 }
5451 
5452 
5453 __attribute__((always_inline))
5454 vm_offset_t
__pgz_decode(mach_vm_address_t addr,mach_vm_size_t size)5455 __pgz_decode(mach_vm_address_t addr, mach_vm_size_t size)
5456 {
5457 	struct zone_page_metadata *meta;
5458 
5459 	if (__probable(!pgz_owned(addr))) {
5460 		return (vm_offset_t)addr;
5461 	}
5462 
5463 	if (zone_addr_size_crosses_page(addr, size)) {
5464 		panic("invalid size for PGZ protected address %p:%p",
5465 		    (void *)addr, (void *)(addr + size));
5466 	}
5467 
5468 	meta = zone_meta_from_addr((vm_offset_t)addr);
5469 	if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5470 		panic_fault_address = (vm_offset_t)addr;
5471 		panic("probabilistic gzalloc use-after-free: %p", (void *)addr);
5472 	}
5473 
5474 	return trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK);
5475 }
5476 
5477 __attribute__((always_inline))
5478 vm_offset_t
__pgz_decode_allow_invalid(vm_offset_t addr,zone_id_t zid)5479 __pgz_decode_allow_invalid(vm_offset_t addr, zone_id_t zid)
5480 {
5481 	struct zone_page_metadata *meta;
5482 	struct zone_page_metadata tmp;
5483 
5484 	if (__probable(!pgz_owned(addr))) {
5485 		return addr;
5486 	}
5487 
5488 	meta = zone_meta_from_addr(addr);
5489 	tmp.zm_bits = os_atomic_load(&meta->zm_bits, relaxed);
5490 
5491 	addr = trunc_page(meta->zm_pgz_orig_addr) + (addr & PAGE_MASK);
5492 
5493 	if (tmp.zm_chunk_len != ZM_PGZ_ALLOCATED) {
5494 		return 0;
5495 	}
5496 
5497 	if (zid != ZONE_ID_ANY && tmp.zm_index != zid) {
5498 		return 0;
5499 	}
5500 
5501 	return addr;
5502 }
5503 
5504 static void
pgz_zone_init(zone_t z)5505 pgz_zone_init(zone_t z)
5506 {
5507 	char zn[MAX_ZONE_NAME];
5508 	char zv[MAX_ZONE_NAME];
5509 	char key[30];
5510 
5511 	if (zone_elem_inner_size(z) > PAGE_SIZE) {
5512 		return;
5513 	}
5514 
5515 	if (pgz_all) {
5516 		os_atomic_inc(&pgz_uses, relaxed);
5517 		z->z_pgz_tracked = true;
5518 		return;
5519 	}
5520 
5521 	snprintf(zn, sizeof(zn), "%s%s", zone_heap_name(z), zone_name(z));
5522 
5523 	for (int i = 1;; i++) {
5524 		snprintf(key, sizeof(key), "pgz%d", i);
5525 		if (!PE_parse_boot_argn(key, zv, sizeof(zv))) {
5526 			break;
5527 		}
5528 		if (track_this_zone(zn, zv) || track_kalloc_zones(z, zv)) {
5529 			os_atomic_inc(&pgz_uses, relaxed);
5530 			z->z_pgz_tracked = true;
5531 			break;
5532 		}
5533 	}
5534 }
5535 
5536 __startup_func
5537 static vm_size_t
pgz_get_size(void)5538 pgz_get_size(void)
5539 {
5540 	if (pgz_slots == UINT32_MAX) {
5541 		/*
5542 		 * Scale with RAM size: ~200 slots a G
5543 		 */
5544 		pgz_slots = (uint32_t)(sane_size >> 22);
5545 	}
5546 
5547 	/*
5548 	 * Make sure that the slot allocation scheme works.
5549 	 * see pgz_slot_alloc() / pgz_slot_free();
5550 	 */
5551 	if (pgz_slots < zpercpu_count() * 4) {
5552 		pgz_slots = zpercpu_count() * 4;
5553 	}
5554 	if (pgz_slots >= UINT16_MAX) {
5555 		pgz_slots = UINT16_MAX - 1;
5556 	}
5557 
5558 	/*
5559 	 * Quarantine is 33% of slots by default, no more than 90%.
5560 	 */
5561 	if (pgz_quarantine == 0) {
5562 		pgz_quarantine = pgz_slots / 3;
5563 	}
5564 	if (pgz_quarantine > pgz_slots * 9 / 10) {
5565 		pgz_quarantine = pgz_slots * 9 / 10;
5566 	}
5567 	pgz_slot_avail = pgz_slots - pgz_quarantine;
5568 
5569 	return ptoa(2 * pgz_slots + 1);
5570 }
5571 
5572 __startup_func
5573 static void
pgz_init(void)5574 pgz_init(void)
5575 {
5576 	if (!pgz_uses) {
5577 		return;
5578 	}
5579 
5580 	if (pgz_sample_rate == 0) {
5581 		/*
5582 		 * If no rate was provided, pick a random one that scales
5583 		 * with the number of protected zones.
5584 		 *
5585 		 * Use a binomal distribution to avoid having too many
5586 		 * really fast sample rates.
5587 		 */
5588 		uint32_t factor = MIN(pgz_uses, 10);
5589 		uint32_t max_rate = 1000 * factor;
5590 		uint32_t min_rate =  100 * factor;
5591 
5592 		pgz_sample_rate = (zalloc_random_uniform32(min_rate, max_rate) +
5593 		    zalloc_random_uniform32(min_rate, max_rate)) / 2;
5594 	}
5595 
5596 	struct mach_vm_range *r = &zone_info.zi_pgz_range;
5597 	zone_info.zi_pgz_meta = zone_meta_from_addr(r->min_address);
5598 	zone_meta_populate(r->min_address, mach_vm_range_size(r));
5599 
5600 	for (size_t i = 0; i < 2 * pgz_slots + 1; i += 2) {
5601 		zone_info.zi_pgz_meta[i].zm_chunk_len = ZM_PGZ_GUARD;
5602 	}
5603 
5604 	for (size_t i = 1; i < pgz_slots; i++) {
5605 		zone_info.zi_pgz_meta[2 * i - 1].zm_pgz_slot_next =
5606 		    &zone_info.zi_pgz_meta[2 * i + 1];
5607 	}
5608 #if OS_ATOMIC_HAS_LLSC
5609 	pgz_slot_head = &zone_info.zi_pgz_meta[1];
5610 #endif
5611 	pgz_slot_tail = &zone_info.zi_pgz_meta[2 * pgz_slots - 1];
5612 
5613 	pgz_backtraces = zalloc_permanent(sizeof(struct pgz_backtrace) *
5614 	    2 * pgz_slots, ZALIGN_PTR);
5615 
5616 	/*
5617 	 * expand the pmap so that pmap_enter_options_addr()
5618 	 * in pgz_protect() never need to call pmap_expand().
5619 	 */
5620 	for (uint32_t slot = 0; slot < pgz_slots; slot++) {
5621 		(void)pmap_enter_options_addr(kernel_pmap, pgz_addr(slot), 0,
5622 		    VM_PROT_NONE, VM_PROT_NONE, 0, FALSE,
5623 		    PMAP_OPTIONS_NOENTER, NULL, PMAP_MAPPING_TYPE_INFER);
5624 	}
5625 
5626 	/* do this last as this will enable pgz */
5627 	percpu_foreach(counter, pgz_sample_counter) {
5628 		*counter = zalloc_random_uniform32(0, 2 * pgz_sample_rate);
5629 	}
5630 }
5631 STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, pgz_init);
5632 
5633 static void
panic_display_pgz_bt(bool has_syms,uint32_t slot,bool free)5634 panic_display_pgz_bt(bool has_syms, uint32_t slot, bool free)
5635 {
5636 	struct pgz_backtrace *bt = pgz_bt(slot, free);
5637 	const char *what = free ? "Free" : "Allocation";
5638 	uintptr_t buf[MAX_ZTRACE_DEPTH];
5639 
5640 	if (!ml_validate_nofault((vm_offset_t)bt, sizeof(*bt))) {
5641 		paniclog_append_noflush("  Can't decode %s Backtrace\n", what);
5642 		return;
5643 	}
5644 
5645 	backtrace_unpack(BTP_KERN_OFFSET_32, buf, MAX_ZTRACE_DEPTH,
5646 	    (uint8_t *)bt->pgz_bt, 4 * bt->pgz_depth);
5647 
5648 	paniclog_append_noflush("  %s Backtrace:\n", what);
5649 	for (uint32_t i = 0; i < bt->pgz_depth && i < MAX_ZTRACE_DEPTH; i++) {
5650 		if (has_syms) {
5651 			paniclog_append_noflush("    %p ", (void *)buf[i]);
5652 			panic_print_symbol_name(buf[i]);
5653 			paniclog_append_noflush("\n");
5654 		} else {
5655 			paniclog_append_noflush("    %p\n", (void *)buf[i]);
5656 		}
5657 	}
5658 	kmod_panic_dump((vm_offset_t *)buf, bt->pgz_depth);
5659 }
5660 
5661 static void
panic_display_pgz_uaf_info(bool has_syms,vm_offset_t addr)5662 panic_display_pgz_uaf_info(bool has_syms, vm_offset_t addr)
5663 {
5664 	struct zone_page_metadata *meta;
5665 	vm_offset_t elem, esize;
5666 	const char *type;
5667 	const char *prob;
5668 	uint32_t slot;
5669 	zone_t z;
5670 
5671 	slot = pgz_slot(addr);
5672 	meta = pgz_meta(slot);
5673 	elem = pgz_addr(slot) + (meta->zm_pgz_orig_addr & PAGE_MASK);
5674 
5675 	paniclog_append_noflush("Probabilistic GZAlloc Report:\n");
5676 
5677 	if (ml_validate_nofault((vm_offset_t)meta, sizeof(*meta)) &&
5678 	    meta->zm_index &&
5679 	    meta->zm_index < os_atomic_load(&num_zones, relaxed)) {
5680 		z = &zone_array[meta->zm_index];
5681 	} else {
5682 		paniclog_append_noflush("  Zone    : <unknown>\n");
5683 		paniclog_append_noflush("  Address : %p\n", (void *)addr);
5684 		paniclog_append_noflush("\n");
5685 		return;
5686 	}
5687 
5688 	esize = zone_elem_inner_size(z);
5689 	paniclog_append_noflush("  Zone    : %s%s\n",
5690 	    zone_heap_name(z), zone_name(z));
5691 	paniclog_append_noflush("  Address : %p\n", (void *)addr);
5692 	paniclog_append_noflush("  Element : [%p, %p) of size %d\n",
5693 	    (void *)elem, (void *)(elem + esize), (uint32_t)esize);
5694 
5695 	if (addr < elem) {
5696 		type = "out-of-bounds(underflow) + use-after-free";
5697 		prob = "low";
5698 	} else if (meta->zm_chunk_len == ZM_PGZ_DOUBLE_FREE) {
5699 		type = "double-free";
5700 		prob = "high";
5701 	} else if (addr < elem + esize) {
5702 		type = "use-after-free";
5703 		prob = "high";
5704 	} else if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5705 		type = "out-of-bounds + use-after-free";
5706 		prob = "low";
5707 	} else {
5708 		type = "out-of-bounds";
5709 		prob = "high";
5710 	}
5711 	paniclog_append_noflush("  Kind    : %s (%s confidence)\n",
5712 	    type, prob);
5713 	if (addr < elem) {
5714 		paniclog_append_noflush("  Access  : %d byte(s) before\n",
5715 		    (uint32_t)(elem - addr) + 1);
5716 	} else if (addr < elem + esize) {
5717 		paniclog_append_noflush("  Access  : %d byte(s) inside\n",
5718 		    (uint32_t)(addr - elem) + 1);
5719 	} else {
5720 		paniclog_append_noflush("  Access  : %d byte(s) past\n",
5721 		    (uint32_t)(addr - (elem + esize)) + 1);
5722 	}
5723 
5724 	panic_display_pgz_bt(has_syms, slot, false);
5725 	if (meta->zm_chunk_len != ZM_PGZ_ALLOCATED) {
5726 		panic_display_pgz_bt(has_syms, slot, true);
5727 	}
5728 
5729 	paniclog_append_noflush("\n");
5730 }
5731 
5732 #endif /* CONFIG_PROB_GZALLOC */
5733 #endif /* !ZALLOC_TEST */
5734 #pragma mark zfree
5735 #if !ZALLOC_TEST
5736 
5737 /*!
5738  * @defgroup zfree
5739  * @{
5740  *
5741  * @brief
5742  * The codepath for zone frees.
5743  *
5744  * @discussion
5745  * There are 4 major ways to allocate memory that end up in the zone allocator:
5746  * - @c zfree()
5747  * - @c zfree_percpu()
5748  * - @c kfree*()
5749  * - @c zfree_permanent()
5750  *
5751  * While permanent zones have their own allocation scheme, all other codepaths
5752  * will eventually go through the @c zfree_ext() choking point.
5753  */
5754 
5755 __header_always_inline void
zfree_drop(zone_t zone,vm_offset_t addr)5756 zfree_drop(zone_t zone, vm_offset_t addr)
5757 {
5758 	vm_offset_t esize = zone_elem_outer_size(zone);
5759 	struct zone_page_metadata *meta;
5760 	vm_offset_t eidx;
5761 
5762 	meta = zone_element_resolve(zone, addr, &eidx);
5763 
5764 	if (!zone_meta_mark_free(meta, eidx)) {
5765 		zone_meta_double_free_panic(zone, addr, __func__);
5766 	}
5767 
5768 	vm_offset_t old_size = meta->zm_alloc_size;
5769 	vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
5770 	vm_offset_t new_size = zone_meta_alloc_size_sub(zone, meta, esize);
5771 
5772 	if (new_size == 0) {
5773 		/* whether the page was on the intermediate or all_used, queue, move it to free */
5774 		zone_meta_requeue(zone, &zone->z_pageq_empty, meta);
5775 		zone->z_wired_empty += meta->zm_chunk_len;
5776 	} else if (old_size + esize > max_size) {
5777 		/* first free element on page, move from all_used */
5778 		zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
5779 	}
5780 
5781 	if (__improbable(zone->z_exhausted_wait)) {
5782 		zone_wakeup_exhausted_waiters(zone);
5783 	}
5784 }
5785 
5786 __attribute__((noinline))
5787 static void
zfree_item(zone_t zone,vm_offset_t addr)5788 zfree_item(zone_t zone, vm_offset_t addr)
5789 {
5790 	/* transfer preemption count to lock */
5791 	zone_lock_nopreempt_check_contention(zone);
5792 
5793 	zfree_drop(zone, addr);
5794 	zone->z_elems_free += 1;
5795 
5796 	zone_unlock(zone);
5797 }
5798 
5799 static void
zfree_cached_depot_recirculate(zone_t zone,uint32_t depot_max,zone_cache_t cache)5800 zfree_cached_depot_recirculate(
5801 	zone_t                  zone,
5802 	uint32_t                depot_max,
5803 	zone_cache_t            cache)
5804 {
5805 	smr_t smr = zone_cache_smr(cache);
5806 	smr_seq_t seq;
5807 	uint32_t n;
5808 
5809 	zone_recirc_lock_nopreempt_check_contention(zone);
5810 
5811 	n = cache->zc_depot.zd_full;
5812 	if (n >= depot_max) {
5813 		/*
5814 		 * If SMR is in use, rotate the entire chunk of magazines.
5815 		 *
5816 		 * If the head of the recirculation layer is ready to be
5817 		 * reused, pull them back to refill a little.
5818 		 */
5819 		seq = zone_depot_move_full(&zone->z_recirc,
5820 		    &cache->zc_depot, smr ? n : n - depot_max / 2, NULL);
5821 
5822 		if (smr) {
5823 			smr_deferred_advance_commit(smr, seq);
5824 			if (depot_max > 1 && zone_depot_poll(&zone->z_recirc, smr)) {
5825 				zone_depot_move_full(&cache->zc_depot,
5826 				    &zone->z_recirc, depot_max / 2, NULL);
5827 			}
5828 		}
5829 	}
5830 
5831 	n = depot_max - cache->zc_depot.zd_full;
5832 	if (n > zone->z_recirc.zd_empty) {
5833 		n = zone->z_recirc.zd_empty;
5834 	}
5835 	if (n) {
5836 		zone_depot_move_empty(&cache->zc_depot, &zone->z_recirc,
5837 		    n, zone);
5838 	}
5839 
5840 	zone_recirc_unlock_nopreempt(zone);
5841 }
5842 
5843 static zone_cache_t
zfree_cached_recirculate(zone_t zone,zone_cache_t cache)5844 zfree_cached_recirculate(zone_t zone, zone_cache_t cache)
5845 {
5846 	zone_magazine_t mag = NULL, tmp = NULL;
5847 	smr_t smr = zone_cache_smr(cache);
5848 	bool wakeup_exhausted = false;
5849 
5850 	if (zone->z_recirc.zd_empty == 0) {
5851 		mag = zone_magazine_alloc(Z_NOWAIT);
5852 	}
5853 
5854 	zone_recirc_lock_nopreempt_check_contention(zone);
5855 
5856 	if (mag == NULL && zone->z_recirc.zd_empty) {
5857 		mag = zone_depot_pop_head_empty(&zone->z_recirc, zone);
5858 		__builtin_assume(mag);
5859 	}
5860 	if (mag) {
5861 		tmp = zone_magazine_replace(cache, mag, true);
5862 		if (smr) {
5863 			smr_deferred_advance_commit(smr, tmp->zm_seq);
5864 		}
5865 		if (zone_security_array[zone_index(zone)].z_lifo) {
5866 			zone_depot_insert_head_full(&zone->z_recirc, tmp);
5867 		} else {
5868 			zone_depot_insert_tail_full(&zone->z_recirc, tmp);
5869 		}
5870 
5871 		wakeup_exhausted = zone->z_exhausted_wait;
5872 	}
5873 
5874 	zone_recirc_unlock_nopreempt(zone);
5875 
5876 	if (__improbable(wakeup_exhausted)) {
5877 		zone_lock_nopreempt(zone);
5878 		if (zone->z_exhausted_wait) {
5879 			zone_wakeup_exhausted_waiters(zone);
5880 		}
5881 		zone_unlock_nopreempt(zone);
5882 	}
5883 
5884 	return mag ? cache : NULL;
5885 }
5886 
5887 __attribute__((noinline))
5888 static zone_cache_t
zfree_cached_trim(zone_t zone,zone_cache_t cache)5889 zfree_cached_trim(zone_t zone, zone_cache_t cache)
5890 {
5891 	zone_magazine_t mag = NULL, tmp = NULL;
5892 	uint32_t depot_max;
5893 
5894 	depot_max = os_atomic_load(&zone->z_depot_size, relaxed);
5895 	if (depot_max) {
5896 		zone_depot_lock_nopreempt(cache);
5897 
5898 		if (cache->zc_depot.zd_empty == 0) {
5899 			zfree_cached_depot_recirculate(zone, depot_max, cache);
5900 		}
5901 
5902 		if (__probable(cache->zc_depot.zd_empty)) {
5903 			mag = zone_depot_pop_head_empty(&cache->zc_depot, NULL);
5904 			__builtin_assume(mag);
5905 		} else {
5906 			mag = zone_magazine_alloc(Z_NOWAIT);
5907 		}
5908 		if (mag) {
5909 			tmp = zone_magazine_replace(cache, mag, true);
5910 			zone_depot_insert_tail_full(&cache->zc_depot, tmp);
5911 		}
5912 
5913 		zone_depot_unlock_nopreempt(cache);
5914 
5915 		return mag ? cache : NULL;
5916 	}
5917 
5918 	return zfree_cached_recirculate(zone, cache);
5919 }
5920 
5921 __attribute__((always_inline))
5922 static inline zone_cache_t
zfree_cached_get_pcpu_cache(zone_t zone,int cpu)5923 zfree_cached_get_pcpu_cache(zone_t zone, int cpu)
5924 {
5925 	zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu);
5926 
5927 	if (__probable(cache->zc_free_cur < zc_mag_size())) {
5928 		return cache;
5929 	}
5930 
5931 	if (__probable(cache->zc_alloc_cur < zc_mag_size())) {
5932 		zone_cache_swap_magazines(cache);
5933 		return cache;
5934 	}
5935 
5936 	return zfree_cached_trim(zone, cache);
5937 }
5938 
5939 __attribute__((always_inline))
5940 static inline zone_cache_t
zfree_cached_get_pcpu_cache_smr(zone_t zone,int cpu)5941 zfree_cached_get_pcpu_cache_smr(zone_t zone, int cpu)
5942 {
5943 	zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu);
5944 	size_t idx = cache->zc_free_cur;
5945 
5946 	if (__probable(idx + 1 < zc_mag_size())) {
5947 		return cache;
5948 	}
5949 
5950 	/*
5951 	 * when SMR is in use, the bucket is tagged early with
5952 	 * @c smr_deferred_advance(), which costs a full barrier,
5953 	 * but performs no store.
5954 	 *
5955 	 * When zones hit the recirculation layer, the advance is commited,
5956 	 * under the recirculation lock (see zfree_cached_recirculate()).
5957 	 *
5958 	 * When done this way, the zone contention detection mechanism
5959 	 * will adjust the size of the per-cpu depots gracefully, which
5960 	 * mechanically reduces the pace of these commits as usage increases.
5961 	 */
5962 
5963 	if (__probable(idx + 1 == zc_mag_size())) {
5964 		zone_magazine_t mag;
5965 
5966 		mag = (zone_magazine_t)((uintptr_t)cache->zc_free_elems -
5967 		    offsetof(struct zone_magazine, zm_elems));
5968 		mag->zm_seq = smr_deferred_advance(zone_cache_smr(cache));
5969 		return cache;
5970 	}
5971 
5972 	return zfree_cached_trim(zone, cache);
5973 }
5974 
5975 __attribute__((always_inline))
5976 static inline vm_offset_t
__zcache_mark_invalid(zone_t zone,vm_offset_t elem,uint64_t combined_size)5977 __zcache_mark_invalid(zone_t zone, vm_offset_t elem, uint64_t combined_size)
5978 {
5979 	struct zone_page_metadata *meta;
5980 	vm_offset_t offs;
5981 
5982 #pragma unused(combined_size)
5983 #if CONFIG_PROB_GZALLOC
5984 	if (__improbable(pgz_owned(elem))) {
5985 		elem = pgz_unprotect(elem, __builtin_frame_address(0));
5986 	}
5987 #endif /* CONFIG_PROB_GZALLOC */
5988 
5989 	meta = zone_meta_from_addr(elem);
5990 	if (!from_zone_map(elem, 1) || !zone_has_index(zone, meta->zm_index)) {
5991 		zone_invalid_element_panic(zone, elem);
5992 	}
5993 
5994 	offs = (elem & PAGE_MASK) - zone_elem_inner_offs(zone);
5995 	if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
5996 		offs += ptoa(meta->zm_page_index);
5997 	}
5998 
5999 	if (!Z_FAST_ALIGNED(offs, zone->z_align_magic)) {
6000 		zone_invalid_element_panic(zone, elem);
6001 	}
6002 
6003 #if VM_TAG_SIZECLASSES
6004 	if (__improbable(zone->z_uses_tags)) {
6005 		vm_tag_t *slot;
6006 
6007 		slot = zba_extra_ref_ptr(meta->zm_bitmap,
6008 		    Z_FAST_QUO(offs, zone->z_quo_magic));
6009 		vm_tag_update_zone_size(*slot, zone->z_tags_sizeclass,
6010 		    -(long)ZFREE_ELEM_SIZE(combined_size));
6011 		*slot = VM_KERN_MEMORY_NONE;
6012 	}
6013 #endif /* VM_TAG_SIZECLASSES */
6014 
6015 #if KASAN_CLASSIC
6016 	kasan_free(elem, ZFREE_ELEM_SIZE(combined_size),
6017 	    ZFREE_USER_SIZE(combined_size), zone_elem_redzone(zone),
6018 	    zone->z_percpu, __builtin_frame_address(0));
6019 #endif
6020 #if CONFIG_KERNEL_TAGGING
6021 	if (__probable(zone->z_tbi_tag)) {
6022 		elem = zone_tag_element(zone, elem, ZFREE_ELEM_SIZE(combined_size));
6023 	}
6024 #endif /* CONFIG_KERNEL_TAGGING */
6025 
6026 	return elem;
6027 }
6028 
6029 __attribute__((always_inline))
6030 void *
zcache_mark_invalid(zone_t zone,void * elem)6031 zcache_mark_invalid(zone_t zone, void *elem)
6032 {
6033 	vm_size_t esize = zone_elem_inner_size(zone);
6034 
6035 	ZFREE_LOG(zone, (vm_offset_t)elem, 1);
6036 	return (void *)__zcache_mark_invalid(zone, (vm_offset_t)elem, ZFREE_PACK_SIZE(esize, esize));
6037 }
6038 
6039 /*
6040  *     The function is noinline when zlog can be used so that the backtracing can
6041  *     reliably skip the zfree_ext() and zfree_log()
6042  *     boring frames.
6043  */
6044 #if ZALLOC_ENABLE_LOGGING
6045 __attribute__((noinline))
6046 #endif /* ZALLOC_ENABLE_LOGGING */
6047 void
zfree_ext(zone_t zone,zone_stats_t zstats,void * addr,uint64_t combined_size)6048 zfree_ext(zone_t zone, zone_stats_t zstats, void *addr, uint64_t combined_size)
6049 {
6050 	vm_offset_t esize = ZFREE_ELEM_SIZE(combined_size);
6051 	vm_offset_t elem = (vm_offset_t)addr;
6052 	int cpu;
6053 
6054 	DTRACE_VM2(zfree, zone_t, zone, void*, elem);
6055 
6056 	ZFREE_LOG(zone, elem, 1);
6057 	elem = __zcache_mark_invalid(zone, elem, combined_size);
6058 
6059 	disable_preemption();
6060 	cpu = cpu_number();
6061 	zpercpu_get_cpu(zstats, cpu)->zs_mem_freed += esize;
6062 
6063 #if KASAN_CLASSIC
6064 	if (zone->z_kasan_quarantine && startup_phase >= STARTUP_SUB_ZALLOC) {
6065 		struct kasan_quarantine_result kqr;
6066 
6067 		kqr  = kasan_quarantine(elem, esize);
6068 		elem = kqr.addr;
6069 		zone = kqr.zone;
6070 		if (elem == 0) {
6071 			return enable_preemption();
6072 		}
6073 	}
6074 #endif
6075 
6076 	if (zone->z_pcpu_cache) {
6077 		zone_cache_t cache = zfree_cached_get_pcpu_cache(zone, cpu);
6078 
6079 		if (__probable(cache)) {
6080 			cache->zc_free_elems[cache->zc_free_cur++] = elem;
6081 			return enable_preemption();
6082 		}
6083 	}
6084 
6085 	return zfree_item(zone, elem);
6086 }
6087 
6088 __attribute__((always_inline))
6089 static inline zstack_t
zcache_free_stack_to_cpu(zone_id_t zid,zone_cache_t cache,zstack_t stack,vm_size_t esize,zone_cache_ops_t ops,bool zero)6090 zcache_free_stack_to_cpu(
6091 	zone_id_t               zid,
6092 	zone_cache_t            cache,
6093 	zstack_t                stack,
6094 	vm_size_t               esize,
6095 	zone_cache_ops_t        ops,
6096 	bool                    zero)
6097 {
6098 	size_t       n = MIN(zc_mag_size() - cache->zc_free_cur, stack.z_count);
6099 	vm_offset_t *p;
6100 
6101 	stack.z_count -= n;
6102 	cache->zc_free_cur += n;
6103 	p = cache->zc_free_elems + cache->zc_free_cur;
6104 
6105 	do {
6106 		void *o = zstack_pop_no_delta(&stack);
6107 
6108 		if (ops) {
6109 			o = ops->zc_op_mark_invalid(zid, o);
6110 		} else {
6111 			if (zero) {
6112 				bzero(o, esize);
6113 			}
6114 			o = (void *)__zcache_mark_invalid(zone_by_id(zid),
6115 			    (vm_offset_t)o, ZFREE_PACK_SIZE(esize, esize));
6116 		}
6117 		*--p  = (vm_offset_t)o;
6118 	} while (--n > 0);
6119 
6120 	return stack;
6121 }
6122 
6123 __attribute__((always_inline))
6124 static inline void
zcache_free_1_ext(zone_id_t zid,void * addr,zone_cache_ops_t ops)6125 zcache_free_1_ext(zone_id_t zid, void *addr, zone_cache_ops_t ops)
6126 {
6127 	vm_offset_t elem = (vm_offset_t)addr;
6128 	zone_cache_t cache;
6129 	vm_size_t esize;
6130 	zone_t zone = zone_by_id(zid);
6131 	int cpu;
6132 
6133 	ZFREE_LOG(zone, elem, 1);
6134 
6135 	disable_preemption();
6136 	cpu = cpu_number();
6137 	esize = zone_elem_inner_size(zone);
6138 	zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed += esize;
6139 	if (!ops) {
6140 		addr = (void *)__zcache_mark_invalid(zone, elem,
6141 		    ZFREE_PACK_SIZE(esize, esize));
6142 	}
6143 	cache = zfree_cached_get_pcpu_cache(zone, cpu);
6144 	if (__probable(cache)) {
6145 		if (ops) {
6146 			addr = ops->zc_op_mark_invalid(zid, addr);
6147 		}
6148 		cache->zc_free_elems[cache->zc_free_cur++] = elem;
6149 		enable_preemption();
6150 	} else if (ops) {
6151 		enable_preemption();
6152 		os_atomic_dec(&zone_by_id(zid)->z_elems_avail, relaxed);
6153 		ops->zc_op_free(zid, addr);
6154 	} else {
6155 		zfree_item(zone, elem);
6156 	}
6157 }
6158 
6159 __attribute__((always_inline))
6160 static inline void
zcache_free_n_ext(zone_id_t zid,zstack_t stack,zone_cache_ops_t ops,bool zero)6161 zcache_free_n_ext(zone_id_t zid, zstack_t stack, zone_cache_ops_t ops, bool zero)
6162 {
6163 	zone_t zone = zone_by_id(zid);
6164 	zone_cache_t cache;
6165 	vm_size_t esize;
6166 	int cpu;
6167 
6168 	ZFREE_LOG(zone, stack.z_head, stack.z_count);
6169 
6170 	disable_preemption();
6171 	cpu = cpu_number();
6172 	esize = zone_elem_inner_size(zone);
6173 	zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed +=
6174 	    stack.z_count * esize;
6175 
6176 	for (;;) {
6177 		cache = zfree_cached_get_pcpu_cache(zone, cpu);
6178 		if (__probable(cache)) {
6179 			stack = zcache_free_stack_to_cpu(zid, cache,
6180 			    stack, esize, ops, zero);
6181 			enable_preemption();
6182 		} else if (ops) {
6183 			enable_preemption();
6184 			os_atomic_dec(&zone->z_elems_avail, relaxed);
6185 			ops->zc_op_free(zid, zstack_pop(&stack));
6186 		} else {
6187 			vm_offset_t addr = (vm_offset_t)zstack_pop(&stack);
6188 
6189 			if (zero) {
6190 				bzero((void *)addr, esize);
6191 			}
6192 			addr = __zcache_mark_invalid(zone, addr,
6193 			    ZFREE_PACK_SIZE(esize, esize));
6194 			zfree_item(zone, addr);
6195 		}
6196 
6197 		if (stack.z_count == 0) {
6198 			break;
6199 		}
6200 
6201 		disable_preemption();
6202 		cpu = cpu_number();
6203 	}
6204 }
6205 
6206 void
6207 (zcache_free)(zone_id_t zid, void *addr, zone_cache_ops_t ops)
6208 {
6209 	__builtin_assume(ops != NULL);
6210 	zcache_free_1_ext(zid, addr, ops);
6211 }
6212 
6213 void
6214 (zcache_free_n)(zone_id_t zid, zstack_t stack, zone_cache_ops_t ops)
6215 {
6216 	__builtin_assume(ops != NULL);
6217 	zcache_free_n_ext(zid, stack, ops, false);
6218 }
6219 
6220 void
6221 (zfree_n)(zone_id_t zid, zstack_t stack)
6222 {
6223 	zcache_free_n_ext(zid, stack, NULL, true);
6224 }
6225 
6226 void
6227 (zfree_nozero)(zone_id_t zid, void *addr)
6228 {
6229 	zcache_free_1_ext(zid, addr, NULL);
6230 }
6231 
6232 void
6233 (zfree_nozero_n)(zone_id_t zid, zstack_t stack)
6234 {
6235 	zcache_free_n_ext(zid, stack, NULL, false);
6236 }
6237 
6238 void
6239 (zfree)(zone_t zov, void *addr)
6240 {
6241 	zone_t zone = zov->z_self;
6242 	zone_stats_t zstats = zov->z_stats;
6243 	vm_offset_t esize = zone_elem_inner_size(zone);
6244 
6245 	assert(zone > &zone_array[ZONE_ID__LAST_RO]);
6246 	assert(!zone->z_percpu && !zone->z_permanent && !zone->z_smr);
6247 
6248 	vm_memtag_bzero(addr, esize);
6249 
6250 	zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize));
6251 }
6252 
6253 __attribute__((noinline))
6254 void
zfree_percpu(union zone_or_view zov,void * addr)6255 zfree_percpu(union zone_or_view zov, void *addr)
6256 {
6257 	zone_t zone = zov.zov_view->zv_zone;
6258 	zone_stats_t zstats = zov.zov_view->zv_stats;
6259 	vm_offset_t esize = zone_elem_inner_size(zone);
6260 
6261 	assert(zone > &zone_array[ZONE_ID__LAST_RO]);
6262 	assert(zone->z_percpu);
6263 	addr = (void *)__zpcpu_demangle(addr);
6264 	zpercpu_foreach_cpu(i) {
6265 		vm_memtag_bzero((char *)addr + ptoa(i), esize);
6266 	}
6267 	zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize));
6268 }
6269 
6270 void
6271 (zfree_id)(zone_id_t zid, void *addr)
6272 {
6273 	(zfree)(&zone_array[zid], addr);
6274 }
6275 
6276 void
6277 (zfree_ro)(zone_id_t zid, void *addr)
6278 {
6279 	assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
6280 	zone_t zone = zone_by_id(zid);
6281 	zone_stats_t zstats = zone->z_stats;
6282 	vm_offset_t esize = zone_ro_size_params[zid].z_elem_size;
6283 
6284 #if ZSECURITY_CONFIG(READ_ONLY)
6285 	assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
6286 	pmap_ro_zone_bzero(zid, (vm_offset_t)addr, 0, esize);
6287 #else
6288 	(void)zid;
6289 	bzero(addr, esize);
6290 #endif /* !KASAN_CLASSIC */
6291 	zfree_ext(zone, zstats, addr, ZFREE_PACK_SIZE(esize, esize));
6292 }
6293 
6294 __attribute__((noinline))
6295 static void
zfree_item_smr(zone_t zone,vm_offset_t addr)6296 zfree_item_smr(zone_t zone, vm_offset_t addr)
6297 {
6298 	zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, 0);
6299 	vm_size_t esize = zone_elem_inner_size(zone);
6300 
6301 	/*
6302 	 * This should be taken extremely rarely:
6303 	 * this happens if we failed allocating an empty bucket.
6304 	 */
6305 	smr_synchronize(zone_cache_smr(cache));
6306 
6307 	cache->zc_free((void *)addr, esize);
6308 	addr = __zcache_mark_invalid(zone, addr, ZFREE_PACK_SIZE(esize, esize));
6309 
6310 	zfree_item(zone, addr);
6311 }
6312 
6313 void
6314 (zfree_smr)(zone_t zone, void *addr)
6315 {
6316 	vm_offset_t elem = (vm_offset_t)addr;
6317 	vm_offset_t esize;
6318 	zone_cache_t cache;
6319 	int cpu;
6320 
6321 	ZFREE_LOG(zone, elem, 1);
6322 
6323 	disable_preemption();
6324 	cpu   = cpu_number();
6325 #if MACH_ASSERT
6326 	cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu);
6327 	assert(!smr_entered_cpu_noblock(cache->zc_smr, cpu));
6328 #endif
6329 	esize = zone_elem_inner_size(zone);
6330 	zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_freed += esize;
6331 	cache = zfree_cached_get_pcpu_cache_smr(zone, cpu);
6332 	if (__probable(cache)) {
6333 		cache->zc_free_elems[cache->zc_free_cur++] = elem;
6334 		enable_preemption();
6335 	} else {
6336 		zfree_item_smr(zone, elem);
6337 	}
6338 }
6339 
6340 void
6341 (zfree_id_smr)(zone_id_t zid, void *addr)
6342 {
6343 	(zfree_smr)(&zone_array[zid], addr);
6344 }
6345 
6346 void
kfree_type_impl_internal(kalloc_type_view_t kt_view,void * ptr __unsafe_indexable)6347 kfree_type_impl_internal(
6348 	kalloc_type_view_t  kt_view,
6349 	void               *ptr __unsafe_indexable)
6350 {
6351 	zone_t zsig = kt_view->kt_zsig;
6352 	zone_t z = kt_view->kt_zv.zv_zone;
6353 	struct zone_page_metadata *meta;
6354 	zone_id_t zidx_meta;
6355 	zone_security_flags_t zsflags_meta;
6356 	zone_security_flags_t zsflags_z = zone_security_config(z);
6357 	zone_security_flags_t zsflags_zsig;
6358 
6359 	if (NULL == ptr) {
6360 		return;
6361 	}
6362 
6363 	meta = zone_meta_from_addr((vm_offset_t) ptr);
6364 	zidx_meta = meta->zm_index;
6365 	zsflags_meta = zone_security_array[zidx_meta];
6366 
6367 	if ((zsflags_z.z_kheap_id == KHEAP_ID_DATA_BUFFERS) ||
6368 	    zone_has_index(z, zidx_meta)) {
6369 		return (zfree)(&kt_view->kt_zv, ptr);
6370 	}
6371 	zsflags_zsig = zone_security_config(zsig);
6372 	if (zsflags_meta.z_sig_eq == zsflags_zsig.z_sig_eq) {
6373 		z = zone_array + zidx_meta;
6374 		return (zfree)(z, ptr);
6375 	}
6376 
6377 	return (zfree)(kt_view->kt_zshared, ptr);
6378 }
6379 
6380 /*! @} */
6381 #endif /* !ZALLOC_TEST */
6382 #pragma mark zalloc
6383 #if !ZALLOC_TEST
6384 
6385 /*!
6386  * @defgroup zalloc
6387  * @{
6388  *
6389  * @brief
6390  * The codepath for zone allocations.
6391  *
6392  * @discussion
6393  * There are 4 major ways to allocate memory that end up in the zone allocator:
6394  * - @c zalloc(), @c zalloc_flags(), ...
6395  * - @c zalloc_percpu()
6396  * - @c kalloc*()
6397  * - @c zalloc_permanent()
6398  *
6399  * While permanent zones have their own allocation scheme, all other codepaths
6400  * will eventually go through the @c zalloc_ext() choking point.
6401  *
6402  * @c zalloc_return() is the final function everyone tail calls into,
6403  * which prepares the element for consumption by the caller and deals with
6404  * common treatment (zone logging, tags, kasan, validation, ...).
6405  */
6406 
6407 /*!
6408  * @function zalloc_import
6409  *
6410  * @brief
6411  * Import @c n elements in the specified array, opposite of @c zfree_drop().
6412  *
6413  * @param zone          The zone to import elements from
6414  * @param elems         The array to import into
6415  * @param n             The number of elements to import. Must be non zero,
6416  *                      and smaller than @c zone->z_elems_free.
6417  */
6418 __header_always_inline vm_size_t
zalloc_import(zone_t zone,vm_offset_t * elems,zalloc_flags_t flags,uint32_t n)6419 zalloc_import(
6420 	zone_t                  zone,
6421 	vm_offset_t            *elems,
6422 	zalloc_flags_t          flags,
6423 	uint32_t                n)
6424 {
6425 	vm_offset_t esize = zone_elem_outer_size(zone);
6426 	vm_offset_t offs  = zone_elem_inner_offs(zone);
6427 	zone_stats_t zs;
6428 	int cpu = cpu_number();
6429 	uint32_t i = 0;
6430 
6431 	zs = zpercpu_get_cpu(zone->z_stats, cpu);
6432 
6433 	if (__improbable(zone_caching_disabled < 0)) {
6434 		/*
6435 		 * In the first 10s after boot, mess with
6436 		 * the scan position in order to make early
6437 		 * allocations patterns less predictable.
6438 		 */
6439 		zone_early_scramble_rr(zone, cpu, zs);
6440 	}
6441 
6442 	do {
6443 		vm_offset_t page, eidx, size = 0;
6444 		struct zone_page_metadata *meta;
6445 
6446 		if (!zone_pva_is_null(zone->z_pageq_partial)) {
6447 			meta = zone_pva_to_meta(zone->z_pageq_partial);
6448 			page = zone_pva_to_addr(zone->z_pageq_partial);
6449 		} else if (!zone_pva_is_null(zone->z_pageq_empty)) {
6450 			meta = zone_pva_to_meta(zone->z_pageq_empty);
6451 			page = zone_pva_to_addr(zone->z_pageq_empty);
6452 			zone_counter_sub(zone, z_wired_empty, meta->zm_chunk_len);
6453 		} else {
6454 			zone_accounting_panic(zone, "z_elems_free corruption");
6455 		}
6456 
6457 		zone_meta_validate(zone, meta, page);
6458 
6459 		vm_offset_t old_size = meta->zm_alloc_size;
6460 		vm_offset_t max_size = ptoa(meta->zm_chunk_len) + ZM_ALLOC_SIZE_LOCK;
6461 
6462 		do {
6463 			eidx = zone_meta_find_and_clear_bit(zone, zs, meta, flags);
6464 			elems[i++] = page + offs + eidx * esize;
6465 			size += esize;
6466 		} while (i < n && old_size + size + esize <= max_size);
6467 
6468 		vm_offset_t new_size = zone_meta_alloc_size_add(zone, meta, size);
6469 
6470 		if (new_size + esize > max_size) {
6471 			zone_meta_requeue(zone, &zone->z_pageq_full, meta);
6472 		} else if (old_size == 0) {
6473 			/* remove from free, move to intermediate */
6474 			zone_meta_requeue(zone, &zone->z_pageq_partial, meta);
6475 		}
6476 	} while (i < n);
6477 
6478 	n = zone_counter_sub(zone, z_elems_free, n);
6479 	if (zone->z_pcpu_cache == NULL && zone->z_elems_free_min > n) {
6480 		zone->z_elems_free_min = n;
6481 	}
6482 
6483 	return zone_elem_inner_size(zone);
6484 }
6485 
6486 __attribute__((always_inline))
6487 static inline vm_offset_t
__zcache_mark_valid(zone_t zone,vm_offset_t addr,zalloc_flags_t flags)6488 __zcache_mark_valid(zone_t zone, vm_offset_t addr, zalloc_flags_t flags)
6489 {
6490 #pragma unused(zone, flags)
6491 #if KASAN_CLASSIC || CONFIG_PROB_GZALLOC || VM_TAG_SIZECLASSES
6492 	vm_offset_t esize = zone_elem_inner_size(zone);
6493 #endif
6494 
6495 #if CONFIG_KERNEL_TAGGING
6496 	if (__probable(zone->z_tbi_tag)) {
6497 		/*
6498 		 * Retrieve the memory tag assigned on free and update the pointer
6499 		 * metadata.
6500 		 */
6501 		addr = vm_memtag_fixup_ptr(addr);
6502 	}
6503 #endif /* CONFIG_KERNEL_TAGGING */
6504 
6505 #if VM_TAG_SIZECLASSES
6506 	if (__improbable(zone->z_uses_tags)) {
6507 		struct zone_page_metadata *meta;
6508 		vm_offset_t offs;
6509 		vm_tag_t *slot;
6510 		vm_tag_t tag;
6511 
6512 		tag  = zalloc_flags_get_tag(flags);
6513 		meta = zone_meta_from_addr(addr);
6514 		offs = (addr & PAGE_MASK) - zone_elem_inner_offs(zone);
6515 		if (meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
6516 			offs += ptoa(meta->zm_page_index);
6517 		}
6518 
6519 		slot = zba_extra_ref_ptr(meta->zm_bitmap,
6520 		    Z_FAST_QUO(offs, zone->z_quo_magic));
6521 		*slot = tag;
6522 
6523 		vm_tag_update_zone_size(tag, zone->z_tags_sizeclass,
6524 		    (long)esize);
6525 	}
6526 #endif /* VM_TAG_SIZECLASSES */
6527 
6528 #if CONFIG_PROB_GZALLOC
6529 	if (zone->z_pgz_tracked && pgz_sample(addr, esize)) {
6530 		addr = pgz_protect(zone, addr, __builtin_frame_address(0));
6531 	}
6532 #endif
6533 
6534 #if KASAN_CLASSIC
6535 	/*
6536 	 * KASAN_CLASSIC integration of kalloc heaps are handled by kalloc_ext()
6537 	 */
6538 	if ((flags & Z_SKIP_KASAN) == 0) {
6539 		kasan_alloc(addr, esize, esize, zone_elem_redzone(zone),
6540 		    (flags & Z_PCPU), __builtin_frame_address(0));
6541 	}
6542 #endif /* KASAN_CLASSIC */
6543 
6544 	return addr;
6545 }
6546 
6547 __attribute__((always_inline))
6548 void *
zcache_mark_valid(zone_t zone,void * addr)6549 zcache_mark_valid(zone_t zone, void *addr)
6550 {
6551 	addr = (void *)__zcache_mark_valid(zone, (vm_offset_t)addr, 0);
6552 	ZALLOC_LOG(zone, (vm_offset_t)addr, 1);
6553 	return addr;
6554 }
6555 
6556 /*!
6557  * @function zalloc_return
6558  *
6559  * @brief
6560  * Performs the tail-end of the work required on allocations before the caller
6561  * uses them.
6562  *
6563  * @discussion
6564  * This function is called without any zone lock held,
6565  * and preemption back to the state it had when @c zalloc_ext() was called.
6566  *
6567  * @param zone          The zone we're allocating from.
6568  * @param addr          The element we just allocated.
6569  * @param flags         The flags passed to @c zalloc_ext() (for Z_ZERO).
6570  * @param elem_size     The element size for this zone.
6571  */
6572 __attribute__((always_inline))
6573 static struct kalloc_result
zalloc_return(zone_t zone,vm_offset_t addr,zalloc_flags_t flags,vm_offset_t elem_size)6574 zalloc_return(
6575 	zone_t                  zone,
6576 	vm_offset_t             addr,
6577 	zalloc_flags_t          flags,
6578 	vm_offset_t             elem_size)
6579 {
6580 	addr = __zcache_mark_valid(zone, addr, flags);
6581 #if ZALLOC_ENABLE_ZERO_CHECK
6582 	zalloc_validate_element(zone, addr, elem_size, flags);
6583 #endif /* ZALLOC_ENABLE_ZERO_CHECK */
6584 	ZALLOC_LOG(zone, addr, 1);
6585 
6586 	DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
6587 	return (struct kalloc_result){ (void *)addr, elem_size };
6588 }
6589 
6590 static vm_size_t
zalloc_get_shared_threshold(zone_t zone,vm_size_t esize)6591 zalloc_get_shared_threshold(zone_t zone, vm_size_t esize)
6592 {
6593 	if (esize <= 512) {
6594 		return zone_early_thres_mul * page_size / 4;
6595 	} else if (esize < 2048) {
6596 		return zone_early_thres_mul * esize * 8;
6597 	}
6598 	return zone_early_thres_mul * zone->z_chunk_elems * esize;
6599 }
6600 
6601 __attribute__((noinline))
6602 static struct kalloc_result
zalloc_item(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags)6603 zalloc_item(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
6604 {
6605 	vm_offset_t esize, addr;
6606 	zone_stats_t zs;
6607 
6608 	zone_lock_nopreempt_check_contention(zone);
6609 
6610 	zs = zpercpu_get(zstats);
6611 	if (__improbable(zone->z_elems_free <= zone->z_elems_rsv / 2)) {
6612 		if ((flags & Z_NOWAIT) || zone->z_elems_free) {
6613 			zone_expand_async_schedule_if_allowed(zone);
6614 		} else {
6615 			zone_expand_locked(zone, flags);
6616 		}
6617 		if (__improbable(zone->z_elems_free == 0)) {
6618 			zs->zs_alloc_fail++;
6619 			zone_unlock(zone);
6620 			if (__improbable(flags & Z_NOFAIL)) {
6621 				zone_nofail_panic(zone);
6622 			}
6623 			DTRACE_VM2(zalloc, zone_t, zone, void*, NULL);
6624 			return (struct kalloc_result){ };
6625 		}
6626 	}
6627 
6628 	esize = zalloc_import(zone, &addr, flags, 1);
6629 	zs->zs_mem_allocated += esize;
6630 
6631 	if (__improbable(!zone_share_always &&
6632 	    !os_atomic_load(&zs->zs_alloc_not_shared, relaxed))) {
6633 		if (flags & Z_SET_NOTSHARED) {
6634 			vm_size_t shared_threshold = zalloc_get_shared_threshold(zone, esize);
6635 
6636 			if (zs->zs_mem_allocated >= shared_threshold) {
6637 				zpercpu_foreach(zs_cpu, zstats) {
6638 					os_atomic_store(&zs_cpu->zs_alloc_not_shared, 1, relaxed);
6639 				}
6640 			}
6641 		}
6642 	}
6643 	zone_unlock(zone);
6644 
6645 	return zalloc_return(zone, addr, flags, esize);
6646 }
6647 
6648 static void
zalloc_cached_import(zone_t zone,zalloc_flags_t flags,zone_cache_t cache)6649 zalloc_cached_import(
6650 	zone_t                  zone,
6651 	zalloc_flags_t          flags,
6652 	zone_cache_t            cache)
6653 {
6654 	uint16_t n_elems = zc_mag_size();
6655 
6656 	zone_lock_nopreempt(zone);
6657 
6658 	if (__probable(!zone_caching_disabled &&
6659 	    zone->z_elems_free > zone->z_elems_rsv / 2)) {
6660 		if (__improbable(zone->z_elems_free <= zone->z_elems_rsv)) {
6661 			zone_expand_async_schedule_if_allowed(zone);
6662 		}
6663 		if (zone->z_elems_free < n_elems) {
6664 			n_elems = (uint16_t)zone->z_elems_free;
6665 		}
6666 		zalloc_import(zone, cache->zc_alloc_elems, flags, n_elems);
6667 		cache->zc_alloc_cur = n_elems;
6668 	}
6669 
6670 	zone_unlock_nopreempt(zone);
6671 }
6672 
6673 static void
zalloc_cached_depot_recirculate(zone_t zone,uint32_t depot_max,zone_cache_t cache,smr_t smr)6674 zalloc_cached_depot_recirculate(
6675 	zone_t                  zone,
6676 	uint32_t                depot_max,
6677 	zone_cache_t            cache,
6678 	smr_t                   smr)
6679 {
6680 	smr_seq_t seq;
6681 	uint32_t n;
6682 
6683 	zone_recirc_lock_nopreempt_check_contention(zone);
6684 
6685 	n = cache->zc_depot.zd_empty;
6686 	if (n >= depot_max) {
6687 		zone_depot_move_empty(&zone->z_recirc, &cache->zc_depot,
6688 		    n - depot_max / 2, NULL);
6689 	}
6690 
6691 	n = cache->zc_depot.zd_full;
6692 	if (smr && n) {
6693 		/*
6694 		 * if SMR is in use, it means smr_poll() failed,
6695 		 * so rotate the entire chunk of magazines in order
6696 		 * to let the sequence numbers age.
6697 		 */
6698 		seq = zone_depot_move_full(&zone->z_recirc, &cache->zc_depot,
6699 		    n, NULL);
6700 		smr_deferred_advance_commit(smr, seq);
6701 	}
6702 
6703 	n = depot_max - cache->zc_depot.zd_empty;
6704 	if (n > zone->z_recirc.zd_full) {
6705 		n = zone->z_recirc.zd_full;
6706 	}
6707 
6708 	if (n && zone_depot_poll(&zone->z_recirc, smr)) {
6709 		zone_depot_move_full(&cache->zc_depot, &zone->z_recirc,
6710 		    n, zone);
6711 	}
6712 
6713 	zone_recirc_unlock_nopreempt(zone);
6714 }
6715 
6716 static void
zalloc_cached_reuse_smr(zone_t z,zone_cache_t cache,zone_magazine_t mag)6717 zalloc_cached_reuse_smr(zone_t z, zone_cache_t cache, zone_magazine_t mag)
6718 {
6719 	zone_smr_free_cb_t zc_free = cache->zc_free;
6720 	vm_size_t esize = zone_elem_inner_size(z);
6721 
6722 	for (uint16_t i = 0; i < zc_mag_size(); i++) {
6723 		vm_offset_t elem = mag->zm_elems[i];
6724 
6725 		zc_free((void *)elem, zone_elem_inner_size(z));
6726 		elem = __zcache_mark_invalid(z, elem,
6727 		    ZFREE_PACK_SIZE(esize, esize));
6728 		mag->zm_elems[i] = elem;
6729 	}
6730 }
6731 
6732 static void
zalloc_cached_recirculate(zone_t zone,zone_cache_t cache)6733 zalloc_cached_recirculate(
6734 	zone_t                  zone,
6735 	zone_cache_t            cache)
6736 {
6737 	zone_magazine_t mag = NULL;
6738 
6739 	zone_recirc_lock_nopreempt_check_contention(zone);
6740 
6741 	if (zone_depot_poll(&zone->z_recirc, zone_cache_smr(cache))) {
6742 		mag = zone_depot_pop_head_full(&zone->z_recirc, zone);
6743 		if (zone_cache_smr(cache)) {
6744 			zalloc_cached_reuse_smr(zone, cache, mag);
6745 		}
6746 		mag = zone_magazine_replace(cache, mag, false);
6747 		zone_depot_insert_head_empty(&zone->z_recirc, mag);
6748 	}
6749 
6750 	zone_recirc_unlock_nopreempt(zone);
6751 }
6752 
6753 __attribute__((noinline))
6754 static zone_cache_t
zalloc_cached_prime(zone_t zone,zone_cache_ops_t ops,zalloc_flags_t flags,zone_cache_t cache)6755 zalloc_cached_prime(
6756 	zone_t                  zone,
6757 	zone_cache_ops_t        ops,
6758 	zalloc_flags_t          flags,
6759 	zone_cache_t            cache)
6760 {
6761 	zone_magazine_t mag = NULL;
6762 	uint32_t depot_max;
6763 	smr_t smr;
6764 
6765 	depot_max = os_atomic_load(&zone->z_depot_size, relaxed);
6766 	if (depot_max) {
6767 		smr = zone_cache_smr(cache);
6768 
6769 		zone_depot_lock_nopreempt(cache);
6770 
6771 		if (!zone_depot_poll(&cache->zc_depot, smr)) {
6772 			zalloc_cached_depot_recirculate(zone, depot_max, cache,
6773 			    smr);
6774 		}
6775 
6776 		if (__probable(cache->zc_depot.zd_full)) {
6777 			mag = zone_depot_pop_head_full(&cache->zc_depot, NULL);
6778 			if (zone_cache_smr(cache)) {
6779 				zalloc_cached_reuse_smr(zone, cache, mag);
6780 			}
6781 			mag = zone_magazine_replace(cache, mag, false);
6782 			zone_depot_insert_head_empty(&cache->zc_depot, mag);
6783 		}
6784 
6785 		zone_depot_unlock_nopreempt(cache);
6786 	} else if (zone->z_recirc.zd_full) {
6787 		zalloc_cached_recirculate(zone, cache);
6788 	}
6789 
6790 	if (__probable(cache->zc_alloc_cur)) {
6791 		return cache;
6792 	}
6793 
6794 	if (ops == NULL) {
6795 		zalloc_cached_import(zone, flags, cache);
6796 		if (__probable(cache->zc_alloc_cur)) {
6797 			return cache;
6798 		}
6799 	}
6800 
6801 	return NULL;
6802 }
6803 
6804 __attribute__((always_inline))
6805 static inline zone_cache_t
zalloc_cached_get_pcpu_cache(zone_t zone,zone_cache_ops_t ops,int cpu,zalloc_flags_t flags)6806 zalloc_cached_get_pcpu_cache(
6807 	zone_t                  zone,
6808 	zone_cache_ops_t        ops,
6809 	int                     cpu,
6810 	zalloc_flags_t          flags)
6811 {
6812 	zone_cache_t cache = zpercpu_get_cpu(zone->z_pcpu_cache, cpu);
6813 
6814 	if (__probable(cache->zc_alloc_cur != 0)) {
6815 		return cache;
6816 	}
6817 
6818 	if (__probable(cache->zc_free_cur != 0 && !cache->zc_smr)) {
6819 		zone_cache_swap_magazines(cache);
6820 		return cache;
6821 	}
6822 
6823 	return zalloc_cached_prime(zone, ops, flags, cache);
6824 }
6825 
6826 
6827 /*!
6828  * @function zalloc_ext
6829  *
6830  * @brief
6831  * The core implementation of @c zalloc(), @c zalloc_flags(), @c zalloc_percpu().
6832  */
6833 struct kalloc_result
zalloc_ext(zone_t zone,zone_stats_t zstats,zalloc_flags_t flags)6834 zalloc_ext(zone_t zone, zone_stats_t zstats, zalloc_flags_t flags)
6835 {
6836 	/*
6837 	 * KASan uses zalloc() for fakestack, which can be called anywhere.
6838 	 * However, we make sure these calls can never block.
6839 	 */
6840 	assertf(startup_phase < STARTUP_SUB_EARLY_BOOT ||
6841 #if KASAN_FAKESTACK
6842 	    zone->z_kasan_fakestacks ||
6843 #endif /* KASAN_FAKESTACK */
6844 	    ml_get_interrupts_enabled() ||
6845 	    ml_is_quiescing() ||
6846 	    debug_mode_active(),
6847 	    "Calling {k,z}alloc from interrupt disabled context isn't allowed");
6848 
6849 	/*
6850 	 * Make sure Z_NOFAIL was not obviously misused
6851 	 */
6852 	if (flags & Z_NOFAIL) {
6853 		assert((flags & (Z_NOWAIT | Z_NOPAGEWAIT)) == 0);
6854 	}
6855 
6856 #if VM_TAG_SIZECLASSES
6857 	if (__improbable(zone->z_uses_tags)) {
6858 		vm_tag_t tag = zalloc_flags_get_tag(flags);
6859 
6860 		if (flags & Z_VM_TAG_BT_BIT) {
6861 			tag = vm_tag_bt() ?: tag;
6862 		}
6863 		if (tag != VM_KERN_MEMORY_NONE) {
6864 			tag = vm_tag_will_update_zone(tag, zone->z_tags_sizeclass,
6865 			    flags & (Z_WAITOK | Z_NOWAIT | Z_NOPAGEWAIT));
6866 		}
6867 		if (tag == VM_KERN_MEMORY_NONE) {
6868 			zone_security_flags_t zsflags = zone_security_config(zone);
6869 
6870 			if (zsflags.z_kheap_id == KHEAP_ID_DATA_BUFFERS) {
6871 				tag = VM_KERN_MEMORY_KALLOC_DATA;
6872 			} else if (zsflags.z_kheap_id == KHEAP_ID_KT_VAR ||
6873 			    zsflags.z_kalloc_type) {
6874 				tag = VM_KERN_MEMORY_KALLOC_TYPE;
6875 			} else {
6876 				tag = VM_KERN_MEMORY_KALLOC;
6877 			}
6878 		}
6879 		flags = Z_VM_TAG(flags & ~Z_VM_TAG_MASK, tag);
6880 	}
6881 #endif /* VM_TAG_SIZECLASSES */
6882 
6883 	disable_preemption();
6884 
6885 #if ZALLOC_ENABLE_ZERO_CHECK
6886 	if (zalloc_skip_zero_check()) {
6887 		flags |= Z_NOZZC;
6888 	}
6889 #endif
6890 
6891 	if (zone->z_pcpu_cache) {
6892 		zone_cache_t cache;
6893 		vm_offset_t index, addr, esize;
6894 		int cpu = cpu_number();
6895 
6896 		cache = zalloc_cached_get_pcpu_cache(zone, NULL, cpu, flags);
6897 		if (__probable(cache)) {
6898 			esize = zone_elem_inner_size(zone);
6899 			zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated += esize;
6900 			index = --cache->zc_alloc_cur;
6901 			addr  = cache->zc_alloc_elems[index];
6902 			cache->zc_alloc_elems[index] = 0;
6903 			enable_preemption();
6904 			return zalloc_return(zone, addr, flags, esize);
6905 		}
6906 	}
6907 
6908 	__attribute__((musttail))
6909 	return zalloc_item(zone, zstats, flags);
6910 }
6911 
6912 __attribute__((always_inline))
6913 static inline zstack_t
zcache_alloc_stack_from_cpu(zone_id_t zid,zone_cache_t cache,zstack_t stack,uint32_t n,zone_cache_ops_t ops)6914 zcache_alloc_stack_from_cpu(
6915 	zone_id_t               zid,
6916 	zone_cache_t            cache,
6917 	zstack_t                stack,
6918 	uint32_t                n,
6919 	zone_cache_ops_t        ops)
6920 {
6921 	vm_offset_t *p;
6922 
6923 	n = MIN(n, cache->zc_alloc_cur);
6924 	p = cache->zc_alloc_elems + cache->zc_alloc_cur;
6925 	cache->zc_alloc_cur -= n;
6926 	stack.z_count += n;
6927 
6928 	do {
6929 		vm_offset_t e = *--p;
6930 
6931 		*p = 0;
6932 		if (ops) {
6933 			e = (vm_offset_t)ops->zc_op_mark_valid(zid, (void *)e);
6934 		} else {
6935 			e = __zcache_mark_valid(zone_by_id(zid), e, 0);
6936 		}
6937 		zstack_push_no_delta(&stack, (void *)e);
6938 	} while (--n > 0);
6939 
6940 	return stack;
6941 }
6942 
6943 __attribute__((noinline))
6944 static zstack_t
zcache_alloc_fail(zone_id_t zid,zstack_t stack,uint32_t count)6945 zcache_alloc_fail(zone_id_t zid, zstack_t stack, uint32_t count)
6946 {
6947 	zone_t zone = zone_by_id(zid);
6948 	zone_stats_t zstats = zone->z_stats;
6949 	int cpu;
6950 
6951 	count -= stack.z_count;
6952 
6953 	disable_preemption();
6954 	cpu = cpu_number();
6955 	zpercpu_get_cpu(zstats, cpu)->zs_mem_allocated -=
6956 	    count * zone_elem_inner_size(zone);
6957 	zpercpu_get_cpu(zstats, cpu)->zs_alloc_fail += 1;
6958 	enable_preemption();
6959 
6960 	return stack;
6961 }
6962 
6963 #define ZCACHE_ALLOC_RETRY  ((void *)-1)
6964 
6965 __attribute__((noinline))
6966 static void *
zcache_alloc_one(zone_id_t zid,zalloc_flags_t flags,zone_cache_ops_t ops)6967 zcache_alloc_one(
6968 	zone_id_t               zid,
6969 	zalloc_flags_t          flags,
6970 	zone_cache_ops_t        ops)
6971 {
6972 	zone_t zone = zone_by_id(zid);
6973 	void *o;
6974 
6975 	/*
6976 	 * First try to allocate in rudimentary zones without ever going into
6977 	 * __ZONE_EXHAUSTED_AND_WAITING_HARD__() by clearing Z_NOFAIL.
6978 	 */
6979 	enable_preemption();
6980 	o = ops->zc_op_alloc(zid, flags & ~Z_NOFAIL);
6981 	if (__probable(o)) {
6982 		os_atomic_inc(&zone->z_elems_avail, relaxed);
6983 	} else if (__probable(flags & Z_NOFAIL)) {
6984 		zone_cache_t cache;
6985 		vm_offset_t index;
6986 		int cpu;
6987 
6988 		zone_lock(zone);
6989 
6990 		cpu   = cpu_number();
6991 		cache = zalloc_cached_get_pcpu_cache(zone, ops, cpu, flags);
6992 		o     = ZCACHE_ALLOC_RETRY;
6993 		if (__probable(cache)) {
6994 			index = --cache->zc_alloc_cur;
6995 			o     = (void *)cache->zc_alloc_elems[index];
6996 			cache->zc_alloc_elems[index] = 0;
6997 			o = ops->zc_op_mark_valid(zid, o);
6998 		} else if (zone->z_elems_free == 0) {
6999 			__ZONE_EXHAUSTED_AND_WAITING_HARD__(zone);
7000 		}
7001 
7002 		zone_unlock(zone);
7003 	}
7004 
7005 	return o;
7006 }
7007 
7008 __attribute__((always_inline))
7009 static zstack_t
zcache_alloc_n_ext(zone_id_t zid,uint32_t count,zalloc_flags_t flags,zone_cache_ops_t ops)7010 zcache_alloc_n_ext(
7011 	zone_id_t               zid,
7012 	uint32_t                count,
7013 	zalloc_flags_t          flags,
7014 	zone_cache_ops_t        ops)
7015 {
7016 	zstack_t stack = { };
7017 	zone_cache_t cache;
7018 	zone_t zone;
7019 	int cpu;
7020 
7021 	disable_preemption();
7022 	cpu  = cpu_number();
7023 	zone = zone_by_id(zid);
7024 	zpercpu_get_cpu(zone->z_stats, cpu)->zs_mem_allocated +=
7025 	    count * zone_elem_inner_size(zone);
7026 
7027 	for (;;) {
7028 		cache = zalloc_cached_get_pcpu_cache(zone, ops, cpu, flags);
7029 		if (__probable(cache)) {
7030 			stack = zcache_alloc_stack_from_cpu(zid, cache, stack,
7031 			    count - stack.z_count, ops);
7032 			enable_preemption();
7033 		} else {
7034 			void *o;
7035 
7036 			if (ops) {
7037 				o = zcache_alloc_one(zid, flags, ops);
7038 			} else {
7039 				o = zalloc_item(zone, zone->z_stats, flags).addr;
7040 			}
7041 			if (__improbable(o == NULL)) {
7042 				return zcache_alloc_fail(zid, stack, count);
7043 			}
7044 			if (ops == NULL || o != ZCACHE_ALLOC_RETRY) {
7045 				zstack_push(&stack, o);
7046 			}
7047 		}
7048 
7049 		if (stack.z_count == count) {
7050 			break;
7051 		}
7052 
7053 		disable_preemption();
7054 		cpu = cpu_number();
7055 	}
7056 
7057 	ZALLOC_LOG(zone, stack.z_head, stack.z_count);
7058 
7059 	return stack;
7060 }
7061 
7062 zstack_t
zalloc_n(zone_id_t zid,uint32_t count,zalloc_flags_t flags)7063 zalloc_n(zone_id_t zid, uint32_t count, zalloc_flags_t flags)
7064 {
7065 	return zcache_alloc_n_ext(zid, count, flags, NULL);
7066 }
7067 
zstack_t(zcache_alloc_n)7068 zstack_t
7069 (zcache_alloc_n)(
7070 	zone_id_t               zid,
7071 	uint32_t                count,
7072 	zalloc_flags_t          flags,
7073 	zone_cache_ops_t        ops)
7074 {
7075 	__builtin_assume(ops != NULL);
7076 	return zcache_alloc_n_ext(zid, count, flags, ops);
7077 }
7078 
7079 __attribute__((always_inline))
7080 void *
zalloc(zone_t zov)7081 zalloc(zone_t zov)
7082 {
7083 	return zalloc_flags(zov, Z_WAITOK);
7084 }
7085 
7086 __attribute__((always_inline))
7087 void *
zalloc_noblock(zone_t zov)7088 zalloc_noblock(zone_t zov)
7089 {
7090 	return zalloc_flags(zov, Z_NOWAIT);
7091 }
7092 
7093 void *
7094 (zalloc_flags)(zone_t zov, zalloc_flags_t flags)
7095 {
7096 	zone_t zone = zov->z_self;
7097 	zone_stats_t zstats = zov->z_stats;
7098 
7099 	assert(zone > &zone_array[ZONE_ID__LAST_RO]);
7100 	assert(!zone->z_percpu && !zone->z_permanent);
7101 	return zalloc_ext(zone, zstats, flags).addr;
7102 }
7103 
7104 __attribute__((always_inline))
7105 void *
7106 (zalloc_id)(zone_id_t zid, zalloc_flags_t flags)
7107 {
7108 	return (zalloc_flags)(zone_by_id(zid), flags);
7109 }
7110 
7111 void *
7112 (zalloc_ro)(zone_id_t zid, zalloc_flags_t flags)
7113 {
7114 	assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
7115 	zone_t zone = zone_by_id(zid);
7116 	zone_stats_t zstats = zone->z_stats;
7117 	struct kalloc_result kr;
7118 
7119 	kr = zalloc_ext(zone, zstats, flags);
7120 #if ZSECURITY_CONFIG(READ_ONLY)
7121 	assert(zone_security_array[zid].z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
7122 	if (kr.addr) {
7123 		zone_require_ro(zid, kr.size, kr.addr);
7124 	}
7125 #endif
7126 	return kr.addr;
7127 }
7128 
7129 #if ZSECURITY_CONFIG(READ_ONLY)
7130 
7131 __attribute__((always_inline))
7132 static bool
from_current_stack(vm_offset_t addr,vm_size_t size)7133 from_current_stack(vm_offset_t addr, vm_size_t size)
7134 {
7135 	vm_offset_t start = (vm_offset_t)__builtin_frame_address(0);
7136 	vm_offset_t end = (start + kernel_stack_size - 1) & -kernel_stack_size;
7137 
7138 	addr = vm_memtag_canonicalize_address(addr);
7139 
7140 	return (addr >= start) && (addr + size < end);
7141 }
7142 
7143 /*
7144  * Check if an address is from const memory i.e TEXT or DATA CONST segements
7145  * or the SECURITY_READ_ONLY_LATE section.
7146  */
7147 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
7148 __attribute__((always_inline))
7149 static bool
from_const_memory(const vm_offset_t addr,vm_size_t size)7150 from_const_memory(const vm_offset_t addr, vm_size_t size)
7151 {
7152 	return rorgn_contains(addr, size, true);
7153 }
7154 #else /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
7155 __attribute__((always_inline))
7156 static bool
from_const_memory(const vm_offset_t addr,vm_size_t size)7157 from_const_memory(const vm_offset_t addr, vm_size_t size)
7158 {
7159 #pragma unused(addr, size)
7160 	return true;
7161 }
7162 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
7163 
7164 __abortlike
7165 static void
zalloc_ro_mut_validation_panic(zone_id_t zid,void * elem,const vm_offset_t src,vm_size_t src_size)7166 zalloc_ro_mut_validation_panic(zone_id_t zid, void *elem,
7167     const vm_offset_t src, vm_size_t src_size)
7168 {
7169 	vm_offset_t stack_start = (vm_offset_t)__builtin_frame_address(0);
7170 	vm_offset_t stack_end = (stack_start + kernel_stack_size - 1) & -kernel_stack_size;
7171 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
7172 	extern vm_offset_t rorgn_begin;
7173 	extern vm_offset_t rorgn_end;
7174 #else
7175 	vm_offset_t const rorgn_begin = 0;
7176 	vm_offset_t const rorgn_end = 0;
7177 #endif
7178 
7179 	if (from_ro_map(src, src_size)) {
7180 		zone_t src_zone = &zone_array[zone_index_from_ptr((void *)src)];
7181 		zone_t dst_zone = &zone_array[zid];
7182 		panic("zalloc_ro_mut failed: source (%p) not from same zone as dst (%p)"
7183 		    " (expected: %s, actual: %s", (void *)src, elem, src_zone->z_name,
7184 		    dst_zone->z_name);
7185 	}
7186 
7187 	panic("zalloc_ro_mut failed: source (%p, phys %p) not from RO zone map (%p - %p), "
7188 	    "current stack (%p - %p) or const memory (phys %p - %p)",
7189 	    (void *)src, (void*)kvtophys(src),
7190 	    (void *)zone_info.zi_ro_range.min_address,
7191 	    (void *)zone_info.zi_ro_range.max_address,
7192 	    (void *)stack_start, (void *)stack_end,
7193 	    (void *)rorgn_begin, (void *)rorgn_end);
7194 }
7195 
7196 __attribute__((always_inline))
7197 static void
zalloc_ro_mut_validate_src(zone_id_t zid,void * elem,const vm_offset_t src,vm_size_t src_size)7198 zalloc_ro_mut_validate_src(zone_id_t zid, void *elem,
7199     const vm_offset_t src, vm_size_t src_size)
7200 {
7201 	if (from_current_stack(src, src_size) ||
7202 	    (from_ro_map(src, src_size) &&
7203 	    zid == zone_index_from_ptr((void *)src)) ||
7204 	    from_const_memory(src, src_size)) {
7205 		return;
7206 	}
7207 	zalloc_ro_mut_validation_panic(zid, elem, src, src_size);
7208 }
7209 
7210 #endif /* ZSECURITY_CONFIG(READ_ONLY) */
7211 
7212 __attribute__((noinline))
7213 void
zalloc_ro_mut(zone_id_t zid,void * elem,vm_offset_t offset,const void * new_data,vm_size_t new_data_size)7214 zalloc_ro_mut(zone_id_t zid, void *elem, vm_offset_t offset,
7215     const void *new_data, vm_size_t new_data_size)
7216 {
7217 	assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
7218 
7219 #if ZSECURITY_CONFIG(READ_ONLY)
7220 	bool skip_src_check = false;
7221 
7222 	/*
7223 	 * The OSEntitlements RO-zone is a little differently treated. For more
7224 	 * information: rdar://100518485.
7225 	 */
7226 	if (zid == ZONE_ID_AMFI_OSENTITLEMENTS) {
7227 		code_signing_config_t cs_config = 0;
7228 
7229 		code_signing_configuration(NULL, &cs_config);
7230 		if (cs_config & CS_CONFIG_CSM_ENABLED) {
7231 			skip_src_check = true;
7232 		}
7233 	}
7234 
7235 	if (skip_src_check == false) {
7236 		zalloc_ro_mut_validate_src(zid, elem, (vm_offset_t)new_data,
7237 		    new_data_size);
7238 	}
7239 	pmap_ro_zone_memcpy(zid, (vm_offset_t) elem, offset,
7240 	    (vm_offset_t) new_data, new_data_size);
7241 #else
7242 	(void)zid;
7243 	memcpy((void *)((uintptr_t)elem + offset), new_data, new_data_size);
7244 #endif
7245 }
7246 
7247 __attribute__((noinline))
7248 uint64_t
zalloc_ro_mut_atomic(zone_id_t zid,void * elem,vm_offset_t offset,zro_atomic_op_t op,uint64_t value)7249 zalloc_ro_mut_atomic(zone_id_t zid, void *elem, vm_offset_t offset,
7250     zro_atomic_op_t op, uint64_t value)
7251 {
7252 	assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
7253 
7254 #if ZSECURITY_CONFIG(READ_ONLY)
7255 	value = pmap_ro_zone_atomic_op(zid, (vm_offset_t)elem, offset, op, value);
7256 #else
7257 	(void)zid;
7258 	value = __zalloc_ro_mut_atomic((vm_offset_t)elem + offset, op, value);
7259 #endif
7260 	return value;
7261 }
7262 
7263 void
zalloc_ro_clear(zone_id_t zid,void * elem,vm_offset_t offset,vm_size_t size)7264 zalloc_ro_clear(zone_id_t zid, void *elem, vm_offset_t offset, vm_size_t size)
7265 {
7266 	assert(zid >= ZONE_ID__FIRST_RO && zid <= ZONE_ID__LAST_RO);
7267 #if ZSECURITY_CONFIG(READ_ONLY)
7268 	pmap_ro_zone_bzero(zid, (vm_offset_t)elem, offset, size);
7269 #else
7270 	(void)zid;
7271 	bzero((void *)((uintptr_t)elem + offset), size);
7272 #endif
7273 }
7274 
7275 /*
7276  * This function will run in the PPL and needs to be robust
7277  * against an attacker with arbitrary kernel write.
7278  */
7279 
7280 #if ZSECURITY_CONFIG(READ_ONLY)
7281 
7282 __abortlike
7283 static void
zone_id_require_ro_panic(zone_id_t zid,void * addr)7284 zone_id_require_ro_panic(zone_id_t zid, void *addr)
7285 {
7286 	struct zone_size_params p = zone_ro_size_params[zid];
7287 	vm_offset_t elem = (vm_offset_t)addr;
7288 	uint32_t zindex;
7289 	zone_t other;
7290 	zone_t zone = &zone_array[zid];
7291 
7292 	if (!from_ro_map(addr, 1)) {
7293 		panic("zone_require_ro failed: address not in a ro zone (addr: %p)", addr);
7294 	}
7295 
7296 	if (!Z_FAST_ALIGNED(PAGE_SIZE - (elem & PAGE_MASK), p.z_align_magic)) {
7297 		panic("zone_require_ro failed: element improperly aligned (addr: %p)", addr);
7298 	}
7299 
7300 	zindex = zone_index_from_ptr(addr);
7301 	other = &zone_array[zindex];
7302 	if (zindex >= os_atomic_load(&num_zones, relaxed) || !other->z_self) {
7303 		panic("zone_require_ro failed: invalid zone index %d "
7304 		    "(addr: %p, expected: %s%s)", zindex,
7305 		    addr, zone_heap_name(zone), zone->z_name);
7306 	} else {
7307 		panic("zone_require_ro failed: address in unexpected zone id %d (%s%s) "
7308 		    "(addr: %p, expected: %s%s)",
7309 		    zindex, zone_heap_name(other), other->z_name,
7310 		    addr, zone_heap_name(zone), zone->z_name);
7311 	}
7312 }
7313 
7314 #endif /* ZSECURITY_CONFIG(READ_ONLY) */
7315 
7316 __attribute__((always_inline))
7317 void
zone_require_ro(zone_id_t zid,vm_size_t elem_size __unused,void * addr)7318 zone_require_ro(zone_id_t zid, vm_size_t elem_size __unused, void *addr)
7319 {
7320 #if ZSECURITY_CONFIG(READ_ONLY)
7321 	struct zone_size_params p = zone_ro_size_params[zid];
7322 	vm_offset_t elem = (vm_offset_t)addr;
7323 
7324 	if (!from_ro_map(addr, 1) ||
7325 	    !Z_FAST_ALIGNED(PAGE_SIZE - (elem & PAGE_MASK), p.z_align_magic) ||
7326 	    zid != zone_meta_from_addr(elem)->zm_index) {
7327 		zone_id_require_ro_panic(zid, addr);
7328 	}
7329 #else
7330 #pragma unused(zid, addr)
7331 #endif
7332 }
7333 
7334 void *
7335 (zalloc_percpu)(union zone_or_view zov, zalloc_flags_t flags)
7336 {
7337 	zone_t zone = zov.zov_view->zv_zone;
7338 	zone_stats_t zstats = zov.zov_view->zv_stats;
7339 
7340 	assert(zone > &zone_array[ZONE_ID__LAST_RO]);
7341 	assert(zone->z_percpu);
7342 	flags |= Z_PCPU;
7343 	return (void *)__zpcpu_mangle(zalloc_ext(zone, zstats, flags).addr);
7344 }
7345 
7346 static void *
_zalloc_permanent(zone_t zone,vm_size_t size,vm_offset_t mask)7347 _zalloc_permanent(zone_t zone, vm_size_t size, vm_offset_t mask)
7348 {
7349 	struct zone_page_metadata *page_meta;
7350 	vm_offset_t offs, addr;
7351 	zone_pva_t pva;
7352 
7353 	assert(ml_get_interrupts_enabled() ||
7354 	    ml_is_quiescing() ||
7355 	    debug_mode_active() ||
7356 	    startup_phase < STARTUP_SUB_EARLY_BOOT);
7357 
7358 	size = (size + mask) & ~mask;
7359 	assert(size <= PAGE_SIZE);
7360 
7361 	zone_lock(zone);
7362 	assert(zone->z_self == zone);
7363 
7364 	for (;;) {
7365 		pva = zone->z_pageq_partial;
7366 		while (!zone_pva_is_null(pva)) {
7367 			page_meta = zone_pva_to_meta(pva);
7368 			if (page_meta->zm_bump + size <= PAGE_SIZE) {
7369 				goto found;
7370 			}
7371 			pva = page_meta->zm_page_next;
7372 		}
7373 
7374 		zone_expand_locked(zone, Z_WAITOK);
7375 	}
7376 
7377 found:
7378 	offs = (uint16_t)((page_meta->zm_bump + mask) & ~mask);
7379 	page_meta->zm_bump = (uint16_t)(offs + size);
7380 	page_meta->zm_alloc_size += size;
7381 	zone->z_elems_free -= size;
7382 	zpercpu_get(zone->z_stats)->zs_mem_allocated += size;
7383 
7384 	if (page_meta->zm_alloc_size >= PAGE_SIZE - sizeof(vm_offset_t)) {
7385 		zone_meta_requeue(zone, &zone->z_pageq_full, page_meta);
7386 	}
7387 
7388 	zone_unlock(zone);
7389 
7390 	if (zone->z_tbi_tag) {
7391 		addr = vm_memtag_fixup_ptr(offs + zone_pva_to_addr(pva));
7392 	} else {
7393 		addr = offs + zone_pva_to_addr(pva);
7394 	}
7395 
7396 	DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
7397 	return (void *)addr;
7398 }
7399 
7400 static void *
_zalloc_permanent_large(size_t size,vm_offset_t mask,vm_tag_t tag)7401 _zalloc_permanent_large(size_t size, vm_offset_t mask, vm_tag_t tag)
7402 {
7403 	vm_offset_t addr;
7404 
7405 	kernel_memory_allocate(kernel_map, &addr, size, mask,
7406 	    KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_ZERO, tag);
7407 
7408 	return (void *)addr;
7409 }
7410 
7411 void *
zalloc_permanent_tag(vm_size_t size,vm_offset_t mask,vm_tag_t tag)7412 zalloc_permanent_tag(vm_size_t size, vm_offset_t mask, vm_tag_t tag)
7413 {
7414 	if (size <= PAGE_SIZE) {
7415 		zone_t zone = &zone_array[ZONE_ID_PERMANENT];
7416 		return _zalloc_permanent(zone, size, mask);
7417 	}
7418 	return _zalloc_permanent_large(size, mask, tag);
7419 }
7420 
7421 void *
zalloc_percpu_permanent(vm_size_t size,vm_offset_t mask)7422 zalloc_percpu_permanent(vm_size_t size, vm_offset_t mask)
7423 {
7424 	zone_t zone = &zone_array[ZONE_ID_PERCPU_PERMANENT];
7425 	return (void *)__zpcpu_mangle(_zalloc_permanent(zone, size, mask));
7426 }
7427 
7428 /*! @} */
7429 #endif /* !ZALLOC_TEST */
7430 #pragma mark zone GC / trimming
7431 #if !ZALLOC_TEST
7432 
7433 static thread_call_data_t zone_trim_callout;
7434 EVENT_DEFINE(ZONE_EXHAUSTED);
7435 
7436 static void
zone_reclaim_chunk(zone_t z,struct zone_page_metadata * meta,uint32_t free_count)7437 zone_reclaim_chunk(
7438 	zone_t                  z,
7439 	struct zone_page_metadata *meta,
7440 	uint32_t                free_count)
7441 {
7442 	vm_address_t page_addr;
7443 	vm_size_t    size_to_free;
7444 	uint32_t     bitmap_ref;
7445 	uint32_t     page_count;
7446 	zone_security_flags_t zsflags = zone_security_config(z);
7447 	bool         sequester = !z->z_destroyed;
7448 	bool         oob_guard = false;
7449 
7450 	if (zone_submap_is_sequestered(zsflags)) {
7451 		/*
7452 		 * If the entire map is sequestered, we can't return the VA.
7453 		 * It stays pinned to the zone forever.
7454 		 */
7455 		sequester = true;
7456 	}
7457 
7458 	zone_meta_queue_pop(z, &z->z_pageq_empty);
7459 
7460 	page_addr  = zone_meta_to_addr(meta);
7461 	page_count = meta->zm_chunk_len;
7462 	oob_guard  = meta->zm_guarded;
7463 
7464 	if (meta->zm_alloc_size) {
7465 		zone_metadata_corruption(z, meta, "alloc_size");
7466 	}
7467 	if (z->z_percpu) {
7468 		if (page_count != 1) {
7469 			zone_metadata_corruption(z, meta, "page_count");
7470 		}
7471 		size_to_free = ptoa(z->z_chunk_pages);
7472 		zone_remove_wired_pages(z, z->z_chunk_pages);
7473 	} else {
7474 		if (page_count > z->z_chunk_pages) {
7475 			zone_metadata_corruption(z, meta, "page_count");
7476 		}
7477 		if (page_count < z->z_chunk_pages) {
7478 			/* Dequeue non populated VA from z_pageq_va */
7479 			zone_meta_remqueue(z, meta + page_count);
7480 		}
7481 		size_to_free = ptoa(page_count);
7482 		zone_remove_wired_pages(z, page_count);
7483 	}
7484 
7485 	zone_counter_sub(z, z_elems_free, free_count);
7486 	zone_counter_sub(z, z_elems_avail, free_count);
7487 	zone_counter_sub(z, z_wired_empty, page_count);
7488 	zone_counter_sub(z, z_wired_cur, page_count);
7489 
7490 	if (z->z_pcpu_cache == NULL) {
7491 		if (z->z_elems_free_min < free_count) {
7492 			z->z_elems_free_min = 0;
7493 		} else {
7494 			z->z_elems_free_min -= free_count;
7495 		}
7496 	}
7497 	if (z->z_elems_free_wma < free_count) {
7498 		z->z_elems_free_wma = 0;
7499 	} else {
7500 		z->z_elems_free_wma -= free_count;
7501 	}
7502 
7503 	bitmap_ref = 0;
7504 	if (sequester) {
7505 		if (meta->zm_inline_bitmap) {
7506 			for (int i = 0; i < meta->zm_chunk_len; i++) {
7507 				meta[i].zm_bitmap = 0;
7508 			}
7509 		} else {
7510 			bitmap_ref = meta->zm_bitmap;
7511 			meta->zm_bitmap = 0;
7512 		}
7513 		meta->zm_chunk_len = 0;
7514 	} else {
7515 		if (!meta->zm_inline_bitmap) {
7516 			bitmap_ref = meta->zm_bitmap;
7517 		}
7518 		zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
7519 		bzero(meta, sizeof(*meta) * (z->z_chunk_pages + oob_guard));
7520 	}
7521 
7522 #if CONFIG_ZLEAKS
7523 	if (__improbable(zleak_should_disable_for_zone(z) &&
7524 	    startup_phase >= STARTUP_SUB_THREAD_CALL)) {
7525 		thread_call_enter(&zone_leaks_callout);
7526 	}
7527 #endif /* CONFIG_ZLEAKS */
7528 
7529 	zone_unlock(z);
7530 
7531 	if (bitmap_ref) {
7532 		zone_bits_free(bitmap_ref);
7533 	}
7534 
7535 	/* Free the pages for metadata and account for them */
7536 #if KASAN_CLASSIC
7537 	if (z->z_percpu) {
7538 		for (uint32_t i = 0; i < z->z_chunk_pages; i++) {
7539 			kasan_zmem_remove(page_addr + ptoa(i), PAGE_SIZE,
7540 			    zone_elem_outer_size(z),
7541 			    zone_elem_outer_offs(z),
7542 			    zone_elem_redzone(z));
7543 		}
7544 	} else {
7545 		kasan_zmem_remove(page_addr, size_to_free,
7546 		    zone_elem_outer_size(z),
7547 		    zone_elem_outer_offs(z),
7548 		    zone_elem_redzone(z));
7549 	}
7550 #endif /* KASAN_CLASSIC */
7551 
7552 	if (sequester) {
7553 		kernel_memory_depopulate(page_addr, size_to_free,
7554 		    KMA_KOBJECT, VM_KERN_MEMORY_ZONE);
7555 	} else {
7556 		assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_VM);
7557 		kmem_free(zone_submap(zsflags), page_addr,
7558 		    ptoa(z->z_chunk_pages + oob_guard));
7559 		if (oob_guard) {
7560 			os_atomic_dec(&zone_guard_pages, relaxed);
7561 		}
7562 	}
7563 
7564 	thread_yield_to_preemption();
7565 
7566 	zone_lock(z);
7567 
7568 	if (sequester) {
7569 		zone_meta_queue_push(z, &z->z_pageq_va, meta);
7570 	}
7571 }
7572 
7573 static void
zone_reclaim_elements(zone_t z,uint16_t n,vm_offset_t * elems)7574 zone_reclaim_elements(zone_t z, uint16_t n, vm_offset_t *elems)
7575 {
7576 	z_debug_assert(n <= zc_mag_size());
7577 
7578 	for (uint16_t i = 0; i < n; i++) {
7579 		vm_offset_t addr = elems[i];
7580 		elems[i] = 0;
7581 		zfree_drop(z, addr);
7582 	}
7583 
7584 	z->z_elems_free += n;
7585 }
7586 
7587 static void
zcache_reclaim_elements(zone_id_t zid,uint16_t n,vm_offset_t * elems)7588 zcache_reclaim_elements(zone_id_t zid, uint16_t n, vm_offset_t *elems)
7589 {
7590 	z_debug_assert(n <= zc_mag_size());
7591 	zone_cache_ops_t ops = zcache_ops[zid];
7592 
7593 	for (uint16_t i = 0; i < n; i++) {
7594 		vm_offset_t addr = elems[i];
7595 		elems[i] = 0;
7596 		addr = (vm_offset_t)ops->zc_op_mark_valid(zid, (void *)addr);
7597 		ops->zc_op_free(zid, (void *)addr);
7598 	}
7599 
7600 	os_atomic_sub(&zone_by_id(zid)->z_elems_avail, n, relaxed);
7601 }
7602 
7603 static void
zone_depot_trim(zone_t z,uint32_t target,struct zone_depot * zd)7604 zone_depot_trim(zone_t z, uint32_t target, struct zone_depot *zd)
7605 {
7606 	zpercpu_foreach(zc, z->z_pcpu_cache) {
7607 		zone_depot_lock(zc);
7608 
7609 		if (zc->zc_depot.zd_full > (target + 1) / 2) {
7610 			uint32_t n = zc->zc_depot.zd_full - (target + 1) / 2;
7611 			zone_depot_move_full(zd, &zc->zc_depot, n, NULL);
7612 		}
7613 
7614 		if (zc->zc_depot.zd_empty > target / 2) {
7615 			uint32_t n = zc->zc_depot.zd_empty - target / 2;
7616 			zone_depot_move_empty(zd, &zc->zc_depot, n, NULL);
7617 		}
7618 
7619 		zone_depot_unlock(zc);
7620 	}
7621 }
7622 
7623 __enum_decl(zone_reclaim_mode_t, uint32_t, {
7624 	ZONE_RECLAIM_TRIM,
7625 	ZONE_RECLAIM_DRAIN,
7626 	ZONE_RECLAIM_DESTROY,
7627 });
7628 
7629 static void
zone_reclaim_pcpu(zone_t z,zone_reclaim_mode_t mode,struct zone_depot * zd)7630 zone_reclaim_pcpu(zone_t z, zone_reclaim_mode_t mode, struct zone_depot *zd)
7631 {
7632 	uint32_t depot_max = 0;
7633 	bool cleanup = mode != ZONE_RECLAIM_TRIM;
7634 
7635 	if (z->z_depot_cleanup) {
7636 		z->z_depot_cleanup = false;
7637 		depot_max = z->z_depot_size;
7638 		cleanup = true;
7639 	}
7640 
7641 	if (cleanup) {
7642 		zone_depot_trim(z, depot_max, zd);
7643 	}
7644 
7645 	if (mode == ZONE_RECLAIM_DESTROY) {
7646 		zpercpu_foreach(zc, z->z_pcpu_cache) {
7647 			zone_reclaim_elements(z, zc->zc_alloc_cur,
7648 			    zc->zc_alloc_elems);
7649 			zone_reclaim_elements(z, zc->zc_free_cur,
7650 			    zc->zc_free_elems);
7651 			zc->zc_alloc_cur = zc->zc_free_cur = 0;
7652 		}
7653 
7654 		z->z_recirc_empty_min = 0;
7655 		z->z_recirc_empty_wma = 0;
7656 		z->z_recirc_full_min = 0;
7657 		z->z_recirc_full_wma = 0;
7658 		z->z_recirc_cont_cur = 0;
7659 		z->z_recirc_cont_wma = 0;
7660 	}
7661 }
7662 
7663 static void
zone_reclaim_recirc_drain(zone_t z,struct zone_depot * zd)7664 zone_reclaim_recirc_drain(zone_t z, struct zone_depot *zd)
7665 {
7666 	assert(zd->zd_empty == 0);
7667 	assert(zd->zd_full == 0);
7668 
7669 	zone_recirc_lock_nopreempt(z);
7670 
7671 	*zd = z->z_recirc;
7672 	if (zd->zd_full == 0) {
7673 		zd->zd_tail = &zd->zd_head;
7674 	}
7675 	zone_depot_init(&z->z_recirc);
7676 	z->z_recirc_empty_min = 0;
7677 	z->z_recirc_empty_wma = 0;
7678 	z->z_recirc_full_min = 0;
7679 	z->z_recirc_full_wma = 0;
7680 
7681 	zone_recirc_unlock_nopreempt(z);
7682 }
7683 
7684 static void
zone_reclaim_recirc_trim(zone_t z,struct zone_depot * zd)7685 zone_reclaim_recirc_trim(zone_t z, struct zone_depot *zd)
7686 {
7687 	for (;;) {
7688 		uint32_t budget = zc_free_batch_size();
7689 		uint32_t count;
7690 		bool done = true;
7691 
7692 		zone_recirc_lock_nopreempt(z);
7693 		count = MIN(z->z_recirc_empty_wma / Z_WMA_UNIT,
7694 		    z->z_recirc_empty_min);
7695 		assert(count <= z->z_recirc.zd_empty);
7696 
7697 		if (count > budget) {
7698 			count = budget;
7699 			done  = false;
7700 		}
7701 		if (count) {
7702 			budget -= count;
7703 			zone_depot_move_empty(zd, &z->z_recirc, count, NULL);
7704 			z->z_recirc_empty_min -= count;
7705 			z->z_recirc_empty_wma -= count * Z_WMA_UNIT;
7706 		}
7707 
7708 		count = MIN(z->z_recirc_full_wma / Z_WMA_UNIT,
7709 		    z->z_recirc_full_min);
7710 		assert(count <= z->z_recirc.zd_full);
7711 
7712 		if (count > budget) {
7713 			count = budget;
7714 			done  = false;
7715 		}
7716 		if (count) {
7717 			zone_depot_move_full(zd, &z->z_recirc, count, NULL);
7718 			z->z_recirc_full_min -= count;
7719 			z->z_recirc_full_wma -= count * Z_WMA_UNIT;
7720 		}
7721 
7722 		zone_recirc_unlock_nopreempt(z);
7723 
7724 		if (done) {
7725 			return;
7726 		}
7727 
7728 		/*
7729 		 * If the number of magazines to reclaim is too large,
7730 		 * we might be keeping preemption disabled for too long.
7731 		 *
7732 		 * Drop and retake the lock to allow for preemption to occur.
7733 		 */
7734 		zone_unlock(z);
7735 		zone_lock(z);
7736 	}
7737 }
7738 
7739 /*!
7740  * @function zone_reclaim
7741  *
7742  * @brief
7743  * Drains or trim the zone.
7744  *
7745  * @discussion
7746  * Draining the zone will free it from all its elements.
7747  *
7748  * Trimming the zone tries to respect the working set size, and avoids draining
7749  * the depot when it's not necessary.
7750  *
7751  * @param z             The zone to reclaim from
7752  * @param mode          The purpose of this reclaim.
7753  */
7754 static void
zone_reclaim(zone_t z,zone_reclaim_mode_t mode)7755 zone_reclaim(zone_t z, zone_reclaim_mode_t mode)
7756 {
7757 	struct zone_depot zd;
7758 
7759 	zone_depot_init(&zd);
7760 
7761 	zone_lock(z);
7762 
7763 	if (mode == ZONE_RECLAIM_DESTROY) {
7764 		if (!z->z_destructible || z->z_elems_rsv) {
7765 			panic("zdestroy: Zone %s%s isn't destructible",
7766 			    zone_heap_name(z), z->z_name);
7767 		}
7768 
7769 		if (!z->z_self || z->z_expander ||
7770 		    z->z_async_refilling || z->z_expanding_wait) {
7771 			panic("zdestroy: Zone %s%s in an invalid state for destruction",
7772 			    zone_heap_name(z), z->z_name);
7773 		}
7774 
7775 #if !KASAN_CLASSIC
7776 		/*
7777 		 * Unset the valid bit. We'll hit an assert failure on further
7778 		 * operations on this zone, until zinit() is called again.
7779 		 *
7780 		 * Leave the zone valid for KASan as we will see zfree's on
7781 		 * quarantined free elements even after the zone is destroyed.
7782 		 */
7783 		z->z_self = NULL;
7784 #endif
7785 		z->z_destroyed = true;
7786 	} else if (z->z_destroyed) {
7787 		return zone_unlock(z);
7788 	} else if (zone_count_free(z) <= z->z_elems_rsv) {
7789 		/* If the zone is under its reserve level, leave it alone. */
7790 		return zone_unlock(z);
7791 	}
7792 
7793 	if (z->z_pcpu_cache) {
7794 		zone_magazine_t mag;
7795 		uint32_t freed = 0;
7796 
7797 		/*
7798 		 * This is all done with the zone lock held on purpose.
7799 		 * The work here is O(ncpu), which should still be short.
7800 		 *
7801 		 * We need to keep the lock held until we have reclaimed
7802 		 * at least a few magazines, otherwise if the zone has no
7803 		 * free elements outside of the depot, a thread performing
7804 		 * a concurrent allocatiuon could try to grow the zone
7805 		 * while we're trying to drain it.
7806 		 */
7807 		if (mode == ZONE_RECLAIM_TRIM) {
7808 			zone_reclaim_recirc_trim(z, &zd);
7809 		} else {
7810 			zone_reclaim_recirc_drain(z, &zd);
7811 		}
7812 		zone_reclaim_pcpu(z, mode, &zd);
7813 
7814 		if (z->z_chunk_elems) {
7815 			zone_cache_t cache = zpercpu_get_cpu(z->z_pcpu_cache, 0);
7816 			smr_t smr = zone_cache_smr(cache);
7817 
7818 			while (zd.zd_full) {
7819 				mag = zone_depot_pop_head_full(&zd, NULL);
7820 				if (smr) {
7821 					smr_wait(smr, mag->zm_seq);
7822 					zalloc_cached_reuse_smr(z, cache, mag);
7823 					freed += zc_mag_size();
7824 				}
7825 				zone_reclaim_elements(z, zc_mag_size(),
7826 				    mag->zm_elems);
7827 				zone_depot_insert_head_empty(&zd, mag);
7828 
7829 				freed += zc_mag_size();
7830 				if (freed >= zc_free_batch_size()) {
7831 					zone_unlock(z);
7832 					zone_magazine_free_list(&zd);
7833 					thread_yield_to_preemption();
7834 					zone_lock(z);
7835 					freed = 0;
7836 				}
7837 			}
7838 		} else {
7839 			zone_id_t zid = zone_index(z);
7840 
7841 			zone_unlock(z);
7842 
7843 			assert(zid <= ZONE_ID__FIRST_DYNAMIC && zcache_ops[zid]);
7844 
7845 			while (zd.zd_full) {
7846 				mag = zone_depot_pop_head_full(&zd, NULL);
7847 				zcache_reclaim_elements(zid, zc_mag_size(),
7848 				    mag->zm_elems);
7849 				zone_magazine_free(mag);
7850 			}
7851 
7852 			goto cleanup;
7853 		}
7854 	}
7855 
7856 	while (!zone_pva_is_null(z->z_pageq_empty)) {
7857 		struct zone_page_metadata *meta;
7858 		uint32_t count, limit = z->z_elems_rsv * 5 / 4;
7859 
7860 		if (mode == ZONE_RECLAIM_TRIM && z->z_pcpu_cache == NULL) {
7861 			limit = MAX(limit, z->z_elems_free -
7862 			    MIN(z->z_elems_free_min, z->z_elems_free_wma));
7863 		}
7864 
7865 		meta  = zone_pva_to_meta(z->z_pageq_empty);
7866 		count = (uint32_t)ptoa(meta->zm_chunk_len) / zone_elem_outer_size(z);
7867 
7868 		if (zone_count_free(z) - count < limit) {
7869 			break;
7870 		}
7871 
7872 		zone_reclaim_chunk(z, meta, count);
7873 	}
7874 
7875 	zone_unlock(z);
7876 
7877 cleanup:
7878 	zone_magazine_free_list(&zd);
7879 }
7880 
7881 void
zone_drain(zone_t zone)7882 zone_drain(zone_t zone)
7883 {
7884 	current_thread()->options |= TH_OPT_ZONE_PRIV;
7885 	lck_mtx_lock(&zone_gc_lock);
7886 	zone_reclaim(zone, ZONE_RECLAIM_DRAIN);
7887 	lck_mtx_unlock(&zone_gc_lock);
7888 	current_thread()->options &= ~TH_OPT_ZONE_PRIV;
7889 }
7890 
7891 void
zcache_drain(zone_id_t zid)7892 zcache_drain(zone_id_t zid)
7893 {
7894 	zone_drain(zone_by_id(zid));
7895 }
7896 
7897 static void
zone_reclaim_all(zone_reclaim_mode_t mode)7898 zone_reclaim_all(zone_reclaim_mode_t mode)
7899 {
7900 	/*
7901 	 * Start with zcaches, so that they flow into the regular zones.
7902 	 *
7903 	 * Then the zones with VA sequester since depopulating
7904 	 * pages will not need to allocate vm map entries for holes,
7905 	 * which will give memory back to the system faster.
7906 	 */
7907 	for (zone_id_t zid = ZONE_ID__LAST_RO + 1; zid < ZONE_ID__FIRST_DYNAMIC; zid++) {
7908 		zone_t z = zone_by_id(zid);
7909 
7910 		if (z->z_self && z->z_chunk_elems == 0) {
7911 			zone_reclaim(z, mode);
7912 		}
7913 	}
7914 	zone_index_foreach(zid) {
7915 		zone_t z = zone_by_id(zid);
7916 
7917 		if (z == zc_magazine_zone || z->z_chunk_elems == 0) {
7918 			continue;
7919 		}
7920 		if (zone_submap_is_sequestered(zone_security_array[zid]) &&
7921 		    z->collectable) {
7922 			zone_reclaim(z, mode);
7923 		}
7924 	}
7925 
7926 	zone_index_foreach(zid) {
7927 		zone_t z = zone_by_id(zid);
7928 
7929 		if (z == zc_magazine_zone || z->z_chunk_elems == 0) {
7930 			continue;
7931 		}
7932 		if (!zone_submap_is_sequestered(zone_security_array[zid]) &&
7933 		    z->collectable) {
7934 			zone_reclaim(z, mode);
7935 		}
7936 	}
7937 
7938 	zone_reclaim(zc_magazine_zone, mode);
7939 }
7940 
7941 void
zone_userspace_reboot_checks(void)7942 zone_userspace_reboot_checks(void)
7943 {
7944 	vm_size_t label_zone_size = zone_size_allocated(ipc_service_port_label_zone);
7945 	if (label_zone_size != 0) {
7946 		panic("Zone %s should be empty upon userspace reboot. Actual size: %lu.",
7947 		    ipc_service_port_label_zone->z_name, (unsigned long)label_zone_size);
7948 	}
7949 }
7950 
7951 void
zone_gc(zone_gc_level_t level)7952 zone_gc(zone_gc_level_t level)
7953 {
7954 	zone_reclaim_mode_t mode;
7955 	zone_t largest_zone = NULL;
7956 
7957 	switch (level) {
7958 	case ZONE_GC_TRIM:
7959 		mode = ZONE_RECLAIM_TRIM;
7960 		break;
7961 	case ZONE_GC_DRAIN:
7962 		mode = ZONE_RECLAIM_DRAIN;
7963 		break;
7964 	case ZONE_GC_JETSAM:
7965 		largest_zone = kill_process_in_largest_zone();
7966 		mode = ZONE_RECLAIM_TRIM;
7967 		break;
7968 	}
7969 
7970 	current_thread()->options |= TH_OPT_ZONE_PRIV;
7971 	lck_mtx_lock(&zone_gc_lock);
7972 
7973 	zone_reclaim_all(mode);
7974 
7975 	if (level == ZONE_GC_JETSAM && zone_map_nearing_exhaustion()) {
7976 		/*
7977 		 * If we possibly killed a process, but we're still critical,
7978 		 * we need to drain harder.
7979 		 */
7980 		zone_reclaim(largest_zone, ZONE_RECLAIM_DRAIN);
7981 		zone_reclaim_all(ZONE_RECLAIM_DRAIN);
7982 	}
7983 
7984 	lck_mtx_unlock(&zone_gc_lock);
7985 	current_thread()->options &= ~TH_OPT_ZONE_PRIV;
7986 }
7987 
7988 void
zone_gc_trim(void)7989 zone_gc_trim(void)
7990 {
7991 	zone_gc(ZONE_GC_TRIM);
7992 }
7993 
7994 void
zone_gc_drain(void)7995 zone_gc_drain(void)
7996 {
7997 	zone_gc(ZONE_GC_DRAIN);
7998 }
7999 
8000 static bool
zone_trim_needed(zone_t z)8001 zone_trim_needed(zone_t z)
8002 {
8003 	if (z->z_depot_cleanup) {
8004 		return true;
8005 	}
8006 
8007 	if (z->z_async_refilling) {
8008 		/* Don't fight with refill */
8009 		return false;
8010 	}
8011 
8012 	if (z->z_pcpu_cache) {
8013 		uint32_t e_n, f_n;
8014 
8015 		e_n = MIN(z->z_recirc_empty_wma, z->z_recirc_empty_min * Z_WMA_UNIT);
8016 		f_n = MIN(z->z_recirc_full_wma, z->z_recirc_full_min * Z_WMA_UNIT);
8017 
8018 		if (e_n > zc_autotrim_buckets() * Z_WMA_UNIT) {
8019 			return true;
8020 		}
8021 
8022 		if (f_n * zc_mag_size() > z->z_elems_rsv * Z_WMA_UNIT &&
8023 		    f_n * zc_mag_size() * zone_elem_inner_size(z) >
8024 		    zc_autotrim_size() * Z_WMA_UNIT) {
8025 			return true;
8026 		}
8027 
8028 		return false;
8029 	}
8030 
8031 	if (!zone_pva_is_null(z->z_pageq_empty)) {
8032 		uint32_t n;
8033 
8034 		n = MIN(z->z_elems_free_wma, z->z_elems_free_min);
8035 
8036 		return n >= z->z_elems_rsv + z->z_chunk_elems;
8037 	}
8038 
8039 	return false;
8040 }
8041 
8042 static void
zone_trim_async(__unused thread_call_param_t p0,__unused thread_call_param_t p1)8043 zone_trim_async(__unused thread_call_param_t p0, __unused thread_call_param_t p1)
8044 {
8045 	current_thread()->options |= TH_OPT_ZONE_PRIV;
8046 
8047 	zone_foreach(z) {
8048 		if (!z->collectable || z == zc_magazine_zone) {
8049 			continue;
8050 		}
8051 
8052 		if (zone_trim_needed(z)) {
8053 			lck_mtx_lock(&zone_gc_lock);
8054 			zone_reclaim(z, ZONE_RECLAIM_TRIM);
8055 			lck_mtx_unlock(&zone_gc_lock);
8056 		}
8057 	}
8058 
8059 	if (zone_trim_needed(zc_magazine_zone)) {
8060 		lck_mtx_lock(&zone_gc_lock);
8061 		zone_reclaim(zc_magazine_zone, ZONE_RECLAIM_TRIM);
8062 		lck_mtx_unlock(&zone_gc_lock);
8063 	}
8064 
8065 	current_thread()->options &= ~TH_OPT_ZONE_PRIV;
8066 }
8067 
8068 void
compute_zone_working_set_size(__unused void * param)8069 compute_zone_working_set_size(__unused void *param)
8070 {
8071 	uint32_t zc_auto = zc_enable_level();
8072 	bool needs_trim = false;
8073 
8074 	/*
8075 	 * Keep zone caching disabled until the first proc is made.
8076 	 */
8077 	if (__improbable(zone_caching_disabled < 0)) {
8078 		return;
8079 	}
8080 
8081 	zone_caching_disabled = vm_pool_low();
8082 
8083 	if (os_mul_overflow(zc_auto, Z_WMA_UNIT, &zc_auto)) {
8084 		zc_auto = 0;
8085 	}
8086 
8087 	zone_foreach(z) {
8088 		uint32_t old, wma, cur;
8089 		bool needs_caching = false;
8090 
8091 		if (z->z_self != z) {
8092 			continue;
8093 		}
8094 
8095 		zone_lock(z);
8096 
8097 		zone_recirc_lock_nopreempt(z);
8098 
8099 		if (z->z_pcpu_cache) {
8100 			wma = Z_WMA_MIX(z->z_recirc_empty_wma, z->z_recirc_empty_min);
8101 			z->z_recirc_empty_min = z->z_recirc.zd_empty;
8102 			z->z_recirc_empty_wma = wma;
8103 		} else {
8104 			wma = Z_WMA_MIX(z->z_elems_free_wma, z->z_elems_free_min);
8105 			z->z_elems_free_min = z->z_elems_free;
8106 			z->z_elems_free_wma = wma;
8107 		}
8108 
8109 		wma = Z_WMA_MIX(z->z_recirc_full_wma, z->z_recirc_full_min);
8110 		z->z_recirc_full_min = z->z_recirc.zd_full;
8111 		z->z_recirc_full_wma = wma;
8112 
8113 		/* fixed point decimal of contentions per second */
8114 		old = z->z_recirc_cont_wma;
8115 		cur = z->z_recirc_cont_cur * Z_WMA_UNIT /
8116 		    (zpercpu_count() * ZONE_WSS_UPDATE_PERIOD);
8117 		cur = (3 * old + cur) / 4;
8118 		zone_recirc_unlock_nopreempt(z);
8119 
8120 		if (z->z_pcpu_cache) {
8121 			uint16_t size = z->z_depot_size;
8122 
8123 			if (zone_exhausted(z)) {
8124 				if (z->z_depot_size) {
8125 					z->z_depot_size = 0;
8126 					z->z_depot_cleanup = true;
8127 				}
8128 			} else if (size < z->z_depot_limit && cur > zc_grow_level()) {
8129 				/*
8130 				 * lose history on purpose now
8131 				 * that we just grew, to give
8132 				 * the sytem time to adjust.
8133 				 */
8134 				cur  = (zc_grow_level() + zc_shrink_level()) / 2;
8135 				size = size ? (3 * size + 2) / 2 : 2;
8136 				z->z_depot_size = MIN(z->z_depot_limit, size);
8137 			} else if (size > 0 && cur <= zc_shrink_level()) {
8138 				/*
8139 				 * lose history on purpose now
8140 				 * that we just shrunk, to give
8141 				 * the sytem time to adjust.
8142 				 */
8143 				cur = (zc_grow_level() + zc_shrink_level()) / 2;
8144 				z->z_depot_size = size - 1;
8145 				z->z_depot_cleanup = true;
8146 			}
8147 		} else if (!z->z_nocaching && !zone_exhaustible(z) && zc_auto &&
8148 		    old >= zc_auto && cur >= zc_auto) {
8149 			needs_caching = true;
8150 		}
8151 
8152 		z->z_recirc_cont_wma = cur;
8153 		z->z_recirc_cont_cur = 0;
8154 
8155 		if (!needs_trim && zone_trim_needed(z)) {
8156 			needs_trim = true;
8157 		}
8158 
8159 		zone_unlock(z);
8160 
8161 		if (needs_caching) {
8162 			zone_enable_caching(z);
8163 		}
8164 	}
8165 
8166 	if (needs_trim) {
8167 		thread_call_enter(&zone_trim_callout);
8168 	}
8169 }
8170 
8171 #endif /* !ZALLOC_TEST */
8172 #pragma mark vm integration, MIG routines
8173 #if !ZALLOC_TEST
8174 
8175 extern unsigned int stack_total;
8176 #if defined (__x86_64__)
8177 extern unsigned int inuse_ptepages_count;
8178 #endif
8179 
8180 static const char *
panic_print_get_typename(kalloc_type_views_t cur,kalloc_type_views_t * next,bool is_kt_var)8181 panic_print_get_typename(kalloc_type_views_t cur, kalloc_type_views_t *next,
8182     bool is_kt_var)
8183 {
8184 	if (is_kt_var) {
8185 		next->ktv_var = (kalloc_type_var_view_t) cur.ktv_var->kt_next;
8186 		return cur.ktv_var->kt_name;
8187 	} else {
8188 		next->ktv_fixed = (kalloc_type_view_t) cur.ktv_fixed->kt_zv.zv_next;
8189 		return cur.ktv_fixed->kt_zv.zv_name;
8190 	}
8191 }
8192 
8193 static void
panic_print_types_in_zone(zone_t z,const char * debug_str)8194 panic_print_types_in_zone(zone_t z, const char* debug_str)
8195 {
8196 	kalloc_type_views_t kt_cur = {};
8197 	const char *prev_type = "";
8198 	size_t skip_over_site = sizeof("site.") - 1;
8199 	zone_security_flags_t zsflags = zone_security_config(z);
8200 	bool is_kt_var = false;
8201 
8202 	if (zsflags.z_kheap_id == KHEAP_ID_KT_VAR) {
8203 		uint32_t heap_id = KT_VAR_PTR_HEAP0 + ((zone_index(z) -
8204 		    kalloc_type_heap_array[KT_VAR_PTR_HEAP0].kh_zstart) / KHEAP_NUM_ZONES);
8205 		kt_cur.ktv_var = kalloc_type_heap_array[heap_id].kt_views;
8206 		is_kt_var = true;
8207 	} else {
8208 		kt_cur.ktv_fixed = (kalloc_type_view_t) z->z_views;
8209 	}
8210 
8211 	paniclog_append_noflush("kalloc %s in zone, %s (%s):\n",
8212 	    is_kt_var? "type arrays" : "types", debug_str, z->z_name);
8213 
8214 	while (kt_cur.ktv_fixed) {
8215 		kalloc_type_views_t kt_next = {};
8216 		const char *typename = panic_print_get_typename(kt_cur, &kt_next,
8217 		    is_kt_var) + skip_over_site;
8218 		if (strcmp(typename, prev_type) != 0) {
8219 			paniclog_append_noflush("\t%-50s\n", typename);
8220 			prev_type = typename;
8221 		}
8222 		kt_cur = kt_next;
8223 	}
8224 	paniclog_append_noflush("\n");
8225 }
8226 
8227 static void
panic_display_kalloc_types(void)8228 panic_display_kalloc_types(void)
8229 {
8230 	if (kalloc_type_src_zone) {
8231 		panic_print_types_in_zone(kalloc_type_src_zone, "addr belongs to");
8232 	}
8233 	if (kalloc_type_dst_zone) {
8234 		panic_print_types_in_zone(kalloc_type_dst_zone,
8235 		    "addr is being freed to");
8236 	}
8237 }
8238 
8239 static void
zone_find_n_largest(const uint32_t n,zone_t * largest_zones,uint64_t * zone_size)8240 zone_find_n_largest(const uint32_t n, zone_t *largest_zones,
8241     uint64_t *zone_size)
8242 {
8243 	zone_index_foreach(zid) {
8244 		zone_t z = &zone_array[zid];
8245 		vm_offset_t size = zone_size_wired(z);
8246 
8247 		if (zid == ZONE_ID_VM_PAGES) {
8248 			continue;
8249 		}
8250 		for (uint32_t i = 0; i < n; i++) {
8251 			if (size > zone_size[i]) {
8252 				largest_zones[i] = z;
8253 				zone_size[i] = size;
8254 				break;
8255 			}
8256 		}
8257 	}
8258 }
8259 
8260 #define NUM_LARGEST_ZONES 5
8261 static void
panic_display_largest_zones(void)8262 panic_display_largest_zones(void)
8263 {
8264 	zone_t largest_zones[NUM_LARGEST_ZONES]  = { NULL };
8265 	uint64_t largest_size[NUM_LARGEST_ZONES] = { 0 };
8266 
8267 	zone_find_n_largest(NUM_LARGEST_ZONES, (zone_t *) &largest_zones,
8268 	    (uint64_t *) &largest_size);
8269 
8270 	paniclog_append_noflush("Largest zones:\n%-28s %10s %10s\n",
8271 	    "Zone Name", "Cur Size", "Free Size");
8272 	for (uint32_t i = 0; i < NUM_LARGEST_ZONES; i++) {
8273 		zone_t z = largest_zones[i];
8274 		paniclog_append_noflush("%-8s%-20s %9u%c %9u%c\n",
8275 		    zone_heap_name(z), z->z_name,
8276 		    mach_vm_size_pretty(largest_size[i]),
8277 		    mach_vm_size_unit(largest_size[i]),
8278 		    mach_vm_size_pretty(zone_size_free(z)),
8279 		    mach_vm_size_unit(zone_size_free(z)));
8280 	}
8281 }
8282 
8283 static void
panic_display_zprint(void)8284 panic_display_zprint(void)
8285 {
8286 	panic_display_largest_zones();
8287 	paniclog_append_noflush("%-20s %10lu\n", "Kernel Stacks",
8288 	    (uintptr_t)(kernel_stack_size * stack_total));
8289 #if defined (__x86_64__)
8290 	paniclog_append_noflush("%-20s %10lu\n", "PageTables",
8291 	    (uintptr_t)ptoa(inuse_ptepages_count));
8292 #endif
8293 	paniclog_append_noflush("%-20s %10llu\n", "Kalloc.Large",
8294 	    counter_load(&kalloc_large_total));
8295 
8296 	if (panic_kext_memory_info) {
8297 		mach_memory_info_t *mem_info = panic_kext_memory_info;
8298 
8299 		paniclog_append_noflush("\n%-5s %10s\n", "Kmod", "Size");
8300 		for (uint32_t i = 0; i < panic_kext_memory_size / sizeof(mem_info[0]); i++) {
8301 			if ((mem_info[i].flags & VM_KERN_SITE_TYPE) != VM_KERN_SITE_KMOD) {
8302 				continue;
8303 			}
8304 			if (mem_info[i].size > (1024 * 1024)) {
8305 				paniclog_append_noflush("%-5lld %10lld\n",
8306 				    mem_info[i].site, mem_info[i].size);
8307 			}
8308 		}
8309 	}
8310 }
8311 
8312 static void
panic_display_zone_info(void)8313 panic_display_zone_info(void)
8314 {
8315 	paniclog_append_noflush("Zone info:\n");
8316 	paniclog_append_noflush("  Zone map: %p - %p\n",
8317 	    (void *)zone_info.zi_map_range.min_address,
8318 	    (void *)zone_info.zi_map_range.max_address);
8319 #if CONFIG_PROB_GZALLOC
8320 	if (pgz_submap) {
8321 		paniclog_append_noflush("  . PGZ   : %p - %p\n",
8322 		    (void *)pgz_submap->min_offset,
8323 		    (void *)pgz_submap->max_offset);
8324 	}
8325 #endif /* CONFIG_PROB_GZALLOC */
8326 	for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) {
8327 		vm_map_t map = zone_submaps[i];
8328 
8329 		if (map == VM_MAP_NULL) {
8330 			continue;
8331 		}
8332 		paniclog_append_noflush("  . %-6s: %p - %p\n",
8333 		    zone_submaps_names[i],
8334 		    (void *)map->min_offset,
8335 		    (void *)map->max_offset);
8336 	}
8337 	paniclog_append_noflush("  Metadata: %p - %p\n"
8338 	    "  Bitmaps : %p - %p\n"
8339 	    "  Extra   : %p - %p\n"
8340 	    "\n",
8341 	    (void *)zone_info.zi_meta_range.min_address,
8342 	    (void *)zone_info.zi_meta_range.max_address,
8343 	    (void *)zone_info.zi_bits_range.min_address,
8344 	    (void *)zone_info.zi_bits_range.max_address,
8345 	    (void *)zone_info.zi_xtra_range.min_address,
8346 	    (void *)zone_info.zi_xtra_range.max_address);
8347 }
8348 
8349 static void
panic_display_zone_fault(vm_offset_t addr)8350 panic_display_zone_fault(vm_offset_t addr)
8351 {
8352 	struct zone_page_metadata meta = { };
8353 	vm_map_t map = VM_MAP_NULL;
8354 	vm_offset_t oob_offs = 0, size = 0;
8355 	int map_idx = -1;
8356 	zone_t z = NULL;
8357 	const char *kind = "whild deref";
8358 	bool oob = false;
8359 
8360 	/*
8361 	 * First: look if we bumped into guard pages between submaps
8362 	 */
8363 	for (int i = 0; i < Z_SUBMAP_IDX_COUNT; i++) {
8364 		map = zone_submaps[i];
8365 		if (map == VM_MAP_NULL) {
8366 			continue;
8367 		}
8368 
8369 		if (addr >= map->min_offset && addr < map->max_offset) {
8370 			map_idx = i;
8371 			break;
8372 		}
8373 	}
8374 
8375 	if (map_idx == -1) {
8376 		/* this really shouldn't happen, submaps are back to back */
8377 		return;
8378 	}
8379 
8380 	paniclog_append_noflush("Probabilistic GZAlloc Report:\n");
8381 
8382 	/*
8383 	 * Second: look if there's just no metadata at all
8384 	 */
8385 	if (ml_nofault_copy((vm_offset_t)zone_meta_from_addr(addr),
8386 	    (vm_offset_t)&meta, sizeof(meta)) != sizeof(meta) ||
8387 	    meta.zm_index == 0 || meta.zm_index >= MAX_ZONES ||
8388 	    zone_array[meta.zm_index].z_self == NULL) {
8389 		paniclog_append_noflush("  Zone    : <unknown>\n");
8390 		kind = "wild deref, missing or invalid metadata";
8391 	} else {
8392 		z = &zone_array[meta.zm_index];
8393 		paniclog_append_noflush("  Zone    : %s%s\n",
8394 		    zone_heap_name(z), zone_name(z));
8395 		if (meta.zm_chunk_len == ZM_PGZ_GUARD) {
8396 			kind = "out-of-bounds (high confidence)";
8397 			oob = true;
8398 			size = zone_element_size((void *)addr,
8399 			    &z, false, &oob_offs);
8400 		} else {
8401 			kind = "use-after-free (medium confidence)";
8402 		}
8403 	}
8404 
8405 	paniclog_append_noflush("  Address : %p\n", (void *)addr);
8406 	if (oob) {
8407 		paniclog_append_noflush("  Element : [%p, %p) of size %d\n",
8408 		    (void *)(trunc_page(addr) - (size - oob_offs)),
8409 		    (void *)trunc_page(addr), (uint32_t)(size - oob_offs));
8410 	}
8411 	paniclog_append_noflush("  Submap  : %s [%p; %p)\n",
8412 	    zone_submaps_names[map_idx],
8413 	    (void *)map->min_offset, (void *)map->max_offset);
8414 	paniclog_append_noflush("  Kind    : %s\n", kind);
8415 	if (oob) {
8416 		paniclog_append_noflush("  Access  : %d byte(s) past\n",
8417 		    (uint32_t)(addr & PAGE_MASK) + 1);
8418 	}
8419 	paniclog_append_noflush("  Metadata: zid:%d inl:%d cl:0x%x "
8420 	    "0x%04x 0x%08x 0x%08x 0x%08x\n",
8421 	    meta.zm_index, meta.zm_inline_bitmap, meta.zm_chunk_len,
8422 	    meta.zm_alloc_size, meta.zm_bitmap,
8423 	    meta.zm_page_next.packed_address,
8424 	    meta.zm_page_prev.packed_address);
8425 	paniclog_append_noflush("\n");
8426 }
8427 
8428 void
panic_display_zalloc(void)8429 panic_display_zalloc(void)
8430 {
8431 	bool keepsyms = false;
8432 
8433 	PE_parse_boot_argn("keepsyms", &keepsyms, sizeof(keepsyms));
8434 
8435 	panic_display_zone_info();
8436 
8437 	if (panic_fault_address) {
8438 #if CONFIG_PROB_GZALLOC
8439 		if (pgz_owned(panic_fault_address)) {
8440 			panic_display_pgz_uaf_info(keepsyms, panic_fault_address);
8441 		} else
8442 #endif /* CONFIG_PROB_GZALLOC */
8443 		if (zone_maps_owned(panic_fault_address, 1)) {
8444 			panic_display_zone_fault(panic_fault_address);
8445 		}
8446 	}
8447 
8448 	if (panic_include_zprint) {
8449 		panic_display_zprint();
8450 	} else if (zone_map_nearing_threshold(ZONE_MAP_EXHAUSTION_PRINT_PANIC)) {
8451 		panic_display_largest_zones();
8452 	}
8453 #if CONFIG_ZLEAKS
8454 	if (zleak_active) {
8455 		panic_display_zleaks(keepsyms);
8456 	}
8457 #endif
8458 	if (panic_include_kalloc_types) {
8459 		panic_display_kalloc_types();
8460 	}
8461 }
8462 
8463 /*
8464  * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
8465  * requesting zone information.
8466  * Frees unused pages towards the end of the region, and zero'es out unused
8467  * space on the last page.
8468  */
8469 static vm_map_copy_t
create_vm_map_copy(vm_offset_t start_addr,vm_size_t total_size,vm_size_t used_size)8470 create_vm_map_copy(
8471 	vm_offset_t             start_addr,
8472 	vm_size_t               total_size,
8473 	vm_size_t               used_size)
8474 {
8475 	kern_return_t   kr;
8476 	vm_offset_t             end_addr;
8477 	vm_size_t               free_size;
8478 	vm_map_copy_t   copy;
8479 
8480 	if (used_size != total_size) {
8481 		end_addr = start_addr + used_size;
8482 		free_size = total_size - (round_page(end_addr) - start_addr);
8483 
8484 		if (free_size >= PAGE_SIZE) {
8485 			kmem_free(ipc_kernel_map,
8486 			    round_page(end_addr), free_size);
8487 		}
8488 		bzero((char *) end_addr, round_page(end_addr) - end_addr);
8489 	}
8490 
8491 	kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
8492 	    (vm_map_size_t)used_size, TRUE, &copy);
8493 	assert(kr == KERN_SUCCESS);
8494 
8495 	return copy;
8496 }
8497 
8498 static boolean_t
get_zone_info(zone_t z,mach_zone_name_t * zn,mach_zone_info_t * zi)8499 get_zone_info(
8500 	zone_t                   z,
8501 	mach_zone_name_t        *zn,
8502 	mach_zone_info_t        *zi)
8503 {
8504 	struct zone zcopy;
8505 	vm_size_t cached = 0;
8506 
8507 	assert(z != ZONE_NULL);
8508 	zone_lock(z);
8509 	if (!z->z_self) {
8510 		zone_unlock(z);
8511 		return FALSE;
8512 	}
8513 	zcopy = *z;
8514 	if (z->z_pcpu_cache) {
8515 		zpercpu_foreach(zc, z->z_pcpu_cache) {
8516 			cached += zc->zc_alloc_cur + zc->zc_free_cur;
8517 			cached += zc->zc_depot.zd_full * zc_mag_size();
8518 		}
8519 	}
8520 	zone_unlock(z);
8521 
8522 	if (zn != NULL) {
8523 		/*
8524 		 * Append kalloc heap name to zone name (if zone is used by kalloc)
8525 		 */
8526 		char temp_zone_name[MAX_ZONE_NAME] = "";
8527 		snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
8528 		    zone_heap_name(z), z->z_name);
8529 
8530 		/* assuming here the name data is static */
8531 		(void) __nosan_strlcpy(zn->mzn_name, temp_zone_name,
8532 		    strlen(temp_zone_name) + 1);
8533 	}
8534 
8535 	if (zi != NULL) {
8536 		*zi = (mach_zone_info_t) {
8537 			.mzi_count = zone_count_allocated(&zcopy) - cached,
8538 			.mzi_cur_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_cur)),
8539 			// max_size for zprint is now high-watermark of pages used
8540 			.mzi_max_size = ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_hwm)),
8541 			.mzi_elem_size = zone_scale_for_percpu(&zcopy, zcopy.z_elem_size),
8542 			.mzi_alloc_size = ptoa_64(zcopy.z_chunk_pages),
8543 			.mzi_exhaustible = (uint64_t)zone_exhaustible(&zcopy),
8544 		};
8545 		if (zcopy.z_chunk_pages == 0) {
8546 			/* this is a zcache */
8547 			zi->mzi_cur_size = zcopy.z_elems_avail * zcopy.z_elem_size;
8548 		}
8549 		zpercpu_foreach(zs, zcopy.z_stats) {
8550 			zi->mzi_sum_size += zs->zs_mem_allocated;
8551 		}
8552 		if (zcopy.collectable) {
8553 			SET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable,
8554 			    ptoa_64(zone_scale_for_percpu(&zcopy, zcopy.z_wired_empty)));
8555 			SET_MZI_COLLECTABLE_FLAG(zi->mzi_collectable, TRUE);
8556 		}
8557 	}
8558 
8559 	return TRUE;
8560 }
8561 
8562 /* mach_memory_info entitlement */
8563 #define MEMORYINFO_ENTITLEMENT "com.apple.private.memoryinfo"
8564 
8565 /* macro needed to rate-limit mach_memory_info */
8566 #define NSEC_DAY (NSEC_PER_SEC * 60 * 60 * 24)
8567 
8568 /* declarations necessary to call kauth_cred_issuser() */
8569 struct ucred;
8570 extern int kauth_cred_issuser(struct ucred *);
8571 extern struct ucred *kauth_cred_get(void);
8572 
8573 static kern_return_t
8574 mach_memory_info_internal(
8575 	host_t                  host,
8576 	mach_zone_name_array_t  *namesp,
8577 	mach_msg_type_number_t  *namesCntp,
8578 	mach_zone_info_array_t  *infop,
8579 	mach_msg_type_number_t  *infoCntp,
8580 	mach_memory_info_array_t *memoryInfop,
8581 	mach_msg_type_number_t   *memoryInfoCntp,
8582 	bool                     redact_info);
8583 
8584 static kern_return_t
mach_memory_info_security_check(bool redact_info)8585 mach_memory_info_security_check(bool redact_info)
8586 {
8587 	/* If not root, only allow redacted calls. */
8588 	if (!kauth_cred_issuser(kauth_cred_get()) && !redact_info) {
8589 		return KERN_NO_ACCESS;
8590 	}
8591 
8592 	if (PE_srd_fused) {
8593 		return KERN_SUCCESS;
8594 	}
8595 
8596 	/* If does not have the memory entitlement, fail. */
8597 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8598 	if (!IOTaskHasEntitlement(current_task(), MEMORYINFO_ENTITLEMENT)) {
8599 		return KERN_DENIED;
8600 	}
8601 
8602 	/*
8603 	 * On release non-mac arm devices, allow mach_memory_info
8604 	 * to be called twice per day per boot. memorymaintenanced
8605 	 * calls it once per day, which leaves room for a sysdiagnose.
8606 	 * Allow redacted version to be called without rate limit.
8607 	 */
8608 
8609 	if (!redact_info) {
8610 		static uint64_t first_call = 0, second_call = 0;
8611 		uint64_t now = 0;
8612 		absolutetime_to_nanoseconds(ml_get_timebase(), &now);
8613 
8614 		if (!first_call) {
8615 			first_call = now;
8616 		} else if (!second_call) {
8617 			second_call = now;
8618 		} else if (first_call + NSEC_DAY > now) {
8619 			return KERN_DENIED;
8620 		} else if (first_call + NSEC_DAY < now) {
8621 			first_call = now;
8622 			second_call = 0;
8623 		}
8624 	}
8625 #endif
8626 
8627 	return KERN_SUCCESS;
8628 }
8629 
8630 kern_return_t
mach_zone_info(mach_port_t host_port,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp,mach_zone_info_array_t * infop,mach_msg_type_number_t * infoCntp)8631 mach_zone_info(
8632 	mach_port_t             host_port,
8633 	mach_zone_name_array_t  *namesp,
8634 	mach_msg_type_number_t  *namesCntp,
8635 	mach_zone_info_array_t  *infop,
8636 	mach_msg_type_number_t  *infoCntp)
8637 {
8638 	return mach_memory_info(host_port, namesp, namesCntp, infop, infoCntp, NULL, NULL);
8639 }
8640 
8641 kern_return_t
mach_memory_info(mach_port_t host_port,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp,mach_zone_info_array_t * infop,mach_msg_type_number_t * infoCntp,mach_memory_info_array_t * memoryInfop,mach_msg_type_number_t * memoryInfoCntp)8642 mach_memory_info(
8643 	mach_port_t             host_port,
8644 	mach_zone_name_array_t  *namesp,
8645 	mach_msg_type_number_t  *namesCntp,
8646 	mach_zone_info_array_t  *infop,
8647 	mach_msg_type_number_t  *infoCntp,
8648 	mach_memory_info_array_t *memoryInfop,
8649 	mach_msg_type_number_t   *memoryInfoCntp)
8650 {
8651 	bool redact_info = false;
8652 	host_t host = HOST_NULL;
8653 
8654 	host = convert_port_to_host_priv(host_port);
8655 	if (host == HOST_NULL) {
8656 		redact_info = true;
8657 		host = convert_port_to_host(host_port);
8658 	}
8659 
8660 	return mach_memory_info_internal(host, namesp, namesCntp, infop, infoCntp, memoryInfop, memoryInfoCntp, redact_info);
8661 }
8662 
8663 static void
zone_info_redact(mach_zone_info_t * zi)8664 zone_info_redact(mach_zone_info_t *zi)
8665 {
8666 	zi->mzi_cur_size = 0;
8667 	zi->mzi_max_size = 0;
8668 	zi->mzi_alloc_size = 0;
8669 	zi->mzi_sum_size = 0;
8670 	zi->mzi_collectable = 0;
8671 }
8672 
8673 static bool
zone_info_needs_to_be_coalesced(int zone_index)8674 zone_info_needs_to_be_coalesced(int zone_index)
8675 {
8676 	zone_security_flags_t zsflags = zone_security_array[zone_index];
8677 	if (zsflags.z_kalloc_type || zsflags.z_kheap_id == KHEAP_ID_KT_VAR) {
8678 		return true;
8679 	}
8680 	return false;
8681 }
8682 
8683 static bool
zone_info_find_coalesce_zone(mach_zone_info_t * zi,mach_zone_info_t * info,int * coalesce,int coalesce_count,int * coalesce_index)8684 zone_info_find_coalesce_zone(
8685 	mach_zone_info_t *zi,
8686 	mach_zone_info_t *info,
8687 	int              *coalesce,
8688 	int              coalesce_count,
8689 	int              *coalesce_index)
8690 {
8691 	for (int i = 0; i < coalesce_count; i++) {
8692 		if (zi->mzi_elem_size == info[coalesce[i]].mzi_elem_size) {
8693 			*coalesce_index = coalesce[i];
8694 			return true;
8695 		}
8696 	}
8697 
8698 	return false;
8699 }
8700 
8701 static void
zone_info_coalesce(mach_zone_info_t * info,int coalesce_index,mach_zone_info_t * zi)8702 zone_info_coalesce(
8703 	mach_zone_info_t *info,
8704 	int coalesce_index,
8705 	mach_zone_info_t *zi)
8706 {
8707 	info[coalesce_index].mzi_count += zi->mzi_count;
8708 }
8709 
8710 static kern_return_t
mach_memory_info_internal(host_t host,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp,mach_zone_info_array_t * infop,mach_msg_type_number_t * infoCntp,mach_memory_info_array_t * memoryInfop,mach_msg_type_number_t * memoryInfoCntp,bool redact_info)8711 mach_memory_info_internal(
8712 	host_t                  host,
8713 	mach_zone_name_array_t  *namesp,
8714 	mach_msg_type_number_t  *namesCntp,
8715 	mach_zone_info_array_t  *infop,
8716 	mach_msg_type_number_t  *infoCntp,
8717 	mach_memory_info_array_t *memoryInfop,
8718 	mach_msg_type_number_t   *memoryInfoCntp,
8719 	bool                     redact_info)
8720 {
8721 	mach_zone_name_t        *names;
8722 	vm_offset_t             names_addr;
8723 	vm_size_t               names_size;
8724 
8725 	mach_zone_info_t        *info;
8726 	vm_offset_t             info_addr;
8727 	vm_size_t               info_size;
8728 
8729 	int                     *coalesce;
8730 	vm_offset_t             coalesce_addr;
8731 	vm_size_t               coalesce_size;
8732 	int                     coalesce_count = 0;
8733 
8734 	mach_memory_info_t      *memory_info;
8735 	vm_offset_t             memory_info_addr;
8736 	vm_size_t               memory_info_size;
8737 	vm_size_t               memory_info_vmsize;
8738 	unsigned int            num_info;
8739 
8740 	unsigned int            max_zones, used_zones, i;
8741 	mach_zone_name_t        *zn;
8742 	mach_zone_info_t        *zi;
8743 	kern_return_t           kr;
8744 
8745 	uint64_t                zones_collectable_bytes = 0;
8746 
8747 	if (host == HOST_NULL) {
8748 		return KERN_INVALID_HOST;
8749 	}
8750 
8751 	kr = mach_memory_info_security_check(redact_info);
8752 	if (kr != KERN_SUCCESS) {
8753 		return kr;
8754 	}
8755 
8756 	/*
8757 	 *	We assume that zones aren't freed once allocated.
8758 	 *	We won't pick up any zones that are allocated later.
8759 	 */
8760 
8761 	max_zones = os_atomic_load(&num_zones, relaxed);
8762 
8763 	names_size = round_page(max_zones * sizeof *names);
8764 	kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size,
8765 	    KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8766 	if (kr != KERN_SUCCESS) {
8767 		return kr;
8768 	}
8769 	names = (mach_zone_name_t *) names_addr;
8770 
8771 	info_size = round_page(max_zones * sizeof *info);
8772 	kr = kmem_alloc(ipc_kernel_map, &info_addr, info_size,
8773 	    KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8774 	if (kr != KERN_SUCCESS) {
8775 		kmem_free(ipc_kernel_map,
8776 		    names_addr, names_size);
8777 		return kr;
8778 	}
8779 	info = (mach_zone_info_t *) info_addr;
8780 
8781 	if (redact_info) {
8782 		coalesce_size = round_page(max_zones * sizeof *coalesce);
8783 		kr = kmem_alloc(ipc_kernel_map, &coalesce_addr, coalesce_size,
8784 		    KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8785 		if (kr != KERN_SUCCESS) {
8786 			kmem_free(ipc_kernel_map,
8787 			    names_addr, names_size);
8788 			kmem_free(ipc_kernel_map,
8789 			    info_addr, info_size);
8790 			return kr;
8791 		}
8792 		coalesce = (int *)coalesce_addr;
8793 	}
8794 
8795 	zn = &names[0];
8796 	zi = &info[0];
8797 
8798 	used_zones = 0;
8799 	for (i = 0; i < max_zones; i++) {
8800 		if (!get_zone_info(&(zone_array[i]), zn, zi)) {
8801 			continue;
8802 		}
8803 
8804 		if (!redact_info) {
8805 			zones_collectable_bytes += GET_MZI_COLLECTABLE_BYTES(zi->mzi_collectable);
8806 			zn++;
8807 			zi++;
8808 			used_zones++;
8809 			continue;
8810 		}
8811 
8812 		zone_info_redact(zi);
8813 		if (!zone_info_needs_to_be_coalesced(i)) {
8814 			zn++;
8815 			zi++;
8816 			used_zones++;
8817 			continue;
8818 		}
8819 
8820 		int coalesce_index;
8821 		bool found_coalesce_zone = zone_info_find_coalesce_zone(zi, info,
8822 		    coalesce, coalesce_count, &coalesce_index);
8823 
8824 		/* Didn't find a zone to coalesce */
8825 		if (!found_coalesce_zone) {
8826 			/* Updates the zone name */
8827 			__nosan_bzero(zn->mzn_name, MAX_ZONE_NAME);
8828 			snprintf(zn->mzn_name, MAX_ZONE_NAME, "kalloc.%d",
8829 			    (int)zi->mzi_elem_size);
8830 
8831 			coalesce[coalesce_count] = used_zones;
8832 			coalesce_count++;
8833 			zn++;
8834 			zi++;
8835 			used_zones++;
8836 			continue;
8837 		}
8838 
8839 		zone_info_coalesce(info, coalesce_index, zi);
8840 	}
8841 
8842 	if (redact_info) {
8843 		kmem_free(ipc_kernel_map, coalesce_addr, coalesce_size);
8844 	}
8845 
8846 	*namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
8847 	*namesCntp = used_zones;
8848 
8849 	*infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
8850 	*infoCntp = used_zones;
8851 
8852 	num_info = 0;
8853 	memory_info_addr = 0;
8854 
8855 	if (memoryInfop && memoryInfoCntp) {
8856 		vm_map_copy_t           copy;
8857 		num_info = vm_page_diagnose_estimate();
8858 		memory_info_size = num_info * sizeof(*memory_info);
8859 		memory_info_vmsize = round_page(memory_info_size);
8860 		kr = kmem_alloc(ipc_kernel_map, &memory_info_addr, memory_info_vmsize,
8861 		    KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
8862 		if (kr != KERN_SUCCESS) {
8863 			return kr;
8864 		}
8865 
8866 		kr = vm_map_wire_kernel(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize,
8867 		    VM_PROT_READ | VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
8868 		assert(kr == KERN_SUCCESS);
8869 
8870 		memory_info = (mach_memory_info_t *) memory_info_addr;
8871 		vm_page_diagnose(memory_info, num_info, zones_collectable_bytes, redact_info);
8872 
8873 		kr = vm_map_unwire(ipc_kernel_map, memory_info_addr, memory_info_addr + memory_info_vmsize, FALSE);
8874 		assert(kr == KERN_SUCCESS);
8875 
8876 		kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)memory_info_addr,
8877 		    (vm_map_size_t)memory_info_size, TRUE, &copy);
8878 		assert(kr == KERN_SUCCESS);
8879 
8880 		*memoryInfop = (mach_memory_info_t *) copy;
8881 		*memoryInfoCntp = num_info;
8882 	}
8883 
8884 	return KERN_SUCCESS;
8885 }
8886 
8887 kern_return_t
mach_zone_info_for_zone(host_priv_t host,mach_zone_name_t name,mach_zone_info_t * infop)8888 mach_zone_info_for_zone(
8889 	host_priv_t                     host,
8890 	mach_zone_name_t        name,
8891 	mach_zone_info_t        *infop)
8892 {
8893 	zone_t zone_ptr;
8894 
8895 	if (host == HOST_NULL) {
8896 		return KERN_INVALID_HOST;
8897 	}
8898 
8899 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8900 	if (!PE_i_can_has_debugger(NULL)) {
8901 		return KERN_INVALID_HOST;
8902 	}
8903 #endif
8904 
8905 	if (infop == NULL) {
8906 		return KERN_INVALID_ARGUMENT;
8907 	}
8908 
8909 	zone_ptr = ZONE_NULL;
8910 	zone_foreach(z) {
8911 		/*
8912 		 * Append kalloc heap name to zone name (if zone is used by kalloc)
8913 		 */
8914 		char temp_zone_name[MAX_ZONE_NAME] = "";
8915 		snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
8916 		    zone_heap_name(z), z->z_name);
8917 
8918 		/* Find the requested zone by name */
8919 		if (track_this_zone(temp_zone_name, name.mzn_name)) {
8920 			zone_ptr = z;
8921 			break;
8922 		}
8923 	}
8924 
8925 	/* No zones found with the requested zone name */
8926 	if (zone_ptr == ZONE_NULL) {
8927 		return KERN_INVALID_ARGUMENT;
8928 	}
8929 
8930 	if (get_zone_info(zone_ptr, NULL, infop)) {
8931 		return KERN_SUCCESS;
8932 	}
8933 	return KERN_FAILURE;
8934 }
8935 
8936 kern_return_t
mach_zone_info_for_largest_zone(host_priv_t host,mach_zone_name_t * namep,mach_zone_info_t * infop)8937 mach_zone_info_for_largest_zone(
8938 	host_priv_t                     host,
8939 	mach_zone_name_t        *namep,
8940 	mach_zone_info_t        *infop)
8941 {
8942 	if (host == HOST_NULL) {
8943 		return KERN_INVALID_HOST;
8944 	}
8945 
8946 #if CONFIG_DEBUGGER_FOR_ZONE_INFO
8947 	if (!PE_i_can_has_debugger(NULL)) {
8948 		return KERN_INVALID_HOST;
8949 	}
8950 #endif
8951 
8952 	if (namep == NULL || infop == NULL) {
8953 		return KERN_INVALID_ARGUMENT;
8954 	}
8955 
8956 	if (get_zone_info(zone_find_largest(NULL), namep, infop)) {
8957 		return KERN_SUCCESS;
8958 	}
8959 	return KERN_FAILURE;
8960 }
8961 
8962 uint64_t
get_zones_collectable_bytes(void)8963 get_zones_collectable_bytes(void)
8964 {
8965 	uint64_t zones_collectable_bytes = 0;
8966 	mach_zone_info_t zi;
8967 
8968 	zone_foreach(z) {
8969 		if (get_zone_info(z, NULL, &zi)) {
8970 			zones_collectable_bytes +=
8971 			    GET_MZI_COLLECTABLE_BYTES(zi.mzi_collectable);
8972 		}
8973 	}
8974 
8975 	return zones_collectable_bytes;
8976 }
8977 
8978 kern_return_t
mach_zone_get_zlog_zones(host_priv_t host,mach_zone_name_array_t * namesp,mach_msg_type_number_t * namesCntp)8979 mach_zone_get_zlog_zones(
8980 	host_priv_t                             host,
8981 	mach_zone_name_array_t  *namesp,
8982 	mach_msg_type_number_t  *namesCntp)
8983 {
8984 #if ZALLOC_ENABLE_LOGGING
8985 	unsigned int max_zones, logged_zones, i;
8986 	kern_return_t kr;
8987 	zone_t zone_ptr;
8988 	mach_zone_name_t *names;
8989 	vm_offset_t names_addr;
8990 	vm_size_t names_size;
8991 
8992 	if (host == HOST_NULL) {
8993 		return KERN_INVALID_HOST;
8994 	}
8995 
8996 	if (namesp == NULL || namesCntp == NULL) {
8997 		return KERN_INVALID_ARGUMENT;
8998 	}
8999 
9000 	max_zones = os_atomic_load(&num_zones, relaxed);
9001 
9002 	names_size = round_page(max_zones * sizeof *names);
9003 	kr = kmem_alloc(ipc_kernel_map, &names_addr, names_size,
9004 	    KMA_PAGEABLE | KMA_DATA, VM_KERN_MEMORY_IPC);
9005 	if (kr != KERN_SUCCESS) {
9006 		return kr;
9007 	}
9008 	names = (mach_zone_name_t *) names_addr;
9009 
9010 	zone_ptr = ZONE_NULL;
9011 	logged_zones = 0;
9012 	for (i = 0; i < max_zones; i++) {
9013 		zone_t z = &(zone_array[i]);
9014 		assert(z != ZONE_NULL);
9015 
9016 		/* Copy out the zone name if zone logging is enabled */
9017 		if (z->z_btlog) {
9018 			get_zone_info(z, &names[logged_zones], NULL);
9019 			logged_zones++;
9020 		}
9021 	}
9022 
9023 	*namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
9024 	*namesCntp = logged_zones;
9025 
9026 	return KERN_SUCCESS;
9027 
9028 #else /* ZALLOC_ENABLE_LOGGING */
9029 #pragma unused(host, namesp, namesCntp)
9030 	return KERN_FAILURE;
9031 #endif /* ZALLOC_ENABLE_LOGGING */
9032 }
9033 
9034 kern_return_t
mach_zone_get_btlog_records(host_priv_t host,mach_zone_name_t name,zone_btrecord_array_t * recsp,mach_msg_type_number_t * numrecs)9035 mach_zone_get_btlog_records(
9036 	host_priv_t             host,
9037 	mach_zone_name_t        name,
9038 	zone_btrecord_array_t  *recsp,
9039 	mach_msg_type_number_t *numrecs)
9040 {
9041 #if ZALLOC_ENABLE_LOGGING
9042 	zone_btrecord_t *recs;
9043 	kern_return_t    kr;
9044 	vm_address_t     addr;
9045 	vm_size_t        size;
9046 	zone_t           zone_ptr;
9047 	vm_map_copy_t    copy;
9048 
9049 	if (host == HOST_NULL) {
9050 		return KERN_INVALID_HOST;
9051 	}
9052 
9053 	if (recsp == NULL || numrecs == NULL) {
9054 		return KERN_INVALID_ARGUMENT;
9055 	}
9056 
9057 	zone_ptr = ZONE_NULL;
9058 	zone_foreach(z) {
9059 		/*
9060 		 * Append kalloc heap name to zone name (if zone is used by kalloc)
9061 		 */
9062 		char temp_zone_name[MAX_ZONE_NAME] = "";
9063 		snprintf(temp_zone_name, MAX_ZONE_NAME, "%s%s",
9064 		    zone_heap_name(z), z->z_name);
9065 
9066 		/* Find the requested zone by name */
9067 		if (track_this_zone(temp_zone_name, name.mzn_name)) {
9068 			zone_ptr = z;
9069 			break;
9070 		}
9071 	}
9072 
9073 	/* No zones found with the requested zone name */
9074 	if (zone_ptr == ZONE_NULL) {
9075 		return KERN_INVALID_ARGUMENT;
9076 	}
9077 
9078 	/* Logging not turned on for the requested zone */
9079 	if (!zone_ptr->z_btlog) {
9080 		return KERN_FAILURE;
9081 	}
9082 
9083 	kr = btlog_get_records(zone_ptr->z_btlog, &recs, numrecs);
9084 	if (kr != KERN_SUCCESS) {
9085 		return kr;
9086 	}
9087 
9088 	addr = (vm_address_t)recs;
9089 	size = sizeof(zone_btrecord_t) * *numrecs;
9090 
9091 	kr = vm_map_copyin(ipc_kernel_map, addr, size, TRUE, &copy);
9092 	assert(kr == KERN_SUCCESS);
9093 
9094 	*recsp = (zone_btrecord_t *)copy;
9095 	return KERN_SUCCESS;
9096 
9097 #else /* !ZALLOC_ENABLE_LOGGING */
9098 #pragma unused(host, name, recsp, numrecs)
9099 	return KERN_FAILURE;
9100 #endif /* !ZALLOC_ENABLE_LOGGING */
9101 }
9102 
9103 
9104 kern_return_t
mach_zone_force_gc(host_t host)9105 mach_zone_force_gc(
9106 	host_t host)
9107 {
9108 	if (host == HOST_NULL) {
9109 		return KERN_INVALID_HOST;
9110 	}
9111 
9112 #if DEBUG || DEVELOPMENT
9113 	extern boolean_t(*volatile consider_buffer_cache_collect)(int);
9114 	/* Callout to buffer cache GC to drop elements in the apfs zones */
9115 	if (consider_buffer_cache_collect != NULL) {
9116 		(void)(*consider_buffer_cache_collect)(0);
9117 	}
9118 	zone_gc(ZONE_GC_DRAIN);
9119 #endif /* DEBUG || DEVELOPMENT */
9120 	return KERN_SUCCESS;
9121 }
9122 
9123 zone_t
zone_find_largest(uint64_t * zone_size)9124 zone_find_largest(uint64_t *zone_size)
9125 {
9126 	zone_t    largest_zone  = 0;
9127 	uint64_t  largest_zone_size = 0;
9128 	zone_find_n_largest(1, &largest_zone, &largest_zone_size);
9129 	if (zone_size) {
9130 		*zone_size = largest_zone_size;
9131 	}
9132 	return largest_zone;
9133 }
9134 
9135 void
zone_get_stats(zone_t zone,struct zone_basic_stats * stats)9136 zone_get_stats(
9137 	zone_t                  zone,
9138 	struct zone_basic_stats *stats)
9139 {
9140 	stats->zbs_avail = zone->z_elems_avail;
9141 
9142 	stats->zbs_alloc_fail = 0;
9143 	zpercpu_foreach(zs, zone->z_stats) {
9144 		stats->zbs_alloc_fail += zs->zs_alloc_fail;
9145 	}
9146 
9147 	stats->zbs_cached = 0;
9148 	if (zone->z_pcpu_cache) {
9149 		zpercpu_foreach(zc, zone->z_pcpu_cache) {
9150 			stats->zbs_cached += zc->zc_alloc_cur +
9151 			    zc->zc_free_cur +
9152 			    zc->zc_depot.zd_full * zc_mag_size();
9153 		}
9154 	}
9155 
9156 	stats->zbs_free = zone_count_free(zone) + stats->zbs_cached;
9157 
9158 	/*
9159 	 * Since we don't take any locks, deal with possible inconsistencies
9160 	 * as the counters may have changed.
9161 	 */
9162 	if (os_sub_overflow(stats->zbs_avail, stats->zbs_free,
9163 	    &stats->zbs_alloc)) {
9164 		stats->zbs_avail = stats->zbs_free;
9165 		stats->zbs_alloc = 0;
9166 	}
9167 }
9168 
9169 #endif /* !ZALLOC_TEST */
9170 #pragma mark zone creation, configuration, destruction
9171 #if !ZALLOC_TEST
9172 
9173 static zone_t
zone_init_defaults(zone_id_t zid)9174 zone_init_defaults(zone_id_t zid)
9175 {
9176 	zone_t z = &zone_array[zid];
9177 
9178 	z->z_wired_max = ~0u;
9179 	z->collectable = true;
9180 
9181 	hw_lck_ticket_init(&z->z_lock, &zone_locks_grp);
9182 	hw_lck_ticket_init(&z->z_recirc_lock, &zone_locks_grp);
9183 	zone_depot_init(&z->z_recirc);
9184 	return z;
9185 }
9186 
9187 void
zone_set_exhaustible(zone_t zone,vm_size_t nelems,bool exhausts_by_design)9188 zone_set_exhaustible(zone_t zone, vm_size_t nelems, bool exhausts_by_design)
9189 {
9190 	zone_lock(zone);
9191 	zone->z_wired_max = zone_alloc_pages_for_nelems(zone, nelems);
9192 	zone->z_exhausts = exhausts_by_design;
9193 	zone_unlock(zone);
9194 }
9195 
9196 void
zone_raise_reserve(union zone_or_view zov,uint16_t min_elements)9197 zone_raise_reserve(union zone_or_view zov, uint16_t min_elements)
9198 {
9199 	zone_t zone = zov.zov_zone;
9200 
9201 	if (zone < zone_array || zone > &zone_array[MAX_ZONES]) {
9202 		zone = zov.zov_view->zv_zone;
9203 	} else {
9204 		zone = zov.zov_zone;
9205 	}
9206 
9207 	os_atomic_max(&zone->z_elems_rsv, min_elements, relaxed);
9208 }
9209 
9210 /**
9211  * @function zone_create_find
9212  *
9213  * @abstract
9214  * Finds an unused zone for the given name and element size.
9215  *
9216  * @param name          the zone name
9217  * @param size          the element size (including redzones, ...)
9218  * @param flags         the flags passed to @c zone_create*
9219  * @param zid_inout     the desired zone ID or ZONE_ID_ANY
9220  *
9221  * @returns             a zone to initialize further.
9222  */
9223 static zone_t
zone_create_find(const char * name,vm_size_t size,zone_create_flags_t flags,zone_id_t * zid_inout)9224 zone_create_find(
9225 	const char             *name,
9226 	vm_size_t               size,
9227 	zone_create_flags_t     flags,
9228 	zone_id_t              *zid_inout)
9229 {
9230 	zone_id_t nzones, zid = *zid_inout;
9231 	zone_t z;
9232 
9233 	simple_lock(&all_zones_lock, &zone_locks_grp);
9234 
9235 	nzones = (zone_id_t)os_atomic_load(&num_zones, relaxed);
9236 	assert(num_zones_in_use <= nzones && nzones < MAX_ZONES);
9237 
9238 	if (__improbable(nzones < ZONE_ID__FIRST_DYNAMIC)) {
9239 		/*
9240 		 * The first time around, make sure the reserved zone IDs
9241 		 * have an initialized lock as zone_index_foreach() will
9242 		 * enumerate them.
9243 		 */
9244 		while (nzones < ZONE_ID__FIRST_DYNAMIC) {
9245 			zone_init_defaults(nzones++);
9246 		}
9247 
9248 		os_atomic_store(&num_zones, nzones, release);
9249 	}
9250 
9251 	if (zid != ZONE_ID_ANY) {
9252 		if (zid >= ZONE_ID__FIRST_DYNAMIC) {
9253 			panic("zone_create: invalid desired zone ID %d for %s",
9254 			    zid, name);
9255 		}
9256 		if (flags & ZC_DESTRUCTIBLE) {
9257 			panic("zone_create: ID %d (%s) must be permanent", zid, name);
9258 		}
9259 		if (zone_array[zid].z_self) {
9260 			panic("zone_create: creating zone ID %d (%s) twice", zid, name);
9261 		}
9262 		z = &zone_array[zid];
9263 	} else {
9264 		if (flags & ZC_DESTRUCTIBLE) {
9265 			/*
9266 			 * If possible, find a previously zdestroy'ed zone in the
9267 			 * zone_array that we can reuse.
9268 			 */
9269 			for (int i = bitmap_first(zone_destroyed_bitmap, MAX_ZONES);
9270 			    i >= 0; i = bitmap_next(zone_destroyed_bitmap, i)) {
9271 				z = &zone_array[i];
9272 
9273 				/*
9274 				 * If the zone name and the element size are the
9275 				 * same, we can just reuse the old zone struct.
9276 				 */
9277 				if (strcmp(z->z_name, name) ||
9278 				    zone_elem_outer_size(z) != size) {
9279 					continue;
9280 				}
9281 				bitmap_clear(zone_destroyed_bitmap, i);
9282 				z->z_destroyed = false;
9283 				z->z_self = z;
9284 				zid = (zone_id_t)i;
9285 				goto out;
9286 			}
9287 		}
9288 
9289 		zid = nzones++;
9290 		z = zone_init_defaults(zid);
9291 
9292 		/*
9293 		 * The release barrier pairs with the acquire in
9294 		 * zone_index_foreach() and makes sure that enumeration loops
9295 		 * always see an initialized zone lock.
9296 		 */
9297 		os_atomic_store(&num_zones, nzones, release);
9298 	}
9299 
9300 out:
9301 	num_zones_in_use++;
9302 	simple_unlock(&all_zones_lock);
9303 
9304 	*zid_inout = zid;
9305 	return z;
9306 }
9307 
9308 __abortlike
9309 static void
zone_create_panic(const char * name,const char * f1,const char * f2)9310 zone_create_panic(const char *name, const char *f1, const char *f2)
9311 {
9312 	panic("zone_create: creating zone %s: flag %s and %s are incompatible",
9313 	    name, f1, f2);
9314 }
9315 #define zone_create_assert_not_both(name, flags, current_flag, forbidden_flag) \
9316 	if ((flags) & forbidden_flag) { \
9317 	        zone_create_panic(name, #current_flag, #forbidden_flag); \
9318 	}
9319 
9320 /*
9321  * Adjusts the size of the element based on minimum size, alignment
9322  * and kasan redzones
9323  */
9324 static vm_size_t
zone_elem_adjust_size(const char * name __unused,vm_size_t elem_size,zone_create_flags_t flags __unused,uint16_t * redzone __unused)9325 zone_elem_adjust_size(
9326 	const char             *name __unused,
9327 	vm_size_t               elem_size,
9328 	zone_create_flags_t     flags __unused,
9329 	uint16_t               *redzone __unused)
9330 {
9331 	vm_size_t size;
9332 
9333 	/*
9334 	 * Adjust element size for minimum size and pointer alignment
9335 	 */
9336 	size = (elem_size + ZONE_ALIGN_SIZE - 1) & -ZONE_ALIGN_SIZE;
9337 	if (size < ZONE_MIN_ELEM_SIZE) {
9338 		size = ZONE_MIN_ELEM_SIZE;
9339 	}
9340 
9341 #if KASAN_CLASSIC
9342 	/*
9343 	 * Expand the zone allocation size to include the redzones.
9344 	 *
9345 	 * For page-multiple zones add a full guard page because they
9346 	 * likely require alignment.
9347 	 */
9348 	uint16_t redzone_tmp;
9349 	if (flags & (ZC_KASAN_NOREDZONE | ZC_PERCPU | ZC_OBJ_CACHE)) {
9350 		redzone_tmp = 0;
9351 	} else if ((size & PAGE_MASK) == 0) {
9352 		if (size != PAGE_SIZE && (flags & ZC_ALIGNMENT_REQUIRED)) {
9353 			panic("zone_create: zone %s can't provide more than PAGE_SIZE"
9354 			    "alignment", name);
9355 		}
9356 		redzone_tmp = PAGE_SIZE;
9357 	} else if (flags & ZC_ALIGNMENT_REQUIRED) {
9358 		redzone_tmp = 0;
9359 	} else {
9360 		redzone_tmp = KASAN_GUARD_SIZE;
9361 	}
9362 	size += redzone_tmp;
9363 	if (redzone) {
9364 		*redzone = redzone_tmp;
9365 	}
9366 #endif
9367 	return size;
9368 }
9369 
9370 /*
9371  * Returns the allocation chunk size that has least framentation
9372  */
9373 static vm_size_t
zone_get_min_alloc_granule(vm_size_t elem_size,zone_create_flags_t flags)9374 zone_get_min_alloc_granule(
9375 	vm_size_t               elem_size,
9376 	zone_create_flags_t     flags)
9377 {
9378 	vm_size_t alloc_granule = PAGE_SIZE;
9379 	if (flags & ZC_PERCPU) {
9380 		alloc_granule = PAGE_SIZE * zpercpu_count();
9381 		if (PAGE_SIZE % elem_size > 256) {
9382 			panic("zone_create: per-cpu zone has too much fragmentation");
9383 		}
9384 	} else if (flags & ZC_READONLY) {
9385 		alloc_granule = PAGE_SIZE;
9386 	} else if ((elem_size & PAGE_MASK) == 0) {
9387 		/* zero fragmentation by definition */
9388 		alloc_granule = elem_size;
9389 	} else if (alloc_granule % elem_size == 0) {
9390 		/* zero fragmentation by definition */
9391 	} else {
9392 		vm_size_t frag = (alloc_granule % elem_size) * 100 / alloc_granule;
9393 		vm_size_t alloc_tmp = PAGE_SIZE;
9394 		vm_size_t max_chunk_size = ZONE_MAX_ALLOC_SIZE;
9395 
9396 #if __arm64__
9397 		/*
9398 		 * Increase chunk size to 48K for sizes larger than 4K on 16k
9399 		 * machines, so as to reduce internal fragementation for kalloc
9400 		 * zones with sizes 12K and 24K.
9401 		 */
9402 		if (elem_size > 4 * 1024 && PAGE_SIZE == 16 * 1024) {
9403 			max_chunk_size = 48 * 1024;
9404 		}
9405 #endif
9406 		while ((alloc_tmp += PAGE_SIZE) <= max_chunk_size) {
9407 			vm_size_t frag_tmp = (alloc_tmp % elem_size) * 100 / alloc_tmp;
9408 			if (frag_tmp < frag) {
9409 				frag = frag_tmp;
9410 				alloc_granule = alloc_tmp;
9411 			}
9412 		}
9413 	}
9414 	return alloc_granule;
9415 }
9416 
9417 vm_size_t
zone_get_early_alloc_size(const char * name __unused,vm_size_t elem_size,zone_create_flags_t flags,vm_size_t min_elems)9418 zone_get_early_alloc_size(
9419 	const char             *name __unused,
9420 	vm_size_t               elem_size,
9421 	zone_create_flags_t     flags,
9422 	vm_size_t               min_elems)
9423 {
9424 	vm_size_t adjusted_size, alloc_granule, chunk_elems;
9425 
9426 	adjusted_size = zone_elem_adjust_size(name, elem_size, flags, NULL);
9427 	alloc_granule = zone_get_min_alloc_granule(adjusted_size, flags);
9428 	chunk_elems   = alloc_granule / adjusted_size;
9429 
9430 	return ((min_elems + chunk_elems - 1) / chunk_elems) * alloc_granule;
9431 }
9432 
9433 zone_t
9434 zone_create_ext(
9435 	const char             *name,
9436 	vm_size_t               size,
9437 	zone_create_flags_t     flags,
9438 	zone_id_t               zid,
9439 	void                  (^extra_setup)(zone_t))
9440 {
9441 	zone_security_flags_t *zsflags;
9442 	uint16_t redzone;
9443 	zone_t z;
9444 
9445 	if (size > ZONE_MAX_ALLOC_SIZE) {
9446 		panic("zone_create: element size too large: %zd", (size_t)size);
9447 	}
9448 
9449 	if (size < 2 * sizeof(vm_size_t)) {
9450 		/* Elements are too small for kasan. */
9451 		flags |= ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE;
9452 	}
9453 
9454 	size = zone_elem_adjust_size(name, size, flags, &redzone);
9455 
9456 	/*
9457 	 * Allocate the zone slot, return early if we found an older match.
9458 	 */
9459 	z = zone_create_find(name, size, flags, &zid);
9460 	if (__improbable(z->z_self)) {
9461 		/* We found a zone to reuse */
9462 		return z;
9463 	}
9464 	zsflags = &zone_security_array[zid];
9465 
9466 	/*
9467 	 * Initialize the zone properly.
9468 	 */
9469 
9470 	/*
9471 	 * If the kernel is post lockdown, copy the zone name passed in.
9472 	 * Else simply maintain a pointer to the name string as it can only
9473 	 * be a core XNU zone (no unloadable kext exists before lockdown).
9474 	 */
9475 	if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
9476 		size_t nsz = MIN(strlen(name) + 1, MACH_ZONE_NAME_MAX_LEN);
9477 		char *buf = zalloc_permanent(nsz, ZALIGN_NONE);
9478 		strlcpy(buf, name, nsz);
9479 		z->z_name = buf;
9480 	} else {
9481 		z->z_name = name;
9482 	}
9483 	if (__probable(zone_array[ZONE_ID_PERCPU_PERMANENT].z_self)) {
9484 		z->z_stats = zalloc_percpu_permanent_type(struct zone_stats);
9485 	} else {
9486 		/*
9487 		 * zone_init() hasn't run yet, use the storage provided by
9488 		 * zone_stats_startup(), and zone_init() will replace it
9489 		 * with the final value once the PERCPU zone exists.
9490 		 */
9491 		z->z_stats = __zpcpu_mangle_for_boot(&zone_stats_startup[zone_index(z)]);
9492 	}
9493 
9494 	if (flags & ZC_OBJ_CACHE) {
9495 		zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_NOCACHING);
9496 		zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_PERCPU);
9497 		zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_NOGC);
9498 		zone_create_assert_not_both(name, flags, ZC_OBJ_CACHE, ZC_DESTRUCTIBLE);
9499 
9500 		z->z_elem_size   = (uint16_t)size;
9501 		z->z_chunk_pages = 0;
9502 		z->z_quo_magic   = 0;
9503 		z->z_align_magic = 0;
9504 		z->z_chunk_elems = 0;
9505 		z->z_elem_offs   = 0;
9506 		z->no_callout    = true;
9507 		zsflags->z_lifo  = true;
9508 	} else {
9509 		vm_size_t alloc = zone_get_min_alloc_granule(size, flags);
9510 
9511 		z->z_elem_size   = (uint16_t)(size - redzone);
9512 		z->z_chunk_pages = (uint16_t)atop(alloc);
9513 		z->z_quo_magic   = Z_MAGIC_QUO(size);
9514 		z->z_align_magic = Z_MAGIC_ALIGNED(size);
9515 		if (flags & ZC_PERCPU) {
9516 			z->z_chunk_elems = (uint16_t)(PAGE_SIZE / size);
9517 			z->z_elem_offs = (uint16_t)(PAGE_SIZE % size) + redzone;
9518 		} else {
9519 			z->z_chunk_elems = (uint16_t)(alloc / size);
9520 			z->z_elem_offs = (uint16_t)(alloc % size) + redzone;
9521 		}
9522 	}
9523 
9524 	/*
9525 	 * Handle KPI flags
9526 	 */
9527 
9528 	/* ZC_CACHING applied after all configuration is done */
9529 	if (flags & ZC_NOCACHING) {
9530 		z->z_nocaching = true;
9531 	}
9532 
9533 	if (flags & ZC_READONLY) {
9534 		zone_create_assert_not_both(name, flags, ZC_READONLY, ZC_VM);
9535 		zone_create_assert_not_both(name, flags, ZC_READONLY, ZC_DATA);
9536 		assert(zid <= ZONE_ID__LAST_RO);
9537 #if ZSECURITY_CONFIG(READ_ONLY)
9538 		zsflags->z_submap_idx = Z_SUBMAP_IDX_READ_ONLY;
9539 #endif
9540 		zone_ro_size_params[zid].z_elem_size = z->z_elem_size;
9541 		zone_ro_size_params[zid].z_align_magic = z->z_align_magic;
9542 		assert(size <= PAGE_SIZE);
9543 		if ((PAGE_SIZE % size) * 10 >= PAGE_SIZE) {
9544 			panic("Fragmentation greater than 10%% with elem size %d zone %s%s",
9545 			    (uint32_t)size, zone_heap_name(z), z->z_name);
9546 		}
9547 	}
9548 
9549 	if (flags & ZC_PERCPU) {
9550 		zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_READONLY);
9551 		zone_create_assert_not_both(name, flags, ZC_PERCPU, ZC_PGZ_USE_GUARDS);
9552 		z->z_percpu = true;
9553 	}
9554 	if (flags & ZC_NOGC) {
9555 		z->collectable = false;
9556 	}
9557 	/*
9558 	 * Handle ZC_NOENCRYPT from xnu only
9559 	 */
9560 	if (startup_phase < STARTUP_SUB_LOCKDOWN && flags & ZC_NOENCRYPT) {
9561 		zsflags->z_noencrypt = true;
9562 	}
9563 	if (flags & ZC_NOCALLOUT) {
9564 		z->no_callout = true;
9565 	}
9566 	if (flags & ZC_DESTRUCTIBLE) {
9567 		zone_create_assert_not_both(name, flags, ZC_DESTRUCTIBLE, ZC_READONLY);
9568 		z->z_destructible = true;
9569 	}
9570 	/*
9571 	 * Handle Internal flags
9572 	 */
9573 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
9574 	if (flags & ZC_PGZ_USE_GUARDS) {
9575 		/*
9576 		 * Try to turn on guard pages only for zones
9577 		 * with a chance of OOB.
9578 		 */
9579 		if (startup_phase < STARTUP_SUB_LOCKDOWN) {
9580 			zsflags->z_pgz_use_guards = true;
9581 		}
9582 		z->z_pgz_use_guards = true;
9583 	}
9584 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
9585 	if (!(flags & ZC_NOTBITAG)) {
9586 		z->z_tbi_tag = true;
9587 	}
9588 	if (flags & ZC_KALLOC_TYPE) {
9589 		zsflags->z_kalloc_type = true;
9590 	}
9591 	if (flags & ZC_VM) {
9592 		zone_create_assert_not_both(name, flags, ZC_VM, ZC_DATA);
9593 		zsflags->z_submap_idx = Z_SUBMAP_IDX_VM;
9594 	}
9595 	if (flags & ZC_DATA) {
9596 		zsflags->z_kheap_id = KHEAP_ID_DATA_BUFFERS;
9597 	}
9598 #if KASAN_CLASSIC
9599 	if (redzone && !(flags & ZC_KASAN_NOQUARANTINE)) {
9600 		z->z_kasan_quarantine = true;
9601 	}
9602 	z->z_kasan_redzone = redzone;
9603 #endif /* KASAN_CLASSIC */
9604 #if KASAN_FAKESTACK
9605 	if (strncmp(name, "fakestack.", sizeof("fakestack.") - 1) == 0) {
9606 		z->z_kasan_fakestacks = true;
9607 	}
9608 #endif /* KASAN_FAKESTACK */
9609 
9610 	/*
9611 	 * Then if there's extra tuning, do it
9612 	 */
9613 	if (extra_setup) {
9614 		extra_setup(z);
9615 	}
9616 
9617 	/*
9618 	 * Configure debugging features
9619 	 */
9620 #if CONFIG_PROB_GZALLOC
9621 	if ((flags & (ZC_READONLY | ZC_PERCPU | ZC_OBJ_CACHE | ZC_NOPGZ)) == 0) {
9622 		pgz_zone_init(z);
9623 	}
9624 #endif
9625 	if (zc_magazine_zone) { /* proxy for "has zone_init run" */
9626 #if ZALLOC_ENABLE_LOGGING
9627 		/*
9628 		 * Check for and set up zone leak detection
9629 		 * if requested via boot-args.
9630 		 */
9631 		zone_setup_logging(z);
9632 #endif /* ZALLOC_ENABLE_LOGGING */
9633 #if KASAN_TBI
9634 		zone_setup_kasan_logging(z);
9635 #endif /* KASAN_TBI */
9636 	}
9637 
9638 #if VM_TAG_SIZECLASSES
9639 	if ((zsflags->z_kheap_id || zsflags->z_kalloc_type) && zone_tagging_on) {
9640 		static uint16_t sizeclass_idx;
9641 
9642 		assert(startup_phase < STARTUP_SUB_LOCKDOWN);
9643 		z->z_uses_tags = true;
9644 		if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) {
9645 			zone_tags_sizeclasses[sizeclass_idx] = (uint16_t)size;
9646 			z->z_tags_sizeclass = sizeclass_idx++;
9647 		} else {
9648 			uint16_t i = 0;
9649 			for (; i < sizeclass_idx; i++) {
9650 				if (size == zone_tags_sizeclasses[i]) {
9651 					z->z_tags_sizeclass = i;
9652 					break;
9653 				}
9654 			}
9655 
9656 			/*
9657 			 * Size class wasn't found, add it to zone_tags_sizeclasses
9658 			 */
9659 			if (i == sizeclass_idx) {
9660 				assert(i < VM_TAG_SIZECLASSES);
9661 				zone_tags_sizeclasses[i] = (uint16_t)size;
9662 				z->z_tags_sizeclass = sizeclass_idx++;
9663 			}
9664 		}
9665 		assert(z->z_tags_sizeclass < VM_TAG_SIZECLASSES);
9666 	}
9667 #endif
9668 
9669 	/*
9670 	 * Finally, fixup properties based on security policies, boot-args, ...
9671 	 */
9672 	if (zsflags->z_kheap_id == KHEAP_ID_DATA_BUFFERS) {
9673 		/*
9674 		 * We use LIFO in the data map, because workloads like network
9675 		 * usage or similar tend to rotate through allocations very
9676 		 * quickly with sometimes epxloding working-sets and using
9677 		 * a FIFO policy might cause massive TLB trashing with rather
9678 		 * dramatic performance impacts.
9679 		 */
9680 		zsflags->z_submap_idx = Z_SUBMAP_IDX_DATA;
9681 		zsflags->z_lifo = true;
9682 	}
9683 
9684 	if ((flags & (ZC_CACHING | ZC_OBJ_CACHE)) && !z->z_nocaching) {
9685 		/*
9686 		 * No zone made before zone_init() can have ZC_CACHING set.
9687 		 */
9688 		assert(zc_magazine_zone);
9689 		zone_enable_caching(z);
9690 	}
9691 
9692 	zone_lock(z);
9693 	z->z_self = z;
9694 	zone_unlock(z);
9695 
9696 	return z;
9697 }
9698 
9699 void
zone_set_sig_eq(zone_t zone,zone_id_t sig_eq)9700 zone_set_sig_eq(zone_t zone, zone_id_t sig_eq)
9701 {
9702 	zone_security_array[zone_index(zone)].z_sig_eq = sig_eq;
9703 }
9704 
9705 zone_id_t
zone_get_sig_eq(zone_t zone)9706 zone_get_sig_eq(zone_t zone)
9707 {
9708 	return zone_security_array[zone_index(zone)].z_sig_eq;
9709 }
9710 
9711 void
zone_enable_smr(zone_t zone,struct smr * smr,zone_smr_free_cb_t free_cb)9712 zone_enable_smr(zone_t zone, struct smr *smr, zone_smr_free_cb_t free_cb)
9713 {
9714 	/* moving to SMR must be done before the zone has ever been used */
9715 	assert(zone->z_va_cur == 0 && !zone->z_smr && !zone->z_nocaching);
9716 	assert(!zone_security_array[zone_index(zone)].z_lifo);
9717 	assert((smr->smr_flags & SMR_SLEEPABLE) == 0);
9718 
9719 	if (!zone->z_pcpu_cache) {
9720 		zone_enable_caching(zone);
9721 	}
9722 
9723 	zone_lock(zone);
9724 
9725 	zpercpu_foreach(it, zone->z_pcpu_cache) {
9726 		it->zc_smr = smr;
9727 		it->zc_free = free_cb;
9728 	}
9729 	zone->z_smr = true;
9730 
9731 	zone_unlock(zone);
9732 }
9733 
9734 __startup_func
9735 void
zone_create_startup(struct zone_create_startup_spec * spec)9736 zone_create_startup(struct zone_create_startup_spec *spec)
9737 {
9738 	zone_t z;
9739 
9740 	z = zone_create_ext(spec->z_name, spec->z_size,
9741 	    spec->z_flags, spec->z_zid, spec->z_setup);
9742 	if (spec->z_var) {
9743 		*spec->z_var = z;
9744 	}
9745 }
9746 
9747 /*
9748  * The 4 first field of a zone_view and a zone alias, so that the zone_or_view_t
9749  * union works. trust but verify.
9750  */
9751 #define zalloc_check_zov_alias(f1, f2) \
9752     static_assert(offsetof(struct zone, f1) == offsetof(struct zone_view, f2))
9753 zalloc_check_zov_alias(z_self, zv_zone);
9754 zalloc_check_zov_alias(z_stats, zv_stats);
9755 zalloc_check_zov_alias(z_name, zv_name);
9756 zalloc_check_zov_alias(z_views, zv_next);
9757 #undef zalloc_check_zov_alias
9758 
9759 __startup_func
9760 void
zone_view_startup_init(struct zone_view_startup_spec * spec)9761 zone_view_startup_init(struct zone_view_startup_spec *spec)
9762 {
9763 	struct kalloc_heap *heap = NULL;
9764 	zone_view_t zv = spec->zv_view;
9765 	zone_t z;
9766 	zone_security_flags_t zsflags;
9767 
9768 	switch (spec->zv_heapid) {
9769 	case KHEAP_ID_DATA_BUFFERS:
9770 		heap = KHEAP_DATA_BUFFERS;
9771 		break;
9772 	default:
9773 		heap = NULL;
9774 	}
9775 
9776 	if (heap) {
9777 		z = kalloc_zone_for_size(heap->kh_zstart, spec->zv_size);
9778 	} else {
9779 		z = *spec->zv_zone;
9780 		assert(spec->zv_size <= zone_elem_inner_size(z));
9781 	}
9782 
9783 	assert(z);
9784 
9785 	zv->zv_zone  = z;
9786 	zv->zv_stats = zalloc_percpu_permanent_type(struct zone_stats);
9787 	zv->zv_next  = z->z_views;
9788 	zsflags = zone_security_config(z);
9789 	if (z->z_views == NULL && zsflags.z_kheap_id == KHEAP_ID_NONE) {
9790 		/*
9791 		 * count the raw view for zones not in a heap,
9792 		 * kalloc_heap_init() already counts it for its members.
9793 		 */
9794 		zone_view_count += 2;
9795 	} else {
9796 		zone_view_count += 1;
9797 	}
9798 	z->z_views = zv;
9799 }
9800 
9801 zone_t
zone_create(const char * name,vm_size_t size,zone_create_flags_t flags)9802 zone_create(
9803 	const char             *name,
9804 	vm_size_t               size,
9805 	zone_create_flags_t     flags)
9806 {
9807 	return zone_create_ext(name, size, flags, ZONE_ID_ANY, NULL);
9808 }
9809 
9810 static_assert(ZONE_ID__LAST_RO_EXT - ZONE_ID__FIRST_RO_EXT == ZC_RO_ID__LAST);
9811 
9812 zone_id_t
zone_create_ro(const char * name,vm_size_t size,zone_create_flags_t flags,zone_create_ro_id_t zc_ro_id)9813 zone_create_ro(
9814 	const char             *name,
9815 	vm_size_t               size,
9816 	zone_create_flags_t     flags,
9817 	zone_create_ro_id_t     zc_ro_id)
9818 {
9819 	assert(zc_ro_id <= ZC_RO_ID__LAST);
9820 	zone_id_t reserved_zid = ZONE_ID__FIRST_RO_EXT + zc_ro_id;
9821 	(void)zone_create_ext(name, size, ZC_READONLY | flags, reserved_zid, NULL);
9822 	return reserved_zid;
9823 }
9824 
9825 zone_t
zinit(vm_size_t size,vm_size_t max __unused,vm_size_t alloc __unused,const char * name)9826 zinit(
9827 	vm_size_t       size,           /* the size of an element */
9828 	vm_size_t       max __unused,   /* maximum memory to use */
9829 	vm_size_t       alloc __unused, /* allocation size */
9830 	const char      *name)          /* a name for the zone */
9831 {
9832 	return zone_create(name, size, ZC_DESTRUCTIBLE);
9833 }
9834 
9835 void
zdestroy(zone_t z)9836 zdestroy(zone_t z)
9837 {
9838 	unsigned int zindex = zone_index(z);
9839 	zone_security_flags_t zsflags = zone_security_array[zindex];
9840 
9841 	current_thread()->options |= TH_OPT_ZONE_PRIV;
9842 	lck_mtx_lock(&zone_gc_lock);
9843 
9844 	zone_reclaim(z, ZONE_RECLAIM_DESTROY);
9845 
9846 	lck_mtx_unlock(&zone_gc_lock);
9847 	current_thread()->options &= ~TH_OPT_ZONE_PRIV;
9848 
9849 	zone_lock(z);
9850 
9851 	if (!zone_submap_is_sequestered(zsflags)) {
9852 		while (!zone_pva_is_null(z->z_pageq_va)) {
9853 			struct zone_page_metadata *meta;
9854 
9855 			zone_counter_sub(z, z_va_cur, z->z_percpu ? 1 : z->z_chunk_pages);
9856 			meta = zone_meta_queue_pop(z, &z->z_pageq_va);
9857 			assert(meta->zm_chunk_len <= ZM_CHUNK_LEN_MAX);
9858 			bzero(meta, sizeof(*meta) * z->z_chunk_pages);
9859 			zone_unlock(z);
9860 			kmem_free(zone_submap(zsflags), zone_meta_to_addr(meta),
9861 			    ptoa(z->z_chunk_pages));
9862 			zone_lock(z);
9863 		}
9864 	}
9865 
9866 #if !KASAN_CLASSIC
9867 	/* Assert that all counts are zero */
9868 	if (z->z_elems_avail || z->z_elems_free || zone_size_wired(z) ||
9869 	    (z->z_va_cur && !zone_submap_is_sequestered(zsflags))) {
9870 		panic("zdestroy: Zone %s%s isn't empty at zdestroy() time",
9871 		    zone_heap_name(z), z->z_name);
9872 	}
9873 
9874 	/* consistency check: make sure everything is indeed empty */
9875 	assert(zone_pva_is_null(z->z_pageq_empty));
9876 	assert(zone_pva_is_null(z->z_pageq_partial));
9877 	assert(zone_pva_is_null(z->z_pageq_full));
9878 	if (!zone_submap_is_sequestered(zsflags)) {
9879 		assert(zone_pva_is_null(z->z_pageq_va));
9880 	}
9881 #endif
9882 
9883 	zone_unlock(z);
9884 
9885 	simple_lock(&all_zones_lock, &zone_locks_grp);
9886 
9887 	assert(!bitmap_test(zone_destroyed_bitmap, zindex));
9888 	/* Mark the zone as empty in the bitmap */
9889 	bitmap_set(zone_destroyed_bitmap, zindex);
9890 	num_zones_in_use--;
9891 	assert(num_zones_in_use > 0);
9892 
9893 	simple_unlock(&all_zones_lock);
9894 }
9895 
9896 #endif /* !ZALLOC_TEST */
9897 #pragma mark zalloc module init
9898 #if !ZALLOC_TEST
9899 
9900 /*
9901  *	Initialize the "zone of zones" which uses fixed memory allocated
9902  *	earlier in memory initialization.  zone_bootstrap is called
9903  *	before zone_init.
9904  */
9905 __startup_func
9906 void
zone_bootstrap(void)9907 zone_bootstrap(void)
9908 {
9909 #if DEBUG || DEVELOPMENT
9910 #if __x86_64__
9911 	if (PE_parse_boot_argn("kernPOST", NULL, 0)) {
9912 		/*
9913 		 * rdar://79781535 Disable early gaps while running kernPOST on Intel
9914 		 * the fp faulting code gets triggered and deadlocks.
9915 		 */
9916 		zone_caching_disabled = 1;
9917 	}
9918 #endif /* __x86_64__ */
9919 #endif /* DEBUG || DEVELOPMENT */
9920 
9921 	/* Validate struct zone_packed_virtual_address expectations */
9922 	static_assert((intptr_t)VM_MIN_KERNEL_ADDRESS < 0, "the top bit must be 1");
9923 	if (VM_KERNEL_POINTER_SIGNIFICANT_BITS - PAGE_SHIFT > 31) {
9924 		panic("zone_pva_t can't pack a kernel page address in 31 bits");
9925 	}
9926 
9927 	zpercpu_early_count = ml_early_cpu_max_number() + 1;
9928 	if (!PE_parse_boot_argn("zc_mag_size", NULL, 0)) {
9929 		/*
9930 		 * Scale zc_mag_size() per machine.
9931 		 *
9932 		 * - wide machines get 128B magazines to avoid all false sharing
9933 		 * - smaller machines but with enough RAM get a bit bigger
9934 		 *   buckets (empirically affects networking performance)
9935 		 */
9936 		if (zpercpu_early_count >= 10) {
9937 			_zc_mag_size = 14;
9938 		} else if ((sane_size >> 30) >= 4) {
9939 			_zc_mag_size = 10;
9940 		}
9941 	}
9942 
9943 	/*
9944 	 * Initialize random used to scramble early allocations
9945 	 */
9946 	zpercpu_foreach_cpu(cpu) {
9947 		random_bool_init(&zone_bool_gen[cpu].zbg_bg);
9948 	}
9949 
9950 #if CONFIG_PROB_GZALLOC
9951 	/*
9952 	 * Set pgz_sample_counter on the boot CPU so that we do not sample
9953 	 * any allocation until PGZ has been properly setup (in pgz_init()).
9954 	 */
9955 	*PERCPU_GET_MASTER(pgz_sample_counter) = INT32_MAX;
9956 #endif /* CONFIG_PROB_GZALLOC */
9957 
9958 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
9959 	/*
9960 	 * Randomly assign zones to one of the 4 general submaps,
9961 	 * and pick whether they allocate from the begining
9962 	 * or the end of it.
9963 	 *
9964 	 * A lot of OOB exploitation relies on precise interleaving
9965 	 * of specific types in the heap.
9966 	 *
9967 	 * Woops, you can't guarantee that anymore.
9968 	 */
9969 	for (zone_id_t i = 1; i < MAX_ZONES; i++) {
9970 		uint32_t r = zalloc_random_uniform32(0,
9971 		    ZSECURITY_CONFIG_GENERAL_SUBMAPS * 2);
9972 
9973 		zone_security_array[i].z_submap_from_end = (r & 1);
9974 		zone_security_array[i].z_submap_idx += (r >> 1);
9975 	}
9976 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
9977 
9978 	thread_call_setup_with_options(&zone_expand_callout,
9979 	    zone_expand_async, NULL, THREAD_CALL_PRIORITY_HIGH,
9980 	    THREAD_CALL_OPTIONS_ONCE);
9981 
9982 	thread_call_setup_with_options(&zone_trim_callout,
9983 	    zone_trim_async, NULL, THREAD_CALL_PRIORITY_USER,
9984 	    THREAD_CALL_OPTIONS_ONCE);
9985 }
9986 
9987 #define ZONE_GUARD_SIZE                 (64UL << 10)
9988 
9989 __startup_func
9990 static void
zone_tunables_fixup(void)9991 zone_tunables_fixup(void)
9992 {
9993 	int wdt = 0;
9994 
9995 #if CONFIG_PROB_GZALLOC && (DEVELOPMENT || DEBUG)
9996 	if (!PE_parse_boot_argn("pgz", NULL, 0) &&
9997 	    PE_parse_boot_argn("pgz1", NULL, 0)) {
9998 		/*
9999 		 * if pgz1= was used, but pgz= was not,
10000 		 * then the more specific pgz1 takes precedence.
10001 		 */
10002 		pgz_all = false;
10003 	}
10004 #endif
10005 
10006 	if (zone_map_jetsam_limit == 0 || zone_map_jetsam_limit > 100) {
10007 		zone_map_jetsam_limit = ZONE_MAP_JETSAM_LIMIT_DEFAULT;
10008 	}
10009 	if (PE_parse_boot_argn("wdt", &wdt, sizeof(wdt)) && wdt == -1 &&
10010 	    !PE_parse_boot_argn("zet", NULL, 0)) {
10011 		zone_exhausted_timeout = -1;
10012 	}
10013 }
10014 STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, zone_tunables_fixup);
10015 
10016 __startup_func
10017 static void
zone_submap_init(mach_vm_offset_t * submap_min,zone_submap_idx_t idx,uint64_t zone_sub_map_numer,uint64_t * remaining_denom,vm_offset_t * remaining_size)10018 zone_submap_init(
10019 	mach_vm_offset_t       *submap_min,
10020 	zone_submap_idx_t       idx,
10021 	uint64_t                zone_sub_map_numer,
10022 	uint64_t               *remaining_denom,
10023 	vm_offset_t            *remaining_size)
10024 {
10025 	vm_map_create_options_t vmco;
10026 	vm_map_address_t addr;
10027 	vm_offset_t submap_start, submap_end;
10028 	vm_size_t submap_size;
10029 	vm_map_t  submap;
10030 	vm_prot_t prot = VM_PROT_DEFAULT;
10031 	vm_prot_t prot_max = VM_PROT_ALL;
10032 	kern_return_t kr;
10033 
10034 	submap_size = trunc_page(zone_sub_map_numer * *remaining_size /
10035 	    *remaining_denom);
10036 	submap_start = *submap_min;
10037 
10038 	if (idx == Z_SUBMAP_IDX_READ_ONLY) {
10039 		vm_offset_t submap_padding = pmap_ro_zone_align(submap_start) - submap_start;
10040 		submap_start += submap_padding;
10041 		submap_size = pmap_ro_zone_align(submap_size);
10042 		assert(*remaining_size >= (submap_padding + submap_size));
10043 		*remaining_size -= submap_padding;
10044 		*submap_min = submap_start;
10045 	}
10046 
10047 	submap_end = submap_start + submap_size;
10048 	if (idx == Z_SUBMAP_IDX_VM) {
10049 		vm_packing_verify_range("vm_compressor",
10050 		    submap_start, submap_end, VM_PACKING_PARAMS(C_SLOT_PACKED_PTR));
10051 		vm_packing_verify_range("vm_page",
10052 		    submap_start, submap_end, VM_PACKING_PARAMS(VM_PAGE_PACKED_PTR));
10053 	}
10054 
10055 	vmco = VM_MAP_CREATE_NEVER_FAULTS;
10056 	if (!zone_submap_is_sequestered(idx)) {
10057 		vmco |= VM_MAP_CREATE_DISABLE_HOLELIST;
10058 	}
10059 
10060 	vm_map_will_allocate_early_map(&zone_submaps[idx]);
10061 	submap = kmem_suballoc(kernel_map, submap_min, submap_size, vmco,
10062 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE, KMS_PERMANENT | KMS_NOFAIL,
10063 	    VM_KERN_MEMORY_ZONE).kmr_submap;
10064 
10065 	if (idx == Z_SUBMAP_IDX_READ_ONLY) {
10066 		zone_info.zi_ro_range.min_address = submap_start;
10067 		zone_info.zi_ro_range.max_address = submap_end;
10068 		prot_max = prot = VM_PROT_NONE;
10069 	}
10070 
10071 	addr = submap_start;
10072 	vm_object_t kobject = kernel_object_default;
10073 	kr = vm_map_enter(submap, &addr, ZONE_GUARD_SIZE / 2, 0,
10074 	    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vm_tag = VM_KERN_MEMORY_ZONE),
10075 	    kobject, addr, FALSE, prot, prot_max, VM_INHERIT_NONE);
10076 	if (kr != KERN_SUCCESS) {
10077 		panic("ksubmap[%s]: failed to make first entry (%d)",
10078 		    zone_submaps_names[idx], kr);
10079 	}
10080 
10081 	addr = submap_end - ZONE_GUARD_SIZE / 2;
10082 	kr = vm_map_enter(submap, &addr, ZONE_GUARD_SIZE / 2, 0,
10083 	    VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(.vm_tag = VM_KERN_MEMORY_ZONE),
10084 	    kobject, addr, FALSE, prot, prot_max, VM_INHERIT_NONE);
10085 	if (kr != KERN_SUCCESS) {
10086 		panic("ksubmap[%s]: failed to make last entry (%d)",
10087 		    zone_submaps_names[idx], kr);
10088 	}
10089 
10090 #if DEBUG || DEVELOPMENT
10091 	printf("zone_init: map %-5s %p:%p (%u%c)\n",
10092 	    zone_submaps_names[idx], (void *)submap_start, (void *)submap_end,
10093 	    mach_vm_size_pretty(submap_size), mach_vm_size_unit(submap_size));
10094 #endif /* DEBUG || DEVELOPMENT */
10095 
10096 	zone_submaps[idx] = submap;
10097 	*submap_min       = submap_end;
10098 	*remaining_size  -= submap_size;
10099 	*remaining_denom -= zone_sub_map_numer;
10100 }
10101 
10102 static inline void
zone_pva_relocate(zone_pva_t * pva,uint32_t delta)10103 zone_pva_relocate(zone_pva_t *pva, uint32_t delta)
10104 {
10105 	if (!zone_pva_is_null(*pva) && !zone_pva_is_queue(*pva)) {
10106 		pva->packed_address += delta;
10107 	}
10108 }
10109 
10110 /*
10111  * Allocate metadata array and migrate bootstrap initial metadata and memory.
10112  */
10113 __startup_func
10114 static void
zone_metadata_init(void)10115 zone_metadata_init(void)
10116 {
10117 	vm_map_t vm_map = zone_submaps[Z_SUBMAP_IDX_VM];
10118 	vm_map_entry_t first;
10119 
10120 	struct mach_vm_range meta_r, bits_r, xtra_r, early_r;
10121 	vm_size_t early_sz;
10122 	vm_offset_t reloc_base;
10123 
10124 	/*
10125 	 * Step 1: Allocate the metadata + bitmaps range
10126 	 *
10127 	 * Allocations can't be smaller than 8 bytes, which is 128b / 16B per 1k
10128 	 * of physical memory (16M per 1G).
10129 	 *
10130 	 * Let's preallocate for the worst to avoid weird panics.
10131 	 */
10132 	vm_map_will_allocate_early_map(&zone_meta_map);
10133 	meta_r = zone_kmem_suballoc(zone_info.zi_meta_range.min_address,
10134 	    zone_meta_size + zone_bits_size + zone_xtra_size,
10135 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
10136 	    VM_KERN_MEMORY_ZONE, &zone_meta_map);
10137 	meta_r.min_address += ZONE_GUARD_SIZE;
10138 	meta_r.max_address -= ZONE_GUARD_SIZE;
10139 	if (zone_xtra_size) {
10140 		xtra_r.max_address  = meta_r.max_address;
10141 		meta_r.max_address -= zone_xtra_size;
10142 		xtra_r.min_address  = meta_r.max_address;
10143 	} else {
10144 		xtra_r.min_address  = xtra_r.max_address = 0;
10145 	}
10146 	bits_r.max_address  = meta_r.max_address;
10147 	meta_r.max_address -= zone_bits_size;
10148 	bits_r.min_address  = meta_r.max_address;
10149 
10150 #if DEBUG || DEVELOPMENT
10151 	printf("zone_init: metadata  %p:%p (%u%c)\n",
10152 	    (void *)meta_r.min_address, (void *)meta_r.max_address,
10153 	    mach_vm_size_pretty(mach_vm_range_size(&meta_r)),
10154 	    mach_vm_size_unit(mach_vm_range_size(&meta_r)));
10155 	printf("zone_init: metabits  %p:%p (%u%c)\n",
10156 	    (void *)bits_r.min_address, (void *)bits_r.max_address,
10157 	    mach_vm_size_pretty(mach_vm_range_size(&bits_r)),
10158 	    mach_vm_size_unit(mach_vm_range_size(&bits_r)));
10159 	printf("zone_init: extra     %p:%p (%u%c)\n",
10160 	    (void *)xtra_r.min_address, (void *)xtra_r.max_address,
10161 	    mach_vm_size_pretty(mach_vm_range_size(&xtra_r)),
10162 	    mach_vm_size_unit(mach_vm_range_size(&xtra_r)));
10163 #endif /* DEBUG || DEVELOPMENT */
10164 
10165 	bits_r.min_address = (bits_r.min_address + ZBA_CHUNK_SIZE - 1) & -ZBA_CHUNK_SIZE;
10166 	bits_r.max_address = bits_r.max_address & -ZBA_CHUNK_SIZE;
10167 
10168 	/*
10169 	 * Step 2: Install new ranges.
10170 	 *         Relocate metadata and bits.
10171 	 */
10172 	early_r  = zone_info.zi_map_range;
10173 	early_sz = mach_vm_range_size(&early_r);
10174 
10175 	zone_info.zi_map_range  = zone_map_range;
10176 	zone_info.zi_meta_range = meta_r;
10177 	zone_info.zi_bits_range = bits_r;
10178 	zone_info.zi_xtra_range = xtra_r;
10179 	zone_info.zi_meta_base  = (struct zone_page_metadata *)meta_r.min_address -
10180 	    zone_pva_from_addr(zone_map_range.min_address).packed_address;
10181 
10182 	vm_map_lock(vm_map);
10183 	first = vm_map_first_entry(vm_map);
10184 	reloc_base = first->vme_end;
10185 	first->vme_end += early_sz;
10186 	vm_map->size += early_sz;
10187 	vm_map_unlock(vm_map);
10188 
10189 	struct zone_page_metadata *early_meta = zone_early_meta_array_startup;
10190 	struct zone_page_metadata *new_meta = zone_meta_from_addr(reloc_base);
10191 	vm_offset_t reloc_delta = reloc_base - early_r.min_address;
10192 	/* this needs to sign extend */
10193 	uint32_t pva_delta = (uint32_t)((intptr_t)reloc_delta >> PAGE_SHIFT);
10194 
10195 	zone_meta_populate(reloc_base, early_sz);
10196 	memcpy(new_meta, early_meta,
10197 	    atop(early_sz) * sizeof(struct zone_page_metadata));
10198 	for (uint32_t i = 0; i < atop(early_sz); i++) {
10199 		zone_pva_relocate(&new_meta[i].zm_page_next, pva_delta);
10200 		zone_pva_relocate(&new_meta[i].zm_page_prev, pva_delta);
10201 	}
10202 
10203 	static_assert(ZONE_ID_VM_MAP_ENTRY == ZONE_ID_VM_MAP + 1);
10204 	static_assert(ZONE_ID_VM_MAP_HOLES == ZONE_ID_VM_MAP + 2);
10205 
10206 	for (zone_id_t zid = ZONE_ID_VM_MAP; zid <= ZONE_ID_VM_MAP_HOLES; zid++) {
10207 		zone_pva_relocate(&zone_array[zid].z_pageq_partial, pva_delta);
10208 		zone_pva_relocate(&zone_array[zid].z_pageq_full, pva_delta);
10209 	}
10210 
10211 	zba_populate(0, false);
10212 	memcpy(zba_base_header(), zba_chunk_startup, sizeof(zba_chunk_startup));
10213 	zba_meta()->zbam_right = (uint32_t)atop(zone_bits_size);
10214 
10215 	/*
10216 	 * Step 3: Relocate the boostrap VM structs
10217 	 *         (including rewriting their content).
10218 	 */
10219 
10220 	kernel_memory_populate(reloc_base, early_sz,
10221 	    KMA_KOBJECT | KMA_NOENCRYPT | KMA_NOFAIL | KMA_TAG,
10222 	    VM_KERN_MEMORY_OSFMK);
10223 	__nosan_memcpy((void *)reloc_base, (void *)early_r.min_address, early_sz);
10224 
10225 #if KASAN
10226 	kasan_notify_address(reloc_base, early_sz);
10227 #if KASAN_TBI
10228 	kasan_tbi_copy_tags(reloc_base, early_r.min_address, early_sz);
10229 #endif /* KASAN_TBI */
10230 #endif /* KASAN */
10231 
10232 	vm_map_relocate_early_maps(reloc_delta);
10233 
10234 	for (uint32_t i = 0; i < atop(early_sz); i++) {
10235 		zone_id_t zid = new_meta[i].zm_index;
10236 		zone_t z = &zone_array[zid];
10237 		vm_size_t esize = zone_elem_outer_size(z);
10238 		vm_address_t base = reloc_base + ptoa(i) + zone_elem_inner_offs(z);
10239 		vm_address_t addr;
10240 
10241 		if (new_meta[i].zm_chunk_len >= ZM_SECONDARY_PAGE) {
10242 			continue;
10243 		}
10244 
10245 		for (uint32_t eidx = 0; eidx < z->z_chunk_elems; eidx++) {
10246 			if (zone_meta_is_free(&new_meta[i], eidx)) {
10247 				continue;
10248 			}
10249 
10250 			addr = vm_memtag_fixup_ptr(base + eidx * esize);
10251 #if KASAN_CLASSIC
10252 			kasan_alloc(addr,
10253 			    zone_elem_inner_size(z), zone_elem_inner_size(z),
10254 			    zone_elem_redzone(z), false,
10255 			    __builtin_frame_address(0));
10256 #endif
10257 			vm_map_relocate_early_elem(zid, addr, reloc_delta);
10258 		}
10259 	}
10260 }
10261 
10262 __startup_data
10263 static uint16_t submap_ratios[Z_SUBMAP_IDX_COUNT] = {
10264 #if ZSECURITY_CONFIG(READ_ONLY)
10265 	[Z_SUBMAP_IDX_VM]               = 15,
10266 	[Z_SUBMAP_IDX_READ_ONLY]        =  5,
10267 #else
10268 	[Z_SUBMAP_IDX_VM]               = 20,
10269 #endif /* !ZSECURITY_CONFIG(READ_ONLY) */
10270 #if ZSECURITY_CONFIG(SAD_FENG_SHUI)
10271 	[Z_SUBMAP_IDX_GENERAL_0]        = 15,
10272 	[Z_SUBMAP_IDX_GENERAL_1]        = 15,
10273 	[Z_SUBMAP_IDX_GENERAL_2]        = 15,
10274 	[Z_SUBMAP_IDX_GENERAL_3]        = 15,
10275 	[Z_SUBMAP_IDX_DATA]             = 20,
10276 #else
10277 	[Z_SUBMAP_IDX_GENERAL_0]        = 60,
10278 	[Z_SUBMAP_IDX_DATA]             = 20,
10279 #endif /* ZSECURITY_CONFIG(SAD_FENG_SHUI) */
10280 };
10281 
10282 __startup_func
10283 static inline uint16_t
zone_submap_ratios_denom(void)10284 zone_submap_ratios_denom(void)
10285 {
10286 	uint16_t denom = 0;
10287 
10288 	for (unsigned idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
10289 		denom += submap_ratios[idx];
10290 	}
10291 
10292 	assert(denom == 100);
10293 
10294 	return denom;
10295 }
10296 
10297 __startup_func
10298 static inline vm_offset_t
zone_restricted_va_max(void)10299 zone_restricted_va_max(void)
10300 {
10301 	vm_offset_t compressor_max = VM_PACKING_MAX_PACKABLE(C_SLOT_PACKED_PTR);
10302 	vm_offset_t vm_page_max    = VM_PACKING_MAX_PACKABLE(VM_PAGE_PACKED_PTR);
10303 
10304 	return trunc_page(MIN(compressor_max, vm_page_max));
10305 }
10306 
10307 __startup_func
10308 static void
zone_set_map_sizes(void)10309 zone_set_map_sizes(void)
10310 {
10311 	vm_size_t zsize;
10312 	vm_size_t zsizearg;
10313 
10314 	/*
10315 	 * Compute the physical limits for the zone map
10316 	 */
10317 
10318 	if (PE_parse_boot_argn("zsize", &zsizearg, sizeof(zsizearg))) {
10319 		zsize = zsizearg * (1024ULL * 1024);
10320 	} else {
10321 		/* Set target zone size as 1/4 of physical memory */
10322 		zsize = (vm_size_t)(sane_size >> 2);
10323 		zsize += zsize >> 1;
10324 	}
10325 
10326 	if (zsize < CONFIG_ZONE_MAP_MIN) {
10327 		zsize = CONFIG_ZONE_MAP_MIN;   /* Clamp to min */
10328 	}
10329 	if (zsize > sane_size >> 1) {
10330 		zsize = (vm_size_t)(sane_size >> 1); /* Clamp to half of RAM max */
10331 	}
10332 	if (zsizearg == 0 && zsize > ZONE_MAP_MAX) {
10333 		/* if zsize boot-arg not present and zsize exceeds platform maximum, clip zsize */
10334 		printf("NOTE: zonemap size reduced from 0x%lx to 0x%lx\n",
10335 		    (uintptr_t)zsize, (uintptr_t)ZONE_MAP_MAX);
10336 		zsize = ZONE_MAP_MAX;
10337 	}
10338 
10339 	zone_pages_wired_max = (uint32_t)atop(trunc_page(zsize));
10340 
10341 
10342 	/*
10343 	 * Declare restrictions on zone max
10344 	 */
10345 	vm_offset_t vm_submap_size = round_page(
10346 		(submap_ratios[Z_SUBMAP_IDX_VM] * ZONE_MAP_VA_SIZE) /
10347 		zone_submap_ratios_denom());
10348 
10349 #if CONFIG_PROB_GZALLOC
10350 	vm_submap_size += pgz_get_size();
10351 #endif /* CONFIG_PROB_GZALLOC */
10352 	if (os_sub_overflow(zone_restricted_va_max(), vm_submap_size,
10353 	    &zone_map_range.min_address)) {
10354 		zone_map_range.min_address = 0;
10355 	}
10356 
10357 	zone_meta_size = round_page(atop(ZONE_MAP_VA_SIZE) *
10358 	    sizeof(struct zone_page_metadata)) + ZONE_GUARD_SIZE * 2;
10359 
10360 	static_assert(ZONE_MAP_MAX / (CHAR_BIT * KALLOC_MINSIZE) <=
10361 	    ZBA_PTR_MASK + 1);
10362 	zone_bits_size = round_page(ptoa(zone_pages_wired_max) /
10363 	    (CHAR_BIT * KALLOC_MINSIZE));
10364 
10365 #if VM_TAG_SIZECLASSES
10366 	if (zone_tagging_on) {
10367 		zba_xtra_shift = (uint8_t)fls(sizeof(vm_tag_t) - 1);
10368 	}
10369 	if (zba_xtra_shift) {
10370 		/*
10371 		 * if we need the extra space range, then limit the size of the
10372 		 * bitmaps to something reasonable instead of a theoretical
10373 		 * worst case scenario of all zones being for the smallest
10374 		 * allocation granule, in order to avoid fake VA pressure on
10375 		 * other parts of the system.
10376 		 */
10377 		zone_bits_size = round_page(zone_bits_size / 8);
10378 		zone_xtra_size = round_page(zone_bits_size * CHAR_BIT << zba_xtra_shift);
10379 	}
10380 #endif /* VM_TAG_SIZECLASSES */
10381 }
10382 STARTUP(KMEM, STARTUP_RANK_FIRST, zone_set_map_sizes);
10383 
10384 /*
10385  * Can't use zone_info.zi_map_range at this point as it is being used to
10386  * store the range of early pmap memory that was stolen to bootstrap the
10387  * necessary VM zones.
10388  */
10389 KMEM_RANGE_REGISTER_STATIC(zones, &zone_map_range, ZONE_MAP_VA_SIZE);
10390 KMEM_RANGE_REGISTER_DYNAMIC(zone_meta, &zone_info.zi_meta_range, ^{
10391 	return zone_meta_size + zone_bits_size + zone_xtra_size;
10392 });
10393 
10394 /*
10395  * Global initialization of Zone Allocator.
10396  * Runs after zone_bootstrap.
10397  */
10398 __startup_func
10399 static void
zone_init(void)10400 zone_init(void)
10401 {
10402 	vm_size_t           remaining_size = ZONE_MAP_VA_SIZE;
10403 	mach_vm_offset_t    submap_min = 0;
10404 	uint64_t            denom = zone_submap_ratios_denom();
10405 	/*
10406 	 * And now allocate the various pieces of VA and submaps.
10407 	 */
10408 
10409 	submap_min = zone_map_range.min_address;
10410 
10411 #if CONFIG_PROB_GZALLOC
10412 	vm_size_t pgz_size = pgz_get_size();
10413 
10414 	vm_map_will_allocate_early_map(&pgz_submap);
10415 	zone_info.zi_pgz_range = zone_kmem_suballoc(submap_min, pgz_size,
10416 	    VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
10417 	    VM_KERN_MEMORY_ZONE, &pgz_submap);
10418 
10419 	submap_min     += pgz_size;
10420 	remaining_size -= pgz_size;
10421 #if DEBUG || DEVELOPMENT
10422 	printf("zone_init: pgzalloc  %p:%p (%u%c) [%d slots]\n",
10423 	    (void *)zone_info.zi_pgz_range.min_address,
10424 	    (void *)zone_info.zi_pgz_range.max_address,
10425 	    mach_vm_size_pretty(pgz_size), mach_vm_size_unit(pgz_size),
10426 	    pgz_slots);
10427 #endif /* DEBUG || DEVELOPMENT */
10428 #endif /* CONFIG_PROB_GZALLOC */
10429 
10430 	/*
10431 	 * Allocate the submaps
10432 	 */
10433 	for (zone_submap_idx_t idx = 0; idx < Z_SUBMAP_IDX_COUNT; idx++) {
10434 		if (submap_ratios[idx] == 0) {
10435 			zone_submaps[idx] = VM_MAP_NULL;
10436 		} else {
10437 			zone_submap_init(&submap_min, idx, submap_ratios[idx],
10438 			    &denom, &remaining_size);
10439 		}
10440 	}
10441 
10442 	zone_metadata_init();
10443 
10444 #if VM_TAG_SIZECLASSES
10445 	if (zone_tagging_on) {
10446 		vm_allocation_zones_init();
10447 	}
10448 #endif /* VM_TAG_SIZECLASSES */
10449 
10450 	zone_create_flags_t kma_flags = ZC_NOCACHING | ZC_NOGC | ZC_NOCALLOUT |
10451 	    ZC_KASAN_NOQUARANTINE | ZC_KASAN_NOREDZONE | ZC_VM;
10452 
10453 	(void)zone_create_ext("vm.permanent", 1, kma_flags | ZC_NOTBITAG,
10454 	    ZONE_ID_PERMANENT, ^(zone_t z) {
10455 		z->z_permanent = true;
10456 		z->z_elem_size = 1;
10457 	});
10458 	(void)zone_create_ext("vm.permanent.percpu", 1,
10459 	    kma_flags | ZC_PERCPU | ZC_NOTBITAG, ZONE_ID_PERCPU_PERMANENT, ^(zone_t z) {
10460 		z->z_permanent = true;
10461 		z->z_elem_size = 1;
10462 	});
10463 
10464 	zc_magazine_zone = zone_create("zcc_magazine_zone", sizeof(struct zone_magazine) +
10465 	    zc_mag_size() * sizeof(vm_offset_t),
10466 	    ZC_VM | ZC_NOCACHING | ZC_ZFREE_CLEARMEM | ZC_PGZ_USE_GUARDS);
10467 	zone_raise_reserve(zc_magazine_zone, (uint16_t)(2 * zpercpu_count()));
10468 
10469 	/*
10470 	 * Now migrate the startup statistics into their final storage,
10471 	 * and enable logging for early zones (that zone_create_ext() skipped).
10472 	 */
10473 	int cpu = cpu_number();
10474 	zone_index_foreach(idx) {
10475 		zone_t tz = &zone_array[idx];
10476 
10477 		if (tz->z_stats == __zpcpu_mangle_for_boot(&zone_stats_startup[idx])) {
10478 			zone_stats_t zs = zalloc_percpu_permanent_type(struct zone_stats);
10479 
10480 			*zpercpu_get_cpu(zs, cpu) = *zpercpu_get_cpu(tz->z_stats, cpu);
10481 			tz->z_stats = zs;
10482 		}
10483 		if (tz->z_self == tz) {
10484 #if ZALLOC_ENABLE_LOGGING
10485 			zone_setup_logging(tz);
10486 #endif /* ZALLOC_ENABLE_LOGGING */
10487 #if KASAN_TBI
10488 			zone_setup_kasan_logging(tz);
10489 #endif /* KASAN_TBI */
10490 		}
10491 	}
10492 }
10493 STARTUP(ZALLOC, STARTUP_RANK_FIRST, zone_init);
10494 
10495 void
zalloc_iokit_lockdown(void)10496 zalloc_iokit_lockdown(void)
10497 {
10498 	zone_share_always = false;
10499 }
10500 
10501 void
zalloc_first_proc_made(void)10502 zalloc_first_proc_made(void)
10503 {
10504 	zone_caching_disabled = 0;
10505 	zone_early_thres_mul = 1;
10506 }
10507 
10508 __startup_func
10509 vm_offset_t
zone_early_mem_init(vm_size_t size)10510 zone_early_mem_init(vm_size_t size)
10511 {
10512 	vm_offset_t mem;
10513 
10514 	assert3u(atop(size), <=, ZONE_EARLY_META_INLINE_COUNT);
10515 
10516 	/*
10517 	 * The zone that is used early to bring up the VM is stolen here.
10518 	 *
10519 	 * When the zone subsystem is actually initialized,
10520 	 * zone_metadata_init() will be called, and those pages
10521 	 * and the elements they contain, will be relocated into
10522 	 * the VM submap (even for architectures when those zones
10523 	 * do not live there).
10524 	 */
10525 	assert3u(size, <=, sizeof(zone_early_pages_to_cram));
10526 	mem = (vm_offset_t)zone_early_pages_to_cram;
10527 
10528 	zone_info.zi_meta_base = zone_early_meta_array_startup -
10529 	    zone_pva_from_addr(mem).packed_address;
10530 	zone_info.zi_map_range.min_address = mem;
10531 	zone_info.zi_map_range.max_address = mem + size;
10532 
10533 	zone_info.zi_bits_range = (struct mach_vm_range){
10534 		.min_address = (mach_vm_offset_t)zba_chunk_startup,
10535 		.max_address = (mach_vm_offset_t)zba_chunk_startup +
10536 	    sizeof(zba_chunk_startup),
10537 	};
10538 
10539 	zba_meta()->zbam_left  = 1;
10540 	zba_meta()->zbam_right = 1;
10541 	zba_init_chunk(0, false);
10542 
10543 	return mem;
10544 }
10545 
10546 #endif /* !ZALLOC_TEST */
10547 #pragma mark - tests
10548 #if DEBUG || DEVELOPMENT
10549 
10550 /*
10551  * Used for sysctl zone tests that aren't thread-safe. Ensure only one
10552  * thread goes through at a time.
10553  *
10554  * Or we can end up with multiple test zones (if a second zinit() comes through
10555  * before zdestroy()), which could lead us to run out of zones.
10556  */
10557 static bool any_zone_test_running = FALSE;
10558 
10559 static uintptr_t *
zone_copy_allocations(zone_t z,uintptr_t * elems,zone_pva_t page_index)10560 zone_copy_allocations(zone_t z, uintptr_t *elems, zone_pva_t page_index)
10561 {
10562 	vm_offset_t elem_size = zone_elem_outer_size(z);
10563 	vm_offset_t base;
10564 	struct zone_page_metadata *meta;
10565 
10566 	while (!zone_pva_is_null(page_index)) {
10567 		base  = zone_pva_to_addr(page_index) + zone_elem_inner_offs(z);
10568 		meta  = zone_pva_to_meta(page_index);
10569 
10570 		if (meta->zm_inline_bitmap) {
10571 			for (size_t i = 0; i < meta->zm_chunk_len; i++) {
10572 				uint32_t map = meta[i].zm_bitmap;
10573 
10574 				for (; map; map &= map - 1) {
10575 					*elems++ = INSTANCE_PUT(base +
10576 					    elem_size * __builtin_clz(map));
10577 				}
10578 				base += elem_size * 32;
10579 			}
10580 		} else {
10581 			uint32_t order = zba_bits_ref_order(meta->zm_bitmap);
10582 			bitmap_t *bits = zba_bits_ref_ptr(meta->zm_bitmap);
10583 			for (size_t i = 0; i < (1u << order); i++) {
10584 				uint64_t map = bits[i];
10585 
10586 				for (; map; map &= map - 1) {
10587 					*elems++ = INSTANCE_PUT(base +
10588 					    elem_size * __builtin_clzll(map));
10589 				}
10590 				base += elem_size * 64;
10591 			}
10592 		}
10593 
10594 		page_index = meta->zm_page_next;
10595 	}
10596 	return elems;
10597 }
10598 
10599 kern_return_t
zone_leaks(const char * zoneName,uint32_t nameLen,leak_site_proc proc)10600 zone_leaks(const char * zoneName, uint32_t nameLen, leak_site_proc proc)
10601 {
10602 	zone_t        zone = NULL;
10603 	uintptr_t *   array;
10604 	uintptr_t *   next;
10605 	uintptr_t     element;
10606 	uint32_t      idx, count, found;
10607 	uint32_t      nobtcount;
10608 	uint32_t      elemSize;
10609 	size_t        maxElems;
10610 
10611 	zone_foreach(z) {
10612 		if (!z->z_name) {
10613 			continue;
10614 		}
10615 		if (!strncmp(zoneName, z->z_name, nameLen)) {
10616 			zone = z;
10617 			break;
10618 		}
10619 	}
10620 	if (zone == NULL) {
10621 		return KERN_INVALID_NAME;
10622 	}
10623 
10624 	elemSize = (uint32_t)zone_elem_inner_size(zone);
10625 	maxElems = (zone->z_elems_avail + 1) & ~1ul;
10626 
10627 	array = kalloc_type_tag(vm_offset_t, maxElems, VM_KERN_MEMORY_DIAG);
10628 	if (array == NULL) {
10629 		return KERN_RESOURCE_SHORTAGE;
10630 	}
10631 
10632 	zone_lock(zone);
10633 
10634 	next = array;
10635 	next = zone_copy_allocations(zone, next, zone->z_pageq_partial);
10636 	next = zone_copy_allocations(zone, next, zone->z_pageq_full);
10637 	count = (uint32_t)(next - array);
10638 
10639 	zone_unlock(zone);
10640 
10641 	zone_leaks_scan(array, count, (uint32_t)zone_elem_outer_size(zone), &found);
10642 	assert(found <= count);
10643 
10644 	for (idx = 0; idx < count; idx++) {
10645 		element = array[idx];
10646 		if (kInstanceFlagReferenced & element) {
10647 			continue;
10648 		}
10649 		element = INSTANCE_PUT(element) & ~kInstanceFlags;
10650 	}
10651 
10652 #if ZALLOC_ENABLE_LOGGING
10653 	if (zone->z_btlog && !corruption_debug_flag) {
10654 		// btlog_copy_backtraces_for_elements will set kInstanceFlagReferenced on elements it found
10655 		static_assert(sizeof(vm_address_t) == sizeof(uintptr_t));
10656 		btlog_copy_backtraces_for_elements(zone->z_btlog,
10657 		    (vm_address_t *)array, &count, elemSize, proc);
10658 	}
10659 #endif /* ZALLOC_ENABLE_LOGGING */
10660 
10661 	for (nobtcount = idx = 0; idx < count; idx++) {
10662 		element = array[idx];
10663 		if (!element) {
10664 			continue;
10665 		}
10666 		if (kInstanceFlagReferenced & element) {
10667 			continue;
10668 		}
10669 		nobtcount++;
10670 	}
10671 	if (nobtcount) {
10672 		proc(nobtcount, elemSize, BTREF_NULL);
10673 	}
10674 
10675 	kfree_type(vm_offset_t, maxElems, array);
10676 	return KERN_SUCCESS;
10677 }
10678 
10679 static int
zone_ro_basic_test_run(__unused int64_t in,int64_t * out)10680 zone_ro_basic_test_run(__unused int64_t in, int64_t *out)
10681 {
10682 	zone_security_flags_t zsflags;
10683 	uint32_t x = 4;
10684 	uint32_t *test_ptr;
10685 
10686 	if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
10687 		printf("zone_ro_basic_test: Test already running.\n");
10688 		return EALREADY;
10689 	}
10690 
10691 	zsflags = zone_security_array[ZONE_ID__FIRST_RO];
10692 
10693 	for (int i = 0; i < 3; i++) {
10694 #if ZSECURITY_CONFIG(READ_ONLY)
10695 		/* Basic Test: Create int zone, zalloc int, modify value, free int */
10696 		printf("zone_ro_basic_test: Basic Test iteration %d\n", i);
10697 		printf("zone_ro_basic_test: create a sub-page size zone\n");
10698 
10699 		printf("zone_ro_basic_test: verify flags were set\n");
10700 		assert(zsflags.z_submap_idx == Z_SUBMAP_IDX_READ_ONLY);
10701 
10702 		printf("zone_ro_basic_test: zalloc an element\n");
10703 		test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK);
10704 		assert(test_ptr);
10705 
10706 		printf("zone_ro_basic_test: verify we can't write to it\n");
10707 		assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT);
10708 
10709 		x = 4;
10710 		printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n");
10711 		zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t));
10712 		assert(test_ptr);
10713 		assert(*(uint32_t*)test_ptr == x);
10714 
10715 		x = 5;
10716 		printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n");
10717 		zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x);
10718 		assert(test_ptr);
10719 		assert(*(uint32_t*)test_ptr == x);
10720 
10721 		printf("zone_ro_basic_test: verify we can't write to it after assigning value\n");
10722 		assert(verify_write(&x, test_ptr, sizeof(x)) == EFAULT);
10723 
10724 		printf("zone_ro_basic_test: free elem\n");
10725 		zfree_ro(ZONE_ID__FIRST_RO, test_ptr);
10726 		assert(!test_ptr);
10727 #else
10728 		printf("zone_ro_basic_test: Read-only allocator n/a on 32bit platforms, test functionality of API\n");
10729 
10730 		printf("zone_ro_basic_test: verify flags were set\n");
10731 		assert(zsflags.z_submap_idx != Z_SUBMAP_IDX_READ_ONLY);
10732 
10733 		printf("zone_ro_basic_test: zalloc an element\n");
10734 		test_ptr = (zalloc_ro)(ZONE_ID__FIRST_RO, Z_WAITOK);
10735 		assert(test_ptr);
10736 
10737 		x = 4;
10738 		printf("zone_ro_basic_test: test zalloc_ro_mut to assign value\n");
10739 		zalloc_ro_mut(ZONE_ID__FIRST_RO, test_ptr, 0, &x, sizeof(uint32_t));
10740 		assert(test_ptr);
10741 		assert(*(uint32_t*)test_ptr == x);
10742 
10743 		x = 5;
10744 		printf("zone_ro_basic_test: test zalloc_ro_update_elem to assign value\n");
10745 		zalloc_ro_update_elem(ZONE_ID__FIRST_RO, test_ptr, &x);
10746 		assert(test_ptr);
10747 		assert(*(uint32_t*)test_ptr == x);
10748 
10749 		printf("zone_ro_basic_test: free elem\n");
10750 		zfree_ro(ZONE_ID__FIRST_RO, test_ptr);
10751 		assert(!test_ptr);
10752 #endif /* !ZSECURITY_CONFIG(READ_ONLY) */
10753 	}
10754 
10755 	printf("zone_ro_basic_test: garbage collection\n");
10756 	zone_gc(ZONE_GC_DRAIN);
10757 
10758 	printf("zone_ro_basic_test: Test passed\n");
10759 
10760 	*out = 1;
10761 	os_atomic_store(&any_zone_test_running, false, relaxed);
10762 	return 0;
10763 }
10764 SYSCTL_TEST_REGISTER(zone_ro_basic_test, zone_ro_basic_test_run);
10765 
10766 static int
zone_basic_test_run(__unused int64_t in,int64_t * out)10767 zone_basic_test_run(__unused int64_t in, int64_t *out)
10768 {
10769 	static zone_t test_zone_ptr = NULL;
10770 
10771 	unsigned int i = 0, max_iter = 5;
10772 	void * test_ptr;
10773 	zone_t test_zone;
10774 	int rc = 0;
10775 
10776 	if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
10777 		printf("zone_basic_test: Test already running.\n");
10778 		return EALREADY;
10779 	}
10780 
10781 	printf("zone_basic_test: Testing zinit(), zalloc(), zfree() and zdestroy() on zone \"test_zone_sysctl\"\n");
10782 
10783 	/* zinit() and zdestroy() a zone with the same name a bunch of times, verify that we get back the same zone each time */
10784 	do {
10785 		test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_zone_sysctl");
10786 		assert(test_zone);
10787 
10788 #if KASAN_CLASSIC
10789 		if (test_zone_ptr == NULL && test_zone->z_elems_free != 0)
10790 #else
10791 		if (test_zone->z_elems_free != 0)
10792 #endif
10793 		{
10794 			printf("zone_basic_test: free count is not zero\n");
10795 			rc = EIO;
10796 			goto out;
10797 		}
10798 
10799 		if (test_zone_ptr == NULL) {
10800 			/* Stash the zone pointer returned on the fist zinit */
10801 			printf("zone_basic_test: zone created for the first time\n");
10802 			test_zone_ptr = test_zone;
10803 		} else if (test_zone != test_zone_ptr) {
10804 			printf("zone_basic_test: old zone pointer and new zone pointer don't match\n");
10805 			rc = EIO;
10806 			goto out;
10807 		}
10808 
10809 		test_ptr = zalloc_flags(test_zone, Z_WAITOK | Z_NOFAIL);
10810 		zfree(test_zone, test_ptr);
10811 
10812 		zdestroy(test_zone);
10813 		i++;
10814 
10815 		printf("zone_basic_test: Iteration %d successful\n", i);
10816 	} while (i < max_iter);
10817 
10818 #if !KASAN_CLASSIC /* because of the quarantine and redzones */
10819 	/* test Z_VA_SEQUESTER */
10820 	{
10821 		zone_t test_pcpu_zone;
10822 		kern_return_t kr;
10823 		int idx, num_allocs = 8;
10824 		vm_size_t elem_size = 2 * PAGE_SIZE / num_allocs;
10825 		void *allocs[num_allocs];
10826 		void **allocs_pcpu;
10827 		vm_offset_t phys_pages = os_atomic_load(&zone_pages_wired, relaxed);
10828 
10829 		test_zone = zone_create("test_zone_sysctl", elem_size,
10830 		    ZC_DESTRUCTIBLE);
10831 		assert(test_zone);
10832 
10833 		test_pcpu_zone = zone_create("test_zone_sysctl.pcpu", sizeof(uint64_t),
10834 		    ZC_DESTRUCTIBLE | ZC_PERCPU);
10835 		assert(test_pcpu_zone);
10836 
10837 		for (idx = 0; idx < num_allocs; idx++) {
10838 			allocs[idx] = zalloc(test_zone);
10839 			assert(NULL != allocs[idx]);
10840 			printf("alloc[%d] %p\n", idx, allocs[idx]);
10841 		}
10842 		for (idx = 0; idx < num_allocs; idx++) {
10843 			zfree(test_zone, allocs[idx]);
10844 		}
10845 		assert(!zone_pva_is_null(test_zone->z_pageq_empty));
10846 
10847 		kr = kmem_alloc(kernel_map, (vm_address_t *)&allocs_pcpu, PAGE_SIZE,
10848 		    KMA_ZERO | KMA_KOBJECT, VM_KERN_MEMORY_DIAG);
10849 		assert(kr == KERN_SUCCESS);
10850 
10851 		for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10852 			allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
10853 			    Z_WAITOK | Z_ZERO);
10854 			assert(NULL != allocs_pcpu[idx]);
10855 		}
10856 		for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10857 			zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
10858 		}
10859 		assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10860 
10861 		printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
10862 		    vm_page_wire_count, vm_page_free_count,
10863 		    100L * phys_pages / zone_pages_wired_max);
10864 		zone_gc(ZONE_GC_DRAIN);
10865 		printf("vm_page_wire_count %d, vm_page_free_count %d, p to v %ld%%\n",
10866 		    vm_page_wire_count, vm_page_free_count,
10867 		    100L * phys_pages / zone_pages_wired_max);
10868 
10869 		unsigned int allva = 0;
10870 
10871 		zone_foreach(z) {
10872 			zone_lock(z);
10873 			allva += z->z_wired_cur;
10874 			if (zone_pva_is_null(z->z_pageq_va)) {
10875 				zone_unlock(z);
10876 				continue;
10877 			}
10878 			unsigned count = 0;
10879 			uint64_t size;
10880 			zone_pva_t pg = z->z_pageq_va;
10881 			struct zone_page_metadata *page_meta;
10882 			while (pg.packed_address) {
10883 				page_meta = zone_pva_to_meta(pg);
10884 				count += z->z_percpu ? 1 : z->z_chunk_pages;
10885 				if (page_meta->zm_chunk_len == ZM_SECONDARY_PAGE) {
10886 					count -= page_meta->zm_page_index;
10887 				}
10888 				pg = page_meta->zm_page_next;
10889 			}
10890 			size = zone_size_wired(z);
10891 			if (!size) {
10892 				size = 1;
10893 			}
10894 			printf("%s%s: seq %d, res %d, %qd %%\n",
10895 			    zone_heap_name(z), z->z_name, z->z_va_cur - z->z_wired_cur,
10896 			    z->z_wired_cur, zone_size_allocated(z) * 100ULL / size);
10897 			zone_unlock(z);
10898 		}
10899 
10900 		printf("total va: %d\n", allva);
10901 
10902 		assert(zone_pva_is_null(test_zone->z_pageq_empty));
10903 		assert(zone_pva_is_null(test_zone->z_pageq_partial));
10904 		assert(!zone_pva_is_null(test_zone->z_pageq_va));
10905 		assert(zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10906 		assert(zone_pva_is_null(test_pcpu_zone->z_pageq_partial));
10907 		assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_va));
10908 
10909 		for (idx = 0; idx < num_allocs; idx++) {
10910 			assert(0 == pmap_find_phys(kernel_pmap, (addr64_t)(uintptr_t) allocs[idx]));
10911 		}
10912 
10913 		/* make sure the zone is still usable after a GC */
10914 
10915 		for (idx = 0; idx < num_allocs; idx++) {
10916 			allocs[idx] = zalloc(test_zone);
10917 			assert(allocs[idx]);
10918 			printf("alloc[%d] %p\n", idx, allocs[idx]);
10919 		}
10920 		for (idx = 0; idx < num_allocs; idx++) {
10921 			zfree(test_zone, allocs[idx]);
10922 		}
10923 
10924 		for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10925 			allocs_pcpu[idx] = zalloc_percpu(test_pcpu_zone,
10926 			    Z_WAITOK | Z_ZERO);
10927 			assert(NULL != allocs_pcpu[idx]);
10928 		}
10929 		for (idx = 0; idx < PAGE_SIZE / sizeof(uint64_t); idx++) {
10930 			zfree_percpu(test_pcpu_zone, allocs_pcpu[idx]);
10931 		}
10932 
10933 		assert(!zone_pva_is_null(test_pcpu_zone->z_pageq_empty));
10934 
10935 		kmem_free(kernel_map, (vm_address_t)allocs_pcpu, PAGE_SIZE);
10936 
10937 		zdestroy(test_zone);
10938 		zdestroy(test_pcpu_zone);
10939 	}
10940 #endif /* KASAN_CLASSIC */
10941 
10942 	printf("zone_basic_test: Test passed\n");
10943 
10944 
10945 	*out = 1;
10946 out:
10947 	os_atomic_store(&any_zone_test_running, false, relaxed);
10948 	return rc;
10949 }
10950 SYSCTL_TEST_REGISTER(zone_basic_test, zone_basic_test_run);
10951 
10952 struct zone_stress_obj {
10953 	TAILQ_ENTRY(zone_stress_obj) zso_link;
10954 };
10955 
10956 struct zone_stress_ctx {
10957 	thread_t  zsc_leader;
10958 	lck_mtx_t zsc_lock;
10959 	zone_t    zsc_zone;
10960 	uint64_t  zsc_end;
10961 	uint32_t  zsc_workers;
10962 };
10963 
10964 static void
zone_stress_worker(void * arg,wait_result_t __unused wr)10965 zone_stress_worker(void *arg, wait_result_t __unused wr)
10966 {
10967 	struct zone_stress_ctx *ctx = arg;
10968 	bool leader = ctx->zsc_leader == current_thread();
10969 	TAILQ_HEAD(zone_stress_head, zone_stress_obj) head = TAILQ_HEAD_INITIALIZER(head);
10970 	struct zone_bool_gen bg = { };
10971 	struct zone_stress_obj *obj;
10972 	uint32_t allocs = 0;
10973 
10974 	random_bool_init(&bg.zbg_bg);
10975 
10976 	do {
10977 		for (int i = 0; i < 2000; i++) {
10978 			uint32_t what = random_bool_gen_bits(&bg.zbg_bg,
10979 			    bg.zbg_entropy, ZONE_ENTROPY_CNT, 1);
10980 			switch (what) {
10981 			case 0:
10982 			case 1:
10983 				if (allocs < 10000) {
10984 					obj = zalloc(ctx->zsc_zone);
10985 					TAILQ_INSERT_HEAD(&head, obj, zso_link);
10986 					allocs++;
10987 				}
10988 				break;
10989 			case 2:
10990 			case 3:
10991 				if (allocs < 10000) {
10992 					obj = zalloc(ctx->zsc_zone);
10993 					TAILQ_INSERT_TAIL(&head, obj, zso_link);
10994 					allocs++;
10995 				}
10996 				break;
10997 			case 4:
10998 				if (leader) {
10999 					zone_gc(ZONE_GC_DRAIN);
11000 				}
11001 				break;
11002 			case 5:
11003 			case 6:
11004 				if (!TAILQ_EMPTY(&head)) {
11005 					obj = TAILQ_FIRST(&head);
11006 					TAILQ_REMOVE(&head, obj, zso_link);
11007 					zfree(ctx->zsc_zone, obj);
11008 					allocs--;
11009 				}
11010 				break;
11011 			case 7:
11012 				if (!TAILQ_EMPTY(&head)) {
11013 					obj = TAILQ_LAST(&head, zone_stress_head);
11014 					TAILQ_REMOVE(&head, obj, zso_link);
11015 					zfree(ctx->zsc_zone, obj);
11016 					allocs--;
11017 				}
11018 				break;
11019 			}
11020 		}
11021 	} while (mach_absolute_time() < ctx->zsc_end);
11022 
11023 	while (!TAILQ_EMPTY(&head)) {
11024 		obj = TAILQ_FIRST(&head);
11025 		TAILQ_REMOVE(&head, obj, zso_link);
11026 		zfree(ctx->zsc_zone, obj);
11027 	}
11028 
11029 	lck_mtx_lock(&ctx->zsc_lock);
11030 	if (--ctx->zsc_workers == 0) {
11031 		thread_wakeup(ctx);
11032 	} else if (leader) {
11033 		while (ctx->zsc_workers) {
11034 			lck_mtx_sleep(&ctx->zsc_lock, LCK_SLEEP_DEFAULT, ctx,
11035 			    THREAD_UNINT);
11036 		}
11037 	}
11038 	lck_mtx_unlock(&ctx->zsc_lock);
11039 
11040 	if (!leader) {
11041 		thread_terminate_self();
11042 		__builtin_unreachable();
11043 	}
11044 }
11045 
11046 static int
zone_stress_test_run(__unused int64_t in,int64_t * out)11047 zone_stress_test_run(__unused int64_t in, int64_t *out)
11048 {
11049 	struct zone_stress_ctx ctx = {
11050 		.zsc_leader  = current_thread(),
11051 		.zsc_workers = 3,
11052 	};
11053 	kern_return_t kr;
11054 	thread_t th;
11055 
11056 	if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
11057 		printf("zone_stress_test: Test already running.\n");
11058 		return EALREADY;
11059 	}
11060 
11061 	lck_mtx_init(&ctx.zsc_lock, &zone_locks_grp, LCK_ATTR_NULL);
11062 	ctx.zsc_zone = zone_create("test_zone_344", 344,
11063 	    ZC_DESTRUCTIBLE | ZC_NOCACHING);
11064 	assert(ctx.zsc_zone->z_chunk_pages > 1);
11065 
11066 	clock_interval_to_deadline(5, NSEC_PER_SEC, &ctx.zsc_end);
11067 
11068 	printf("zone_stress_test: Starting (leader %p)\n", current_thread());
11069 
11070 	os_atomic_inc(&zalloc_simulate_vm_pressure, relaxed);
11071 
11072 	for (uint32_t i = 1; i < ctx.zsc_workers; i++) {
11073 		kr = kernel_thread_start_priority(zone_stress_worker, &ctx,
11074 		    BASEPRI_DEFAULT, &th);
11075 		if (kr == KERN_SUCCESS) {
11076 			printf("zone_stress_test: thread %d: %p\n", i, th);
11077 			thread_deallocate(th);
11078 		} else {
11079 			ctx.zsc_workers--;
11080 		}
11081 	}
11082 
11083 	zone_stress_worker(&ctx, 0);
11084 
11085 	lck_mtx_destroy(&ctx.zsc_lock, &zone_locks_grp);
11086 
11087 	zdestroy(ctx.zsc_zone);
11088 
11089 	printf("zone_stress_test: Done\n");
11090 
11091 	*out = 1;
11092 	os_atomic_dec(&zalloc_simulate_vm_pressure, relaxed);
11093 	os_atomic_store(&any_zone_test_running, false, relaxed);
11094 	return 0;
11095 }
11096 SYSCTL_TEST_REGISTER(zone_stress_test, zone_stress_test_run);
11097 
11098 struct zone_gc_stress_obj {
11099 	STAILQ_ENTRY(zone_gc_stress_obj) zgso_link;
11100 	uintptr_t                        zgso_pad[63];
11101 };
11102 STAILQ_HEAD(zone_gc_stress_head, zone_gc_stress_obj);
11103 
11104 #define ZONE_GC_OBJ_PER_PAGE  (PAGE_SIZE / sizeof(struct zone_gc_stress_obj))
11105 
11106 KALLOC_TYPE_DEFINE(zone_gc_stress_zone, struct zone_gc_stress_obj, KT_DEFAULT);
11107 
11108 struct zone_gc_stress_ctx {
11109 	bool      zgsc_done;
11110 	lck_mtx_t zgsc_lock;
11111 	zone_t    zgsc_zone;
11112 	uint64_t  zgsc_end;
11113 	uint32_t  zgsc_workers;
11114 };
11115 
11116 static void
zone_gc_stress_test_alloc_n(struct zone_gc_stress_head * head,size_t n)11117 zone_gc_stress_test_alloc_n(struct zone_gc_stress_head *head, size_t n)
11118 {
11119 	struct zone_gc_stress_obj *obj;
11120 
11121 	for (size_t i = 0; i < n; i++) {
11122 		obj = zalloc_flags(zone_gc_stress_zone, Z_WAITOK);
11123 		STAILQ_INSERT_TAIL(head, obj, zgso_link);
11124 	}
11125 }
11126 
11127 static void
zone_gc_stress_test_free_n(struct zone_gc_stress_head * head)11128 zone_gc_stress_test_free_n(struct zone_gc_stress_head *head)
11129 {
11130 	struct zone_gc_stress_obj *obj;
11131 
11132 	while ((obj = STAILQ_FIRST(head))) {
11133 		STAILQ_REMOVE_HEAD(head, zgso_link);
11134 		zfree(zone_gc_stress_zone, obj);
11135 	}
11136 }
11137 
11138 __dead2
11139 static void
zone_gc_stress_worker(void * arg,wait_result_t __unused wr)11140 zone_gc_stress_worker(void *arg, wait_result_t __unused wr)
11141 {
11142 	struct zone_gc_stress_ctx *ctx = arg;
11143 	struct zone_gc_stress_head head = STAILQ_HEAD_INITIALIZER(head);
11144 
11145 	while (!ctx->zgsc_done) {
11146 		zone_gc_stress_test_alloc_n(&head, ZONE_GC_OBJ_PER_PAGE * 4);
11147 		zone_gc_stress_test_free_n(&head);
11148 	}
11149 
11150 	lck_mtx_lock(&ctx->zgsc_lock);
11151 	if (--ctx->zgsc_workers == 0) {
11152 		thread_wakeup(ctx);
11153 	}
11154 	lck_mtx_unlock(&ctx->zgsc_lock);
11155 
11156 	thread_terminate_self();
11157 	__builtin_unreachable();
11158 }
11159 
11160 static int
zone_gc_stress_test_run(__unused int64_t in,int64_t * out)11161 zone_gc_stress_test_run(__unused int64_t in, int64_t *out)
11162 {
11163 	struct zone_gc_stress_head head = STAILQ_HEAD_INITIALIZER(head);
11164 	struct zone_gc_stress_ctx ctx = {
11165 		.zgsc_workers = 3,
11166 	};
11167 	kern_return_t kr;
11168 	thread_t th;
11169 
11170 	if (os_atomic_xchg(&any_zone_test_running, true, relaxed)) {
11171 		printf("zone_gc_stress_test: Test already running.\n");
11172 		return EALREADY;
11173 	}
11174 
11175 	lck_mtx_init(&ctx.zgsc_lock, &zone_locks_grp, LCK_ATTR_NULL);
11176 	lck_mtx_lock(&ctx.zgsc_lock);
11177 
11178 	printf("zone_gc_stress_test: Starting (leader %p)\n", current_thread());
11179 
11180 	os_atomic_inc(&zalloc_simulate_vm_pressure, relaxed);
11181 
11182 	for (uint32_t i = 0; i < ctx.zgsc_workers; i++) {
11183 		kr = kernel_thread_start_priority(zone_gc_stress_worker, &ctx,
11184 		    BASEPRI_DEFAULT, &th);
11185 		if (kr == KERN_SUCCESS) {
11186 			printf("zone_gc_stress_test: thread %d: %p\n", i, th);
11187 			thread_deallocate(th);
11188 		} else {
11189 			ctx.zgsc_workers--;
11190 		}
11191 	}
11192 
11193 	for (uint64_t i = 0; i < in; i++) {
11194 		size_t count = zc_mag_size() * zc_free_batch_size() * 10;
11195 
11196 		if (count < ZONE_GC_OBJ_PER_PAGE * 20) {
11197 			count = ZONE_GC_OBJ_PER_PAGE * 20;
11198 		}
11199 
11200 		zone_gc_stress_test_alloc_n(&head, count);
11201 		zone_gc_stress_test_free_n(&head);
11202 
11203 		lck_mtx_lock(&zone_gc_lock);
11204 		zone_reclaim(zone_gc_stress_zone->kt_zv.zv_zone,
11205 		    ZONE_RECLAIM_TRIM);
11206 		lck_mtx_unlock(&zone_gc_lock);
11207 
11208 		printf("zone_gc_stress_test: round %lld/%lld\n", i + 1, in);
11209 	}
11210 
11211 	os_atomic_thread_fence(seq_cst);
11212 	ctx.zgsc_done = true;
11213 	lck_mtx_sleep(&ctx.zgsc_lock, LCK_SLEEP_DEFAULT, &ctx, THREAD_UNINT);
11214 	lck_mtx_unlock(&ctx.zgsc_lock);
11215 
11216 	lck_mtx_destroy(&ctx.zgsc_lock, &zone_locks_grp);
11217 
11218 	lck_mtx_lock(&zone_gc_lock);
11219 	zone_reclaim(zone_gc_stress_zone->kt_zv.zv_zone,
11220 	    ZONE_RECLAIM_DRAIN);
11221 	lck_mtx_unlock(&zone_gc_lock);
11222 
11223 	printf("zone_gc_stress_test: Done\n");
11224 
11225 	*out = 1;
11226 	os_atomic_dec(&zalloc_simulate_vm_pressure, relaxed);
11227 	os_atomic_store(&any_zone_test_running, false, relaxed);
11228 	return 0;
11229 }
11230 SYSCTL_TEST_REGISTER(zone_gc_stress_test, zone_gc_stress_test_run);
11231 
11232 /*
11233  * Routines to test that zone garbage collection and zone replenish threads
11234  * running at the same time don't cause problems.
11235  */
11236 
11237 static int
zone_gc_replenish_test(__unused int64_t in,int64_t * out)11238 zone_gc_replenish_test(__unused int64_t in, int64_t *out)
11239 {
11240 	zone_gc(ZONE_GC_DRAIN);
11241 	*out = 1;
11242 	return 0;
11243 }
11244 SYSCTL_TEST_REGISTER(zone_gc_replenish_test, zone_gc_replenish_test);
11245 
11246 static int
zone_alloc_replenish_test(__unused int64_t in,int64_t * out)11247 zone_alloc_replenish_test(__unused int64_t in, int64_t *out)
11248 {
11249 	zone_t z = vm_map_entry_zone;
11250 	struct data { struct data *next; } *node, *list = NULL;
11251 
11252 	if (z == NULL) {
11253 		printf("Couldn't find a replenish zone\n");
11254 		return EIO;
11255 	}
11256 
11257 	/* big enough to go past replenishment */
11258 	for (uint32_t i = 0; i < 10 * z->z_elems_rsv; ++i) {
11259 		node = zalloc(z);
11260 		node->next = list;
11261 		list = node;
11262 	}
11263 
11264 	/*
11265 	 * release the memory we allocated
11266 	 */
11267 	while (list != NULL) {
11268 		node = list;
11269 		list = list->next;
11270 		zfree(z, node);
11271 	}
11272 
11273 	*out = 1;
11274 	return 0;
11275 }
11276 SYSCTL_TEST_REGISTER(zone_alloc_replenish_test, zone_alloc_replenish_test);
11277 
11278 #endif /* DEBUG || DEVELOPMENT */
11279