xref: /xnu-11417.121.6/bsd/skywalk/mem/skmem_region.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* BEGIN CSTYLED */
30 /*
31  * A region represents a collection of one or more similarly-sized memory
32  * segments, each of which is a contiguous range of integers.  A segment
33  * is either allocated or free, and is treated as disjoint from all other
34  * segments.  That is, the contiguity applies only at the segment level,
35  * and a region with multiple segments is not contiguous at the region level.
36  * A segment always belongs to the segment freelist, or the allocated-address
37  * hash chain, as described below.
38  *
39  * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40  * stays intact even after a defunct.  Otherwise, the segments belonging
41  * to the region will be freed at defunct time, and the span covered by
42  * the region will be redirected to zero-filled anonymous memory.
43  *
44  * Memory for a region is always created as pageable and purgeable.  It is
45  * the client's responsibility to prepare (wire) it, and optionally insert
46  * it to the IOMMU, at segment construction time.  When the segment is
47  * freed, the client is responsible for removing it from IOMMU (if needed),
48  * and complete (unwire) it.
49  *
50  * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51  * is immediately wired upon allocation (segment removed from freelist).
52  * It gets unwired when memory is discarded (segment inserted to freelist).
53  *
54  * The chronological life cycle of a segment is as such:
55  *
56  *    SKSEG_STATE_DETACHED
57  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58  *            [segment allocated, useable by client]
59  *              ...
60  *            [client frees segment]
61  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62  *	  [reclaim]
63  *    SKSEG_STATE_DETACHED
64  *
65  * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66  * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67  * prevent modifications by the user task.  Only user-mappable regions will
68  * be considered for inclusion during skmem_arena_mmap().
69  *
70  * Every skmem allocator has a region as its slab supplier.  Each slab is
71  * exactly a segment.  The allocator uses skmem_region_{alloc,free}() to
72  * create and destroy slabs.
73  *
74  * A region may be mirrored by another region; the latter acts as the master
75  * controller for both regions.  Mirrored (slave) regions cannot be used
76  * directly by the skmem allocator.  Region mirroring technique is used for
77  * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78  * region has the same size and lifetime as its shadow counterpart.
79  *
80  * CREATION/DESTRUCTION:
81  *
82  *   At creation time, all segments are allocated and are immediately inserted
83  *   into the freelist.  Allocating a purgeable segment has very little cost,
84  *   as it is not backed by physical memory until it is accessed.  Immediate
85  *   insertion into the freelist causes the mapping to be further torn down.
86  *
87  *   At destruction time, the freelist is emptied, and each segment is then
88  *   destroyed.  The system will assert if it detects there are outstanding
89  *   segments not yet returned to the region (not freed by the client.)
90  *
91  * ALLOCATION:
92  *
93  *   Allocating involves searching the freelist for a segment; if found, the
94  *   segment is removed from the freelist and is inserted into the allocated-
95  *   address hash chain.  The address of the memory object represented by
96  *   the segment is used as hash key.  The use of allocated-address hash chain
97  *   is needed since we return the address of the memory object, and not the
98  *   segment's itself, to the client.
99  *
100  * DEALLOCATION:
101  *
102  *   Freeing a memory object causes the chain to be searched for a matching
103  *   segment.  The system will assert if a segment cannot be found, since
104  *   that indicates that the memory object address is invalid.  Once found,
105  *   the segment is removed from the allocated-address hash chain, and is
106  *   inserted to the freelist.
107  *
108  * Segment allocation and deallocation can be expensive.  Because of this,
109  * we expect that most clients will utilize the skmem_cache slab allocator
110  * as the frontend instead.
111  */
112 /* END CSTYLED */
113 
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF             /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
117 
118 static void skmem_region_destroy(struct skmem_region *skr);
119 static void skmem_region_depopulate(struct skmem_region *);
120 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
121 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
122 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
123 static void sksegment_freelist_insert(struct skmem_region *,
124     struct sksegment *, boolean_t);
125 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
126     struct sksegment *, uint32_t, boolean_t);
127 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
128 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
129     uint32_t);
130 static void *__sized_by(seg_size) skmem_region_alloc_common(struct skmem_region *,
131     struct sksegment *, uint32_t seg_size);
132 static void *__sized_by(seg_size) skmem_region_mirror_alloc(struct skmem_region *,
133     struct sksegment *, uint32_t seg_size, struct sksegment **);
134 static void skmem_region_applyall(void (*)(struct skmem_region *));
135 static void skmem_region_update(struct skmem_region *);
136 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
137 static inline void skmem_region_retain_locked(struct skmem_region *);
138 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
139 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
140 
141 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
142 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
143 
144 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
145     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
146     0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
147     "Skywalk region statistics");
148 
149 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
150 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
151 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
152     &skmem_region_lock_attr);
153 
154 /* protected by skmem_region_lock */
155 static TAILQ_HEAD(, skmem_region) skmem_region_head;
156 
157 static thread_call_t skmem_region_update_tc;
158 
159 #define SKMEM_REGION_UPDATE_INTERVAL    13      /* 13 seconds */
160 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
161 
162 #define SKMEM_WDT_MAXTIME               30      /* # of secs before watchdog */
163 #define SKMEM_WDT_PURGE                 3       /* retry purge threshold */
164 
165 #if (DEVELOPMENT || DEBUG)
166 /* Mean Time Between Failures (ms) */
167 static volatile uint64_t skmem_region_mtbf;
168 
169 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
170     struct sysctl_req *);
171 
172 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
173     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
174     skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
175 
176 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
177     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
178     SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
179 #endif /* (DEVELOPMENT || DEBUG) */
180 
181 #define SKMEM_REGION_LOCK()                     \
182 	lck_mtx_lock(&skmem_region_lock)
183 #define SKMEM_REGION_LOCK_ASSERT_HELD()         \
184 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
185 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD()      \
186 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
187 #define SKMEM_REGION_UNLOCK()                   \
188 	lck_mtx_unlock(&skmem_region_lock)
189 
190 /*
191  * Hash table bounds.  Start with the initial value, and rescale up to
192  * the specified limit.  Ideally we don't need a limit, but in practice
193  * this helps guard against runaways.  These values should be revisited
194  * in future and be adjusted as needed.
195  */
196 #define SKMEM_REGION_HASH_INITIAL       32      /* initial hash table size */
197 #define SKMEM_REGION_HASH_LIMIT         4096    /* hash table size limit */
198 
199 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m)     \
200 	(((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
201 #define SKMEM_REGION_HASH(_skr, _addr)                                     \
202 	(&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
203 	    (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
204 
205 static SKMEM_TYPE_DEFINE(skr_zone, struct skmem_region);
206 
207 /*
208  * XXX: This is used in only one function (skmem_region_init) after the
209  * -fbounds-safety changes were made for Skmem. We can remove this global and
210  * just make it a local variable to the function (skmem_region_init).
211  */
212 static unsigned int sg_size;                    /* size of zone element */
213 static struct skmem_cache *skmem_sg_cache;      /* cache for sksegment */
214 
215 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
216 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
217 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
218 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
219 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
220 
221 #define SKMEM_TAG_SEGMENT_BMAP  "com.apple.skywalk.segment.bmap"
222 static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP);
223 
224 #define SKMEM_TAG_SEGMENT_HASH  "com.apple.skywalk.segment.hash"
225 static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH);
226 
227 #define SKMEM_TAG_REGION_MIB     "com.apple.skywalk.region.mib"
228 static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB);
229 
230 #define BMAPSZ  64
231 
232 /* 64-bit mask with range */
233 #define BMASK64(_beg, _end)     \
234 	((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
235 
236 static int __skmem_region_inited = 0;
237 
238 /*
239  * XXX -fbounds-safety: we added seg_size to skmem_region_alloc_common(), but
240  * this is only used by -fbounds-safety, so we add __unused if -fbounds-safety
241  * is disabled. The utility macro for that is SK_BF_ARG().
242  * We do the same for skmem_region_alloc(), with objsize
243  */
244 #if !__has_ptrcheck
245 #define SK_FB_ARG __unused
246 #else
247 #define SK_FB_ARG
248 #endif
249 
250 void
skmem_region_init(void)251 skmem_region_init(void)
252 {
253 	boolean_t randomize_seg_size;
254 
255 	_CASSERT(sizeof(bitmap_t) == sizeof(uint64_t));
256 	_CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3));
257 	_CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
258 	_CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
259 	ASSERT(!__skmem_region_inited);
260 
261 	/* enforce the ordering here */
262 	_CASSERT(SKMEM_REGION_GUARD_HEAD == 0);
263 	_CASSERT(SKMEM_REGION_SCHEMA == 1);
264 	_CASSERT(SKMEM_REGION_RING == 2);
265 	_CASSERT(SKMEM_REGION_BUF_DEF == 3);
266 	_CASSERT(SKMEM_REGION_BUF_LARGE == 4);
267 	_CASSERT(SKMEM_REGION_RXBUF_DEF == 5);
268 	_CASSERT(SKMEM_REGION_RXBUF_LARGE == 6);
269 	_CASSERT(SKMEM_REGION_TXBUF_DEF == 7);
270 	_CASSERT(SKMEM_REGION_TXBUF_LARGE == 8);
271 	_CASSERT(SKMEM_REGION_UMD == 9);
272 	_CASSERT(SKMEM_REGION_TXAUSD == 10);
273 	_CASSERT(SKMEM_REGION_RXFUSD == 11);
274 	_CASSERT(SKMEM_REGION_UBFT == 12);
275 	_CASSERT(SKMEM_REGION_USTATS == 13);
276 	_CASSERT(SKMEM_REGION_FLOWADV == 14);
277 	_CASSERT(SKMEM_REGION_NEXUSADV == 15);
278 	_CASSERT(SKMEM_REGION_SYSCTLS == 16);
279 	_CASSERT(SKMEM_REGION_GUARD_TAIL == 17);
280 	_CASSERT(SKMEM_REGION_KMD == 18);
281 	_CASSERT(SKMEM_REGION_RXKMD == 19);
282 	_CASSERT(SKMEM_REGION_TXKMD == 20);
283 	_CASSERT(SKMEM_REGION_KBFT == 21);
284 	_CASSERT(SKMEM_REGION_RXKBFT == 22);
285 	_CASSERT(SKMEM_REGION_TXKBFT == 23);
286 	_CASSERT(SKMEM_REGION_TXAKSD == 24);
287 	_CASSERT(SKMEM_REGION_RXFKSD == 25);
288 	_CASSERT(SKMEM_REGION_KSTATS == 26);
289 	_CASSERT(SKMEM_REGION_INTRINSIC == 27);
290 
291 	_CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
292 	_CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
293 	_CASSERT(SREG_RING == SKMEM_REGION_RING);
294 	_CASSERT(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF);
295 	_CASSERT(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE);
296 	_CASSERT(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF);
297 	_CASSERT(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE);
298 	_CASSERT(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF);
299 	_CASSERT(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE);
300 	_CASSERT(SREG_UMD == SKMEM_REGION_UMD);
301 	_CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
302 	_CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
303 	_CASSERT(SREG_UBFT == SKMEM_REGION_UBFT);
304 	_CASSERT(SREG_USTATS == SKMEM_REGION_USTATS);
305 	_CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
306 	_CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
307 	_CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
308 	_CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
309 	_CASSERT(SREG_KMD == SKMEM_REGION_KMD);
310 	_CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD);
311 	_CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD);
312 	_CASSERT(SREG_KBFT == SKMEM_REGION_KBFT);
313 	_CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
314 	_CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
315 	_CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
316 	_CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
317 	_CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS);
318 
319 	_CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
320 	_CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
321 	_CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
322 	_CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
323 	_CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
324 	_CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
325 	_CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
326 	_CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
327 	_CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
328 	_CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
329 	_CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD);
330 	_CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
331 	_CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
332 	_CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
333 	_CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
334 	_CASSERT(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE);
335 	_CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB);
336 	_CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
337 
338 	(void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
339 	    sizeof(skmem_seg_size));
340 	if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
341 		skmem_seg_size = SKMEM_MIN_SEG_SIZE;
342 	}
343 	skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
344 	    SKMEM_MIN_SEG_SIZE);
345 	VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
346 
347 	(void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
348 	    sizeof(skmem_md_seg_size));
349 	if (skmem_md_seg_size < skmem_seg_size) {
350 		skmem_md_seg_size = skmem_seg_size;
351 	}
352 	skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
353 	    SKMEM_MIN_SEG_SIZE);
354 	VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
355 
356 	/*
357 	 * If set via boot-args, honor it and don't randomize.
358 	 */
359 	randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
360 	    &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
361 	if (skmem_drv_buf_seg_size < skmem_seg_size) {
362 		skmem_drv_buf_seg_size = skmem_seg_size;
363 	}
364 	skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
365 	    (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
366 	VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
367 
368 	/*
369 	 * Randomize the driver buffer segment size; here we choose
370 	 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
371 	 * Set this as the effective driver buffer segment size.
372 	 */
373 	if (randomize_seg_size) {
374 		uint32_t sm;
375 		read_frandom(&sm, sizeof(sm));
376 		skmem_drv_buf_seg_eff_size +=
377 		    (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
378 		VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
379 	}
380 	VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
381 
382 	(void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
383 	    &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
384 	if (skmem_usr_buf_seg_size < skmem_seg_size) {
385 		skmem_usr_buf_seg_size = skmem_seg_size;
386 	}
387 	skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
388 	    SKMEM_MIN_SEG_SIZE);
389 	VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
390 
391 	SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
392 	    "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
393 	    skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
394 	    skmem_usr_buf_seg_size);
395 
396 	TAILQ_INIT(&skmem_region_head);
397 
398 	skmem_region_update_tc =
399 	    thread_call_allocate_with_options(skmem_region_update_func,
400 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
401 	if (skmem_region_update_tc == NULL) {
402 		panic("%s: thread_call_allocate failed", __func__);
403 		/* NOTREACHED */
404 		__builtin_unreachable();
405 	}
406 
407 	sg_size = sizeof(struct sksegment);
408 	skmem_sg_cache = skmem_cache_create("sg", sg_size,
409 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
410 
411 	/* and start the periodic region update machinery */
412 	skmem_dispatch(skmem_region_update_tc, NULL,
413 	    (skmem_region_update_interval * NSEC_PER_SEC));
414 
415 	__skmem_region_inited = 1;
416 }
417 
418 void
skmem_region_fini(void)419 skmem_region_fini(void)
420 {
421 	if (__skmem_region_inited) {
422 		ASSERT(TAILQ_EMPTY(&skmem_region_head));
423 
424 		if (skmem_region_update_tc != NULL) {
425 			(void) thread_call_cancel_wait(skmem_region_update_tc);
426 			(void) thread_call_free(skmem_region_update_tc);
427 			skmem_region_update_tc = NULL;
428 		}
429 
430 		if (skmem_sg_cache != NULL) {
431 			skmem_cache_destroy(skmem_sg_cache);
432 			skmem_sg_cache = NULL;
433 		}
434 
435 		__skmem_region_inited = 0;
436 	}
437 }
438 
439 /*
440  * Reap internal caches.
441  */
442 void
skmem_region_reap_caches(boolean_t purge)443 skmem_region_reap_caches(boolean_t purge)
444 {
445 	skmem_cache_reap_now(skmem_sg_cache, purge);
446 }
447 
448 /*
449  * Configure and compute the parameters of a region.
450  */
451 void
skmem_region_params_config(struct skmem_region_params * srp)452 skmem_region_params_config(struct skmem_region_params *srp)
453 {
454 	uint32_t cache_line_size = skmem_cpu_cache_line_size();
455 	size_t seglim, segsize, segcnt;
456 	size_t objsize, objcnt;
457 
458 	ASSERT(srp->srp_id < SKMEM_REGIONS);
459 
460 	/*
461 	 * If magazines layer is disabled system-wide, override
462 	 * the region parameter here.  This will effectively reduce
463 	 * the number of requested objects computed below.  Note that
464 	 * the region may have already been configured to exclude
465 	 * magazines in the default skmem_regions[] array.
466 	 */
467 	if (!skmem_allow_magazines()) {
468 		srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
469 	}
470 
471 	objsize = srp->srp_r_obj_size;
472 	ASSERT(objsize != 0);
473 	objcnt = srp->srp_r_obj_cnt;
474 	ASSERT(objcnt != 0);
475 
476 	if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
477 		size_t align = srp->srp_align;
478 
479 		VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
480 		VERIFY(powerof2(align));
481 		objsize = MAX(objsize, sizeof(uint64_t));
482 #if KASAN
483 		/*
484 		 * When KASAN is enabled, the zone allocator adjusts the
485 		 * element size to include the redzone regions, in which
486 		 * case we assume that the elements won't start on the
487 		 * alignment boundary and thus need to do some fix-ups.
488 		 * These include increasing the effective object size
489 		 * which adds at least 16 bytes to the original size.
490 		 */
491 		objsize += sizeof(uint64_t) + align;
492 #endif /* KASAN */
493 		objsize = P2ROUNDUP(objsize, align);
494 
495 		segsize = objsize;
496 		srp->srp_r_seg_size = (uint32_t)segsize;
497 		segcnt = objcnt;
498 		goto done;
499 	} else {
500 		/* objects are always aligned at CPU cache line size */
501 		srp->srp_align = cache_line_size;
502 	}
503 
504 	/*
505 	 * Start with default segment size for the region, and compute the
506 	 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE).  If the
507 	 * object size is greater, then we adjust the segment size to next
508 	 * multiple of the effective size larger than the object size.
509 	 */
510 	if (srp->srp_r_seg_size == 0) {
511 		switch (srp->srp_id) {
512 		case SKMEM_REGION_UMD:
513 		case SKMEM_REGION_KMD:
514 		case SKMEM_REGION_RXKMD:
515 		case SKMEM_REGION_TXKMD:
516 			srp->srp_r_seg_size = skmem_md_seg_size;
517 			break;
518 
519 		case SKMEM_REGION_BUF_DEF:
520 		case SKMEM_REGION_RXBUF_DEF:
521 		case SKMEM_REGION_TXBUF_DEF:
522 			/*
523 			 * Use the effective driver buffer segment size,
524 			 * since it reflects any randomization done at
525 			 * skmem_region_init() time.
526 			 */
527 			srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
528 			break;
529 
530 		default:
531 			srp->srp_r_seg_size = skmem_seg_size;
532 			break;
533 		}
534 	} else {
535 		srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
536 		    SKMEM_MIN_SEG_SIZE);
537 	}
538 
539 	seglim = srp->srp_r_seg_size;
540 	VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
541 
542 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
543 	    srp->srp_name, seglim, objsize, objcnt);
544 
545 	/*
546 	 * Make sure object size is multiple of CPU cache line
547 	 * size, and that we can evenly divide the segment size.
548 	 */
549 	if (!((objsize < cache_line_size) && (objsize < seglim) &&
550 	    ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
551 		objsize = P2ROUNDUP(objsize, cache_line_size);
552 		while (objsize < seglim && (seglim % objsize) != 0) {
553 			SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
554 			    srp->srp_name, objsize, objsize + cache_line_size);
555 			objsize += cache_line_size;
556 		}
557 	}
558 
559 	/* segment must be larger than object */
560 	while (objsize > seglim) {
561 		SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
562 		    seglim, seglim + SKMEM_MIN_SEG_SIZE);
563 		seglim += SKMEM_MIN_SEG_SIZE;
564 	}
565 
566 	/*
567 	 * Take into account worst-case per-CPU cached
568 	 * objects if this region is configured for it.
569 	 */
570 	if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
571 		uint32_t magazine_max_objs =
572 		    skmem_cache_magazine_max((uint32_t)objsize);
573 		SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
574 		    objcnt, objcnt + magazine_max_objs);
575 		objcnt += magazine_max_objs;
576 	}
577 
578 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
579 	    "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
580 
581 	segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
582 	if (seglim > segsize) {
583 		/*
584 		 * If the segment limit is larger than what we need,
585 		 * avoid memory wastage by shrinking it.
586 		 */
587 		while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
588 			VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
589 			SK_DF(SK_VERB_MEM,
590 			    "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
591 			    srp->srp_name, segsize, objsize, objcnt, seglim,
592 			    P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
593 			    SKMEM_MIN_SEG_SIZE));
594 			seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
595 			    SKMEM_MIN_SEG_SIZE);
596 		}
597 
598 		/* adjust segment size */
599 		segsize = seglim;
600 	} else if (seglim < segsize) {
601 		size_t oseglim = seglim;
602 		/*
603 		 * If the segment limit is less than the segment size,
604 		 * see if increasing it slightly (up to 1.5x the segment
605 		 * size) would allow us to avoid allocating too many
606 		 * extra objects (due to excessive segment count).
607 		 */
608 		while (seglim < segsize && (segsize % seglim) != 0) {
609 			SK_DF(SK_VERB_MEM,
610 			    "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
611 			    srp->srp_name, segsize, objsize, objcnt, seglim,
612 			    (seglim + SKMEM_MIN_SEG_SIZE));
613 			seglim += SKMEM_MIN_SEG_SIZE;
614 			if (seglim >= (oseglim + (oseglim >> 1))) {
615 				break;
616 			}
617 		}
618 
619 		/* can't use P2ROUNDUP since seglim may not be power of 2 */
620 		segsize = SK_ROUNDUP(segsize, seglim);
621 	}
622 	ASSERT(segsize != 0 && (segsize % seglim) == 0);
623 
624 	SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
625 	    srp->srp_name, segsize, seglim);
626 
627 	/* compute segment count, and recompute segment size */
628 	if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
629 		segcnt = 1;
630 	} else {
631 		/*
632 		 * The adjustments above were done in increments of
633 		 * SKMEM_MIN_SEG_SIZE.  If the object size is greater
634 		 * than that, ensure that the segment size is a multiple
635 		 * of the object size.
636 		 */
637 		if (objsize > SKMEM_MIN_SEG_SIZE) {
638 			ASSERT(seglim >= objsize);
639 			if ((seglim % objsize) != 0) {
640 				seglim += (seglim - objsize);
641 			}
642 			/* recompute segsize; see SK_ROUNDUP comment above */
643 			segsize = SK_ROUNDUP(segsize, seglim);
644 		}
645 
646 		segcnt = MAX(1, (segsize / seglim));
647 		segsize /= segcnt;
648 	}
649 
650 	SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
651 	    srp->srp_name, segcnt, segsize);
652 
653 	/* recompute object count to avoid wastage */
654 	objcnt = (segsize * segcnt) / objsize;
655 	ASSERT(objcnt != 0);
656 done:
657 	srp->srp_c_obj_size = (uint32_t)objsize;
658 	srp->srp_c_obj_cnt = (uint32_t)objcnt;
659 	srp->srp_c_seg_size = (uint32_t)segsize;
660 	srp->srp_seg_cnt = (uint32_t)segcnt;
661 
662 	SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
663 	    srp->srp_name, objsize, objcnt, segcnt, segsize);
664 
665 #if SK_LOG
666 	if (__improbable(sk_verbose != 0)) {
667 		char label[32];
668 		(void) snprintf(label, sizeof(label), "REGION_%s:",
669 		    skmem_region_id2name(srp->srp_id));
670 		SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
671 		    (uint32_t)srp->srp_r_obj_cnt,
672 		    (uint32_t)srp->srp_r_obj_size,
673 		    (uint32_t)srp->srp_c_obj_cnt,
674 		    (uint32_t)srp->srp_c_obj_size);
675 	}
676 #endif /* SK_LOG */
677 }
678 
679 /*
680  * Create a region.
681  */
682 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)683 skmem_region_create(const char *name, struct skmem_region_params *srp,
684     sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
685 {
686 	boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
687 	uint32_t cflags = srp->srp_cflags;
688 	struct skmem_region *skr;
689 	uint32_t i;
690 
691 	ASSERT(srp->srp_id < SKMEM_REGIONS);
692 	ASSERT(srp->srp_c_seg_size != 0 &&
693 	    (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
694 	ASSERT(srp->srp_seg_cnt != 0);
695 	ASSERT(srp->srp_c_obj_cnt == 1 ||
696 	    (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
697 	ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
698 
699 	skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
700 	skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
701 	skr->skr_seg_size = srp->srp_c_seg_size;
702 	skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
703 	skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
704 
705 	if (!pseudo) {
706 		skr->skr_seg_max_cnt = srp->srp_seg_cnt;
707 
708 		/* set alignment to CPU cache line size */
709 		skr->skr_params.srp_align = skmem_cpu_cache_line_size();
710 
711 		/* allocate the allocated-address hash chain */
712 		skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
713 		skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
714 		uint32_t size = skr->skr_hash_initial;
715 		skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
716 		    size, Z_WAITOK | Z_NOFAIL,
717 		    skmem_tag_segment_hash);
718 		skr->skr_hash_size = size;
719 		skr->skr_hash_mask = (skr->skr_hash_initial - 1);
720 		skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
721 
722 		for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
723 			TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
724 		}
725 	} else {
726 		/* this upper bound doesn't apply */
727 		skr->skr_seg_max_cnt = 0;
728 
729 		/* pick up value set by skmem_regions_params_config() */
730 		skr->skr_params.srp_align = srp->srp_align;
731 	}
732 
733 	skr->skr_r_obj_size = srp->srp_r_obj_size;
734 	skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
735 	skr->skr_c_obj_size = srp->srp_c_obj_size;
736 	skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
737 
738 	skr->skr_params.srp_md_type = srp->srp_md_type;
739 	skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
740 	skr->skr_params.srp_max_frags = srp->srp_max_frags;
741 
742 	skr->skr_seg_ctor = ctor;
743 	skr->skr_seg_dtor = dtor;
744 	skr->skr_private = private;
745 
746 	lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
747 	    &skmem_region_lock_attr);
748 
749 	TAILQ_INIT(&skr->skr_seg_free);
750 	RB_INIT(&skr->skr_seg_tfree);
751 
752 	skr->skr_id = srp->srp_id;
753 	uuid_generate_random(skr->skr_uuid);
754 	(void) snprintf(skr->skr_name, sizeof(skr->skr_name),
755 	    "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
756 
757 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
758 	    skr->skr_name, SK_KVA(skr));
759 
760 	/* sanity check */
761 	ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
762 	    !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
763 	    SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
764 	    SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
765 	    SKMEM_REGION_CR_PUREDATA)));
766 
767 	skr->skr_cflags = cflags;
768 	if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
769 		skr->skr_mode |= SKR_MODE_NOREDIRECT;
770 	}
771 	if (cflags & SKMEM_REGION_CR_MMAPOK) {
772 		skr->skr_mode |= SKR_MODE_MMAPOK;
773 	}
774 	if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
775 	    (cflags & SKMEM_REGION_CR_UREADONLY)) {
776 		skr->skr_mode |= SKR_MODE_UREADONLY;
777 	}
778 	if (cflags & SKMEM_REGION_CR_KREADONLY) {
779 		skr->skr_mode |= SKR_MODE_KREADONLY;
780 	}
781 	if (cflags & SKMEM_REGION_CR_PERSISTENT) {
782 		skr->skr_mode |= SKR_MODE_PERSISTENT;
783 	}
784 	if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
785 		skr->skr_mode |= SKR_MODE_MONOLITHIC;
786 	}
787 	if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
788 		skr->skr_mode |= SKR_MODE_NOMAGAZINES;
789 	}
790 	if (cflags & SKMEM_REGION_CR_NOCACHE) {
791 		skr->skr_mode |= SKR_MODE_NOCACHE;
792 	}
793 	if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
794 		skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
795 	}
796 	if (cflags & SKMEM_REGION_CR_SHAREOK) {
797 		skr->skr_mode |= SKR_MODE_SHAREOK;
798 	}
799 	if (cflags & SKMEM_REGION_CR_IODIR_IN) {
800 		skr->skr_mode |= SKR_MODE_IODIR_IN;
801 	}
802 	if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
803 		skr->skr_mode |= SKR_MODE_IODIR_OUT;
804 	}
805 	if (cflags & SKMEM_REGION_CR_GUARD) {
806 		skr->skr_mode |= SKR_MODE_GUARD;
807 	}
808 	if (cflags & SKMEM_REGION_CR_PUREDATA) {
809 		skr->skr_mode |= SKR_MODE_PUREDATA;
810 	}
811 	if (cflags & SKMEM_REGION_CR_PSEUDO) {
812 		skr->skr_mode |= SKR_MODE_PSEUDO;
813 	}
814 	if (cflags & SKMEM_REGION_CR_THREADSAFE) {
815 		skr->skr_mode |= SKR_MODE_THREADSAFE;
816 	}
817 	if (cflags & SKMEM_REGION_CR_MEMTAG) {
818 		skr->skr_mode |= SKR_MODE_MEMTAG;
819 	}
820 
821 #if XNU_TARGET_OS_OSX
822 	/*
823 	 * Mark all regions as persistent except for the guard and Intrinsic
824 	 * regions.
825 	 * This is to ensure that kernel threads won't be faulting-in while
826 	 * accessing these memory regions. We have observed various kinds of
827 	 * kernel panics due to kernel threads faulting on non-wired memory
828 	 * access when the VM subsystem is not in a state to swap-in the page.
829 	 */
830 	if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
831 	    (skr->skr_mode & SKR_MODE_GUARD))) {
832 		skr->skr_mode |= SKR_MODE_PERSISTENT;
833 	}
834 #endif /* XNU_TARGET_OS_OSX */
835 
836 	/* SKR_MODE_UREADONLY only takes effect for user task mapping */
837 	skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
838 	skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
839 	/* Regions containing pointers are wired (i.e. not pageable nor purgeable) */
840 	skr->skr_bufspec.purgeable = !(skr->skr_mode & SKR_MODE_MEMTAG);
841 	skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
842 	skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
843 	skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
844 	skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
845 	skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
846 	skr->skr_bufspec.threadSafe = !!(skr->skr_mode & SKR_MODE_THREADSAFE);
847 	skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
848 	skr->skr_bufspec.memtag = !!(skr->skr_mode & SKR_MODE_MEMTAG);
849 	/* allocate segment bitmaps */
850 	if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
851 		ASSERT(skr->skr_seg_max_cnt != 0);
852 		skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
853 		size_t size = BITMAP_SIZE(skr->skr_seg_max_cnt);
854 		skr->skr_seg_bmap = sk_alloc_data(size,
855 		    Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
856 		skr->skr_seg_bmap_size = size;
857 		ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
858 		    (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
859 
860 		/* mark all bitmaps as free (bit set) */
861 		bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
862 	}
863 
864 	/*
865 	 * Populate the freelist by allocating all segments for the
866 	 * region, which will be mapped but not faulted-in, and then
867 	 * immediately insert each to the freelist.  That will in
868 	 * turn unmap the segment's memory object.
869 	 */
870 	SKR_LOCK(skr);
871 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
872 		char zone_name[64];
873 		(void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
874 		    SKMEM_ZONE_PREFIX, name);
875 		skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
876 		    ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
877 	} else {
878 		/* create a backing IOSKRegion object */
879 		if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
880 		    (IOSKSize)skr->skr_seg_size,
881 		    (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
882 			SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed",
883 			    skr->skr_name, (uint32_t)skr->skr_seg_size,
884 			    (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags,
885 			    SKMEM_REGION_CR_BITS);
886 			goto failed;
887 		}
888 	}
889 
890 	ASSERT(skr->skr_seg_objs != 0);
891 
892 	++skr->skr_refcnt;      /* for caller */
893 	SKR_UNLOCK(skr);
894 
895 	SKMEM_REGION_LOCK();
896 	TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
897 	SKMEM_REGION_UNLOCK();
898 
899 	SK_DF(SK_VERB_MEM_REGION,
900 	    "  [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b",
901 	    (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
902 	    (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
903 	    skr->skr_cflags, SKMEM_REGION_CR_BITS);
904 
905 	return skr;
906 
907 failed:
908 	SKR_LOCK_ASSERT_HELD(skr);
909 	skmem_region_destroy(skr);
910 
911 	return NULL;
912 }
913 
914 /*
915  * Destroy a region.
916  */
917 static void
skmem_region_destroy(struct skmem_region * skr)918 skmem_region_destroy(struct skmem_region *skr)
919 {
920 	struct skmem_region *mskr;
921 
922 	SKR_LOCK_ASSERT_HELD(skr);
923 
924 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx",
925 	    skr->skr_name, SK_KVA(skr));
926 
927 	/*
928 	 * Panic if we detect there are unfreed segments; the caller
929 	 * destroying this region is responsible for ensuring that all
930 	 * allocated segments have been freed prior to getting here.
931 	 */
932 	ASSERT(skr->skr_refcnt == 0);
933 	if (skr->skr_seginuse != 0) {
934 		panic("%s: '%s' (%p) not empty (%u unfreed)",
935 		    __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
936 		/* NOTREACHED */
937 		__builtin_unreachable();
938 	}
939 
940 	if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
941 		SKR_UNLOCK(skr);
942 		SKMEM_REGION_LOCK();
943 		TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
944 		SKMEM_REGION_UNLOCK();
945 		SKR_LOCK(skr);
946 		ASSERT(skr->skr_refcnt == 0);
947 	}
948 
949 	/*
950 	 * Undo what's done earlier at region creation time.
951 	 */
952 	skmem_region_depopulate(skr);
953 	ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
954 	ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
955 	ASSERT(skr->skr_seg_free_cnt == 0);
956 
957 	if (skr->skr_reg != NULL) {
958 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
959 		IOSKRegionDestroy(skr->skr_reg);
960 		skr->skr_reg = NULL;
961 	}
962 
963 	if (skr->skr_zreg != NULL) {
964 		ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
965 		zdestroy(skr->skr_zreg);
966 		skr->skr_zreg = NULL;
967 	}
968 
969 	if (skr->skr_seg_bmap != NULL) {
970 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
971 #if (DEBUG || DEVELOPMENT)
972 		ASSERT(skr->skr_seg_bmap_len != 0);
973 		/* must have been set to vacant (bit set) by now */
974 		assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
975 #endif /* DEBUG || DEVELOPMENT */
976 
977 		bitmap_t *__indexable bmap = skr->skr_seg_bmap;
978 		sk_free_data(bmap, skr->skr_seg_bmap_size);
979 		skr->skr_seg_bmap = NULL;
980 		skr->skr_seg_bmap_size = 0;
981 		skr->skr_seg_bmap_len = 0;
982 	}
983 	ASSERT(skr->skr_seg_bmap_len == 0);
984 
985 	if (skr->skr_hash_table != NULL) {
986 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
987 #if (DEBUG || DEVELOPMENT)
988 		for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
989 			ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
990 		}
991 #endif /* DEBUG || DEVELOPMENT */
992 
993 		struct sksegment_bkt *__indexable htable = skr->skr_hash_table;
994 		sk_free_type_array(struct sksegment_bkt, skr->skr_hash_size,
995 		    htable);
996 		skr->skr_hash_table = NULL;
997 		skr->skr_hash_size = 0;
998 		htable = NULL;
999 	}
1000 	if ((mskr = skr->skr_mirror) != NULL) {
1001 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1002 		skr->skr_mirror = NULL;
1003 		mskr->skr_mode &= ~SKR_MODE_MIRRORED;
1004 	}
1005 	SKR_UNLOCK(skr);
1006 
1007 	if (mskr != NULL) {
1008 		skmem_region_release(mskr);
1009 	}
1010 
1011 	lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
1012 
1013 	zfree(skr_zone, skr);
1014 }
1015 
1016 /*
1017  * Mirror mskr (slave) to skr (master).
1018  */
1019 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)1020 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
1021 {
1022 	ASSERT(mskr != NULL);
1023 	SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ",
1024 	    SK_KVA(skr), SK_KVA(mskr));
1025 
1026 	SKR_LOCK(skr);
1027 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1028 	ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
1029 	ASSERT(skr->skr_mirror == NULL);
1030 
1031 	/* both regions must share identical parameters */
1032 	ASSERT(skr->skr_size == mskr->skr_size);
1033 	ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
1034 	ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
1035 
1036 	skr->skr_mirror = mskr;
1037 	skmem_region_retain(mskr);
1038 	mskr->skr_mode |= SKR_MODE_MIRRORED;
1039 	SKR_UNLOCK(skr);
1040 }
1041 
1042 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm,bool attach)1043 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm,
1044     bool attach)
1045 {
1046 	int i;
1047 
1048 	SKR_LOCK(skr);
1049 	if (attach) {
1050 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != NULL;
1051 		    i++) {
1052 			;
1053 		}
1054 		VERIFY(i < SKR_MAX_CACHES);
1055 		ASSERT(skr->skr_cache[i] == NULL);
1056 		skr->skr_mode |= SKR_MODE_SLAB;
1057 		skr->skr_cache[i] = skm;
1058 		skmem_region_retain_locked(skr);
1059 		SKR_UNLOCK(skr);
1060 	} else {
1061 		ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1062 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != skm;
1063 		    i++) {
1064 			;
1065 		}
1066 		VERIFY(i < SKR_MAX_CACHES);
1067 		ASSERT(skr->skr_cache[i] == skm);
1068 		skr->skr_cache[i] = NULL;
1069 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] == NULL;
1070 		    i++) {
1071 			;
1072 		}
1073 		if (i == SKR_MAX_CACHES) {
1074 			skr->skr_mode &= ~SKR_MODE_SLAB;
1075 		}
1076 		if (!skmem_region_release_locked(skr)) {
1077 			SKR_UNLOCK(skr);
1078 		}
1079 	}
1080 }
1081 
1082 /*
1083  * Common routines for skmem_region_{alloc,mirror_alloc}.
1084  */
1085 static void *
__sized_by(objsize)1086 __sized_by(objsize)
1087 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg,
1088     uint32_t SK_FB_ARG objsize)
1089 {
1090 	struct sksegment_bkt *sgb;
1091 	uint32_t SK_FB_ARG seg_sz = 0;
1092 	void *__sized_by(seg_sz) addr;
1093 
1094 	SKR_LOCK_ASSERT_HELD(skr);
1095 
1096 	ASSERT(sg->sg_md != NULL);
1097 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1098 	addr = __unsafe_forge_bidi_indexable(void *, (void *)sg->sg_start, objsize);
1099 	seg_sz = objsize;
1100 	sgb = SKMEM_REGION_HASH(skr, addr);
1101 	ASSERT(sg->sg_link.tqe_next == NULL);
1102 	ASSERT(sg->sg_link.tqe_prev == NULL);
1103 	TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1104 
1105 	skr->skr_seginuse++;
1106 	skr->skr_meminuse += skr->skr_seg_size;
1107 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1108 		skr->skr_w_meminuse += skr->skr_seg_size;
1109 	}
1110 	skr->skr_alloc++;
1111 
1112 	return addr;
1113 }
1114 
1115 /*
1116  * Allocate a segment from the region.
1117  * XXX -fbounds-safety: there's only 5 callers of this funcion, so it was easier
1118  * to just add objsize to the function signature
1119  * XXX -fbounds-safety: until we have __sized_by_or_null (rdar://75598414), we
1120  * can't pass NULL, but instead create a variable whose value is NULL. Also,
1121  * once rdar://83900556 lands, -fbounds-safety will do size checking at return.
1122  * So we need to come back to this once rdar://75598414 and rdar://83900556
1123  * land.
1124  */
1125 void *
__sized_by(objsize)1126 __sized_by(objsize)
1127 skmem_region_alloc(struct skmem_region *skr, void *__sized_by(*msize) * maddr,
1128     struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag,
1129     uint32_t SK_FB_ARG objsize, uint32_t *SK_FB_ARG msize)
1130 {
1131 	struct sksegment *sg = NULL;
1132 	struct sksegment *__single sg1 = NULL;
1133 	void *__indexable addr = NULL, *__indexable addr1 = NULL;
1134 	uint32_t retries = 0;
1135 
1136 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1137 
1138 	if (retsg != NULL) {
1139 		*retsg = NULL;
1140 	}
1141 	if (retsgm != NULL) {
1142 		*retsgm = NULL;
1143 	}
1144 
1145 	/* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1146 	VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1147 	    (SKMEM_NOSLEEP | SKMEM_FAILOK));
1148 
1149 	SKR_LOCK(skr);
1150 	while (sg == NULL) {
1151 		/* see if there's a segment in the freelist */
1152 		sg = TAILQ_FIRST(&skr->skr_seg_free);
1153 		if (sg == NULL) {
1154 			/* see if we can grow the freelist */
1155 			sg = sksegment_freelist_grow(skr);
1156 			if (sg != NULL) {
1157 				break;
1158 			}
1159 
1160 			if (skr->skr_mode & SKR_MODE_SLAB) {
1161 				SKR_UNLOCK(skr);
1162 				/*
1163 				 * None found; it's possible that the slab
1164 				 * layer is caching extra amount, so ask
1165 				 * skmem_cache to reap/purge its caches.
1166 				 */
1167 				for (int i = 0; i < SKR_MAX_CACHES; i++) {
1168 					if (skr->skr_cache[i] == NULL) {
1169 						continue;
1170 					}
1171 					skmem_cache_reap_now(skr->skr_cache[i],
1172 					    TRUE);
1173 				}
1174 				SKR_LOCK(skr);
1175 				/*
1176 				 * If we manage to get some freed, try again.
1177 				 */
1178 				if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1179 					continue;
1180 				}
1181 			}
1182 
1183 			/*
1184 			 * Give up if this is a non-blocking allocation,
1185 			 * or if this is a blocking allocation but the
1186 			 * caller is willing to retry.
1187 			 */
1188 			if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1189 				break;
1190 			}
1191 
1192 			/* otherwise we wait until one is available */
1193 			++skr->skr_seg_waiters;
1194 			(void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1195 			    (PZERO - 1), skr->skr_name, NULL);
1196 		}
1197 	}
1198 
1199 	SKR_LOCK_ASSERT_HELD(skr);
1200 
1201 	if (sg != NULL) {
1202 retry:
1203 		/*
1204 		 * We have a segment; remove it from the freelist and
1205 		 * insert it into the allocated-address hash chain.
1206 		 * Note that this may return NULL if we can't allocate
1207 		 * the memory descriptor.
1208 		 */
1209 		if (sksegment_freelist_remove(skr, sg, skmflag,
1210 		    FALSE) == NULL) {
1211 			ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1212 			ASSERT(sg->sg_md == NULL);
1213 			ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1214 
1215 			/*
1216 			 * If it's non-blocking allocation, simply just give
1217 			 * up and let the caller decide when to retry.  Else,
1218 			 * it gets a bit complicated due to the contract we
1219 			 * have for blocking allocations with the client; the
1220 			 * most sensible thing to do here is to retry the
1221 			 * allocation ourselves.  Note that we keep using the
1222 			 * same segment we originally got, since we only need
1223 			 * the memory descriptor to be allocated for it; thus
1224 			 * we make sure we don't release the region lock when
1225 			 * retrying allocation.  Doing so is crucial when the
1226 			 * region is mirrored, since the segment indices on
1227 			 * both regions need to match.
1228 			 */
1229 			if (skmflag & SKMEM_NOSLEEP) {
1230 				SK_ERR("\"%s\": failed to allocate segment "
1231 				    "(non-sleeping mode)", skr->skr_name);
1232 				sg = NULL;
1233 			} else {
1234 				if (++retries > SKMEM_WDT_MAXTIME) {
1235 					panic_plain("\"%s\": failed to "
1236 					    "allocate segment (sleeping mode) "
1237 					    "after %u retries\n\n%s",
1238 					    skr->skr_name, SKMEM_WDT_MAXTIME,
1239 					    skmem_dump(skr));
1240 					/* NOTREACHED */
1241 					__builtin_unreachable();
1242 				} else {
1243 					SK_ERR("\"%s\": failed to allocate "
1244 					    "segment (sleeping mode): %u "
1245 					    "retries", skr->skr_name, retries);
1246 				}
1247 				if (skr->skr_mode & SKR_MODE_SLAB) {
1248 					/*
1249 					 * We can't get any memory descriptor
1250 					 * for this segment; reap extra cached
1251 					 * objects from the slab layer and hope
1252 					 * that we get lucky next time around.
1253 					 *
1254 					 * XXX [email protected]: perhaps also
1255 					 * trigger the zone allocator to do
1256 					 * its garbage collection here?
1257 					 */
1258 					skmem_cache_reap();
1259 				}
1260 				delay(1 * USEC_PER_SEC);        /* 1 sec */
1261 				goto retry;
1262 			}
1263 		}
1264 
1265 		if (sg != NULL) {
1266 			/* insert to allocated-address hash chain */
1267 			addr = skmem_region_alloc_common(skr, sg,
1268 			    skr->skr_seg_size);
1269 		}
1270 	}
1271 
1272 	if (sg == NULL) {
1273 		VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1274 		if (skmflag & SKMEM_PANIC) {
1275 			VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1276 			    SKMEM_NOSLEEP);
1277 			/*
1278 			 * If is a failed non-blocking alloc and the caller
1279 			 * insists that it must be successful, then panic.
1280 			 */
1281 			panic_plain("\"%s\": skr 0x%p unable to satisfy "
1282 			    "mandatory allocation\n", skr->skr_name, skr);
1283 			/* NOTREACHED */
1284 			__builtin_unreachable();
1285 		} else {
1286 			/*
1287 			 * Give up if this is a non-blocking allocation,
1288 			 * or one where the caller is willing to handle
1289 			 * allocation failures.
1290 			 */
1291 			goto done;
1292 		}
1293 	}
1294 
1295 	ASSERT((mach_vm_address_t)addr == sg->sg_start);
1296 
1297 #if SK_LOG
1298 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1299 	    SK_KVA(skr), SK_KVA(sg));
1300 	if (skr->skr_mirror == NULL ||
1301 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1302 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1303 		    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1304 	} else {
1305 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1306 		    sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start),
1307 		    SK_KVA(sg->sg_end));
1308 	}
1309 #endif /* SK_LOG */
1310 
1311 	/*
1312 	 * If mirroring, allocate shadow object from slave region.
1313 	 */
1314 	if (skr->skr_mirror != NULL) {
1315 		ASSERT(skr->skr_mirror != skr);
1316 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1317 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1318 		addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg,
1319 		    skr->skr_mirror->skr_seg_size, &sg1);
1320 		ASSERT(addr1 != NULL);
1321 		ASSERT(sg1 != NULL && sg1 != sg);
1322 		ASSERT(sg1->sg_index == sg->sg_index);
1323 	}
1324 
1325 done:
1326 	SKR_UNLOCK(skr);
1327 
1328 	/* return segment metadata to caller if asked (reference not needed) */
1329 	if (addr != NULL) {
1330 		if (retsg != NULL) {
1331 			*retsg = sg;
1332 		}
1333 		if (retsgm != NULL) {
1334 			*retsgm = sg1;
1335 		}
1336 	}
1337 
1338 	if (maddr != NULL) {
1339 		if (addr1) {
1340 			*maddr = addr1;
1341 			*msize = skr->skr_mirror->skr_seg_size;
1342 		} else {
1343 			*maddr = addr1;
1344 			*msize = 0;
1345 		}
1346 	}
1347 
1348 	return addr;
1349 }
1350 
1351 /*
1352  * Allocate a segment from a mirror region at the same index.  While it
1353  * is somewhat a simplified variant of skmem_region_alloc, keeping it
1354  * separate allows us to avoid further convoluting that routine.
1355  */
1356 static void *
__sized_by(seg_size)1357 __sized_by(seg_size)
1358 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1359     uint32_t SK_FB_ARG seg_size, struct sksegment **__single retsg)
1360 {
1361 	struct sksegment sg_key = { .sg_index = sg0->sg_index };
1362 	struct sksegment *sg = NULL;
1363 	void *addr = NULL;
1364 
1365 	ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1366 	ASSERT(skr->skr_mirror == NULL);
1367 	ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1368 
1369 	if (retsg != NULL) {
1370 		*retsg = NULL;
1371 	}
1372 
1373 	SKR_LOCK(skr);
1374 
1375 	/*
1376 	 * See if we can find one in the freelist first.  Otherwise,
1377 	 * create a new segment of the same index and add that to the
1378 	 * freelist.  We would always get a segment since both regions
1379 	 * are synchronized when it comes to the indices of allocated
1380 	 * segments.
1381 	 */
1382 	sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1383 	if (sg == NULL) {
1384 		sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1385 		VERIFY(sg != NULL);
1386 	}
1387 	VERIFY(sg->sg_index == sg0->sg_index);
1388 
1389 	/*
1390 	 * We have a segment; remove it from the freelist and insert
1391 	 * it into the allocated-address hash chain.  This either
1392 	 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1393 	 * can't be allocated.
1394 	 *
1395 	 * TODO: consider retrying IOBMD allocation attempts if needed.
1396 	 */
1397 	sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1398 	VERIFY(sg != NULL);
1399 
1400 	/* insert to allocated-address hash chain */
1401 	addr = skmem_region_alloc_common(skr, sg, skr->skr_seg_size);
1402 
1403 #if SK_LOG
1404 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1405 	    SK_KVA(skr), SK_KVA(sg));
1406 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1407 	    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1408 #endif /* SK_LOG */
1409 
1410 	SKR_UNLOCK(skr);
1411 
1412 	/* return segment metadata to caller if asked (reference not needed) */
1413 	if (retsg != NULL) {
1414 		*retsg = sg;
1415 	}
1416 
1417 	return addr;
1418 }
1419 
1420 /*
1421  * Free a segment to the region.
1422  */
1423 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1424 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1425 {
1426 	struct sksegment_bkt *sgb;
1427 	struct sksegment *sg, *tsg;
1428 
1429 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1430 
1431 	/*
1432 	 * Search the hash chain to find a matching segment for the
1433 	 * given address.  If found, remove the segment from the
1434 	 * hash chain and insert it into the freelist.  Otherwise,
1435 	 * we panic since the caller has given us a bogus address.
1436 	 */
1437 	SKR_LOCK(skr);
1438 	sgb = SKMEM_REGION_HASH(skr, addr);
1439 	TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1440 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1441 		if (sg->sg_start == (mach_vm_address_t)addr) {
1442 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1443 			sg->sg_link.tqe_next = NULL;
1444 			sg->sg_link.tqe_prev = NULL;
1445 			break;
1446 		}
1447 	}
1448 
1449 	ASSERT(sg != NULL);
1450 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1451 		ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1452 		skr->skr_w_meminuse -= skr->skr_seg_size;
1453 	}
1454 	sksegment_freelist_insert(skr, sg, FALSE);
1455 
1456 	ASSERT(skr->skr_seginuse != 0);
1457 	skr->skr_seginuse--;
1458 	skr->skr_meminuse -= skr->skr_seg_size;
1459 	skr->skr_free++;
1460 
1461 #if SK_LOG
1462 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1463 	    SK_KVA(skr), SK_KVA(sg));
1464 	if (skr->skr_mirror == NULL ||
1465 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1466 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1467 		    sg->sg_index, SK_KVA(addr),
1468 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1469 	} else {
1470 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1471 		    sg->sg_index, SK_KVA(sg), SK_KVA(addr),
1472 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1473 	}
1474 #endif /* SK_LOG */
1475 
1476 	/*
1477 	 * If mirroring, also free shadow object in slave region.
1478 	 */
1479 	if (skr->skr_mirror != NULL) {
1480 		ASSERT(maddr != NULL);
1481 		ASSERT(skr->skr_mirror != skr);
1482 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1483 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1484 		skmem_region_free(skr->skr_mirror, maddr, NULL);
1485 	}
1486 
1487 	/* wake up any blocked threads waiting for a segment */
1488 	if (skr->skr_seg_waiters != 0) {
1489 		SK_DF(SK_VERB_MEM_REGION,
1490 		    "sg 0x%llx waking up %u waiters", SK_KVA(sg),
1491 		    skr->skr_seg_waiters);
1492 		skr->skr_seg_waiters = 0;
1493 		wakeup(&skr->skr_seg_free);
1494 	}
1495 	SKR_UNLOCK(skr);
1496 }
1497 
1498 __attribute__((always_inline))
1499 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1500 skmem_region_retain_locked(struct skmem_region *skr)
1501 {
1502 	SKR_LOCK_ASSERT_HELD(skr);
1503 	skr->skr_refcnt++;
1504 	ASSERT(skr->skr_refcnt != 0);
1505 }
1506 
1507 /*
1508  * Retain a segment.
1509  */
1510 void
skmem_region_retain(struct skmem_region * skr)1511 skmem_region_retain(struct skmem_region *skr)
1512 {
1513 	SKR_LOCK(skr);
1514 	skmem_region_retain_locked(skr);
1515 	SKR_UNLOCK(skr);
1516 }
1517 
1518 __attribute__((always_inline))
1519 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1520 skmem_region_release_locked(struct skmem_region *skr)
1521 {
1522 	SKR_LOCK_ASSERT_HELD(skr);
1523 	ASSERT(skr->skr_refcnt != 0);
1524 	if (--skr->skr_refcnt == 0) {
1525 		skmem_region_destroy(skr);
1526 		return TRUE;
1527 	}
1528 	return FALSE;
1529 }
1530 
1531 /*
1532  * Release (and potentially destroy) a segment.
1533  */
1534 boolean_t
skmem_region_release(struct skmem_region * skr)1535 skmem_region_release(struct skmem_region *skr)
1536 {
1537 	boolean_t lastref;
1538 
1539 	SKR_LOCK(skr);
1540 	if (!(lastref = skmem_region_release_locked(skr))) {
1541 		SKR_UNLOCK(skr);
1542 	}
1543 
1544 	return lastref;
1545 }
1546 
1547 /*
1548  * Depopulate the segment freelist.
1549  */
1550 static void
skmem_region_depopulate(struct skmem_region * skr)1551 skmem_region_depopulate(struct skmem_region *skr)
1552 {
1553 	struct sksegment *sg, *tsg;
1554 
1555 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
1556 	    skr->skr_name, SK_KVA(skr));
1557 
1558 	SKR_LOCK_ASSERT_HELD(skr);
1559 	ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1560 
1561 	TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1562 		struct sksegment *sg0;
1563 		uint32_t i;
1564 
1565 		i = sg->sg_index;
1566 		sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1567 		VERIFY(sg0 == sg);
1568 
1569 		sksegment_destroy(skr, sg);
1570 		ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1571 	}
1572 }
1573 
1574 /*
1575  * Free tree segment compare routine.
1576  */
1577 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1578 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1579 {
1580 	return sg1->sg_index - sg2->sg_index;
1581 }
1582 
1583 /*
1584  * Create a segment.
1585  *
1586  * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1587  */
1588 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1589 sksegment_create(struct skmem_region *skr, uint32_t i)
1590 {
1591 	struct sksegment *__single sg = NULL;
1592 	bitmap_t *bmap;
1593 
1594 	SKR_LOCK_ASSERT_HELD(skr);
1595 
1596 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1597 	ASSERT(i < skr->skr_seg_max_cnt);
1598 	ASSERT(skr->skr_reg != NULL);
1599 	ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1600 
1601 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1602 	ASSERT(bit_test(*bmap, i % BMAPSZ));
1603 
1604 	sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1605 	bzero(sg, sizeof(*sg));
1606 
1607 	sg->sg_region = skr;
1608 	sg->sg_index = i;
1609 	sg->sg_state = SKSEG_STATE_DETACHED;
1610 
1611 	/* claim it (clear bit) */
1612 	bit_clear(*bmap, i % BMAPSZ);
1613 
1614 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b", i,
1615 	    SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode,
1616 	    SKR_MODE_BITS);
1617 
1618 	return sg;
1619 }
1620 
1621 /*
1622  * Destroy a segment.
1623  *
1624  * Set the bit for the segment's index in skr_seg_bmap bitmap,
1625  * indicating that it is now vacant.
1626  */
1627 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1628 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1629 {
1630 	uint32_t i = sg->sg_index;
1631 	bitmap_t *bmap;
1632 
1633 	SKR_LOCK_ASSERT_HELD(skr);
1634 
1635 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1636 	ASSERT(skr == sg->sg_region);
1637 	ASSERT(skr->skr_reg != NULL);
1638 	ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1639 	ASSERT(i < skr->skr_seg_max_cnt);
1640 
1641 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1642 	ASSERT(!bit_test(*bmap, i % BMAPSZ));
1643 
1644 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b",
1645 	    i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end),
1646 	    skr->skr_mode, SKR_MODE_BITS);
1647 
1648 	/*
1649 	 * Undo what's done earlier at segment creation time.
1650 	 */
1651 
1652 	ASSERT(sg->sg_md == NULL);
1653 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1654 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1655 
1656 	/* release it (set bit) */
1657 	bit_set(*bmap, i % BMAPSZ);
1658 
1659 	skmem_cache_free(skmem_sg_cache, sg);
1660 }
1661 
1662 /*
1663  * Insert a segment into freelist (freeing the segment).
1664  */
1665 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1666 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1667     boolean_t populating)
1668 {
1669 	SKR_LOCK_ASSERT_HELD(skr);
1670 
1671 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1672 	ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1673 	ASSERT(skr == sg->sg_region);
1674 	ASSERT(skr->skr_reg != NULL);
1675 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1676 
1677 	/*
1678 	 * If the region is being populated, then we're done.
1679 	 */
1680 	if (__improbable(populating)) {
1681 		ASSERT(sg->sg_md == NULL);
1682 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1683 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1684 	} else {
1685 		IOSKMemoryBufferRef __single md;
1686 		IOReturn err;
1687 
1688 		ASSERT(sg->sg_md != NULL);
1689 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1690 
1691 		/*
1692 		 * Let the client remove the memory from IOMMU, and unwire it.
1693 		 */
1694 		if (skr->skr_seg_dtor != NULL) {
1695 			skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1696 		}
1697 
1698 		ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1699 		    sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1700 
1701 		IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1702 		VERIFY(sg->sg_md == md);
1703 
1704 		/*
1705 		 * If persistent, unwire this memory now. But do not unwire
1706 		 * memtag regions, as they come from zalloc.
1707 		 */
1708 		if ((skr->skr_mode & SKR_MODE_PERSISTENT) &&
1709 		    !(skr->skr_mode & SKR_MODE_MEMTAG)) {
1710 			err = IOSKMemoryUnwire(md);
1711 			if (err != kIOReturnSuccess) {
1712 				panic("Fail to unwire md %p, err %d", md, err);
1713 			}
1714 		}
1715 
1716 		/* mark memory as empty/discarded for consistency */
1717 		if (!(skr->skr_mode & SKR_MODE_MEMTAG)) {
1718 			err = IOSKMemoryDiscard(md);
1719 			if (err != kIOReturnSuccess) {
1720 				panic("Fail to discard md %p, err %d", md, err);
1721 			}
1722 		}
1723 
1724 		IOSKMemoryDestroy(md);
1725 		sg->sg_md = NULL;
1726 		sg->sg_start = sg->sg_end = 0;
1727 		sg->sg_state = SKSEG_STATE_DETACHED;
1728 
1729 		ASSERT(skr->skr_memtotal >= skr->skr_seg_size);
1730 		skr->skr_memtotal -= skr->skr_seg_size;
1731 	}
1732 
1733 	sg->sg_type = SKSEG_TYPE_FREE;
1734 	ASSERT(sg->sg_link.tqe_next == NULL);
1735 	ASSERT(sg->sg_link.tqe_prev == NULL);
1736 	TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1737 	ASSERT(sg->sg_node.rbe_left == NULL);
1738 	ASSERT(sg->sg_node.rbe_right == NULL);
1739 	ASSERT(sg->sg_node.rbe_parent == NULL);
1740 	RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1741 	++skr->skr_seg_free_cnt;
1742 	ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1743 }
1744 
1745 /*
1746  * Remove a segment from the freelist (allocating the segment).
1747  */
1748 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1749 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1750     uint32_t skmflag, boolean_t purging)
1751 {
1752 #pragma unused(skmflag)
1753 	mach_vm_address_t segstart;
1754 	IOReturn err;
1755 
1756 	SKR_LOCK_ASSERT_HELD(skr);
1757 
1758 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1759 	ASSERT(sg != NULL);
1760 	ASSERT(skr == sg->sg_region);
1761 	ASSERT(skr->skr_reg != NULL);
1762 	ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1763 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1764 
1765 #if (DEVELOPMENT || DEBUG)
1766 	uint64_t mtbf = skmem_region_get_mtbf();
1767 	/*
1768 	 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1769 	 */
1770 	if (__improbable(mtbf != 0 && !purging &&
1771 	    (net_uptime_ms() % mtbf) == 0 &&
1772 	    !(skmflag & SKMEM_PANIC))) {
1773 		SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure",
1774 		    skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1775 		net_update_uptime();
1776 		return NULL;
1777 	}
1778 #endif /* (DEVELOPMENT || DEBUG) */
1779 
1780 	TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1781 	sg->sg_link.tqe_next = NULL;
1782 	sg->sg_link.tqe_prev = NULL;
1783 	RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1784 	sg->sg_node.rbe_left = NULL;
1785 	sg->sg_node.rbe_right = NULL;
1786 	sg->sg_node.rbe_parent = NULL;
1787 
1788 	ASSERT(skr->skr_seg_free_cnt != 0);
1789 	--skr->skr_seg_free_cnt;
1790 
1791 	/*
1792 	 * If the region is being depopulated, then we're done.
1793 	 */
1794 	if (__improbable(purging)) {
1795 		ASSERT(sg->sg_md == NULL);
1796 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1797 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1798 		sg->sg_type = SKSEG_TYPE_DESTROYED;
1799 		return sg;
1800 	}
1801 
1802 	ASSERT(sg->sg_md == NULL);
1803 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1804 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1805 
1806 	/* created as non-volatile (mapped) upon success */
1807 	if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1808 	    &skr->skr_bufspec, &segstart)) == NULL) {
1809 		ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1810 		if (skmflag & SKMEM_PANIC) {
1811 			/* if the caller insists for a success then panic */
1812 			panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1813 			    "to satisfy mandatory allocation\n", skr->skr_name,
1814 			    skr, sg, sg->sg_index);
1815 			/* NOTREACHED */
1816 			__builtin_unreachable();
1817 		}
1818 		/* reinsert this segment to freelist */
1819 		ASSERT(sg->sg_link.tqe_next == NULL);
1820 		ASSERT(sg->sg_link.tqe_prev == NULL);
1821 		TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1822 		ASSERT(sg->sg_node.rbe_left == NULL);
1823 		ASSERT(sg->sg_node.rbe_right == NULL);
1824 		ASSERT(sg->sg_node.rbe_parent == NULL);
1825 		RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1826 		++skr->skr_seg_free_cnt;
1827 		return NULL;
1828 	}
1829 
1830 	sg->sg_start = segstart;
1831 	sg->sg_end = (segstart + skr->skr_seg_size);
1832 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1833 
1834 	/* mark memory as non-volatile just to be consistent */
1835 	if (!(skr->skr_mode & SKR_MODE_MEMTAG)) {
1836 		err = IOSKMemoryReclaim(sg->sg_md);
1837 		if (err != kIOReturnSuccess) {
1838 			panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1839 		}
1840 	}
1841 
1842 	/*
1843 	 * If persistent, wire down its memory now. But do not wire memtag
1844 	 * regions, as they come from zalloc.
1845 	 */
1846 	if ((skr->skr_mode & SKR_MODE_PERSISTENT) &&
1847 	    !(skr->skr_mode & SKR_MODE_MEMTAG)) {
1848 		err = IOSKMemoryWire(sg->sg_md);
1849 		if (err != kIOReturnSuccess) {
1850 			panic("Fail to wire md %p, err %d", sg->sg_md, err);
1851 		}
1852 	}
1853 
1854 	err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1855 	if (err != kIOReturnSuccess) {
1856 		panic("Fail to set md %p, err %d", sg->sg_md, err);
1857 	}
1858 
1859 	/*
1860 	 * Let the client wire it and insert to IOMMU, if applicable.
1861 	 * Try to find out if it's wired and set the right state.
1862 	 */
1863 	if (skr->skr_seg_ctor != NULL) {
1864 		skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1865 	}
1866 
1867 	sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1868 	    SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1869 
1870 	skr->skr_memtotal += skr->skr_seg_size;
1871 
1872 	ASSERT(sg->sg_md != NULL);
1873 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1874 
1875 	sg->sg_type = SKSEG_TYPE_ALLOC;
1876 	return sg;
1877 }
1878 
1879 /*
1880  * Find the first available index and allocate a segment at that index.
1881  */
1882 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1883 sksegment_freelist_grow(struct skmem_region *skr)
1884 {
1885 	struct sksegment *sg = NULL;
1886 	uint32_t i, j, idx;
1887 
1888 	SKR_LOCK_ASSERT_HELD(skr);
1889 
1890 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1891 	ASSERT(skr->skr_seg_bmap_len != 0);
1892 	ASSERT(skr->skr_seg_max_cnt != 0);
1893 
1894 	for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1895 		bitmap_t *bmap, mask;
1896 		uint32_t end = (BMAPSZ - 1);
1897 
1898 		if (i == (skr->skr_seg_bmap_len - 1)) {
1899 			end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1900 		}
1901 
1902 		bmap = &skr->skr_seg_bmap[i];
1903 		mask = BMASK64(0, end);
1904 
1905 		j = ffsll((*bmap) & mask);
1906 		if (j == 0) {
1907 			continue;
1908 		}
1909 
1910 		--j;
1911 		idx = (i * BMAPSZ) + j;
1912 
1913 		sg = sksegment_alloc_with_idx(skr, idx);
1914 
1915 		/* we're done */
1916 		break;
1917 	}
1918 
1919 	ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1920 	return sg;
1921 }
1922 
1923 /*
1924  * Create a single segment at a specific index and add it to the freelist.
1925  */
1926 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1927 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1928 {
1929 	struct sksegment *sg;
1930 
1931 	SKR_LOCK_ASSERT_HELD(skr);
1932 
1933 	if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1934 		panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1935 		    __func__, skr->skr_name, (void *)skr, idx,
1936 		    (skr->skr_seg_max_cnt - 1));
1937 		/* NOTREACHED */
1938 		__builtin_unreachable();
1939 	}
1940 
1941 	/* must not fail, blocking alloc */
1942 	sg = sksegment_create(skr, idx);
1943 	VERIFY(sg != NULL);
1944 	VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1945 
1946 	/* populate the freelist */
1947 	sksegment_freelist_insert(skr, sg, TRUE);
1948 	ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1949 #if (DEVELOPMENT || DEBUG)
1950 	struct sksegment sg_key = { .sg_index = sg->sg_index };
1951 	ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1952 #endif /* (DEVELOPMENT || DEBUG) */
1953 
1954 	SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1955 
1956 	return sg;
1957 }
1958 
1959 /*
1960  * Rescale the regions's allocated-address hash table.
1961  */
1962 static void
skmem_region_hash_rescale(struct skmem_region * skr)1963 skmem_region_hash_rescale(struct skmem_region *skr)
1964 {
1965 	struct sksegment_bkt *__indexable old_table, *new_table;
1966 	size_t old_size, new_size;
1967 	uint32_t i, moved = 0;
1968 
1969 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
1970 		ASSERT(skr->skr_hash_table == NULL);
1971 		/* this is no-op for pseudo region */
1972 		return;
1973 	}
1974 
1975 	ASSERT(skr->skr_hash_table != NULL);
1976 	/* insist that we are executing in the update thread call context */
1977 	ASSERT(sk_is_region_update_protected());
1978 
1979 	/*
1980 	 * To get small average lookup time (lookup depth near 1.0), the hash
1981 	 * table size should be roughly the same (not necessarily equivalent)
1982 	 * as the region size.
1983 	 */
1984 	new_size = MAX(skr->skr_hash_initial,
1985 	    (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
1986 	new_size = MIN(skr->skr_hash_limit, new_size);
1987 	old_size = (skr->skr_hash_mask + 1);
1988 
1989 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
1990 		return;
1991 	}
1992 
1993 	new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
1994 	    Z_NOWAIT, skmem_tag_segment_hash);
1995 	if (__improbable(new_table == NULL)) {
1996 		return;
1997 	}
1998 
1999 	for (i = 0; i < new_size; i++) {
2000 		TAILQ_INIT(&new_table[i].sgb_head);
2001 	}
2002 
2003 	SKR_LOCK(skr);
2004 
2005 	old_size = (skr->skr_hash_mask + 1);
2006 	old_table = skr->skr_hash_table;
2007 
2008 	skr->skr_hash_mask = (uint32_t)(new_size - 1);
2009 	skr->skr_hash_table = new_table;
2010 	skr->skr_hash_size = new_size;
2011 	skr->skr_rescale++;
2012 
2013 	for (i = 0; i < old_size; i++) {
2014 		struct sksegment_bkt *sgb = &old_table[i];
2015 		struct sksegment_bkt *new_sgb;
2016 		struct sksegment *sg;
2017 
2018 		while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
2019 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
2020 			ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
2021 			new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
2022 			TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
2023 			++moved;
2024 		}
2025 		ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
2026 	}
2027 
2028 	SK_DF(SK_VERB_MEM_REGION,
2029 	    "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr),
2030 	    (uint32_t)old_size, (uint32_t)new_size, moved);
2031 
2032 	SKR_UNLOCK(skr);
2033 
2034 	sk_free_type_array(struct sksegment_bkt, old_size, old_table);
2035 }
2036 
2037 /*
2038  * Apply a function to operate on all regions.
2039  */
2040 static void
skmem_region_applyall(void (* func)(struct skmem_region *))2041 skmem_region_applyall(void (*func)(struct skmem_region *))
2042 {
2043 	struct skmem_region *skr;
2044 
2045 	net_update_uptime();
2046 
2047 	SKMEM_REGION_LOCK();
2048 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2049 		func(skr);
2050 	}
2051 	SKMEM_REGION_UNLOCK();
2052 }
2053 
2054 static void
skmem_region_update(struct skmem_region * skr)2055 skmem_region_update(struct skmem_region *skr)
2056 {
2057 	SKMEM_REGION_LOCK_ASSERT_HELD();
2058 
2059 	/* insist that we are executing in the update thread call context */
2060 	ASSERT(sk_is_region_update_protected());
2061 
2062 	SKR_LOCK(skr);
2063 	/*
2064 	 * If there are threads blocked waiting for an available
2065 	 * segment, wake them up periodically so they can issue
2066 	 * another skmem_cache_reap() to reclaim resources cached
2067 	 * by skmem_cache.
2068 	 */
2069 	if (skr->skr_seg_waiters != 0) {
2070 		SK_DF(SK_VERB_MEM_REGION,
2071 		    "waking up %u waiters to reclaim", skr->skr_seg_waiters);
2072 		skr->skr_seg_waiters = 0;
2073 		wakeup(&skr->skr_seg_free);
2074 	}
2075 	SKR_UNLOCK(skr);
2076 
2077 	/*
2078 	 * Rescale the hash table if needed.
2079 	 */
2080 	skmem_region_hash_rescale(skr);
2081 }
2082 
2083 /*
2084  * Thread call callback for update.
2085  */
2086 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)2087 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2088 {
2089 #pragma unused(dummy, arg)
2090 	sk_protect_t protect;
2091 
2092 	protect = sk_region_update_protect();
2093 	skmem_region_applyall(skmem_region_update);
2094 	sk_region_update_unprotect(protect);
2095 
2096 	skmem_dispatch(skmem_region_update_tc, NULL,
2097 	    (skmem_region_update_interval * NSEC_PER_SEC));
2098 }
2099 
2100 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2101 skmem_region_for_pp(skmem_region_id_t id)
2102 {
2103 	int i;
2104 
2105 	for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2106 		if (id == skmem_pp_region_ids[i]) {
2107 			return TRUE;
2108 		}
2109 	}
2110 	return FALSE;
2111 }
2112 
2113 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2114 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2115 {
2116 	bzero(sreg, sizeof(*sreg));
2117 
2118 	(void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2119 	    "%s", skr->skr_name);
2120 	uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2121 	sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2122 	sreg->sreg_mode = skr->skr_mode;
2123 
2124 	sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2125 	sreg->sreg_c_seg_size = skr->skr_seg_size;
2126 	sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2127 	sreg->sreg_seg_objs = skr->skr_seg_objs;
2128 	sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2129 	sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2130 	sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2131 	sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2132 	sreg->sreg_align = skr->skr_align;
2133 	sreg->sreg_max_frags = skr->skr_max_frags;
2134 
2135 	sreg->sreg_meminuse = skr->skr_meminuse;
2136 	sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2137 	sreg->sreg_memtotal = skr->skr_memtotal;
2138 	sreg->sreg_seginuse = skr->skr_seginuse;
2139 	sreg->sreg_rescale = skr->skr_rescale;
2140 	sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2141 	sreg->sreg_alloc = skr->skr_alloc;
2142 	sreg->sreg_free = skr->skr_free;
2143 }
2144 
2145 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * __sized_by (len)out,size_t len)2146 skmem_region_mib_get_stats(struct skmem_region *skr, void *__sized_by(len) out,
2147     size_t len)
2148 {
2149 	size_t actual_space = sizeof(struct sk_stats_region);
2150 	struct sk_stats_region *__single sreg;
2151 
2152 	if (out == NULL || len < actual_space) {
2153 		goto done;
2154 	}
2155 	sreg = out;
2156 
2157 	skmem_region_get_stats(skr, sreg);
2158 
2159 done:
2160 	return actual_space;
2161 }
2162 
2163 static int
2164 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2165 {
2166 #pragma unused(arg1, arg2, oidp)
2167 	struct skmem_region *skr;
2168 	size_t actual_space;
2169 	size_t buffer_space;
2170 	size_t allocated_space = 0;
2171 	caddr_t __sized_by(allocated_space) buffer = NULL;
2172 	caddr_t scan;
2173 	int error = 0;
2174 
2175 	if (!kauth_cred_issuser(kauth_cred_get())) {
2176 		return EPERM;
2177 	}
2178 
2179 	net_update_uptime();
2180 	buffer_space = req->oldlen;
2181 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2182 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2183 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2184 		}
2185 		caddr_t temp;
2186 		temp = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_region_mib);
2187 		if (__improbable(temp == NULL)) {
2188 			return ENOBUFS;
2189 		}
2190 		buffer = temp;
2191 		allocated_space = buffer_space;
2192 	} else if (req->oldptr == USER_ADDR_NULL) {
2193 		buffer_space = 0;
2194 	}
2195 	actual_space = 0;
2196 	scan = buffer;
2197 
2198 	SKMEM_REGION_LOCK();
2199 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2200 		size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2201 		if (scan != NULL) {
2202 			if (buffer_space < size) {
2203 				/* supplied buffer too small, stop copying */
2204 				error = ENOMEM;
2205 				break;
2206 			}
2207 			scan += size;
2208 			buffer_space -= size;
2209 		}
2210 		actual_space += size;
2211 	}
2212 	SKMEM_REGION_UNLOCK();
2213 
2214 	if (actual_space != 0) {
2215 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
2216 		if (out_error != 0) {
2217 			error = out_error;
2218 		}
2219 	}
2220 	if (buffer != NULL) {
2221 		sk_free_data_sized_by(buffer, allocated_space);
2222 	}
2223 
2224 	return error;
2225 }
2226 
2227 #if SK_LOG
2228 const char *
skmem_region_id2name(skmem_region_id_t id)2229 skmem_region_id2name(skmem_region_id_t id)
2230 {
2231 	const char *name;
2232 	switch (id) {
2233 	case SKMEM_REGION_SCHEMA:
2234 		name = "SCHEMA";
2235 		break;
2236 
2237 	case SKMEM_REGION_RING:
2238 		name = "RING";
2239 		break;
2240 
2241 	case SKMEM_REGION_BUF_DEF:
2242 		name = "BUF_DEF";
2243 		break;
2244 
2245 	case SKMEM_REGION_BUF_LARGE:
2246 		name = "BUF_LARGE";
2247 		break;
2248 
2249 	case SKMEM_REGION_RXBUF_DEF:
2250 		name = "RXBUF_DEF";
2251 		break;
2252 
2253 	case SKMEM_REGION_RXBUF_LARGE:
2254 		name = "RXBUF_LARGE";
2255 		break;
2256 
2257 	case SKMEM_REGION_TXBUF_DEF:
2258 		name = "TXBUF_DEF";
2259 		break;
2260 
2261 	case SKMEM_REGION_TXBUF_LARGE:
2262 		name = "TXBUF_LARGE";
2263 		break;
2264 
2265 	case SKMEM_REGION_UMD:
2266 		name = "UMD";
2267 		break;
2268 
2269 	case SKMEM_REGION_TXAUSD:
2270 		name = "TXAUSD";
2271 		break;
2272 
2273 	case SKMEM_REGION_RXFUSD:
2274 		name = "RXFUSD";
2275 		break;
2276 
2277 	case SKMEM_REGION_USTATS:
2278 		name = "USTATS";
2279 		break;
2280 
2281 	case SKMEM_REGION_FLOWADV:
2282 		name = "FLOWADV";
2283 		break;
2284 
2285 	case SKMEM_REGION_NEXUSADV:
2286 		name = "NEXUSADV";
2287 		break;
2288 
2289 	case SKMEM_REGION_SYSCTLS:
2290 		name = "SYSCTLS";
2291 		break;
2292 
2293 	case SKMEM_REGION_GUARD_HEAD:
2294 		name = "HEADGUARD";
2295 		break;
2296 
2297 	case SKMEM_REGION_GUARD_TAIL:
2298 		name = "TAILGUARD";
2299 		break;
2300 
2301 	case SKMEM_REGION_KMD:
2302 		name = "KMD";
2303 		break;
2304 
2305 	case SKMEM_REGION_RXKMD:
2306 		name = "RXKMD";
2307 		break;
2308 
2309 	case SKMEM_REGION_TXKMD:
2310 		name = "TXKMD";
2311 		break;
2312 
2313 	case SKMEM_REGION_TXAKSD:
2314 		name = "TXAKSD";
2315 		break;
2316 
2317 	case SKMEM_REGION_RXFKSD:
2318 		name = "RXFKSD";
2319 		break;
2320 
2321 	case SKMEM_REGION_KSTATS:
2322 		name = "KSTATS";
2323 		break;
2324 
2325 	case SKMEM_REGION_KBFT:
2326 		name = "KBFT";
2327 		break;
2328 
2329 	case SKMEM_REGION_UBFT:
2330 		name = "UBFT";
2331 		break;
2332 
2333 	case SKMEM_REGION_RXKBFT:
2334 		name = "RXKBFT";
2335 		break;
2336 
2337 	case SKMEM_REGION_TXKBFT:
2338 		name = "TXKBFT";
2339 		break;
2340 
2341 	case SKMEM_REGION_INTRINSIC:
2342 		name = "INTRINSIC";
2343 		break;
2344 
2345 	default:
2346 		name = "UNKNOWN";
2347 		break;
2348 	}
2349 
2350 	const char *__null_terminated s = __unsafe_null_terminated_from_indexable(name);
2351 
2352 	return s;
2353 }
2354 #endif /* SK_LOG */
2355 
2356 #if (DEVELOPMENT || DEBUG)
2357 uint64_t
skmem_region_get_mtbf(void)2358 skmem_region_get_mtbf(void)
2359 {
2360 	return skmem_region_mtbf;
2361 }
2362 
2363 void
skmem_region_set_mtbf(uint64_t newval)2364 skmem_region_set_mtbf(uint64_t newval)
2365 {
2366 	if (newval < SKMEM_REGION_MTBF_MIN) {
2367 		if (newval != 0) {
2368 			newval = SKMEM_REGION_MTBF_MIN;
2369 		}
2370 	} else if (newval > SKMEM_REGION_MTBF_MAX) {
2371 		newval = SKMEM_REGION_MTBF_MAX;
2372 	}
2373 
2374 	if (skmem_region_mtbf != newval) {
2375 		os_atomic_store(&skmem_region_mtbf, newval, release);
2376 		SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2377 	}
2378 }
2379 
2380 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2381 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2382     struct sysctl_req *req)
2383 {
2384 #pragma unused(oidp, arg1, arg2)
2385 	int changed, error;
2386 	uint64_t newval;
2387 
2388 	_CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2389 	if ((error = sysctl_io_number(req, skmem_region_mtbf,
2390 	    sizeof(uint64_t), &newval, &changed)) == 0) {
2391 		if (changed) {
2392 			skmem_region_set_mtbf(newval);
2393 		}
2394 	}
2395 	return error;
2396 }
2397 #endif /* (DEVELOPMENT || DEBUG) */
2398