xref: /xnu-8020.140.41/bsd/skywalk/mem/skmem_region.c (revision 27b03b360a988dfd3dfdf34262bb0042026747cc)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* BEGIN CSTYLED */
30 /*
31  * A region represents a collection of one or more similarly-sized memory
32  * segments, each of which is a contiguous range of integers.  A segment
33  * is either allocated or free, and is treated as disjoint from all other
34  * segments.  That is, the contiguity applies only at the segment level,
35  * and a region with multiple segments is not contiguous at the region level.
36  * A segment always belongs to the segment freelist, or the allocated-address
37  * hash chain, as described below.
38  *
39  * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40  * stays intact even after a defunct.  Otherwise, the segments belonging
41  * to the region will be freed at defunct time, and the span covered by
42  * the region will be redirected to zero-filled anonymous memory.
43  *
44  * Memory for a region is always created as pageable and purgeable.  It is
45  * the client's responsibility to prepare (wire) it, and optionally insert
46  * it to the IOMMU, at segment construction time.  When the segment is
47  * freed, the client is responsible for removing it from IOMMU (if needed),
48  * and complete (unwire) it.
49  *
50  * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51  * is immediately wired upon allocation (segment removed from freelist).
52  * It gets unwired when memory is discarded (segment inserted to freelist).
53  *
54  * The chronological life cycle of a segment is as such:
55  *
56  *    SKSEG_STATE_DETACHED
57  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58  *            [segment allocated, useable by client]
59  *              ...
60  *            [client frees segment]
61  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62  *	  [reclaim]
63  *    SKSEG_STATE_DETACHED
64  *
65  * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66  * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67  * prevent modifications by the user task.  Only user-mappable regions will
68  * be considered for inclusion during skmem_arena_mmap().
69  *
70  * Every skmem allocator has a region as its slab supplier.  Each slab is
71  * exactly a segment.  The allocator uses skmem_region_{alloc,free}() to
72  * create and destroy slabs.
73  *
74  * A region may be mirrored by another region; the latter acts as the master
75  * controller for both regions.  Mirrored (slave) regions cannot be used
76  * directly by the skmem allocator.  Region mirroring technique is used for
77  * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78  * region has the same size and lifetime as its shadow counterpart.
79  *
80  * CREATION/DESTRUCTION:
81  *
82  *   At creation time, all segments are allocated and are immediately inserted
83  *   into the freelist.  Allocating a purgeable segment has very little cost,
84  *   as it is not backed by physical memory until it is accessed.  Immediate
85  *   insertion into the freelist causes the mapping to be further torn down.
86  *
87  *   At destruction time, the freelist is emptied, and each segment is then
88  *   destroyed.  The system will assert if it detects there are outstanding
89  *   segments not yet returned to the region (not freed by the client.)
90  *
91  * ALLOCATION:
92  *
93  *   Allocating involves searching the freelist for a segment; if found, the
94  *   segment is removed from the freelist and is inserted into the allocated-
95  *   address hash chain.  The address of the memory object represented by
96  *   the segment is used as hash key.  The use of allocated-address hash chain
97  *   is needed since we return the address of the memory object, and not the
98  *   segment's itself, to the client.
99  *
100  * DEALLOCATION:
101  *
102  *   Freeing a memory object causes the chain to be searched for a matching
103  *   segment.  The system will assert if a segment cannot be found, since
104  *   that indicates that the memory object address is invalid.  Once found,
105  *   the segment is removed from the allocated-address hash chain, and is
106  *   inserted to the freelist.
107  *
108  * Segment allocation and deallocation can be expensive.  Because of this,
109  * we expect that most clients will utilize the skmem_cache slab allocator
110  * as the frontend instead.
111  */
112 /* END CSTYLED */
113 
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF             /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
117 
118 static void skmem_region_destroy(struct skmem_region *skr);
119 static void skmem_region_depopulate(struct skmem_region *);
120 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
121 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
122 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
123 static void sksegment_freelist_insert(struct skmem_region *,
124     struct sksegment *, boolean_t);
125 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
126     struct sksegment *, uint32_t, boolean_t);
127 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
128 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
129     uint32_t);
130 static void *skmem_region_alloc_common(struct skmem_region *,
131     struct sksegment *);
132 static void *skmem_region_mirror_alloc(struct skmem_region *,
133     struct sksegment *, struct sksegment **);
134 static void skmem_region_applyall(void (*)(struct skmem_region *));
135 static void skmem_region_update(struct skmem_region *);
136 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
137 static inline void skmem_region_retain_locked(struct skmem_region *);
138 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
139 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
140 
141 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
142 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
143 
144 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
145     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
146     0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
147     "Skywalk region statistics");
148 
149 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
150 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
151 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
152     &skmem_region_lock_attr);
153 
154 /* protected by skmem_region_lock */
155 static TAILQ_HEAD(, skmem_region) skmem_region_head;
156 
157 static thread_call_t skmem_region_update_tc;
158 
159 #define SKMEM_REGION_UPDATE_INTERVAL    13      /* 13 seconds */
160 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
161 
162 #define SKMEM_WDT_MAXTIME               30      /* # of secs before watchdog */
163 #define SKMEM_WDT_PURGE                 3       /* retry purge threshold */
164 
165 #if (DEVELOPMENT || DEBUG)
166 /* Mean Time Between Failures (ms) */
167 static volatile uint64_t skmem_region_mtbf;
168 
169 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
170     struct sysctl_req *);
171 
172 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
173     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
174     skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
175 
176 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
177     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
178     SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
179 #endif /* (DEVELOPMENT || DEBUG) */
180 
181 #define SKMEM_REGION_LOCK()                     \
182 	lck_mtx_lock(&skmem_region_lock)
183 #define SKMEM_REGION_LOCK_ASSERT_HELD()         \
184 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
185 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD()      \
186 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
187 #define SKMEM_REGION_UNLOCK()                   \
188 	lck_mtx_unlock(&skmem_region_lock)
189 
190 /*
191  * Hash table bounds.  Start with the initial value, and rescale up to
192  * the specified limit.  Ideally we don't need a limit, but in practice
193  * this helps guard against runaways.  These values should be revisited
194  * in future and be adjusted as needed.
195  */
196 #define SKMEM_REGION_HASH_INITIAL       32      /* initial hash table size */
197 #define SKMEM_REGION_HASH_LIMIT         4096    /* hash table size limit */
198 
199 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m)     \
200 	(((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
201 #define SKMEM_REGION_HASH(_skr, _addr)                                     \
202 	(&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
203 	    (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
204 
205 static ZONE_DEFINE(skr_zone, SKMEM_ZONE_PREFIX ".mem.skr",
206     sizeof(struct skmem_region), ZC_ZFREE_CLEARMEM);
207 
208 static unsigned int sg_size;                    /* size of zone element */
209 static struct skmem_cache *skmem_sg_cache;      /* cache for sksegment */
210 
211 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
212 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
213 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
214 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
215 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
216 
217 #define SKMEM_TAG_SEGMENT_BMAP  "com.apple.skywalk.segment.bmap"
218 static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP);
219 
220 #define SKMEM_TAG_SEGMENT_HASH  "com.apple.skywalk.segment.hash"
221 static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH);
222 
223 #define SKMEM_TAG_REGION_MIB     "com.apple.skywalk.region.mib"
224 static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB);
225 
226 #define BMAPSZ  64
227 
228 /* 64-bit mask with range */
229 #define BMASK64(_beg, _end)     \
230 	((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
231 
232 static int __skmem_region_inited = 0;
233 
234 void
skmem_region_init(void)235 skmem_region_init(void)
236 {
237 	boolean_t randomize_seg_size;
238 
239 	_CASSERT(sizeof(bitmap_t) == sizeof(uint64_t));
240 	_CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3));
241 	_CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
242 	_CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
243 	ASSERT(!__skmem_region_inited);
244 
245 	/* enforce the ordering here */
246 	_CASSERT(SKMEM_REGION_GUARD_HEAD == 0);
247 	_CASSERT(SKMEM_REGION_SCHEMA == 1);
248 	_CASSERT(SKMEM_REGION_RING == 2);
249 	_CASSERT(SKMEM_REGION_BUF == 3);
250 	_CASSERT(SKMEM_REGION_RXBUF == 4);
251 	_CASSERT(SKMEM_REGION_TXBUF == 5);
252 	_CASSERT(SKMEM_REGION_UMD == 6);
253 	_CASSERT(SKMEM_REGION_TXAUSD == 7);
254 	_CASSERT(SKMEM_REGION_RXFUSD == 8);
255 	_CASSERT(SKMEM_REGION_UBFT == 9);
256 	_CASSERT(SKMEM_REGION_USTATS == 10);
257 	_CASSERT(SKMEM_REGION_FLOWADV == 11);
258 	_CASSERT(SKMEM_REGION_NEXUSADV == 12);
259 	_CASSERT(SKMEM_REGION_SYSCTLS == 13);
260 	_CASSERT(SKMEM_REGION_GUARD_TAIL == 14);
261 	_CASSERT(SKMEM_REGION_KMD == 15);
262 	_CASSERT(SKMEM_REGION_RXKMD == 16);
263 	_CASSERT(SKMEM_REGION_TXKMD == 17);
264 	_CASSERT(SKMEM_REGION_KBFT == 18);
265 	_CASSERT(SKMEM_REGION_RXKBFT == 19);
266 	_CASSERT(SKMEM_REGION_TXKBFT == 20);
267 	_CASSERT(SKMEM_REGION_TXAKSD == 21);
268 	_CASSERT(SKMEM_REGION_RXFKSD == 22);
269 	_CASSERT(SKMEM_REGION_KSTATS == 23);
270 	_CASSERT(SKMEM_REGION_INTRINSIC == 24);
271 
272 	_CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
273 	_CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
274 	_CASSERT(SREG_RING == SKMEM_REGION_RING);
275 	_CASSERT(SREG_BUF == SKMEM_REGION_BUF);
276 	_CASSERT(SREG_RXBUF == SKMEM_REGION_RXBUF);
277 	_CASSERT(SREG_TXBUF == SKMEM_REGION_TXBUF);
278 	_CASSERT(SREG_UMD == SKMEM_REGION_UMD);
279 	_CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
280 	_CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
281 	_CASSERT(SREG_UBFT == SKMEM_REGION_UBFT);
282 	_CASSERT(SREG_USTATS == SKMEM_REGION_USTATS);
283 	_CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
284 	_CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
285 	_CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
286 	_CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
287 	_CASSERT(SREG_KMD == SKMEM_REGION_KMD);
288 	_CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD);
289 	_CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD);
290 	_CASSERT(SREG_KBFT == SKMEM_REGION_KBFT);
291 	_CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
292 	_CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
293 	_CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
294 	_CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
295 	_CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS);
296 
297 	_CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
298 	_CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
299 	_CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
300 	_CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
301 	_CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
302 	_CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
303 	_CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
304 	_CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
305 	_CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
306 	_CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
307 	_CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD);
308 	_CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
309 	_CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
310 	_CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
311 	_CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
312 	_CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB);
313 	_CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
314 
315 	(void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
316 	    sizeof(skmem_seg_size));
317 	if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
318 		skmem_seg_size = SKMEM_MIN_SEG_SIZE;
319 	}
320 	skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
321 	    SKMEM_MIN_SEG_SIZE);
322 	VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
323 
324 	(void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
325 	    sizeof(skmem_md_seg_size));
326 	if (skmem_md_seg_size < skmem_seg_size) {
327 		skmem_md_seg_size = skmem_seg_size;
328 	}
329 	skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
330 	    SKMEM_MIN_SEG_SIZE);
331 	VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
332 
333 	/*
334 	 * If set via boot-args, honor it and don't randomize.
335 	 */
336 	randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
337 	    &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
338 	if (skmem_drv_buf_seg_size < skmem_seg_size) {
339 		skmem_drv_buf_seg_size = skmem_seg_size;
340 	}
341 	skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
342 	    (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
343 	VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
344 
345 	/*
346 	 * Randomize the driver buffer segment size; here we choose
347 	 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
348 	 * Set this as the effective driver buffer segment size.
349 	 */
350 	if (randomize_seg_size) {
351 		uint32_t sm;
352 		read_frandom(&sm, sizeof(sm));
353 		skmem_drv_buf_seg_eff_size +=
354 		    (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
355 		VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
356 	}
357 	VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
358 
359 	(void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
360 	    &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
361 	if (skmem_usr_buf_seg_size < skmem_seg_size) {
362 		skmem_usr_buf_seg_size = skmem_seg_size;
363 	}
364 	skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
365 	    SKMEM_MIN_SEG_SIZE);
366 	VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
367 
368 	SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
369 	    "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
370 	    skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
371 	    skmem_usr_buf_seg_size);
372 
373 	TAILQ_INIT(&skmem_region_head);
374 
375 	skmem_region_update_tc =
376 	    thread_call_allocate_with_options(skmem_region_update_func,
377 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
378 	if (skmem_region_update_tc == NULL) {
379 		panic("%s: thread_call_allocate failed", __func__);
380 		/* NOTREACHED */
381 		__builtin_unreachable();
382 	}
383 
384 	sg_size = sizeof(struct sksegment);
385 	skmem_sg_cache = skmem_cache_create("sg", sg_size,
386 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
387 
388 	/* and start the periodic region update machinery */
389 	skmem_dispatch(skmem_region_update_tc, NULL,
390 	    (skmem_region_update_interval * NSEC_PER_SEC));
391 
392 	__skmem_region_inited = 1;
393 }
394 
395 void
skmem_region_fini(void)396 skmem_region_fini(void)
397 {
398 	if (__skmem_region_inited) {
399 		ASSERT(TAILQ_EMPTY(&skmem_region_head));
400 
401 		if (skmem_region_update_tc != NULL) {
402 			(void) thread_call_cancel_wait(skmem_region_update_tc);
403 			(void) thread_call_free(skmem_region_update_tc);
404 			skmem_region_update_tc = NULL;
405 		}
406 
407 		if (skmem_sg_cache != NULL) {
408 			skmem_cache_destroy(skmem_sg_cache);
409 			skmem_sg_cache = NULL;
410 		}
411 
412 		__skmem_region_inited = 0;
413 	}
414 }
415 
416 /*
417  * Reap internal caches.
418  */
419 void
skmem_region_reap_caches(boolean_t purge)420 skmem_region_reap_caches(boolean_t purge)
421 {
422 	skmem_cache_reap_now(skmem_sg_cache, purge);
423 }
424 
425 /*
426  * Configure and compute the parameters of a region.
427  */
428 void
skmem_region_params_config(struct skmem_region_params * srp)429 skmem_region_params_config(struct skmem_region_params *srp)
430 {
431 	uint32_t cache_line_size = skmem_cpu_cache_line_size();
432 	size_t seglim, segsize, segcnt;
433 	size_t objsize, objcnt;
434 
435 	ASSERT(srp->srp_id < SKMEM_REGIONS);
436 
437 	/*
438 	 * If magazines layer is disabled system-wide, override
439 	 * the region parameter here.  This will effectively reduce
440 	 * the number of requested objects computed below.  Note that
441 	 * the region may have already been configured to exclude
442 	 * magazines in the default skmem_regions[] array.
443 	 */
444 	if (!skmem_allow_magazines()) {
445 		srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
446 	}
447 
448 	objsize = srp->srp_r_obj_size;
449 	ASSERT(objsize != 0);
450 	objcnt = srp->srp_r_obj_cnt;
451 	ASSERT(objcnt != 0);
452 
453 	if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
454 		size_t align = srp->srp_align;
455 
456 		VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
457 		VERIFY(powerof2(align));
458 		objsize = MAX(objsize, sizeof(uint64_t));
459 #if KASAN
460 		/*
461 		 * When KASAN is enabled, the zone allocator adjusts the
462 		 * element size to include the redzone regions, in which
463 		 * case we assume that the elements won't start on the
464 		 * alignment boundary and thus need to do some fix-ups.
465 		 * These include increasing the effective object size
466 		 * which adds at least 16 bytes to the original size.
467 		 */
468 		objsize += sizeof(uint64_t) + align;
469 #endif /* KASAN */
470 		objsize = P2ROUNDUP(objsize, align);
471 
472 		segsize = objsize;
473 		srp->srp_r_seg_size = (uint32_t)segsize;
474 		segcnt = objcnt;
475 		goto done;
476 	} else {
477 		/* objects are always aligned at CPU cache line size */
478 		srp->srp_align = cache_line_size;
479 	}
480 
481 	/*
482 	 * Start with default segment size for the region, and compute the
483 	 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE).  If the
484 	 * object size is greater, then we adjust the segment size to next
485 	 * multiple of the effective size larger than the object size.
486 	 */
487 	if (srp->srp_r_seg_size == 0) {
488 		switch (srp->srp_id) {
489 		case SKMEM_REGION_UMD:
490 		case SKMEM_REGION_KMD:
491 		case SKMEM_REGION_RXKMD:
492 		case SKMEM_REGION_TXKMD:
493 			srp->srp_r_seg_size = skmem_md_seg_size;
494 			break;
495 
496 		case SKMEM_REGION_BUF:
497 		case SKMEM_REGION_RXBUF:
498 		case SKMEM_REGION_TXBUF:
499 			/*
500 			 * Use the effective driver buffer segment size,
501 			 * since it reflects any randomization done at
502 			 * skmem_region_init() time.
503 			 */
504 			srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
505 			break;
506 
507 		default:
508 			srp->srp_r_seg_size = skmem_seg_size;
509 			break;
510 		}
511 	} else {
512 		srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
513 		    SKMEM_MIN_SEG_SIZE);
514 	}
515 
516 	seglim = srp->srp_r_seg_size;
517 	VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
518 
519 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
520 	    srp->srp_name, seglim, objsize, objcnt);
521 
522 	/*
523 	 * Make sure object size is multiple of CPU cache line
524 	 * size, and that we can evenly divide the segment size.
525 	 */
526 	if (!((objsize < cache_line_size) && (objsize < seglim) &&
527 	    ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
528 		objsize = P2ROUNDUP(objsize, cache_line_size);
529 		while (objsize < seglim && (seglim % objsize) != 0) {
530 			SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
531 			    srp->srp_name, objsize, objsize + cache_line_size);
532 			objsize += cache_line_size;
533 		}
534 	}
535 
536 	/* segment must be larger than object */
537 	while (objsize > seglim) {
538 		SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
539 		    seglim, seglim + SKMEM_MIN_SEG_SIZE);
540 		seglim += SKMEM_MIN_SEG_SIZE;
541 	}
542 
543 	/*
544 	 * Take into account worst-case per-CPU cached
545 	 * objects if this region is configured for it.
546 	 */
547 	if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
548 		uint32_t magazine_max_objs =
549 		    skmem_cache_magazine_max((uint32_t)objsize);
550 		SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
551 		    objcnt, objcnt + magazine_max_objs);
552 		objcnt += magazine_max_objs;
553 	}
554 
555 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
556 	    "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
557 
558 	segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
559 	if (seglim > segsize) {
560 		/*
561 		 * If the segment limit is larger than what we need,
562 		 * avoid memory wastage by shrinking it.
563 		 */
564 		while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
565 			VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
566 			SK_DF(SK_VERB_MEM,
567 			    "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
568 			    srp->srp_name, segsize, objsize, objcnt, seglim,
569 			    P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
570 			    SKMEM_MIN_SEG_SIZE));
571 			seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
572 			    SKMEM_MIN_SEG_SIZE);
573 		}
574 
575 		/* adjust segment size */
576 		segsize = seglim;
577 	} else if (seglim < segsize) {
578 		size_t oseglim = seglim;
579 		/*
580 		 * If the segment limit is less than the segment size,
581 		 * see if increasing it slightly (up to 1.5x the segment
582 		 * size) would allow us to avoid allocating too many
583 		 * extra objects (due to excessive segment count).
584 		 */
585 		while (seglim < segsize && (segsize % seglim) != 0) {
586 			SK_DF(SK_VERB_MEM,
587 			    "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
588 			    srp->srp_name, segsize, objsize, objcnt, seglim,
589 			    (seglim + SKMEM_MIN_SEG_SIZE));
590 			seglim += SKMEM_MIN_SEG_SIZE;
591 			if (seglim >= (oseglim + (oseglim >> 1))) {
592 				break;
593 			}
594 		}
595 
596 		/* can't use P2ROUNDUP since seglim may not be power of 2 */
597 		segsize = SK_ROUNDUP(segsize, seglim);
598 	}
599 	ASSERT(segsize != 0 && (segsize % seglim) == 0);
600 
601 	SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
602 	    srp->srp_name, segsize, seglim);
603 
604 	/* compute segment count, and recompute segment size */
605 	if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
606 		segcnt = 1;
607 	} else {
608 		/*
609 		 * The adjustments above were done in increments of
610 		 * SKMEM_MIN_SEG_SIZE.  If the object size is greater
611 		 * than that, ensure that the segment size is a multiple
612 		 * of the object size.
613 		 */
614 		if (objsize > SKMEM_MIN_SEG_SIZE) {
615 			ASSERT(seglim >= objsize);
616 			if ((seglim % objsize) != 0) {
617 				seglim += (seglim - objsize);
618 			}
619 			/* recompute segsize; see SK_ROUNDUP comment above */
620 			segsize = SK_ROUNDUP(segsize, seglim);
621 		}
622 
623 		segcnt = MAX(1, (segsize / seglim));
624 		segsize /= segcnt;
625 	}
626 
627 	SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
628 	    srp->srp_name, segcnt, segsize);
629 
630 	/* recompute object count to avoid wastage */
631 	objcnt = (segsize * segcnt) / objsize;
632 	ASSERT(objcnt != 0);
633 done:
634 	srp->srp_c_obj_size = (uint32_t)objsize;
635 	srp->srp_c_obj_cnt = (uint32_t)objcnt;
636 	srp->srp_c_seg_size = (uint32_t)segsize;
637 	srp->srp_seg_cnt = (uint32_t)segcnt;
638 
639 	SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
640 	    srp->srp_name, objsize, objcnt, segcnt, segsize);
641 
642 #if SK_LOG
643 	if (__improbable(sk_verbose != 0)) {
644 		char label[32];
645 		(void) snprintf(label, sizeof(label), "REGION_%s:",
646 		    skmem_region_id2name(srp->srp_id));
647 		SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
648 		    (uint32_t)srp->srp_r_obj_cnt,
649 		    (uint32_t)srp->srp_r_obj_size,
650 		    (uint32_t)srp->srp_c_obj_cnt,
651 		    (uint32_t)srp->srp_c_obj_size);
652 	}
653 #endif /* SK_LOG */
654 }
655 
656 /*
657  * Create a region.
658  */
659 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)660 skmem_region_create(const char *name, struct skmem_region_params *srp,
661     sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
662 {
663 	boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
664 	uint32_t cflags = srp->srp_cflags;
665 	struct skmem_region *skr;
666 	uint32_t i;
667 
668 	ASSERT(srp->srp_id < SKMEM_REGIONS);
669 	ASSERT(srp->srp_c_seg_size != 0 &&
670 	    (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
671 	ASSERT(srp->srp_seg_cnt != 0);
672 	ASSERT(srp->srp_c_obj_cnt == 1 ||
673 	    (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
674 	ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
675 
676 	skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
677 	skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
678 	skr->skr_seg_size = srp->srp_c_seg_size;
679 	skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
680 	skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
681 
682 	if (!pseudo) {
683 		skr->skr_seg_max_cnt = srp->srp_seg_cnt;
684 
685 		/* set alignment to CPU cache line size */
686 		skr->skr_params.srp_align = skmem_cpu_cache_line_size();
687 
688 		/* allocate the allocated-address hash chain */
689 		skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
690 		skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
691 		skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
692 		    skr->skr_hash_initial, Z_WAITOK | Z_NOFAIL,
693 		    skmem_tag_segment_hash);
694 		skr->skr_hash_mask = (skr->skr_hash_initial - 1);
695 		skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
696 
697 		for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
698 			TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
699 		}
700 	} else {
701 		/* this upper bound doesn't apply */
702 		skr->skr_seg_max_cnt = 0;
703 
704 		/* pick up value set by skmem_regions_params_config() */
705 		skr->skr_params.srp_align = srp->srp_align;
706 	}
707 
708 	skr->skr_r_obj_size = srp->srp_r_obj_size;
709 	skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
710 	skr->skr_c_obj_size = srp->srp_c_obj_size;
711 	skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
712 
713 	skr->skr_params.srp_md_type = srp->srp_md_type;
714 	skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
715 	skr->skr_params.srp_max_frags = srp->srp_max_frags;
716 
717 	skr->skr_seg_ctor = ctor;
718 	skr->skr_seg_dtor = dtor;
719 	skr->skr_private = private;
720 
721 	lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
722 	    &skmem_region_lock_attr);
723 
724 	TAILQ_INIT(&skr->skr_seg_free);
725 	RB_INIT(&skr->skr_seg_tfree);
726 
727 	skr->skr_id = srp->srp_id;
728 	uuid_generate_random(skr->skr_uuid);
729 	(void) snprintf(skr->skr_name, sizeof(skr->skr_name),
730 	    "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
731 
732 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
733 	    skr->skr_name, SK_KVA(skr));
734 
735 	/* sanity check */
736 	ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
737 	    !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
738 	    SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
739 	    SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
740 	    SKMEM_REGION_CR_PUREDATA)));
741 
742 	skr->skr_cflags = cflags;
743 	if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
744 		skr->skr_mode |= SKR_MODE_NOREDIRECT;
745 	}
746 	if (cflags & SKMEM_REGION_CR_MMAPOK) {
747 		skr->skr_mode |= SKR_MODE_MMAPOK;
748 	}
749 	if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
750 	    (cflags & SKMEM_REGION_CR_UREADONLY)) {
751 		skr->skr_mode |= SKR_MODE_UREADONLY;
752 	}
753 	if (cflags & SKMEM_REGION_CR_KREADONLY) {
754 		skr->skr_mode |= SKR_MODE_KREADONLY;
755 	}
756 	if (cflags & SKMEM_REGION_CR_PERSISTENT) {
757 		skr->skr_mode |= SKR_MODE_PERSISTENT;
758 	}
759 	if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
760 		skr->skr_mode |= SKR_MODE_MONOLITHIC;
761 	}
762 	if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
763 		skr->skr_mode |= SKR_MODE_NOMAGAZINES;
764 	}
765 	if (cflags & SKMEM_REGION_CR_NOCACHE) {
766 		skr->skr_mode |= SKR_MODE_NOCACHE;
767 	}
768 	if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
769 		skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
770 	}
771 	if (cflags & SKMEM_REGION_CR_SHAREOK) {
772 		skr->skr_mode |= SKR_MODE_SHAREOK;
773 	}
774 	if (cflags & SKMEM_REGION_CR_IODIR_IN) {
775 		skr->skr_mode |= SKR_MODE_IODIR_IN;
776 	}
777 	if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
778 		skr->skr_mode |= SKR_MODE_IODIR_OUT;
779 	}
780 	if (cflags & SKMEM_REGION_CR_GUARD) {
781 		skr->skr_mode |= SKR_MODE_GUARD;
782 	}
783 	if (cflags & SKMEM_REGION_CR_PUREDATA) {
784 		skr->skr_mode |= SKR_MODE_PUREDATA;
785 	}
786 	if (cflags & SKMEM_REGION_CR_PSEUDO) {
787 		skr->skr_mode |= SKR_MODE_PSEUDO;
788 	}
789 
790 #if XNU_TARGET_OS_OSX
791 	/*
792 	 * Mark all regions as persistent except for the guard and Intrinsic
793 	 * regions.
794 	 * This is to ensure that kernel threads won't be faulting-in while
795 	 * accessing these memory regions. We have observed various kinds of
796 	 * kernel panics due to kernel threads faulting on non-wired memory
797 	 * access when the VM subsystem is not in a state to swap-in the page.
798 	 */
799 	if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
800 	    (skr->skr_mode & SKR_MODE_GUARD))) {
801 		skr->skr_mode |= SKR_MODE_PERSISTENT;
802 	}
803 #endif /* XNU_TARGET_OS_OSX */
804 
805 	/* SKR_MODE_UREADONLY only takes effect for user task mapping */
806 	skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
807 	skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
808 	skr->skr_bufspec.purgeable = TRUE;
809 	skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
810 	skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
811 	skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
812 	skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
813 	skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
814 	skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
815 
816 	/* allocate segment bitmaps */
817 	if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
818 		ASSERT(skr->skr_seg_max_cnt != 0);
819 		skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
820 		skr->skr_seg_bmap = sk_alloc_data(BITMAP_SIZE(skr->skr_seg_max_cnt),
821 		    Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
822 		ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
823 		    (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
824 
825 		/* mark all bitmaps as free (bit set) */
826 		bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
827 	}
828 
829 	/*
830 	 * Populate the freelist by allocating all segments for the
831 	 * region, which will be mapped but not faulted-in, and then
832 	 * immediately insert each to the freelist.  That will in
833 	 * turn unmap the segment's memory object.
834 	 */
835 	SKR_LOCK(skr);
836 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
837 		char zone_name[64];
838 		(void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
839 		    SKMEM_ZONE_PREFIX, name);
840 		skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
841 		    ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
842 	} else {
843 		/* create a backing IOSKRegion object */
844 		if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
845 		    (IOSKSize)skr->skr_seg_size,
846 		    (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
847 			SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed",
848 			    skr->skr_name, (uint32_t)skr->skr_seg_size,
849 			    (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags,
850 			    SKMEM_REGION_CR_BITS);
851 			goto failed;
852 		}
853 	}
854 
855 	ASSERT(skr->skr_seg_objs != 0);
856 
857 	++skr->skr_refcnt;      /* for caller */
858 	SKR_UNLOCK(skr);
859 
860 	SKMEM_REGION_LOCK();
861 	TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
862 	SKMEM_REGION_UNLOCK();
863 
864 	SK_DF(SK_VERB_MEM_REGION,
865 	    "  [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b",
866 	    (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
867 	    (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
868 	    skr->skr_cflags, SKMEM_REGION_CR_BITS);
869 
870 	return skr;
871 
872 failed:
873 	SKR_LOCK_ASSERT_HELD(skr);
874 	skmem_region_destroy(skr);
875 
876 	return NULL;
877 }
878 
879 /*
880  * Destroy a region.
881  */
882 static void
skmem_region_destroy(struct skmem_region * skr)883 skmem_region_destroy(struct skmem_region *skr)
884 {
885 	struct skmem_region *mskr;
886 
887 	SKR_LOCK_ASSERT_HELD(skr);
888 
889 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx",
890 	    skr->skr_name, SK_KVA(skr));
891 
892 	/*
893 	 * Panic if we detect there are unfreed segments; the caller
894 	 * destroying this region is responsible for ensuring that all
895 	 * allocated segments have been freed prior to getting here.
896 	 */
897 	ASSERT(skr->skr_refcnt == 0);
898 	if (skr->skr_seginuse != 0) {
899 		panic("%s: '%s' (%p) not empty (%u unfreed)",
900 		    __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
901 		/* NOTREACHED */
902 		__builtin_unreachable();
903 	}
904 
905 	if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
906 		SKR_UNLOCK(skr);
907 		SKMEM_REGION_LOCK();
908 		TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
909 		SKMEM_REGION_UNLOCK();
910 		SKR_LOCK(skr);
911 		ASSERT(skr->skr_refcnt == 0);
912 	}
913 
914 	/*
915 	 * Undo what's done earlier at region creation time.
916 	 */
917 	skmem_region_depopulate(skr);
918 	ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
919 	ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
920 	ASSERT(skr->skr_seg_free_cnt == 0);
921 
922 	if (skr->skr_reg != NULL) {
923 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
924 		IOSKRegionDestroy(skr->skr_reg);
925 		skr->skr_reg = NULL;
926 	}
927 
928 	if (skr->skr_zreg != NULL) {
929 		ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
930 		zdestroy(skr->skr_zreg);
931 		skr->skr_zreg = NULL;
932 	}
933 
934 	if (skr->skr_seg_bmap != NULL) {
935 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
936 #if (DEBUG || DEVELOPMENT)
937 		ASSERT(skr->skr_seg_bmap_len != 0);
938 		/* must have been set to vacant (bit set) by now */
939 		assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
940 #endif /* DEBUG || DEVELOPMENT */
941 
942 		sk_free_data(skr->skr_seg_bmap, BITMAP_SIZE(skr->skr_seg_max_cnt));
943 		skr->skr_seg_bmap = NULL;
944 		skr->skr_seg_bmap_len = 0;
945 	}
946 	ASSERT(skr->skr_seg_bmap_len == 0);
947 
948 	if (skr->skr_hash_table != NULL) {
949 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
950 #if (DEBUG || DEVELOPMENT)
951 		for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
952 			ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
953 		}
954 #endif /* DEBUG || DEVELOPMENT */
955 
956 		sk_free_type_array(struct sksegment_bkt, skr->skr_hash_mask + 1,
957 		    skr->skr_hash_table);
958 		skr->skr_hash_table = NULL;
959 	}
960 	if ((mskr = skr->skr_mirror) != NULL) {
961 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
962 		skr->skr_mirror = NULL;
963 		mskr->skr_mode &= ~SKR_MODE_MIRRORED;
964 	}
965 	SKR_UNLOCK(skr);
966 
967 	if (mskr != NULL) {
968 		skmem_region_release(mskr);
969 	}
970 
971 	lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
972 
973 	zfree(skr_zone, skr);
974 }
975 
976 /*
977  * Mirror mskr (slave) to skr (master).
978  */
979 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)980 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
981 {
982 	SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ",
983 	    SK_KVA(skr), SK_KVA(mskr));
984 
985 	SKR_LOCK(skr);
986 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
987 	ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
988 	ASSERT(skr->skr_mirror == NULL);
989 
990 	/* both regions must share identical parameters */
991 	ASSERT(skr->skr_size == mskr->skr_size);
992 	ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
993 	ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
994 
995 	skr->skr_mirror = mskr;
996 	skmem_region_retain(mskr);
997 	mskr->skr_mode |= SKR_MODE_MIRRORED;
998 	SKR_UNLOCK(skr);
999 }
1000 
1001 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm)1002 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm)
1003 {
1004 	SKR_LOCK(skr);
1005 	if (skm != NULL) {
1006 		ASSERT(!(skr->skr_mode & SKR_MODE_SLAB));
1007 		skr->skr_mode |= SKR_MODE_SLAB;
1008 		ASSERT(skr->skr_cache == NULL);
1009 		skr->skr_cache = skm;
1010 		skmem_region_retain_locked(skr);
1011 		SKR_UNLOCK(skr);
1012 	} else {
1013 		ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1014 		skr->skr_mode &= ~SKR_MODE_SLAB;
1015 		ASSERT(skr->skr_cache != NULL);
1016 		skr->skr_cache = NULL;
1017 		if (!skmem_region_release_locked(skr)) {
1018 			SKR_UNLOCK(skr);
1019 		}
1020 	}
1021 }
1022 
1023 /*
1024  * Common routines for skmem_region_{alloc,mirror_alloc}.
1025  */
1026 static void *
skmem_region_alloc_common(struct skmem_region * skr,struct sksegment * sg)1027 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg)
1028 {
1029 	struct sksegment_bkt *sgb;
1030 	void *addr;
1031 
1032 	SKR_LOCK_ASSERT_HELD(skr);
1033 
1034 	ASSERT(sg->sg_md != NULL);
1035 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1036 	addr = (void *)sg->sg_start;
1037 	sgb = SKMEM_REGION_HASH(skr, addr);
1038 	ASSERT(sg->sg_link.tqe_next == NULL);
1039 	ASSERT(sg->sg_link.tqe_prev == NULL);
1040 	TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1041 
1042 	skr->skr_seginuse++;
1043 	skr->skr_meminuse += skr->skr_seg_size;
1044 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1045 		skr->skr_w_meminuse += skr->skr_seg_size;
1046 	}
1047 	skr->skr_alloc++;
1048 
1049 	return addr;
1050 }
1051 
1052 /*
1053  * Allocate a segment from the region.
1054  */
1055 void *
skmem_region_alloc(struct skmem_region * skr,void ** maddr,struct sksegment ** retsg,struct sksegment ** retsgm,uint32_t skmflag)1056 skmem_region_alloc(struct skmem_region *skr, void **maddr,
1057     struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag)
1058 {
1059 	struct sksegment *sg = NULL;
1060 	struct sksegment *sg1 = NULL;
1061 	void *addr = NULL, *addr1 = NULL;
1062 	uint32_t retries = 0;
1063 
1064 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1065 
1066 	if (retsg != NULL) {
1067 		*retsg = NULL;
1068 	}
1069 	if (retsgm != NULL) {
1070 		*retsgm = NULL;
1071 	}
1072 
1073 	/* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1074 	VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1075 	    (SKMEM_NOSLEEP | SKMEM_FAILOK));
1076 
1077 	SKR_LOCK(skr);
1078 	while (sg == NULL) {
1079 		/* see if there's a segment in the freelist */
1080 		sg = TAILQ_FIRST(&skr->skr_seg_free);
1081 		if (sg == NULL) {
1082 			/* see if we can grow the freelist */
1083 			sg = sksegment_freelist_grow(skr);
1084 			if (sg != NULL) {
1085 				break;
1086 			}
1087 
1088 			if (skr->skr_mode & SKR_MODE_SLAB) {
1089 				SKR_UNLOCK(skr);
1090 				/*
1091 				 * None found; it's possible that the slab
1092 				 * layer is caching extra amount, so ask
1093 				 * skmem_cache to reap/purge its caches.
1094 				 */
1095 				skmem_cache_reap_now(skr->skr_cache, TRUE);
1096 				SKR_LOCK(skr);
1097 				/*
1098 				 * If we manage to get some freed, try again.
1099 				 */
1100 				if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1101 					continue;
1102 				}
1103 			}
1104 
1105 			/*
1106 			 * Give up if this is a non-blocking allocation,
1107 			 * or if this is a blocking allocation but the
1108 			 * caller is willing to retry.
1109 			 */
1110 			if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1111 				break;
1112 			}
1113 
1114 			/* otherwise we wait until one is available */
1115 			++skr->skr_seg_waiters;
1116 			(void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1117 			    (PZERO - 1), skr->skr_name, NULL);
1118 		}
1119 	}
1120 
1121 	SKR_LOCK_ASSERT_HELD(skr);
1122 
1123 	if (sg != NULL) {
1124 retry:
1125 		/*
1126 		 * We have a segment; remove it from the freelist and
1127 		 * insert it into the allocated-address hash chain.
1128 		 * Note that this may return NULL if we can't allocate
1129 		 * the memory descriptor.
1130 		 */
1131 		if (sksegment_freelist_remove(skr, sg, skmflag,
1132 		    FALSE) == NULL) {
1133 			ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1134 			ASSERT(sg->sg_md == NULL);
1135 			ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1136 
1137 			/*
1138 			 * If it's non-blocking allocation, simply just give
1139 			 * up and let the caller decide when to retry.  Else,
1140 			 * it gets a bit complicated due to the contract we
1141 			 * have for blocking allocations with the client; the
1142 			 * most sensible thing to do here is to retry the
1143 			 * allocation ourselves.  Note that we keep using the
1144 			 * same segment we originally got, since we only need
1145 			 * the memory descriptor to be allocated for it; thus
1146 			 * we make sure we don't release the region lock when
1147 			 * retrying allocation.  Doing so is crucial when the
1148 			 * region is mirrored, since the segment indices on
1149 			 * both regions need to match.
1150 			 */
1151 			if (skmflag & SKMEM_NOSLEEP) {
1152 				SK_ERR("\"%s\": failed to allocate segment "
1153 				    "(non-sleeping mode)", skr->skr_name);
1154 				sg = NULL;
1155 			} else {
1156 				if (++retries > SKMEM_WDT_MAXTIME) {
1157 					panic_plain("\"%s\": failed to "
1158 					    "allocate segment (sleeping mode) "
1159 					    "after %u retries\n\n%s",
1160 					    skr->skr_name, SKMEM_WDT_MAXTIME,
1161 					    skmem_dump(skr));
1162 					/* NOTREACHED */
1163 					__builtin_unreachable();
1164 				} else {
1165 					SK_ERR("\"%s\": failed to allocate "
1166 					    "segment (sleeping mode): %u "
1167 					    "retries", skr->skr_name, retries);
1168 				}
1169 				if (skr->skr_mode & SKR_MODE_SLAB) {
1170 					/*
1171 					 * We can't get any memory descriptor
1172 					 * for this segment; reap extra cached
1173 					 * objects from the slab layer and hope
1174 					 * that we get lucky next time around.
1175 					 *
1176 					 * XXX [email protected]: perhaps also
1177 					 * trigger the zone allocator to do
1178 					 * its garbage collection here?
1179 					 */
1180 					skmem_cache_reap();
1181 				}
1182 				delay(1 * USEC_PER_SEC);        /* 1 sec */
1183 				goto retry;
1184 			}
1185 		}
1186 
1187 		if (sg != NULL) {
1188 			/* insert to allocated-address hash chain */
1189 			addr = skmem_region_alloc_common(skr, sg);
1190 		}
1191 	}
1192 
1193 	if (sg == NULL) {
1194 		VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1195 		if (skmflag & SKMEM_PANIC) {
1196 			VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1197 			    SKMEM_NOSLEEP);
1198 			/*
1199 			 * If is a failed non-blocking alloc and the caller
1200 			 * insists that it must be successful, then panic.
1201 			 */
1202 			panic_plain("\"%s\": skr 0x%p unable to satisfy "
1203 			    "mandatory allocation\n", skr->skr_name, skr);
1204 			/* NOTREACHED */
1205 			__builtin_unreachable();
1206 		} else {
1207 			/*
1208 			 * Give up if this is a non-blocking allocation,
1209 			 * or one where the caller is willing to handle
1210 			 * allocation failures.
1211 			 */
1212 			goto done;
1213 		}
1214 	}
1215 
1216 	ASSERT((mach_vm_address_t)addr == sg->sg_start);
1217 
1218 #if SK_LOG
1219 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1220 	    SK_KVA(skr), SK_KVA(sg));
1221 	if (skr->skr_mirror == NULL ||
1222 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1223 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1224 		    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1225 	} else {
1226 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1227 		    sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start),
1228 		    SK_KVA(sg->sg_end));
1229 	}
1230 #endif /* SK_LOG */
1231 
1232 	/*
1233 	 * If mirroring, allocate shadow object from slave region.
1234 	 */
1235 	if (skr->skr_mirror != NULL) {
1236 		ASSERT(skr->skr_mirror != skr);
1237 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1238 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1239 		addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg, &sg1);
1240 		ASSERT(addr1 != NULL);
1241 		ASSERT(sg1 != NULL && sg1 != sg);
1242 		ASSERT(sg1->sg_index == sg->sg_index);
1243 	}
1244 
1245 done:
1246 	SKR_UNLOCK(skr);
1247 
1248 	/* return segment metadata to caller if asked (reference not needed) */
1249 	if (addr != NULL) {
1250 		if (retsg != NULL) {
1251 			*retsg = sg;
1252 		}
1253 		if (retsgm != NULL) {
1254 			*retsgm = sg1;
1255 		}
1256 	}
1257 
1258 	if (maddr != NULL) {
1259 		*maddr = addr1;
1260 	}
1261 
1262 	return addr;
1263 }
1264 
1265 /*
1266  * Allocate a segment from a mirror region at the same index.  While it
1267  * is somewhat a simplified variant of skmem_region_alloc, keeping it
1268  * separate allows us to avoid further convoluting that routine.
1269  */
1270 static void *
skmem_region_mirror_alloc(struct skmem_region * skr,struct sksegment * sg0,struct sksegment ** retsg)1271 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1272     struct sksegment **retsg)
1273 {
1274 	struct sksegment sg_key = { .sg_index = sg0->sg_index };
1275 	struct sksegment *sg = NULL;
1276 	void *addr = NULL;
1277 
1278 	ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1279 	ASSERT(skr->skr_mirror == NULL);
1280 	ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1281 
1282 	if (retsg != NULL) {
1283 		*retsg = NULL;
1284 	}
1285 
1286 	SKR_LOCK(skr);
1287 
1288 	/*
1289 	 * See if we can find one in the freelist first.  Otherwise,
1290 	 * create a new segment of the same index and add that to the
1291 	 * freelist.  We would always get a segment since both regions
1292 	 * are synchronized when it comes to the indices of allocated
1293 	 * segments.
1294 	 */
1295 	sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1296 	if (sg == NULL) {
1297 		sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1298 		VERIFY(sg != NULL);
1299 	}
1300 	VERIFY(sg->sg_index == sg0->sg_index);
1301 
1302 	/*
1303 	 * We have a segment; remove it from the freelist and insert
1304 	 * it into the allocated-address hash chain.  This either
1305 	 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1306 	 * can't be allocated.
1307 	 *
1308 	 * TODO: consider retrying IOBMD allocation attempts if needed.
1309 	 */
1310 	sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1311 	VERIFY(sg != NULL);
1312 
1313 	/* insert to allocated-address hash chain */
1314 	addr = skmem_region_alloc_common(skr, sg);
1315 
1316 #if SK_LOG
1317 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1318 	    SK_KVA(skr), SK_KVA(sg));
1319 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1320 	    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1321 #endif /* SK_LOG */
1322 
1323 	SKR_UNLOCK(skr);
1324 
1325 	/* return segment metadata to caller if asked (reference not needed) */
1326 	if (retsg != NULL) {
1327 		*retsg = sg;
1328 	}
1329 
1330 	return addr;
1331 }
1332 
1333 /*
1334  * Free a segment to the region.
1335  */
1336 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1337 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1338 {
1339 	struct sksegment_bkt *sgb;
1340 	struct sksegment *sg, *tsg;
1341 
1342 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1343 
1344 	/*
1345 	 * Search the hash chain to find a matching segment for the
1346 	 * given address.  If found, remove the segment from the
1347 	 * hash chain and insert it into the freelist.  Otherwise,
1348 	 * we panic since the caller has given us a bogus address.
1349 	 */
1350 	SKR_LOCK(skr);
1351 	sgb = SKMEM_REGION_HASH(skr, addr);
1352 	TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1353 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1354 		if (sg->sg_start == (mach_vm_address_t)addr) {
1355 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1356 			sg->sg_link.tqe_next = NULL;
1357 			sg->sg_link.tqe_prev = NULL;
1358 			break;
1359 		}
1360 	}
1361 
1362 	ASSERT(sg != NULL);
1363 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1364 		ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1365 		skr->skr_w_meminuse -= skr->skr_seg_size;
1366 	}
1367 	sksegment_freelist_insert(skr, sg, FALSE);
1368 
1369 	ASSERT(skr->skr_seginuse != 0);
1370 	skr->skr_seginuse--;
1371 	skr->skr_meminuse -= skr->skr_seg_size;
1372 	skr->skr_free++;
1373 
1374 #if SK_LOG
1375 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1376 	    SK_KVA(skr), SK_KVA(sg));
1377 	if (skr->skr_mirror == NULL ||
1378 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1379 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1380 		    sg->sg_index, SK_KVA(addr),
1381 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1382 	} else {
1383 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1384 		    sg->sg_index, SK_KVA(sg), SK_KVA(addr),
1385 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1386 	}
1387 #endif /* SK_LOG */
1388 
1389 	/*
1390 	 * If mirroring, also free shadow object in slave region.
1391 	 */
1392 	if (skr->skr_mirror != NULL) {
1393 		ASSERT(maddr != NULL);
1394 		ASSERT(skr->skr_mirror != skr);
1395 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1396 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1397 		skmem_region_free(skr->skr_mirror, maddr, NULL);
1398 	}
1399 
1400 	/* wake up any blocked threads waiting for a segment */
1401 	if (skr->skr_seg_waiters != 0) {
1402 		SK_DF(SK_VERB_MEM_REGION,
1403 		    "sg 0x%llx waking up %u waiters", SK_KVA(sg),
1404 		    skr->skr_seg_waiters);
1405 		skr->skr_seg_waiters = 0;
1406 		wakeup(&skr->skr_seg_free);
1407 	}
1408 	SKR_UNLOCK(skr);
1409 }
1410 
1411 __attribute__((always_inline))
1412 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1413 skmem_region_retain_locked(struct skmem_region *skr)
1414 {
1415 	SKR_LOCK_ASSERT_HELD(skr);
1416 	skr->skr_refcnt++;
1417 	ASSERT(skr->skr_refcnt != 0);
1418 }
1419 
1420 /*
1421  * Retain a segment.
1422  */
1423 void
skmem_region_retain(struct skmem_region * skr)1424 skmem_region_retain(struct skmem_region *skr)
1425 {
1426 	SKR_LOCK(skr);
1427 	skmem_region_retain_locked(skr);
1428 	SKR_UNLOCK(skr);
1429 }
1430 
1431 __attribute__((always_inline))
1432 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1433 skmem_region_release_locked(struct skmem_region *skr)
1434 {
1435 	SKR_LOCK_ASSERT_HELD(skr);
1436 	ASSERT(skr->skr_refcnt != 0);
1437 	if (--skr->skr_refcnt == 0) {
1438 		skmem_region_destroy(skr);
1439 		return TRUE;
1440 	}
1441 	return FALSE;
1442 }
1443 
1444 /*
1445  * Release (and potentially destroy) a segment.
1446  */
1447 boolean_t
skmem_region_release(struct skmem_region * skr)1448 skmem_region_release(struct skmem_region *skr)
1449 {
1450 	boolean_t lastref;
1451 
1452 	SKR_LOCK(skr);
1453 	if (!(lastref = skmem_region_release_locked(skr))) {
1454 		SKR_UNLOCK(skr);
1455 	}
1456 
1457 	return lastref;
1458 }
1459 
1460 /*
1461  * Depopulate the segment freelist.
1462  */
1463 static void
skmem_region_depopulate(struct skmem_region * skr)1464 skmem_region_depopulate(struct skmem_region *skr)
1465 {
1466 	struct sksegment *sg, *tsg;
1467 
1468 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
1469 	    skr->skr_name, SK_KVA(skr));
1470 
1471 	SKR_LOCK_ASSERT_HELD(skr);
1472 	ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1473 
1474 	TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1475 		struct sksegment *sg0;
1476 		uint32_t i;
1477 
1478 		i = sg->sg_index;
1479 		sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1480 		VERIFY(sg0 == sg);
1481 
1482 		sksegment_destroy(skr, sg);
1483 		ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1484 	}
1485 }
1486 
1487 /*
1488  * Free tree segment compare routine.
1489  */
1490 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1491 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1492 {
1493 	return sg1->sg_index - sg2->sg_index;
1494 }
1495 
1496 /*
1497  * Create a segment.
1498  *
1499  * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1500  */
1501 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1502 sksegment_create(struct skmem_region *skr, uint32_t i)
1503 {
1504 	struct sksegment *sg = NULL;
1505 	bitmap_t *bmap;
1506 
1507 	SKR_LOCK_ASSERT_HELD(skr);
1508 
1509 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1510 	ASSERT(i < skr->skr_seg_max_cnt);
1511 	ASSERT(skr->skr_reg != NULL);
1512 	ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1513 
1514 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1515 	ASSERT(bit_test(*bmap, i % BMAPSZ));
1516 
1517 	sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1518 	bzero(sg, sg_size);
1519 
1520 	sg->sg_region = skr;
1521 	sg->sg_index = i;
1522 	sg->sg_state = SKSEG_STATE_DETACHED;
1523 
1524 	/* claim it (clear bit) */
1525 	bit_clear(*bmap, i % BMAPSZ);
1526 
1527 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b", i,
1528 	    SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode,
1529 	    SKR_MODE_BITS);
1530 
1531 	return sg;
1532 }
1533 
1534 /*
1535  * Destroy a segment.
1536  *
1537  * Set the bit for the segment's index in skr_seg_bmap bitmap,
1538  * indicating that it is now vacant.
1539  */
1540 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1541 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1542 {
1543 	uint32_t i = sg->sg_index;
1544 	bitmap_t *bmap;
1545 
1546 	SKR_LOCK_ASSERT_HELD(skr);
1547 
1548 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1549 	ASSERT(skr == sg->sg_region);
1550 	ASSERT(skr->skr_reg != NULL);
1551 	ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1552 	ASSERT(i < skr->skr_seg_max_cnt);
1553 
1554 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1555 	ASSERT(!bit_test(*bmap, i % BMAPSZ));
1556 
1557 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b",
1558 	    i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end),
1559 	    skr->skr_mode, SKR_MODE_BITS);
1560 
1561 	/*
1562 	 * Undo what's done earlier at segment creation time.
1563 	 */
1564 
1565 	ASSERT(sg->sg_md == NULL);
1566 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1567 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1568 
1569 	/* release it (set bit) */
1570 	bit_set(*bmap, i % BMAPSZ);
1571 
1572 	skmem_cache_free(skmem_sg_cache, sg);
1573 }
1574 
1575 /*
1576  * Insert a segment into freelist (freeing the segment).
1577  */
1578 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1579 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1580     boolean_t populating)
1581 {
1582 	SKR_LOCK_ASSERT_HELD(skr);
1583 
1584 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1585 	ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1586 	ASSERT(skr == sg->sg_region);
1587 	ASSERT(skr->skr_reg != NULL);
1588 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1589 
1590 	/*
1591 	 * If the region is being populated, then we're done.
1592 	 */
1593 	if (__improbable(populating)) {
1594 		ASSERT(sg->sg_md == NULL);
1595 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1596 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1597 	} else {
1598 		IOSKMemoryBufferRef md;
1599 		IOReturn err;
1600 
1601 		ASSERT(sg->sg_md != NULL);
1602 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1603 
1604 		/*
1605 		 * Let the client remove the memory from IOMMU, and unwire it.
1606 		 */
1607 		if (skr->skr_seg_dtor != NULL) {
1608 			skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1609 		}
1610 
1611 		ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1612 		    sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1613 
1614 		IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1615 		VERIFY(sg->sg_md == md);
1616 
1617 		/* if persistent, unwire this memory now */
1618 		if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1619 			err = IOSKMemoryUnwire(md);
1620 			if (err != kIOReturnSuccess) {
1621 				panic("Fail to unwire md %p, err %d", md, err);
1622 			}
1623 		}
1624 
1625 		/* mark memory as empty/discarded for consistency */
1626 		err = IOSKMemoryDiscard(md);
1627 		if (err != kIOReturnSuccess) {
1628 			panic("Fail to discard md %p, err %d", md, err);
1629 		}
1630 
1631 		IOSKMemoryDestroy(md);
1632 		sg->sg_md = NULL;
1633 		sg->sg_start = sg->sg_end = 0;
1634 		sg->sg_state = SKSEG_STATE_DETACHED;
1635 
1636 		ASSERT(skr->skr_memtotal >= skr->skr_seg_size);
1637 		skr->skr_memtotal -= skr->skr_seg_size;
1638 	}
1639 
1640 	sg->sg_type = SKSEG_TYPE_FREE;
1641 	ASSERT(sg->sg_link.tqe_next == NULL);
1642 	ASSERT(sg->sg_link.tqe_prev == NULL);
1643 	TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1644 	ASSERT(sg->sg_node.rbe_left == NULL);
1645 	ASSERT(sg->sg_node.rbe_right == NULL);
1646 	ASSERT(sg->sg_node.rbe_parent == NULL);
1647 	RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1648 	++skr->skr_seg_free_cnt;
1649 	ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1650 }
1651 
1652 /*
1653  * Remove a segment from the freelist (allocating the segment).
1654  */
1655 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1656 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1657     uint32_t skmflag, boolean_t purging)
1658 {
1659 #pragma unused(skmflag)
1660 	mach_vm_address_t segstart;
1661 	IOReturn err;
1662 
1663 	SKR_LOCK_ASSERT_HELD(skr);
1664 
1665 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1666 	ASSERT(sg != NULL);
1667 	ASSERT(skr == sg->sg_region);
1668 	ASSERT(skr->skr_reg != NULL);
1669 	ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1670 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1671 
1672 #if (DEVELOPMENT || DEBUG)
1673 	uint64_t mtbf = skmem_region_get_mtbf();
1674 	/*
1675 	 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1676 	 */
1677 	if (__improbable(mtbf != 0 && !purging &&
1678 	    (net_uptime_ms() % mtbf) == 0 &&
1679 	    !(skmflag & SKMEM_PANIC))) {
1680 		SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure",
1681 		    skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1682 		net_update_uptime();
1683 		return NULL;
1684 	}
1685 #endif /* (DEVELOPMENT || DEBUG) */
1686 
1687 	TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1688 	sg->sg_link.tqe_next = NULL;
1689 	sg->sg_link.tqe_prev = NULL;
1690 	RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1691 	sg->sg_node.rbe_left = NULL;
1692 	sg->sg_node.rbe_right = NULL;
1693 	sg->sg_node.rbe_parent = NULL;
1694 
1695 	ASSERT(skr->skr_seg_free_cnt != 0);
1696 	--skr->skr_seg_free_cnt;
1697 
1698 	/*
1699 	 * If the region is being depopulated, then we're done.
1700 	 */
1701 	if (__improbable(purging)) {
1702 		ASSERT(sg->sg_md == NULL);
1703 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1704 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1705 		sg->sg_type = SKSEG_TYPE_DESTROYED;
1706 		return sg;
1707 	}
1708 
1709 	ASSERT(sg->sg_md == NULL);
1710 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1711 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1712 
1713 	/* created as non-volatile (mapped) upon success */
1714 	if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1715 	    &skr->skr_bufspec, &segstart)) == NULL) {
1716 		ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1717 		if (skmflag & SKMEM_PANIC) {
1718 			/* if the caller insists for a success then panic */
1719 			panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1720 			    "to satisfy mandatory allocation\n", skr->skr_name,
1721 			    skr, sg, sg->sg_index);
1722 			/* NOTREACHED */
1723 			__builtin_unreachable();
1724 		}
1725 		/* reinsert this segment to freelist */
1726 		ASSERT(sg->sg_link.tqe_next == NULL);
1727 		ASSERT(sg->sg_link.tqe_prev == NULL);
1728 		TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1729 		ASSERT(sg->sg_node.rbe_left == NULL);
1730 		ASSERT(sg->sg_node.rbe_right == NULL);
1731 		ASSERT(sg->sg_node.rbe_parent == NULL);
1732 		RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1733 		++skr->skr_seg_free_cnt;
1734 		return NULL;
1735 	}
1736 
1737 	sg->sg_start = segstart;
1738 	sg->sg_end = (segstart + skr->skr_seg_size);
1739 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1740 
1741 	/* mark memory as non-volatile just to be consistent */
1742 	err = IOSKMemoryReclaim(sg->sg_md);
1743 	if (err != kIOReturnSuccess) {
1744 		panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1745 	}
1746 
1747 	/* if persistent, wire down its memory now */
1748 	if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1749 		err = IOSKMemoryWire(sg->sg_md);
1750 		if (err != kIOReturnSuccess) {
1751 			panic("Fail to wire md %p, err %d", sg->sg_md, err);
1752 		}
1753 	}
1754 
1755 	err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1756 	if (err != kIOReturnSuccess) {
1757 		panic("Fail to set md %p, err %d", sg->sg_md, err);
1758 	}
1759 
1760 	/*
1761 	 * Let the client wire it and insert to IOMMU, if applicable.
1762 	 * Try to find out if it's wired and set the right state.
1763 	 */
1764 	if (skr->skr_seg_ctor != NULL) {
1765 		skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1766 	}
1767 
1768 	sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1769 	    SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1770 
1771 	skr->skr_memtotal += skr->skr_seg_size;
1772 
1773 	ASSERT(sg->sg_md != NULL);
1774 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1775 
1776 	sg->sg_type = SKSEG_TYPE_ALLOC;
1777 	return sg;
1778 }
1779 
1780 /*
1781  * Find the first available index and allocate a segment at that index.
1782  */
1783 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1784 sksegment_freelist_grow(struct skmem_region *skr)
1785 {
1786 	struct sksegment *sg = NULL;
1787 	uint32_t i, j, idx;
1788 
1789 	SKR_LOCK_ASSERT_HELD(skr);
1790 
1791 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1792 	ASSERT(skr->skr_seg_bmap_len != 0);
1793 	ASSERT(skr->skr_seg_max_cnt != 0);
1794 
1795 	for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1796 		bitmap_t *bmap, mask;
1797 		uint32_t end = (BMAPSZ - 1);
1798 
1799 		if (i == (skr->skr_seg_bmap_len - 1)) {
1800 			end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1801 		}
1802 
1803 		bmap = &skr->skr_seg_bmap[i];
1804 		mask = BMASK64(0, end);
1805 
1806 		j = ffsll((*bmap) & mask);
1807 		if (j == 0) {
1808 			continue;
1809 		}
1810 
1811 		--j;
1812 		idx = (i * BMAPSZ) + j;
1813 
1814 		sg = sksegment_alloc_with_idx(skr, idx);
1815 
1816 		/* we're done */
1817 		break;
1818 	}
1819 
1820 	ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1821 	return sg;
1822 }
1823 
1824 /*
1825  * Create a single segment at a specific index and add it to the freelist.
1826  */
1827 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1828 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1829 {
1830 	struct sksegment *sg;
1831 
1832 	SKR_LOCK_ASSERT_HELD(skr);
1833 
1834 	if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1835 		panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1836 		    __func__, skr->skr_name, (void *)skr, idx,
1837 		    (skr->skr_seg_max_cnt - 1));
1838 		/* NOTREACHED */
1839 		__builtin_unreachable();
1840 	}
1841 
1842 	/* must not fail, blocking alloc */
1843 	sg = sksegment_create(skr, idx);
1844 	VERIFY(sg != NULL);
1845 	VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1846 
1847 	/* populate the freelist */
1848 	sksegment_freelist_insert(skr, sg, TRUE);
1849 	ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1850 #if (DEVELOPMENT || DEBUG)
1851 	struct sksegment sg_key = { .sg_index = sg->sg_index };
1852 	ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1853 #endif /* (DEVELOPMENT || DEBUG) */
1854 
1855 	SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1856 
1857 	return sg;
1858 }
1859 
1860 /*
1861  * Rescale the regions's allocated-address hash table.
1862  */
1863 static void
skmem_region_hash_rescale(struct skmem_region * skr)1864 skmem_region_hash_rescale(struct skmem_region *skr)
1865 {
1866 	struct sksegment_bkt *old_table, *new_table;
1867 	size_t old_size, new_size;
1868 	uint32_t i, moved = 0;
1869 
1870 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
1871 		ASSERT(skr->skr_hash_table == NULL);
1872 		/* this is no-op for pseudo region */
1873 		return;
1874 	}
1875 
1876 	ASSERT(skr->skr_hash_table != NULL);
1877 	/* insist that we are executing in the update thread call context */
1878 	ASSERT(sk_is_region_update_protected());
1879 
1880 	/*
1881 	 * To get small average lookup time (lookup depth near 1.0), the hash
1882 	 * table size should be roughly the same (not necessarily equivalent)
1883 	 * as the region size.
1884 	 */
1885 	new_size = MAX(skr->skr_hash_initial,
1886 	    (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
1887 	new_size = MIN(skr->skr_hash_limit, new_size);
1888 	old_size = (skr->skr_hash_mask + 1);
1889 
1890 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
1891 		return;
1892 	}
1893 
1894 	new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
1895 	    Z_NOWAIT, skmem_tag_segment_hash);
1896 	if (__improbable(new_table == NULL)) {
1897 		return;
1898 	}
1899 
1900 	for (i = 0; i < new_size; i++) {
1901 		TAILQ_INIT(&new_table[i].sgb_head);
1902 	}
1903 
1904 	SKR_LOCK(skr);
1905 
1906 	old_size = (skr->skr_hash_mask + 1);
1907 	old_table = skr->skr_hash_table;
1908 
1909 	skr->skr_hash_mask = (uint32_t)(new_size - 1);
1910 	skr->skr_hash_table = new_table;
1911 	skr->skr_rescale++;
1912 
1913 	for (i = 0; i < old_size; i++) {
1914 		struct sksegment_bkt *sgb = &old_table[i];
1915 		struct sksegment_bkt *new_sgb;
1916 		struct sksegment *sg;
1917 
1918 		while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
1919 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1920 			ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1921 			new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
1922 			TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
1923 			++moved;
1924 		}
1925 		ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
1926 	}
1927 
1928 	SK_DF(SK_VERB_MEM_REGION,
1929 	    "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr),
1930 	    (uint32_t)old_size, (uint32_t)new_size, moved);
1931 
1932 	SKR_UNLOCK(skr);
1933 
1934 	sk_free_type_array(struct sksegment_bkt, old_size, old_table);
1935 }
1936 
1937 /*
1938  * Apply a function to operate on all regions.
1939  */
1940 static void
skmem_region_applyall(void (* func)(struct skmem_region *))1941 skmem_region_applyall(void (*func)(struct skmem_region *))
1942 {
1943 	struct skmem_region *skr;
1944 
1945 	net_update_uptime();
1946 
1947 	SKMEM_REGION_LOCK();
1948 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
1949 		func(skr);
1950 	}
1951 	SKMEM_REGION_UNLOCK();
1952 }
1953 
1954 static void
skmem_region_update(struct skmem_region * skr)1955 skmem_region_update(struct skmem_region *skr)
1956 {
1957 	SKMEM_REGION_LOCK_ASSERT_HELD();
1958 
1959 	/* insist that we are executing in the update thread call context */
1960 	ASSERT(sk_is_region_update_protected());
1961 
1962 	SKR_LOCK(skr);
1963 	/*
1964 	 * If there are threads blocked waiting for an available
1965 	 * segment, wake them up periodically so they can issue
1966 	 * another skmem_cache_reap() to reclaim resources cached
1967 	 * by skmem_cache.
1968 	 */
1969 	if (skr->skr_seg_waiters != 0) {
1970 		SK_DF(SK_VERB_MEM_REGION,
1971 		    "waking up %u waiters to reclaim", skr->skr_seg_waiters);
1972 		skr->skr_seg_waiters = 0;
1973 		wakeup(&skr->skr_seg_free);
1974 	}
1975 	SKR_UNLOCK(skr);
1976 
1977 	/*
1978 	 * Rescale the hash table if needed.
1979 	 */
1980 	skmem_region_hash_rescale(skr);
1981 }
1982 
1983 /*
1984  * Thread call callback for update.
1985  */
1986 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)1987 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
1988 {
1989 #pragma unused(dummy, arg)
1990 	sk_protect_t protect;
1991 
1992 	protect = sk_region_update_protect();
1993 	skmem_region_applyall(skmem_region_update);
1994 	sk_region_update_unprotect(protect);
1995 
1996 	skmem_dispatch(skmem_region_update_tc, NULL,
1997 	    (skmem_region_update_interval * NSEC_PER_SEC));
1998 }
1999 
2000 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2001 skmem_region_for_pp(skmem_region_id_t id)
2002 {
2003 	int i;
2004 
2005 	for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2006 		if (id == skmem_pp_region_ids[i]) {
2007 			return TRUE;
2008 		}
2009 	}
2010 	return FALSE;
2011 }
2012 
2013 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2014 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2015 {
2016 	bzero(sreg, sizeof(*sreg));
2017 
2018 	(void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2019 	    "%s", skr->skr_name);
2020 	uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2021 	sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2022 	sreg->sreg_mode = skr->skr_mode;
2023 
2024 	sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2025 	sreg->sreg_c_seg_size = skr->skr_seg_size;
2026 	sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2027 	sreg->sreg_seg_objs = skr->skr_seg_objs;
2028 	sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2029 	sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2030 	sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2031 	sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2032 	sreg->sreg_align = skr->skr_align;
2033 	sreg->sreg_max_frags = skr->skr_max_frags;
2034 
2035 	sreg->sreg_meminuse = skr->skr_meminuse;
2036 	sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2037 	sreg->sreg_memtotal = skr->skr_memtotal;
2038 	sreg->sreg_seginuse = skr->skr_seginuse;
2039 	sreg->sreg_rescale = skr->skr_rescale;
2040 	sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2041 	sreg->sreg_alloc = skr->skr_alloc;
2042 	sreg->sreg_free = skr->skr_free;
2043 }
2044 
2045 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * out,size_t len)2046 skmem_region_mib_get_stats(struct skmem_region *skr, void *out, size_t len)
2047 {
2048 	size_t actual_space = sizeof(struct sk_stats_region);
2049 	struct sk_stats_region *sreg = out;
2050 
2051 	if (out == NULL || len < actual_space) {
2052 		goto done;
2053 	}
2054 
2055 	skmem_region_get_stats(skr, sreg);
2056 
2057 done:
2058 	return actual_space;
2059 }
2060 
2061 static int
2062 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2063 {
2064 #pragma unused(arg1, arg2, oidp)
2065 	struct skmem_region *skr;
2066 	size_t actual_space;
2067 	size_t buffer_space;
2068 	size_t allocated_space;
2069 	caddr_t buffer = NULL;
2070 	caddr_t scan;
2071 	int error = 0;
2072 
2073 	if (!kauth_cred_issuser(kauth_cred_get())) {
2074 		return EPERM;
2075 	}
2076 
2077 	net_update_uptime();
2078 	buffer_space = req->oldlen;
2079 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2080 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2081 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2082 		}
2083 		allocated_space = buffer_space;
2084 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_region_mib);
2085 		if (__improbable(buffer == NULL)) {
2086 			return ENOBUFS;
2087 		}
2088 	} else if (req->oldptr == USER_ADDR_NULL) {
2089 		buffer_space = 0;
2090 	}
2091 	actual_space = 0;
2092 	scan = buffer;
2093 
2094 	SKMEM_REGION_LOCK();
2095 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2096 		size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2097 		if (scan != NULL) {
2098 			if (buffer_space < size) {
2099 				/* supplied buffer too small, stop copying */
2100 				error = ENOMEM;
2101 				break;
2102 			}
2103 			scan += size;
2104 			buffer_space -= size;
2105 		}
2106 		actual_space += size;
2107 	}
2108 	SKMEM_REGION_UNLOCK();
2109 
2110 	if (actual_space != 0) {
2111 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
2112 		if (out_error != 0) {
2113 			error = out_error;
2114 		}
2115 	}
2116 	if (buffer != NULL) {
2117 		sk_free_data(buffer, allocated_space);
2118 	}
2119 
2120 	return error;
2121 }
2122 
2123 #if SK_LOG
2124 const char *
skmem_region_id2name(skmem_region_id_t id)2125 skmem_region_id2name(skmem_region_id_t id)
2126 {
2127 	const char *name;
2128 	switch (id) {
2129 	case SKMEM_REGION_SCHEMA:
2130 		name = "SCHEMA";
2131 		break;
2132 
2133 	case SKMEM_REGION_RING:
2134 		name = "RING";
2135 		break;
2136 
2137 	case SKMEM_REGION_BUF:
2138 		name = "BUF";
2139 		break;
2140 
2141 	case SKMEM_REGION_RXBUF:
2142 		name = "RXBUF";
2143 		break;
2144 
2145 	case SKMEM_REGION_TXBUF:
2146 		name = "TXBUF";
2147 		break;
2148 
2149 	case SKMEM_REGION_UMD:
2150 		name = "UMD";
2151 		break;
2152 
2153 	case SKMEM_REGION_TXAUSD:
2154 		name = "TXAUSD";
2155 		break;
2156 
2157 	case SKMEM_REGION_RXFUSD:
2158 		name = "RXFUSD";
2159 		break;
2160 
2161 	case SKMEM_REGION_USTATS:
2162 		name = "USTATS";
2163 		break;
2164 
2165 	case SKMEM_REGION_FLOWADV:
2166 		name = "FLOWADV";
2167 		break;
2168 
2169 	case SKMEM_REGION_NEXUSADV:
2170 		name = "NEXUSADV";
2171 		break;
2172 
2173 	case SKMEM_REGION_SYSCTLS:
2174 		name = "SYSCTLS";
2175 		break;
2176 
2177 	case SKMEM_REGION_GUARD_HEAD:
2178 		name = "HEADGUARD";
2179 		break;
2180 
2181 	case SKMEM_REGION_GUARD_TAIL:
2182 		name = "TAILGUARD";
2183 		break;
2184 
2185 	case SKMEM_REGION_KMD:
2186 		name = "KMD";
2187 		break;
2188 
2189 	case SKMEM_REGION_RXKMD:
2190 		name = "RXKMD";
2191 		break;
2192 
2193 	case SKMEM_REGION_TXKMD:
2194 		name = "TXKMD";
2195 		break;
2196 
2197 	case SKMEM_REGION_TXAKSD:
2198 		name = "TXAKSD";
2199 		break;
2200 
2201 	case SKMEM_REGION_RXFKSD:
2202 		name = "RXFKSD";
2203 		break;
2204 
2205 	case SKMEM_REGION_KSTATS:
2206 		name = "KSTATS";
2207 		break;
2208 
2209 	case SKMEM_REGION_KBFT:
2210 		name = "KBFT";
2211 		break;
2212 
2213 	case SKMEM_REGION_UBFT:
2214 		name = "UBFT";
2215 		break;
2216 
2217 	case SKMEM_REGION_RXKBFT:
2218 		name = "RXKBFT";
2219 		break;
2220 
2221 	case SKMEM_REGION_TXKBFT:
2222 		name = "TXKBFT";
2223 		break;
2224 
2225 	case SKMEM_REGION_INTRINSIC:
2226 		name = "INTRINSIC";
2227 		break;
2228 
2229 	default:
2230 		name = "UNKNOWN";
2231 		break;
2232 	}
2233 
2234 	return name;
2235 }
2236 #endif /* SK_LOG */
2237 
2238 #if (DEVELOPMENT || DEBUG)
2239 uint64_t
skmem_region_get_mtbf(void)2240 skmem_region_get_mtbf(void)
2241 {
2242 	return skmem_region_mtbf;
2243 }
2244 
2245 void
skmem_region_set_mtbf(uint64_t newval)2246 skmem_region_set_mtbf(uint64_t newval)
2247 {
2248 	if (newval < SKMEM_REGION_MTBF_MIN) {
2249 		if (newval != 0) {
2250 			newval = SKMEM_REGION_MTBF_MIN;
2251 		}
2252 	} else if (newval > SKMEM_REGION_MTBF_MAX) {
2253 		newval = SKMEM_REGION_MTBF_MAX;
2254 	}
2255 
2256 	if (skmem_region_mtbf != newval) {
2257 		atomic_set_64(&skmem_region_mtbf, newval);
2258 		SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2259 	}
2260 }
2261 
2262 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2263 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2264     struct sysctl_req *req)
2265 {
2266 #pragma unused(oidp, arg1, arg2)
2267 	int changed, error;
2268 	uint64_t newval;
2269 
2270 	_CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2271 	if ((error = sysctl_io_number(req, skmem_region_mtbf,
2272 	    sizeof(uint64_t), &newval, &changed)) == 0) {
2273 		if (changed) {
2274 			skmem_region_set_mtbf(newval);
2275 		}
2276 	}
2277 	return error;
2278 }
2279 #endif /* (DEVELOPMENT || DEBUG) */
2280