xref: /xnu-10002.41.9/bsd/skywalk/mem/skmem_region.c (revision 699cd48037512bf4380799317ca44ca453c82f57)
1 /*
2  * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /* BEGIN CSTYLED */
30 /*
31  * A region represents a collection of one or more similarly-sized memory
32  * segments, each of which is a contiguous range of integers.  A segment
33  * is either allocated or free, and is treated as disjoint from all other
34  * segments.  That is, the contiguity applies only at the segment level,
35  * and a region with multiple segments is not contiguous at the region level.
36  * A segment always belongs to the segment freelist, or the allocated-address
37  * hash chain, as described below.
38  *
39  * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40  * stays intact even after a defunct.  Otherwise, the segments belonging
41  * to the region will be freed at defunct time, and the span covered by
42  * the region will be redirected to zero-filled anonymous memory.
43  *
44  * Memory for a region is always created as pageable and purgeable.  It is
45  * the client's responsibility to prepare (wire) it, and optionally insert
46  * it to the IOMMU, at segment construction time.  When the segment is
47  * freed, the client is responsible for removing it from IOMMU (if needed),
48  * and complete (unwire) it.
49  *
50  * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51  * is immediately wired upon allocation (segment removed from freelist).
52  * It gets unwired when memory is discarded (segment inserted to freelist).
53  *
54  * The chronological life cycle of a segment is as such:
55  *
56  *    SKSEG_STATE_DETACHED
57  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58  *            [segment allocated, useable by client]
59  *              ...
60  *            [client frees segment]
61  *        SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62  *	  [reclaim]
63  *    SKSEG_STATE_DETACHED
64  *
65  * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66  * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67  * prevent modifications by the user task.  Only user-mappable regions will
68  * be considered for inclusion during skmem_arena_mmap().
69  *
70  * Every skmem allocator has a region as its slab supplier.  Each slab is
71  * exactly a segment.  The allocator uses skmem_region_{alloc,free}() to
72  * create and destroy slabs.
73  *
74  * A region may be mirrored by another region; the latter acts as the master
75  * controller for both regions.  Mirrored (slave) regions cannot be used
76  * directly by the skmem allocator.  Region mirroring technique is used for
77  * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78  * region has the same size and lifetime as its shadow counterpart.
79  *
80  * CREATION/DESTRUCTION:
81  *
82  *   At creation time, all segments are allocated and are immediately inserted
83  *   into the freelist.  Allocating a purgeable segment has very little cost,
84  *   as it is not backed by physical memory until it is accessed.  Immediate
85  *   insertion into the freelist causes the mapping to be further torn down.
86  *
87  *   At destruction time, the freelist is emptied, and each segment is then
88  *   destroyed.  The system will assert if it detects there are outstanding
89  *   segments not yet returned to the region (not freed by the client.)
90  *
91  * ALLOCATION:
92  *
93  *   Allocating involves searching the freelist for a segment; if found, the
94  *   segment is removed from the freelist and is inserted into the allocated-
95  *   address hash chain.  The address of the memory object represented by
96  *   the segment is used as hash key.  The use of allocated-address hash chain
97  *   is needed since we return the address of the memory object, and not the
98  *   segment's itself, to the client.
99  *
100  * DEALLOCATION:
101  *
102  *   Freeing a memory object causes the chain to be searched for a matching
103  *   segment.  The system will assert if a segment cannot be found, since
104  *   that indicates that the memory object address is invalid.  Once found,
105  *   the segment is removed from the allocated-address hash chain, and is
106  *   inserted to the freelist.
107  *
108  * Segment allocation and deallocation can be expensive.  Because of this,
109  * we expect that most clients will utilize the skmem_cache slab allocator
110  * as the frontend instead.
111  */
112 /* END CSTYLED */
113 
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF             /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
117 
118 static void skmem_region_destroy(struct skmem_region *skr);
119 static void skmem_region_depopulate(struct skmem_region *);
120 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
121 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
122 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
123 static void sksegment_freelist_insert(struct skmem_region *,
124     struct sksegment *, boolean_t);
125 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
126     struct sksegment *, uint32_t, boolean_t);
127 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
128 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
129     uint32_t);
130 static void *skmem_region_alloc_common(struct skmem_region *,
131     struct sksegment *);
132 static void *skmem_region_mirror_alloc(struct skmem_region *,
133     struct sksegment *, struct sksegment **);
134 static void skmem_region_applyall(void (*)(struct skmem_region *));
135 static void skmem_region_update(struct skmem_region *);
136 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
137 static inline void skmem_region_retain_locked(struct skmem_region *);
138 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
139 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
140 
141 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
142 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
143 
144 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
145     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
146     0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
147     "Skywalk region statistics");
148 
149 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
150 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
151 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
152     &skmem_region_lock_attr);
153 
154 /* protected by skmem_region_lock */
155 static TAILQ_HEAD(, skmem_region) skmem_region_head;
156 
157 static thread_call_t skmem_region_update_tc;
158 
159 #define SKMEM_REGION_UPDATE_INTERVAL    13      /* 13 seconds */
160 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
161 
162 #define SKMEM_WDT_MAXTIME               30      /* # of secs before watchdog */
163 #define SKMEM_WDT_PURGE                 3       /* retry purge threshold */
164 
165 #if (DEVELOPMENT || DEBUG)
166 /* Mean Time Between Failures (ms) */
167 static volatile uint64_t skmem_region_mtbf;
168 
169 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
170     struct sysctl_req *);
171 
172 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
173     CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
174     skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
175 
176 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
177     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
178     SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
179 #endif /* (DEVELOPMENT || DEBUG) */
180 
181 #define SKMEM_REGION_LOCK()                     \
182 	lck_mtx_lock(&skmem_region_lock)
183 #define SKMEM_REGION_LOCK_ASSERT_HELD()         \
184 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
185 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD()      \
186 	LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
187 #define SKMEM_REGION_UNLOCK()                   \
188 	lck_mtx_unlock(&skmem_region_lock)
189 
190 /*
191  * Hash table bounds.  Start with the initial value, and rescale up to
192  * the specified limit.  Ideally we don't need a limit, but in practice
193  * this helps guard against runaways.  These values should be revisited
194  * in future and be adjusted as needed.
195  */
196 #define SKMEM_REGION_HASH_INITIAL       32      /* initial hash table size */
197 #define SKMEM_REGION_HASH_LIMIT         4096    /* hash table size limit */
198 
199 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m)     \
200 	(((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
201 #define SKMEM_REGION_HASH(_skr, _addr)                                     \
202 	(&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
203 	    (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
204 
205 static SKMEM_TYPE_DEFINE(skr_zone, struct skmem_region);
206 
207 static unsigned int sg_size;                    /* size of zone element */
208 static struct skmem_cache *skmem_sg_cache;      /* cache for sksegment */
209 
210 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
211 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
212 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
213 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
214 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
215 
216 #define SKMEM_TAG_SEGMENT_BMAP  "com.apple.skywalk.segment.bmap"
217 static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP);
218 
219 #define SKMEM_TAG_SEGMENT_HASH  "com.apple.skywalk.segment.hash"
220 static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH);
221 
222 #define SKMEM_TAG_REGION_MIB     "com.apple.skywalk.region.mib"
223 static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB);
224 
225 #define BMAPSZ  64
226 
227 /* 64-bit mask with range */
228 #define BMASK64(_beg, _end)     \
229 	((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
230 
231 static int __skmem_region_inited = 0;
232 
233 void
skmem_region_init(void)234 skmem_region_init(void)
235 {
236 	boolean_t randomize_seg_size;
237 
238 	_CASSERT(sizeof(bitmap_t) == sizeof(uint64_t));
239 	_CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3));
240 	_CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
241 	_CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
242 	ASSERT(!__skmem_region_inited);
243 
244 	/* enforce the ordering here */
245 	_CASSERT(SKMEM_REGION_GUARD_HEAD == 0);
246 	_CASSERT(SKMEM_REGION_SCHEMA == 1);
247 	_CASSERT(SKMEM_REGION_RING == 2);
248 	_CASSERT(SKMEM_REGION_BUF_DEF == 3);
249 	_CASSERT(SKMEM_REGION_BUF_LARGE == 4);
250 	_CASSERT(SKMEM_REGION_RXBUF_DEF == 5);
251 	_CASSERT(SKMEM_REGION_RXBUF_LARGE == 6);
252 	_CASSERT(SKMEM_REGION_TXBUF_DEF == 7);
253 	_CASSERT(SKMEM_REGION_TXBUF_LARGE == 8);
254 	_CASSERT(SKMEM_REGION_UMD == 9);
255 	_CASSERT(SKMEM_REGION_TXAUSD == 10);
256 	_CASSERT(SKMEM_REGION_RXFUSD == 11);
257 	_CASSERT(SKMEM_REGION_UBFT == 12);
258 	_CASSERT(SKMEM_REGION_USTATS == 13);
259 	_CASSERT(SKMEM_REGION_FLOWADV == 14);
260 	_CASSERT(SKMEM_REGION_NEXUSADV == 15);
261 	_CASSERT(SKMEM_REGION_SYSCTLS == 16);
262 	_CASSERT(SKMEM_REGION_GUARD_TAIL == 17);
263 	_CASSERT(SKMEM_REGION_KMD == 18);
264 	_CASSERT(SKMEM_REGION_RXKMD == 19);
265 	_CASSERT(SKMEM_REGION_TXKMD == 20);
266 	_CASSERT(SKMEM_REGION_KBFT == 21);
267 	_CASSERT(SKMEM_REGION_RXKBFT == 22);
268 	_CASSERT(SKMEM_REGION_TXKBFT == 23);
269 	_CASSERT(SKMEM_REGION_TXAKSD == 24);
270 	_CASSERT(SKMEM_REGION_RXFKSD == 25);
271 	_CASSERT(SKMEM_REGION_KSTATS == 26);
272 	_CASSERT(SKMEM_REGION_INTRINSIC == 27);
273 
274 	_CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
275 	_CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
276 	_CASSERT(SREG_RING == SKMEM_REGION_RING);
277 	_CASSERT(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF);
278 	_CASSERT(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE);
279 	_CASSERT(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF);
280 	_CASSERT(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE);
281 	_CASSERT(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF);
282 	_CASSERT(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE);
283 	_CASSERT(SREG_UMD == SKMEM_REGION_UMD);
284 	_CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
285 	_CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
286 	_CASSERT(SREG_UBFT == SKMEM_REGION_UBFT);
287 	_CASSERT(SREG_USTATS == SKMEM_REGION_USTATS);
288 	_CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
289 	_CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
290 	_CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
291 	_CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
292 	_CASSERT(SREG_KMD == SKMEM_REGION_KMD);
293 	_CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD);
294 	_CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD);
295 	_CASSERT(SREG_KBFT == SKMEM_REGION_KBFT);
296 	_CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
297 	_CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
298 	_CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
299 	_CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
300 	_CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS);
301 
302 	_CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
303 	_CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
304 	_CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
305 	_CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
306 	_CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
307 	_CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
308 	_CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
309 	_CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
310 	_CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
311 	_CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
312 	_CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD);
313 	_CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
314 	_CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
315 	_CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
316 	_CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
317 	_CASSERT(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE);
318 	_CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB);
319 	_CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
320 
321 	(void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
322 	    sizeof(skmem_seg_size));
323 	if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
324 		skmem_seg_size = SKMEM_MIN_SEG_SIZE;
325 	}
326 	skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
327 	    SKMEM_MIN_SEG_SIZE);
328 	VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
329 
330 	(void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
331 	    sizeof(skmem_md_seg_size));
332 	if (skmem_md_seg_size < skmem_seg_size) {
333 		skmem_md_seg_size = skmem_seg_size;
334 	}
335 	skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
336 	    SKMEM_MIN_SEG_SIZE);
337 	VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
338 
339 	/*
340 	 * If set via boot-args, honor it and don't randomize.
341 	 */
342 	randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
343 	    &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
344 	if (skmem_drv_buf_seg_size < skmem_seg_size) {
345 		skmem_drv_buf_seg_size = skmem_seg_size;
346 	}
347 	skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
348 	    (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
349 	VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
350 
351 	/*
352 	 * Randomize the driver buffer segment size; here we choose
353 	 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
354 	 * Set this as the effective driver buffer segment size.
355 	 */
356 	if (randomize_seg_size) {
357 		uint32_t sm;
358 		read_frandom(&sm, sizeof(sm));
359 		skmem_drv_buf_seg_eff_size +=
360 		    (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
361 		VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
362 	}
363 	VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
364 
365 	(void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
366 	    &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
367 	if (skmem_usr_buf_seg_size < skmem_seg_size) {
368 		skmem_usr_buf_seg_size = skmem_seg_size;
369 	}
370 	skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
371 	    SKMEM_MIN_SEG_SIZE);
372 	VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
373 
374 	SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
375 	    "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
376 	    skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
377 	    skmem_usr_buf_seg_size);
378 
379 	TAILQ_INIT(&skmem_region_head);
380 
381 	skmem_region_update_tc =
382 	    thread_call_allocate_with_options(skmem_region_update_func,
383 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
384 	if (skmem_region_update_tc == NULL) {
385 		panic("%s: thread_call_allocate failed", __func__);
386 		/* NOTREACHED */
387 		__builtin_unreachable();
388 	}
389 
390 	sg_size = sizeof(struct sksegment);
391 	skmem_sg_cache = skmem_cache_create("sg", sg_size,
392 	    sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
393 
394 	/* and start the periodic region update machinery */
395 	skmem_dispatch(skmem_region_update_tc, NULL,
396 	    (skmem_region_update_interval * NSEC_PER_SEC));
397 
398 	__skmem_region_inited = 1;
399 }
400 
401 void
skmem_region_fini(void)402 skmem_region_fini(void)
403 {
404 	if (__skmem_region_inited) {
405 		ASSERT(TAILQ_EMPTY(&skmem_region_head));
406 
407 		if (skmem_region_update_tc != NULL) {
408 			(void) thread_call_cancel_wait(skmem_region_update_tc);
409 			(void) thread_call_free(skmem_region_update_tc);
410 			skmem_region_update_tc = NULL;
411 		}
412 
413 		if (skmem_sg_cache != NULL) {
414 			skmem_cache_destroy(skmem_sg_cache);
415 			skmem_sg_cache = NULL;
416 		}
417 
418 		__skmem_region_inited = 0;
419 	}
420 }
421 
422 /*
423  * Reap internal caches.
424  */
425 void
skmem_region_reap_caches(boolean_t purge)426 skmem_region_reap_caches(boolean_t purge)
427 {
428 	skmem_cache_reap_now(skmem_sg_cache, purge);
429 }
430 
431 /*
432  * Configure and compute the parameters of a region.
433  */
434 void
skmem_region_params_config(struct skmem_region_params * srp)435 skmem_region_params_config(struct skmem_region_params *srp)
436 {
437 	uint32_t cache_line_size = skmem_cpu_cache_line_size();
438 	size_t seglim, segsize, segcnt;
439 	size_t objsize, objcnt;
440 
441 	ASSERT(srp->srp_id < SKMEM_REGIONS);
442 
443 	/*
444 	 * If magazines layer is disabled system-wide, override
445 	 * the region parameter here.  This will effectively reduce
446 	 * the number of requested objects computed below.  Note that
447 	 * the region may have already been configured to exclude
448 	 * magazines in the default skmem_regions[] array.
449 	 */
450 	if (!skmem_allow_magazines()) {
451 		srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
452 	}
453 
454 	objsize = srp->srp_r_obj_size;
455 	ASSERT(objsize != 0);
456 	objcnt = srp->srp_r_obj_cnt;
457 	ASSERT(objcnt != 0);
458 
459 	if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
460 		size_t align = srp->srp_align;
461 
462 		VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
463 		VERIFY(powerof2(align));
464 		objsize = MAX(objsize, sizeof(uint64_t));
465 #if KASAN
466 		/*
467 		 * When KASAN is enabled, the zone allocator adjusts the
468 		 * element size to include the redzone regions, in which
469 		 * case we assume that the elements won't start on the
470 		 * alignment boundary and thus need to do some fix-ups.
471 		 * These include increasing the effective object size
472 		 * which adds at least 16 bytes to the original size.
473 		 */
474 		objsize += sizeof(uint64_t) + align;
475 #endif /* KASAN */
476 		objsize = P2ROUNDUP(objsize, align);
477 
478 		segsize = objsize;
479 		srp->srp_r_seg_size = (uint32_t)segsize;
480 		segcnt = objcnt;
481 		goto done;
482 	} else {
483 		/* objects are always aligned at CPU cache line size */
484 		srp->srp_align = cache_line_size;
485 	}
486 
487 	/*
488 	 * Start with default segment size for the region, and compute the
489 	 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE).  If the
490 	 * object size is greater, then we adjust the segment size to next
491 	 * multiple of the effective size larger than the object size.
492 	 */
493 	if (srp->srp_r_seg_size == 0) {
494 		switch (srp->srp_id) {
495 		case SKMEM_REGION_UMD:
496 		case SKMEM_REGION_KMD:
497 		case SKMEM_REGION_RXKMD:
498 		case SKMEM_REGION_TXKMD:
499 			srp->srp_r_seg_size = skmem_md_seg_size;
500 			break;
501 
502 		case SKMEM_REGION_BUF_DEF:
503 		case SKMEM_REGION_RXBUF_DEF:
504 		case SKMEM_REGION_TXBUF_DEF:
505 			/*
506 			 * Use the effective driver buffer segment size,
507 			 * since it reflects any randomization done at
508 			 * skmem_region_init() time.
509 			 */
510 			srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
511 			break;
512 
513 		default:
514 			srp->srp_r_seg_size = skmem_seg_size;
515 			break;
516 		}
517 	} else {
518 		srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
519 		    SKMEM_MIN_SEG_SIZE);
520 	}
521 
522 	seglim = srp->srp_r_seg_size;
523 	VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
524 
525 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
526 	    srp->srp_name, seglim, objsize, objcnt);
527 
528 	/*
529 	 * Make sure object size is multiple of CPU cache line
530 	 * size, and that we can evenly divide the segment size.
531 	 */
532 	if (!((objsize < cache_line_size) && (objsize < seglim) &&
533 	    ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
534 		objsize = P2ROUNDUP(objsize, cache_line_size);
535 		while (objsize < seglim && (seglim % objsize) != 0) {
536 			SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
537 			    srp->srp_name, objsize, objsize + cache_line_size);
538 			objsize += cache_line_size;
539 		}
540 	}
541 
542 	/* segment must be larger than object */
543 	while (objsize > seglim) {
544 		SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
545 		    seglim, seglim + SKMEM_MIN_SEG_SIZE);
546 		seglim += SKMEM_MIN_SEG_SIZE;
547 	}
548 
549 	/*
550 	 * Take into account worst-case per-CPU cached
551 	 * objects if this region is configured for it.
552 	 */
553 	if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
554 		uint32_t magazine_max_objs =
555 		    skmem_cache_magazine_max((uint32_t)objsize);
556 		SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
557 		    objcnt, objcnt + magazine_max_objs);
558 		objcnt += magazine_max_objs;
559 	}
560 
561 	SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
562 	    "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
563 
564 	segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
565 	if (seglim > segsize) {
566 		/*
567 		 * If the segment limit is larger than what we need,
568 		 * avoid memory wastage by shrinking it.
569 		 */
570 		while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
571 			VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
572 			SK_DF(SK_VERB_MEM,
573 			    "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
574 			    srp->srp_name, segsize, objsize, objcnt, seglim,
575 			    P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
576 			    SKMEM_MIN_SEG_SIZE));
577 			seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
578 			    SKMEM_MIN_SEG_SIZE);
579 		}
580 
581 		/* adjust segment size */
582 		segsize = seglim;
583 	} else if (seglim < segsize) {
584 		size_t oseglim = seglim;
585 		/*
586 		 * If the segment limit is less than the segment size,
587 		 * see if increasing it slightly (up to 1.5x the segment
588 		 * size) would allow us to avoid allocating too many
589 		 * extra objects (due to excessive segment count).
590 		 */
591 		while (seglim < segsize && (segsize % seglim) != 0) {
592 			SK_DF(SK_VERB_MEM,
593 			    "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
594 			    srp->srp_name, segsize, objsize, objcnt, seglim,
595 			    (seglim + SKMEM_MIN_SEG_SIZE));
596 			seglim += SKMEM_MIN_SEG_SIZE;
597 			if (seglim >= (oseglim + (oseglim >> 1))) {
598 				break;
599 			}
600 		}
601 
602 		/* can't use P2ROUNDUP since seglim may not be power of 2 */
603 		segsize = SK_ROUNDUP(segsize, seglim);
604 	}
605 	ASSERT(segsize != 0 && (segsize % seglim) == 0);
606 
607 	SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
608 	    srp->srp_name, segsize, seglim);
609 
610 	/* compute segment count, and recompute segment size */
611 	if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
612 		segcnt = 1;
613 	} else {
614 		/*
615 		 * The adjustments above were done in increments of
616 		 * SKMEM_MIN_SEG_SIZE.  If the object size is greater
617 		 * than that, ensure that the segment size is a multiple
618 		 * of the object size.
619 		 */
620 		if (objsize > SKMEM_MIN_SEG_SIZE) {
621 			ASSERT(seglim >= objsize);
622 			if ((seglim % objsize) != 0) {
623 				seglim += (seglim - objsize);
624 			}
625 			/* recompute segsize; see SK_ROUNDUP comment above */
626 			segsize = SK_ROUNDUP(segsize, seglim);
627 		}
628 
629 		segcnt = MAX(1, (segsize / seglim));
630 		segsize /= segcnt;
631 	}
632 
633 	SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
634 	    srp->srp_name, segcnt, segsize);
635 
636 	/* recompute object count to avoid wastage */
637 	objcnt = (segsize * segcnt) / objsize;
638 	ASSERT(objcnt != 0);
639 done:
640 	srp->srp_c_obj_size = (uint32_t)objsize;
641 	srp->srp_c_obj_cnt = (uint32_t)objcnt;
642 	srp->srp_c_seg_size = (uint32_t)segsize;
643 	srp->srp_seg_cnt = (uint32_t)segcnt;
644 
645 	SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
646 	    srp->srp_name, objsize, objcnt, segcnt, segsize);
647 
648 #if SK_LOG
649 	if (__improbable(sk_verbose != 0)) {
650 		char label[32];
651 		(void) snprintf(label, sizeof(label), "REGION_%s:",
652 		    skmem_region_id2name(srp->srp_id));
653 		SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
654 		    (uint32_t)srp->srp_r_obj_cnt,
655 		    (uint32_t)srp->srp_r_obj_size,
656 		    (uint32_t)srp->srp_c_obj_cnt,
657 		    (uint32_t)srp->srp_c_obj_size);
658 	}
659 #endif /* SK_LOG */
660 }
661 
662 /*
663  * Create a region.
664  */
665 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)666 skmem_region_create(const char *name, struct skmem_region_params *srp,
667     sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
668 {
669 	boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
670 	uint32_t cflags = srp->srp_cflags;
671 	struct skmem_region *skr;
672 	uint32_t i;
673 
674 	ASSERT(srp->srp_id < SKMEM_REGIONS);
675 	ASSERT(srp->srp_c_seg_size != 0 &&
676 	    (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
677 	ASSERT(srp->srp_seg_cnt != 0);
678 	ASSERT(srp->srp_c_obj_cnt == 1 ||
679 	    (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
680 	ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
681 
682 	skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
683 	skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
684 	skr->skr_seg_size = srp->srp_c_seg_size;
685 	skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
686 	skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
687 
688 	if (!pseudo) {
689 		skr->skr_seg_max_cnt = srp->srp_seg_cnt;
690 
691 		/* set alignment to CPU cache line size */
692 		skr->skr_params.srp_align = skmem_cpu_cache_line_size();
693 
694 		/* allocate the allocated-address hash chain */
695 		skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
696 		skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
697 		skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
698 		    skr->skr_hash_initial, Z_WAITOK | Z_NOFAIL,
699 		    skmem_tag_segment_hash);
700 		skr->skr_hash_mask = (skr->skr_hash_initial - 1);
701 		skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
702 
703 		for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
704 			TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
705 		}
706 	} else {
707 		/* this upper bound doesn't apply */
708 		skr->skr_seg_max_cnt = 0;
709 
710 		/* pick up value set by skmem_regions_params_config() */
711 		skr->skr_params.srp_align = srp->srp_align;
712 	}
713 
714 	skr->skr_r_obj_size = srp->srp_r_obj_size;
715 	skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
716 	skr->skr_c_obj_size = srp->srp_c_obj_size;
717 	skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
718 
719 	skr->skr_params.srp_md_type = srp->srp_md_type;
720 	skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
721 	skr->skr_params.srp_max_frags = srp->srp_max_frags;
722 
723 	skr->skr_seg_ctor = ctor;
724 	skr->skr_seg_dtor = dtor;
725 	skr->skr_private = private;
726 
727 	lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
728 	    &skmem_region_lock_attr);
729 
730 	TAILQ_INIT(&skr->skr_seg_free);
731 	RB_INIT(&skr->skr_seg_tfree);
732 
733 	skr->skr_id = srp->srp_id;
734 	uuid_generate_random(skr->skr_uuid);
735 	(void) snprintf(skr->skr_name, sizeof(skr->skr_name),
736 	    "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
737 
738 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
739 	    skr->skr_name, SK_KVA(skr));
740 
741 	/* sanity check */
742 	ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
743 	    !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
744 	    SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
745 	    SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
746 	    SKMEM_REGION_CR_PUREDATA)));
747 
748 	skr->skr_cflags = cflags;
749 	if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
750 		skr->skr_mode |= SKR_MODE_NOREDIRECT;
751 	}
752 	if (cflags & SKMEM_REGION_CR_MMAPOK) {
753 		skr->skr_mode |= SKR_MODE_MMAPOK;
754 	}
755 	if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
756 	    (cflags & SKMEM_REGION_CR_UREADONLY)) {
757 		skr->skr_mode |= SKR_MODE_UREADONLY;
758 	}
759 	if (cflags & SKMEM_REGION_CR_KREADONLY) {
760 		skr->skr_mode |= SKR_MODE_KREADONLY;
761 	}
762 	if (cflags & SKMEM_REGION_CR_PERSISTENT) {
763 		skr->skr_mode |= SKR_MODE_PERSISTENT;
764 	}
765 	if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
766 		skr->skr_mode |= SKR_MODE_MONOLITHIC;
767 	}
768 	if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
769 		skr->skr_mode |= SKR_MODE_NOMAGAZINES;
770 	}
771 	if (cflags & SKMEM_REGION_CR_NOCACHE) {
772 		skr->skr_mode |= SKR_MODE_NOCACHE;
773 	}
774 	if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
775 		skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
776 	}
777 	if (cflags & SKMEM_REGION_CR_SHAREOK) {
778 		skr->skr_mode |= SKR_MODE_SHAREOK;
779 	}
780 	if (cflags & SKMEM_REGION_CR_IODIR_IN) {
781 		skr->skr_mode |= SKR_MODE_IODIR_IN;
782 	}
783 	if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
784 		skr->skr_mode |= SKR_MODE_IODIR_OUT;
785 	}
786 	if (cflags & SKMEM_REGION_CR_GUARD) {
787 		skr->skr_mode |= SKR_MODE_GUARD;
788 	}
789 	if (cflags & SKMEM_REGION_CR_PUREDATA) {
790 		skr->skr_mode |= SKR_MODE_PUREDATA;
791 	}
792 	if (cflags & SKMEM_REGION_CR_PSEUDO) {
793 		skr->skr_mode |= SKR_MODE_PSEUDO;
794 	}
795 	if (cflags & SKMEM_REGION_CR_THREADSAFE) {
796 		skr->skr_mode |= SKR_MODE_THREADSAFE;
797 	}
798 	if (cflags & SKMEM_REGION_CR_MEMTAG) {
799 		skr->skr_mode |= SKR_MODE_MEMTAG;
800 	}
801 
802 #if XNU_TARGET_OS_OSX
803 	/*
804 	 * Mark all regions as persistent except for the guard and Intrinsic
805 	 * regions.
806 	 * This is to ensure that kernel threads won't be faulting-in while
807 	 * accessing these memory regions. We have observed various kinds of
808 	 * kernel panics due to kernel threads faulting on non-wired memory
809 	 * access when the VM subsystem is not in a state to swap-in the page.
810 	 */
811 	if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
812 	    (skr->skr_mode & SKR_MODE_GUARD))) {
813 		skr->skr_mode |= SKR_MODE_PERSISTENT;
814 	}
815 #endif /* XNU_TARGET_OS_OSX */
816 
817 	/* SKR_MODE_UREADONLY only takes effect for user task mapping */
818 	skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
819 	skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
820 	skr->skr_bufspec.purgeable = TRUE;
821 	skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
822 	skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
823 	skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
824 	skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
825 	skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
826 	skr->skr_bufspec.threadSafe = !!(skr->skr_mode & SKR_MODE_THREADSAFE);
827 	skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
828 
829 	/* allocate segment bitmaps */
830 	if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
831 		ASSERT(skr->skr_seg_max_cnt != 0);
832 		skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
833 		skr->skr_seg_bmap = sk_alloc_data(BITMAP_SIZE(skr->skr_seg_max_cnt),
834 		    Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
835 		ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
836 		    (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
837 
838 		/* mark all bitmaps as free (bit set) */
839 		bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
840 	}
841 
842 	/*
843 	 * Populate the freelist by allocating all segments for the
844 	 * region, which will be mapped but not faulted-in, and then
845 	 * immediately insert each to the freelist.  That will in
846 	 * turn unmap the segment's memory object.
847 	 */
848 	SKR_LOCK(skr);
849 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
850 		char zone_name[64];
851 		(void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
852 		    SKMEM_ZONE_PREFIX, name);
853 		skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
854 		    ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
855 	} else {
856 		/* create a backing IOSKRegion object */
857 		if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
858 		    (IOSKSize)skr->skr_seg_size,
859 		    (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
860 			SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed",
861 			    skr->skr_name, (uint32_t)skr->skr_seg_size,
862 			    (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags,
863 			    SKMEM_REGION_CR_BITS);
864 			goto failed;
865 		}
866 	}
867 
868 	ASSERT(skr->skr_seg_objs != 0);
869 
870 	++skr->skr_refcnt;      /* for caller */
871 	SKR_UNLOCK(skr);
872 
873 	SKMEM_REGION_LOCK();
874 	TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
875 	SKMEM_REGION_UNLOCK();
876 
877 	SK_DF(SK_VERB_MEM_REGION,
878 	    "  [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b",
879 	    (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
880 	    (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
881 	    skr->skr_cflags, SKMEM_REGION_CR_BITS);
882 
883 	return skr;
884 
885 failed:
886 	SKR_LOCK_ASSERT_HELD(skr);
887 	skmem_region_destroy(skr);
888 
889 	return NULL;
890 }
891 
892 /*
893  * Destroy a region.
894  */
895 static void
skmem_region_destroy(struct skmem_region * skr)896 skmem_region_destroy(struct skmem_region *skr)
897 {
898 	struct skmem_region *mskr;
899 
900 	SKR_LOCK_ASSERT_HELD(skr);
901 
902 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx",
903 	    skr->skr_name, SK_KVA(skr));
904 
905 	/*
906 	 * Panic if we detect there are unfreed segments; the caller
907 	 * destroying this region is responsible for ensuring that all
908 	 * allocated segments have been freed prior to getting here.
909 	 */
910 	ASSERT(skr->skr_refcnt == 0);
911 	if (skr->skr_seginuse != 0) {
912 		panic("%s: '%s' (%p) not empty (%u unfreed)",
913 		    __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
914 		/* NOTREACHED */
915 		__builtin_unreachable();
916 	}
917 
918 	if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
919 		SKR_UNLOCK(skr);
920 		SKMEM_REGION_LOCK();
921 		TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
922 		SKMEM_REGION_UNLOCK();
923 		SKR_LOCK(skr);
924 		ASSERT(skr->skr_refcnt == 0);
925 	}
926 
927 	/*
928 	 * Undo what's done earlier at region creation time.
929 	 */
930 	skmem_region_depopulate(skr);
931 	ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
932 	ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
933 	ASSERT(skr->skr_seg_free_cnt == 0);
934 
935 	if (skr->skr_reg != NULL) {
936 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
937 		IOSKRegionDestroy(skr->skr_reg);
938 		skr->skr_reg = NULL;
939 	}
940 
941 	if (skr->skr_zreg != NULL) {
942 		ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
943 		zdestroy(skr->skr_zreg);
944 		skr->skr_zreg = NULL;
945 	}
946 
947 	if (skr->skr_seg_bmap != NULL) {
948 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
949 #if (DEBUG || DEVELOPMENT)
950 		ASSERT(skr->skr_seg_bmap_len != 0);
951 		/* must have been set to vacant (bit set) by now */
952 		assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
953 #endif /* DEBUG || DEVELOPMENT */
954 
955 		sk_free_data(skr->skr_seg_bmap, BITMAP_SIZE(skr->skr_seg_max_cnt));
956 		skr->skr_seg_bmap = NULL;
957 		skr->skr_seg_bmap_len = 0;
958 	}
959 	ASSERT(skr->skr_seg_bmap_len == 0);
960 
961 	if (skr->skr_hash_table != NULL) {
962 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
963 #if (DEBUG || DEVELOPMENT)
964 		for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
965 			ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
966 		}
967 #endif /* DEBUG || DEVELOPMENT */
968 
969 		sk_free_type_array(struct sksegment_bkt, skr->skr_hash_mask + 1,
970 		    skr->skr_hash_table);
971 		skr->skr_hash_table = NULL;
972 	}
973 	if ((mskr = skr->skr_mirror) != NULL) {
974 		ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
975 		skr->skr_mirror = NULL;
976 		mskr->skr_mode &= ~SKR_MODE_MIRRORED;
977 	}
978 	SKR_UNLOCK(skr);
979 
980 	if (mskr != NULL) {
981 		skmem_region_release(mskr);
982 	}
983 
984 	lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
985 
986 	zfree(skr_zone, skr);
987 }
988 
989 /*
990  * Mirror mskr (slave) to skr (master).
991  */
992 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)993 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
994 {
995 	SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ",
996 	    SK_KVA(skr), SK_KVA(mskr));
997 
998 	SKR_LOCK(skr);
999 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1000 	ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
1001 	ASSERT(skr->skr_mirror == NULL);
1002 
1003 	/* both regions must share identical parameters */
1004 	ASSERT(skr->skr_size == mskr->skr_size);
1005 	ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
1006 	ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
1007 
1008 	skr->skr_mirror = mskr;
1009 	skmem_region_retain(mskr);
1010 	mskr->skr_mode |= SKR_MODE_MIRRORED;
1011 	SKR_UNLOCK(skr);
1012 }
1013 
1014 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm,bool attach)1015 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm,
1016     bool attach)
1017 {
1018 	int i;
1019 
1020 	SKR_LOCK(skr);
1021 	if (attach) {
1022 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != NULL;
1023 		    i++) {
1024 			;
1025 		}
1026 		VERIFY(i < SKR_MAX_CACHES);
1027 		ASSERT(skr->skr_cache[i] == NULL);
1028 		skr->skr_mode |= SKR_MODE_SLAB;
1029 		skr->skr_cache[i] = skm;
1030 		skmem_region_retain_locked(skr);
1031 		SKR_UNLOCK(skr);
1032 	} else {
1033 		ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1034 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != skm;
1035 		    i++) {
1036 			;
1037 		}
1038 		VERIFY(i < SKR_MAX_CACHES);
1039 		ASSERT(skr->skr_cache[i] == skm);
1040 		skr->skr_cache[i] = NULL;
1041 		for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] == NULL;
1042 		    i++) {
1043 			;
1044 		}
1045 		if (i == SKR_MAX_CACHES) {
1046 			skr->skr_mode &= ~SKR_MODE_SLAB;
1047 		}
1048 		if (!skmem_region_release_locked(skr)) {
1049 			SKR_UNLOCK(skr);
1050 		}
1051 	}
1052 }
1053 
1054 /*
1055  * Common routines for skmem_region_{alloc,mirror_alloc}.
1056  */
1057 static void *
skmem_region_alloc_common(struct skmem_region * skr,struct sksegment * sg)1058 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg)
1059 {
1060 	struct sksegment_bkt *sgb;
1061 	void *addr;
1062 
1063 	SKR_LOCK_ASSERT_HELD(skr);
1064 
1065 	ASSERT(sg->sg_md != NULL);
1066 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1067 	addr = (void *)sg->sg_start;
1068 	sgb = SKMEM_REGION_HASH(skr, addr);
1069 	ASSERT(sg->sg_link.tqe_next == NULL);
1070 	ASSERT(sg->sg_link.tqe_prev == NULL);
1071 	TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1072 
1073 	skr->skr_seginuse++;
1074 	skr->skr_meminuse += skr->skr_seg_size;
1075 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1076 		skr->skr_w_meminuse += skr->skr_seg_size;
1077 	}
1078 	skr->skr_alloc++;
1079 
1080 	return addr;
1081 }
1082 
1083 /*
1084  * Allocate a segment from the region.
1085  */
1086 void *
skmem_region_alloc(struct skmem_region * skr,void ** maddr,struct sksegment ** retsg,struct sksegment ** retsgm,uint32_t skmflag)1087 skmem_region_alloc(struct skmem_region *skr, void **maddr,
1088     struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag)
1089 {
1090 	struct sksegment *sg = NULL;
1091 	struct sksegment *sg1 = NULL;
1092 	void *addr = NULL, *addr1 = NULL;
1093 	uint32_t retries = 0;
1094 
1095 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1096 
1097 	if (retsg != NULL) {
1098 		*retsg = NULL;
1099 	}
1100 	if (retsgm != NULL) {
1101 		*retsgm = NULL;
1102 	}
1103 
1104 	/* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1105 	VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1106 	    (SKMEM_NOSLEEP | SKMEM_FAILOK));
1107 
1108 	SKR_LOCK(skr);
1109 	while (sg == NULL) {
1110 		/* see if there's a segment in the freelist */
1111 		sg = TAILQ_FIRST(&skr->skr_seg_free);
1112 		if (sg == NULL) {
1113 			/* see if we can grow the freelist */
1114 			sg = sksegment_freelist_grow(skr);
1115 			if (sg != NULL) {
1116 				break;
1117 			}
1118 
1119 			if (skr->skr_mode & SKR_MODE_SLAB) {
1120 				SKR_UNLOCK(skr);
1121 				/*
1122 				 * None found; it's possible that the slab
1123 				 * layer is caching extra amount, so ask
1124 				 * skmem_cache to reap/purge its caches.
1125 				 */
1126 				for (int i = 0; i < SKR_MAX_CACHES; i++) {
1127 					if (skr->skr_cache[i] == NULL) {
1128 						continue;
1129 					}
1130 					skmem_cache_reap_now(skr->skr_cache[i],
1131 					    TRUE);
1132 				}
1133 				SKR_LOCK(skr);
1134 				/*
1135 				 * If we manage to get some freed, try again.
1136 				 */
1137 				if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1138 					continue;
1139 				}
1140 			}
1141 
1142 			/*
1143 			 * Give up if this is a non-blocking allocation,
1144 			 * or if this is a blocking allocation but the
1145 			 * caller is willing to retry.
1146 			 */
1147 			if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1148 				break;
1149 			}
1150 
1151 			/* otherwise we wait until one is available */
1152 			++skr->skr_seg_waiters;
1153 			(void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1154 			    (PZERO - 1), skr->skr_name, NULL);
1155 		}
1156 	}
1157 
1158 	SKR_LOCK_ASSERT_HELD(skr);
1159 
1160 	if (sg != NULL) {
1161 retry:
1162 		/*
1163 		 * We have a segment; remove it from the freelist and
1164 		 * insert it into the allocated-address hash chain.
1165 		 * Note that this may return NULL if we can't allocate
1166 		 * the memory descriptor.
1167 		 */
1168 		if (sksegment_freelist_remove(skr, sg, skmflag,
1169 		    FALSE) == NULL) {
1170 			ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1171 			ASSERT(sg->sg_md == NULL);
1172 			ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1173 
1174 			/*
1175 			 * If it's non-blocking allocation, simply just give
1176 			 * up and let the caller decide when to retry.  Else,
1177 			 * it gets a bit complicated due to the contract we
1178 			 * have for blocking allocations with the client; the
1179 			 * most sensible thing to do here is to retry the
1180 			 * allocation ourselves.  Note that we keep using the
1181 			 * same segment we originally got, since we only need
1182 			 * the memory descriptor to be allocated for it; thus
1183 			 * we make sure we don't release the region lock when
1184 			 * retrying allocation.  Doing so is crucial when the
1185 			 * region is mirrored, since the segment indices on
1186 			 * both regions need to match.
1187 			 */
1188 			if (skmflag & SKMEM_NOSLEEP) {
1189 				SK_ERR("\"%s\": failed to allocate segment "
1190 				    "(non-sleeping mode)", skr->skr_name);
1191 				sg = NULL;
1192 			} else {
1193 				if (++retries > SKMEM_WDT_MAXTIME) {
1194 					panic_plain("\"%s\": failed to "
1195 					    "allocate segment (sleeping mode) "
1196 					    "after %u retries\n\n%s",
1197 					    skr->skr_name, SKMEM_WDT_MAXTIME,
1198 					    skmem_dump(skr));
1199 					/* NOTREACHED */
1200 					__builtin_unreachable();
1201 				} else {
1202 					SK_ERR("\"%s\": failed to allocate "
1203 					    "segment (sleeping mode): %u "
1204 					    "retries", skr->skr_name, retries);
1205 				}
1206 				if (skr->skr_mode & SKR_MODE_SLAB) {
1207 					/*
1208 					 * We can't get any memory descriptor
1209 					 * for this segment; reap extra cached
1210 					 * objects from the slab layer and hope
1211 					 * that we get lucky next time around.
1212 					 *
1213 					 * XXX [email protected]: perhaps also
1214 					 * trigger the zone allocator to do
1215 					 * its garbage collection here?
1216 					 */
1217 					skmem_cache_reap();
1218 				}
1219 				delay(1 * USEC_PER_SEC);        /* 1 sec */
1220 				goto retry;
1221 			}
1222 		}
1223 
1224 		if (sg != NULL) {
1225 			/* insert to allocated-address hash chain */
1226 			addr = skmem_region_alloc_common(skr, sg);
1227 		}
1228 	}
1229 
1230 	if (sg == NULL) {
1231 		VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1232 		if (skmflag & SKMEM_PANIC) {
1233 			VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1234 			    SKMEM_NOSLEEP);
1235 			/*
1236 			 * If is a failed non-blocking alloc and the caller
1237 			 * insists that it must be successful, then panic.
1238 			 */
1239 			panic_plain("\"%s\": skr 0x%p unable to satisfy "
1240 			    "mandatory allocation\n", skr->skr_name, skr);
1241 			/* NOTREACHED */
1242 			__builtin_unreachable();
1243 		} else {
1244 			/*
1245 			 * Give up if this is a non-blocking allocation,
1246 			 * or one where the caller is willing to handle
1247 			 * allocation failures.
1248 			 */
1249 			goto done;
1250 		}
1251 	}
1252 
1253 	ASSERT((mach_vm_address_t)addr == sg->sg_start);
1254 
1255 #if SK_LOG
1256 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1257 	    SK_KVA(skr), SK_KVA(sg));
1258 	if (skr->skr_mirror == NULL ||
1259 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1260 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1261 		    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1262 	} else {
1263 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1264 		    sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start),
1265 		    SK_KVA(sg->sg_end));
1266 	}
1267 #endif /* SK_LOG */
1268 
1269 	/*
1270 	 * If mirroring, allocate shadow object from slave region.
1271 	 */
1272 	if (skr->skr_mirror != NULL) {
1273 		ASSERT(skr->skr_mirror != skr);
1274 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1275 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1276 		addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg, &sg1);
1277 		ASSERT(addr1 != NULL);
1278 		ASSERT(sg1 != NULL && sg1 != sg);
1279 		ASSERT(sg1->sg_index == sg->sg_index);
1280 	}
1281 
1282 done:
1283 	SKR_UNLOCK(skr);
1284 
1285 	/* return segment metadata to caller if asked (reference not needed) */
1286 	if (addr != NULL) {
1287 		if (retsg != NULL) {
1288 			*retsg = sg;
1289 		}
1290 		if (retsgm != NULL) {
1291 			*retsgm = sg1;
1292 		}
1293 	}
1294 
1295 	if (maddr != NULL) {
1296 		*maddr = addr1;
1297 	}
1298 
1299 	return addr;
1300 }
1301 
1302 /*
1303  * Allocate a segment from a mirror region at the same index.  While it
1304  * is somewhat a simplified variant of skmem_region_alloc, keeping it
1305  * separate allows us to avoid further convoluting that routine.
1306  */
1307 static void *
skmem_region_mirror_alloc(struct skmem_region * skr,struct sksegment * sg0,struct sksegment ** retsg)1308 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1309     struct sksegment **retsg)
1310 {
1311 	struct sksegment sg_key = { .sg_index = sg0->sg_index };
1312 	struct sksegment *sg = NULL;
1313 	void *addr = NULL;
1314 
1315 	ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1316 	ASSERT(skr->skr_mirror == NULL);
1317 	ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1318 
1319 	if (retsg != NULL) {
1320 		*retsg = NULL;
1321 	}
1322 
1323 	SKR_LOCK(skr);
1324 
1325 	/*
1326 	 * See if we can find one in the freelist first.  Otherwise,
1327 	 * create a new segment of the same index and add that to the
1328 	 * freelist.  We would always get a segment since both regions
1329 	 * are synchronized when it comes to the indices of allocated
1330 	 * segments.
1331 	 */
1332 	sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1333 	if (sg == NULL) {
1334 		sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1335 		VERIFY(sg != NULL);
1336 	}
1337 	VERIFY(sg->sg_index == sg0->sg_index);
1338 
1339 	/*
1340 	 * We have a segment; remove it from the freelist and insert
1341 	 * it into the allocated-address hash chain.  This either
1342 	 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1343 	 * can't be allocated.
1344 	 *
1345 	 * TODO: consider retrying IOBMD allocation attempts if needed.
1346 	 */
1347 	sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1348 	VERIFY(sg != NULL);
1349 
1350 	/* insert to allocated-address hash chain */
1351 	addr = skmem_region_alloc_common(skr, sg);
1352 
1353 #if SK_LOG
1354 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1355 	    SK_KVA(skr), SK_KVA(sg));
1356 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1357 	    sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1358 #endif /* SK_LOG */
1359 
1360 	SKR_UNLOCK(skr);
1361 
1362 	/* return segment metadata to caller if asked (reference not needed) */
1363 	if (retsg != NULL) {
1364 		*retsg = sg;
1365 	}
1366 
1367 	return addr;
1368 }
1369 
1370 /*
1371  * Free a segment to the region.
1372  */
1373 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1374 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1375 {
1376 	struct sksegment_bkt *sgb;
1377 	struct sksegment *sg, *tsg;
1378 
1379 	VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1380 
1381 	/*
1382 	 * Search the hash chain to find a matching segment for the
1383 	 * given address.  If found, remove the segment from the
1384 	 * hash chain and insert it into the freelist.  Otherwise,
1385 	 * we panic since the caller has given us a bogus address.
1386 	 */
1387 	SKR_LOCK(skr);
1388 	sgb = SKMEM_REGION_HASH(skr, addr);
1389 	TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1390 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1391 		if (sg->sg_start == (mach_vm_address_t)addr) {
1392 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1393 			sg->sg_link.tqe_next = NULL;
1394 			sg->sg_link.tqe_prev = NULL;
1395 			break;
1396 		}
1397 	}
1398 
1399 	ASSERT(sg != NULL);
1400 	if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1401 		ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1402 		skr->skr_w_meminuse -= skr->skr_seg_size;
1403 	}
1404 	sksegment_freelist_insert(skr, sg, FALSE);
1405 
1406 	ASSERT(skr->skr_seginuse != 0);
1407 	skr->skr_seginuse--;
1408 	skr->skr_meminuse -= skr->skr_seg_size;
1409 	skr->skr_free++;
1410 
1411 #if SK_LOG
1412 	SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1413 	    SK_KVA(skr), SK_KVA(sg));
1414 	if (skr->skr_mirror == NULL ||
1415 	    !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1416 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx)",
1417 		    sg->sg_index, SK_KVA(addr),
1418 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1419 	} else {
1420 		SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) mirrored",
1421 		    sg->sg_index, SK_KVA(sg), SK_KVA(addr),
1422 		    SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1423 	}
1424 #endif /* SK_LOG */
1425 
1426 	/*
1427 	 * If mirroring, also free shadow object in slave region.
1428 	 */
1429 	if (skr->skr_mirror != NULL) {
1430 		ASSERT(maddr != NULL);
1431 		ASSERT(skr->skr_mirror != skr);
1432 		ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1433 		ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1434 		skmem_region_free(skr->skr_mirror, maddr, NULL);
1435 	}
1436 
1437 	/* wake up any blocked threads waiting for a segment */
1438 	if (skr->skr_seg_waiters != 0) {
1439 		SK_DF(SK_VERB_MEM_REGION,
1440 		    "sg 0x%llx waking up %u waiters", SK_KVA(sg),
1441 		    skr->skr_seg_waiters);
1442 		skr->skr_seg_waiters = 0;
1443 		wakeup(&skr->skr_seg_free);
1444 	}
1445 	SKR_UNLOCK(skr);
1446 }
1447 
1448 __attribute__((always_inline))
1449 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1450 skmem_region_retain_locked(struct skmem_region *skr)
1451 {
1452 	SKR_LOCK_ASSERT_HELD(skr);
1453 	skr->skr_refcnt++;
1454 	ASSERT(skr->skr_refcnt != 0);
1455 }
1456 
1457 /*
1458  * Retain a segment.
1459  */
1460 void
skmem_region_retain(struct skmem_region * skr)1461 skmem_region_retain(struct skmem_region *skr)
1462 {
1463 	SKR_LOCK(skr);
1464 	skmem_region_retain_locked(skr);
1465 	SKR_UNLOCK(skr);
1466 }
1467 
1468 __attribute__((always_inline))
1469 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1470 skmem_region_release_locked(struct skmem_region *skr)
1471 {
1472 	SKR_LOCK_ASSERT_HELD(skr);
1473 	ASSERT(skr->skr_refcnt != 0);
1474 	if (--skr->skr_refcnt == 0) {
1475 		skmem_region_destroy(skr);
1476 		return TRUE;
1477 	}
1478 	return FALSE;
1479 }
1480 
1481 /*
1482  * Release (and potentially destroy) a segment.
1483  */
1484 boolean_t
skmem_region_release(struct skmem_region * skr)1485 skmem_region_release(struct skmem_region *skr)
1486 {
1487 	boolean_t lastref;
1488 
1489 	SKR_LOCK(skr);
1490 	if (!(lastref = skmem_region_release_locked(skr))) {
1491 		SKR_UNLOCK(skr);
1492 	}
1493 
1494 	return lastref;
1495 }
1496 
1497 /*
1498  * Depopulate the segment freelist.
1499  */
1500 static void
skmem_region_depopulate(struct skmem_region * skr)1501 skmem_region_depopulate(struct skmem_region *skr)
1502 {
1503 	struct sksegment *sg, *tsg;
1504 
1505 	SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
1506 	    skr->skr_name, SK_KVA(skr));
1507 
1508 	SKR_LOCK_ASSERT_HELD(skr);
1509 	ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1510 
1511 	TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1512 		struct sksegment *sg0;
1513 		uint32_t i;
1514 
1515 		i = sg->sg_index;
1516 		sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1517 		VERIFY(sg0 == sg);
1518 
1519 		sksegment_destroy(skr, sg);
1520 		ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1521 	}
1522 }
1523 
1524 /*
1525  * Free tree segment compare routine.
1526  */
1527 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1528 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1529 {
1530 	return sg1->sg_index - sg2->sg_index;
1531 }
1532 
1533 /*
1534  * Create a segment.
1535  *
1536  * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1537  */
1538 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1539 sksegment_create(struct skmem_region *skr, uint32_t i)
1540 {
1541 	struct sksegment *sg = NULL;
1542 	bitmap_t *bmap;
1543 
1544 	SKR_LOCK_ASSERT_HELD(skr);
1545 
1546 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1547 	ASSERT(i < skr->skr_seg_max_cnt);
1548 	ASSERT(skr->skr_reg != NULL);
1549 	ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1550 
1551 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1552 	ASSERT(bit_test(*bmap, i % BMAPSZ));
1553 
1554 	sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1555 	bzero(sg, sg_size);
1556 
1557 	sg->sg_region = skr;
1558 	sg->sg_index = i;
1559 	sg->sg_state = SKSEG_STATE_DETACHED;
1560 
1561 	/* claim it (clear bit) */
1562 	bit_clear(*bmap, i % BMAPSZ);
1563 
1564 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b", i,
1565 	    SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode,
1566 	    SKR_MODE_BITS);
1567 
1568 	return sg;
1569 }
1570 
1571 /*
1572  * Destroy a segment.
1573  *
1574  * Set the bit for the segment's index in skr_seg_bmap bitmap,
1575  * indicating that it is now vacant.
1576  */
1577 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1578 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1579 {
1580 	uint32_t i = sg->sg_index;
1581 	bitmap_t *bmap;
1582 
1583 	SKR_LOCK_ASSERT_HELD(skr);
1584 
1585 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1586 	ASSERT(skr == sg->sg_region);
1587 	ASSERT(skr->skr_reg != NULL);
1588 	ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1589 	ASSERT(i < skr->skr_seg_max_cnt);
1590 
1591 	bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1592 	ASSERT(!bit_test(*bmap, i % BMAPSZ));
1593 
1594 	SK_DF(SK_VERB_MEM_REGION, "  [%u] [0x%llx-0x%llx) 0x%b",
1595 	    i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end),
1596 	    skr->skr_mode, SKR_MODE_BITS);
1597 
1598 	/*
1599 	 * Undo what's done earlier at segment creation time.
1600 	 */
1601 
1602 	ASSERT(sg->sg_md == NULL);
1603 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1604 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1605 
1606 	/* release it (set bit) */
1607 	bit_set(*bmap, i % BMAPSZ);
1608 
1609 	skmem_cache_free(skmem_sg_cache, sg);
1610 }
1611 
1612 /*
1613  * Insert a segment into freelist (freeing the segment).
1614  */
1615 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1616 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1617     boolean_t populating)
1618 {
1619 	SKR_LOCK_ASSERT_HELD(skr);
1620 
1621 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1622 	ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1623 	ASSERT(skr == sg->sg_region);
1624 	ASSERT(skr->skr_reg != NULL);
1625 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1626 
1627 	/*
1628 	 * If the region is being populated, then we're done.
1629 	 */
1630 	if (__improbable(populating)) {
1631 		ASSERT(sg->sg_md == NULL);
1632 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1633 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1634 	} else {
1635 		IOSKMemoryBufferRef md;
1636 		IOReturn err;
1637 
1638 		ASSERT(sg->sg_md != NULL);
1639 		ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1640 
1641 		/*
1642 		 * Let the client remove the memory from IOMMU, and unwire it.
1643 		 */
1644 		if (skr->skr_seg_dtor != NULL) {
1645 			skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1646 		}
1647 
1648 		ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1649 		    sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1650 
1651 		IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1652 		VERIFY(sg->sg_md == md);
1653 
1654 		/* if persistent, unwire this memory now */
1655 		if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1656 			err = IOSKMemoryUnwire(md);
1657 			if (err != kIOReturnSuccess) {
1658 				panic("Fail to unwire md %p, err %d", md, err);
1659 			}
1660 		}
1661 
1662 		/* mark memory as empty/discarded for consistency */
1663 		err = IOSKMemoryDiscard(md);
1664 		if (err != kIOReturnSuccess) {
1665 			panic("Fail to discard md %p, err %d", md, err);
1666 		}
1667 
1668 		IOSKMemoryDestroy(md);
1669 		sg->sg_md = NULL;
1670 		sg->sg_start = sg->sg_end = 0;
1671 		sg->sg_state = SKSEG_STATE_DETACHED;
1672 
1673 		ASSERT(skr->skr_memtotal >= skr->skr_seg_size);
1674 		skr->skr_memtotal -= skr->skr_seg_size;
1675 	}
1676 
1677 	sg->sg_type = SKSEG_TYPE_FREE;
1678 	ASSERT(sg->sg_link.tqe_next == NULL);
1679 	ASSERT(sg->sg_link.tqe_prev == NULL);
1680 	TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1681 	ASSERT(sg->sg_node.rbe_left == NULL);
1682 	ASSERT(sg->sg_node.rbe_right == NULL);
1683 	ASSERT(sg->sg_node.rbe_parent == NULL);
1684 	RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1685 	++skr->skr_seg_free_cnt;
1686 	ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1687 }
1688 
1689 /*
1690  * Remove a segment from the freelist (allocating the segment).
1691  */
1692 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1693 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1694     uint32_t skmflag, boolean_t purging)
1695 {
1696 #pragma unused(skmflag)
1697 	mach_vm_address_t segstart;
1698 	IOReturn err;
1699 
1700 	SKR_LOCK_ASSERT_HELD(skr);
1701 
1702 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1703 	ASSERT(sg != NULL);
1704 	ASSERT(skr == sg->sg_region);
1705 	ASSERT(skr->skr_reg != NULL);
1706 	ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1707 	ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1708 
1709 #if (DEVELOPMENT || DEBUG)
1710 	uint64_t mtbf = skmem_region_get_mtbf();
1711 	/*
1712 	 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1713 	 */
1714 	if (__improbable(mtbf != 0 && !purging &&
1715 	    (net_uptime_ms() % mtbf) == 0 &&
1716 	    !(skmflag & SKMEM_PANIC))) {
1717 		SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure",
1718 		    skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1719 		net_update_uptime();
1720 		return NULL;
1721 	}
1722 #endif /* (DEVELOPMENT || DEBUG) */
1723 
1724 	TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1725 	sg->sg_link.tqe_next = NULL;
1726 	sg->sg_link.tqe_prev = NULL;
1727 	RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1728 	sg->sg_node.rbe_left = NULL;
1729 	sg->sg_node.rbe_right = NULL;
1730 	sg->sg_node.rbe_parent = NULL;
1731 
1732 	ASSERT(skr->skr_seg_free_cnt != 0);
1733 	--skr->skr_seg_free_cnt;
1734 
1735 	/*
1736 	 * If the region is being depopulated, then we're done.
1737 	 */
1738 	if (__improbable(purging)) {
1739 		ASSERT(sg->sg_md == NULL);
1740 		ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1741 		ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1742 		sg->sg_type = SKSEG_TYPE_DESTROYED;
1743 		return sg;
1744 	}
1745 
1746 	ASSERT(sg->sg_md == NULL);
1747 	ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1748 	ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1749 
1750 	/* created as non-volatile (mapped) upon success */
1751 	if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1752 	    &skr->skr_bufspec, &segstart)) == NULL) {
1753 		ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1754 		if (skmflag & SKMEM_PANIC) {
1755 			/* if the caller insists for a success then panic */
1756 			panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1757 			    "to satisfy mandatory allocation\n", skr->skr_name,
1758 			    skr, sg, sg->sg_index);
1759 			/* NOTREACHED */
1760 			__builtin_unreachable();
1761 		}
1762 		/* reinsert this segment to freelist */
1763 		ASSERT(sg->sg_link.tqe_next == NULL);
1764 		ASSERT(sg->sg_link.tqe_prev == NULL);
1765 		TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1766 		ASSERT(sg->sg_node.rbe_left == NULL);
1767 		ASSERT(sg->sg_node.rbe_right == NULL);
1768 		ASSERT(sg->sg_node.rbe_parent == NULL);
1769 		RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1770 		++skr->skr_seg_free_cnt;
1771 		return NULL;
1772 	}
1773 
1774 	sg->sg_start = segstart;
1775 	sg->sg_end = (segstart + skr->skr_seg_size);
1776 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1777 
1778 	/* mark memory as non-volatile just to be consistent */
1779 	err = IOSKMemoryReclaim(sg->sg_md);
1780 	if (err != kIOReturnSuccess) {
1781 		panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1782 	}
1783 
1784 	/* if persistent, wire down its memory now */
1785 	if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1786 		err = IOSKMemoryWire(sg->sg_md);
1787 		if (err != kIOReturnSuccess) {
1788 			panic("Fail to wire md %p, err %d", sg->sg_md, err);
1789 		}
1790 	}
1791 
1792 	err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1793 	if (err != kIOReturnSuccess) {
1794 		panic("Fail to set md %p, err %d", sg->sg_md, err);
1795 	}
1796 
1797 	/*
1798 	 * Let the client wire it and insert to IOMMU, if applicable.
1799 	 * Try to find out if it's wired and set the right state.
1800 	 */
1801 	if (skr->skr_seg_ctor != NULL) {
1802 		skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1803 	}
1804 
1805 	sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1806 	    SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1807 
1808 	skr->skr_memtotal += skr->skr_seg_size;
1809 
1810 	ASSERT(sg->sg_md != NULL);
1811 	ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1812 
1813 	sg->sg_type = SKSEG_TYPE_ALLOC;
1814 	return sg;
1815 }
1816 
1817 /*
1818  * Find the first available index and allocate a segment at that index.
1819  */
1820 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1821 sksegment_freelist_grow(struct skmem_region *skr)
1822 {
1823 	struct sksegment *sg = NULL;
1824 	uint32_t i, j, idx;
1825 
1826 	SKR_LOCK_ASSERT_HELD(skr);
1827 
1828 	ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1829 	ASSERT(skr->skr_seg_bmap_len != 0);
1830 	ASSERT(skr->skr_seg_max_cnt != 0);
1831 
1832 	for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1833 		bitmap_t *bmap, mask;
1834 		uint32_t end = (BMAPSZ - 1);
1835 
1836 		if (i == (skr->skr_seg_bmap_len - 1)) {
1837 			end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1838 		}
1839 
1840 		bmap = &skr->skr_seg_bmap[i];
1841 		mask = BMASK64(0, end);
1842 
1843 		j = ffsll((*bmap) & mask);
1844 		if (j == 0) {
1845 			continue;
1846 		}
1847 
1848 		--j;
1849 		idx = (i * BMAPSZ) + j;
1850 
1851 		sg = sksegment_alloc_with_idx(skr, idx);
1852 
1853 		/* we're done */
1854 		break;
1855 	}
1856 
1857 	ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1858 	return sg;
1859 }
1860 
1861 /*
1862  * Create a single segment at a specific index and add it to the freelist.
1863  */
1864 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1865 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1866 {
1867 	struct sksegment *sg;
1868 
1869 	SKR_LOCK_ASSERT_HELD(skr);
1870 
1871 	if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1872 		panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1873 		    __func__, skr->skr_name, (void *)skr, idx,
1874 		    (skr->skr_seg_max_cnt - 1));
1875 		/* NOTREACHED */
1876 		__builtin_unreachable();
1877 	}
1878 
1879 	/* must not fail, blocking alloc */
1880 	sg = sksegment_create(skr, idx);
1881 	VERIFY(sg != NULL);
1882 	VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1883 
1884 	/* populate the freelist */
1885 	sksegment_freelist_insert(skr, sg, TRUE);
1886 	ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1887 #if (DEVELOPMENT || DEBUG)
1888 	struct sksegment sg_key = { .sg_index = sg->sg_index };
1889 	ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1890 #endif /* (DEVELOPMENT || DEBUG) */
1891 
1892 	SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1893 
1894 	return sg;
1895 }
1896 
1897 /*
1898  * Rescale the regions's allocated-address hash table.
1899  */
1900 static void
skmem_region_hash_rescale(struct skmem_region * skr)1901 skmem_region_hash_rescale(struct skmem_region *skr)
1902 {
1903 	struct sksegment_bkt *old_table, *new_table;
1904 	size_t old_size, new_size;
1905 	uint32_t i, moved = 0;
1906 
1907 	if (skr->skr_mode & SKR_MODE_PSEUDO) {
1908 		ASSERT(skr->skr_hash_table == NULL);
1909 		/* this is no-op for pseudo region */
1910 		return;
1911 	}
1912 
1913 	ASSERT(skr->skr_hash_table != NULL);
1914 	/* insist that we are executing in the update thread call context */
1915 	ASSERT(sk_is_region_update_protected());
1916 
1917 	/*
1918 	 * To get small average lookup time (lookup depth near 1.0), the hash
1919 	 * table size should be roughly the same (not necessarily equivalent)
1920 	 * as the region size.
1921 	 */
1922 	new_size = MAX(skr->skr_hash_initial,
1923 	    (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
1924 	new_size = MIN(skr->skr_hash_limit, new_size);
1925 	old_size = (skr->skr_hash_mask + 1);
1926 
1927 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
1928 		return;
1929 	}
1930 
1931 	new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
1932 	    Z_NOWAIT, skmem_tag_segment_hash);
1933 	if (__improbable(new_table == NULL)) {
1934 		return;
1935 	}
1936 
1937 	for (i = 0; i < new_size; i++) {
1938 		TAILQ_INIT(&new_table[i].sgb_head);
1939 	}
1940 
1941 	SKR_LOCK(skr);
1942 
1943 	old_size = (skr->skr_hash_mask + 1);
1944 	old_table = skr->skr_hash_table;
1945 
1946 	skr->skr_hash_mask = (uint32_t)(new_size - 1);
1947 	skr->skr_hash_table = new_table;
1948 	skr->skr_rescale++;
1949 
1950 	for (i = 0; i < old_size; i++) {
1951 		struct sksegment_bkt *sgb = &old_table[i];
1952 		struct sksegment_bkt *new_sgb;
1953 		struct sksegment *sg;
1954 
1955 		while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
1956 			TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1957 			ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1958 			new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
1959 			TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
1960 			++moved;
1961 		}
1962 		ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
1963 	}
1964 
1965 	SK_DF(SK_VERB_MEM_REGION,
1966 	    "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr),
1967 	    (uint32_t)old_size, (uint32_t)new_size, moved);
1968 
1969 	SKR_UNLOCK(skr);
1970 
1971 	sk_free_type_array(struct sksegment_bkt, old_size, old_table);
1972 }
1973 
1974 /*
1975  * Apply a function to operate on all regions.
1976  */
1977 static void
skmem_region_applyall(void (* func)(struct skmem_region *))1978 skmem_region_applyall(void (*func)(struct skmem_region *))
1979 {
1980 	struct skmem_region *skr;
1981 
1982 	net_update_uptime();
1983 
1984 	SKMEM_REGION_LOCK();
1985 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
1986 		func(skr);
1987 	}
1988 	SKMEM_REGION_UNLOCK();
1989 }
1990 
1991 static void
skmem_region_update(struct skmem_region * skr)1992 skmem_region_update(struct skmem_region *skr)
1993 {
1994 	SKMEM_REGION_LOCK_ASSERT_HELD();
1995 
1996 	/* insist that we are executing in the update thread call context */
1997 	ASSERT(sk_is_region_update_protected());
1998 
1999 	SKR_LOCK(skr);
2000 	/*
2001 	 * If there are threads blocked waiting for an available
2002 	 * segment, wake them up periodically so they can issue
2003 	 * another skmem_cache_reap() to reclaim resources cached
2004 	 * by skmem_cache.
2005 	 */
2006 	if (skr->skr_seg_waiters != 0) {
2007 		SK_DF(SK_VERB_MEM_REGION,
2008 		    "waking up %u waiters to reclaim", skr->skr_seg_waiters);
2009 		skr->skr_seg_waiters = 0;
2010 		wakeup(&skr->skr_seg_free);
2011 	}
2012 	SKR_UNLOCK(skr);
2013 
2014 	/*
2015 	 * Rescale the hash table if needed.
2016 	 */
2017 	skmem_region_hash_rescale(skr);
2018 }
2019 
2020 /*
2021  * Thread call callback for update.
2022  */
2023 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)2024 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2025 {
2026 #pragma unused(dummy, arg)
2027 	sk_protect_t protect;
2028 
2029 	protect = sk_region_update_protect();
2030 	skmem_region_applyall(skmem_region_update);
2031 	sk_region_update_unprotect(protect);
2032 
2033 	skmem_dispatch(skmem_region_update_tc, NULL,
2034 	    (skmem_region_update_interval * NSEC_PER_SEC));
2035 }
2036 
2037 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2038 skmem_region_for_pp(skmem_region_id_t id)
2039 {
2040 	int i;
2041 
2042 	for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2043 		if (id == skmem_pp_region_ids[i]) {
2044 			return TRUE;
2045 		}
2046 	}
2047 	return FALSE;
2048 }
2049 
2050 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2051 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2052 {
2053 	bzero(sreg, sizeof(*sreg));
2054 
2055 	(void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2056 	    "%s", skr->skr_name);
2057 	uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2058 	sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2059 	sreg->sreg_mode = skr->skr_mode;
2060 
2061 	sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2062 	sreg->sreg_c_seg_size = skr->skr_seg_size;
2063 	sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2064 	sreg->sreg_seg_objs = skr->skr_seg_objs;
2065 	sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2066 	sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2067 	sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2068 	sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2069 	sreg->sreg_align = skr->skr_align;
2070 	sreg->sreg_max_frags = skr->skr_max_frags;
2071 
2072 	sreg->sreg_meminuse = skr->skr_meminuse;
2073 	sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2074 	sreg->sreg_memtotal = skr->skr_memtotal;
2075 	sreg->sreg_seginuse = skr->skr_seginuse;
2076 	sreg->sreg_rescale = skr->skr_rescale;
2077 	sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2078 	sreg->sreg_alloc = skr->skr_alloc;
2079 	sreg->sreg_free = skr->skr_free;
2080 }
2081 
2082 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * out,size_t len)2083 skmem_region_mib_get_stats(struct skmem_region *skr, void *out, size_t len)
2084 {
2085 	size_t actual_space = sizeof(struct sk_stats_region);
2086 	struct sk_stats_region *sreg = out;
2087 
2088 	if (out == NULL || len < actual_space) {
2089 		goto done;
2090 	}
2091 
2092 	skmem_region_get_stats(skr, sreg);
2093 
2094 done:
2095 	return actual_space;
2096 }
2097 
2098 static int
2099 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2100 {
2101 #pragma unused(arg1, arg2, oidp)
2102 	struct skmem_region *skr;
2103 	size_t actual_space;
2104 	size_t buffer_space;
2105 	size_t allocated_space;
2106 	caddr_t buffer = NULL;
2107 	caddr_t scan;
2108 	int error = 0;
2109 
2110 	if (!kauth_cred_issuser(kauth_cred_get())) {
2111 		return EPERM;
2112 	}
2113 
2114 	net_update_uptime();
2115 	buffer_space = req->oldlen;
2116 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2117 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2118 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2119 		}
2120 		allocated_space = buffer_space;
2121 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_region_mib);
2122 		if (__improbable(buffer == NULL)) {
2123 			return ENOBUFS;
2124 		}
2125 	} else if (req->oldptr == USER_ADDR_NULL) {
2126 		buffer_space = 0;
2127 	}
2128 	actual_space = 0;
2129 	scan = buffer;
2130 
2131 	SKMEM_REGION_LOCK();
2132 	TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2133 		size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2134 		if (scan != NULL) {
2135 			if (buffer_space < size) {
2136 				/* supplied buffer too small, stop copying */
2137 				error = ENOMEM;
2138 				break;
2139 			}
2140 			scan += size;
2141 			buffer_space -= size;
2142 		}
2143 		actual_space += size;
2144 	}
2145 	SKMEM_REGION_UNLOCK();
2146 
2147 	if (actual_space != 0) {
2148 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
2149 		if (out_error != 0) {
2150 			error = out_error;
2151 		}
2152 	}
2153 	if (buffer != NULL) {
2154 		sk_free_data(buffer, allocated_space);
2155 	}
2156 
2157 	return error;
2158 }
2159 
2160 #if SK_LOG
2161 const char *
skmem_region_id2name(skmem_region_id_t id)2162 skmem_region_id2name(skmem_region_id_t id)
2163 {
2164 	const char *name;
2165 	switch (id) {
2166 	case SKMEM_REGION_SCHEMA:
2167 		name = "SCHEMA";
2168 		break;
2169 
2170 	case SKMEM_REGION_RING:
2171 		name = "RING";
2172 		break;
2173 
2174 	case SKMEM_REGION_BUF_DEF:
2175 		name = "BUF_DEF";
2176 		break;
2177 
2178 	case SKMEM_REGION_BUF_LARGE:
2179 		name = "BUF_LARGE";
2180 		break;
2181 
2182 	case SKMEM_REGION_RXBUF_DEF:
2183 		name = "RXBUF_DEF";
2184 		break;
2185 
2186 	case SKMEM_REGION_RXBUF_LARGE:
2187 		name = "RXBUF_LARGE";
2188 		break;
2189 
2190 	case SKMEM_REGION_TXBUF_DEF:
2191 		name = "TXBUF_DEF";
2192 		break;
2193 
2194 	case SKMEM_REGION_TXBUF_LARGE:
2195 		name = "TXBUF_LARGE";
2196 		break;
2197 
2198 	case SKMEM_REGION_UMD:
2199 		name = "UMD";
2200 		break;
2201 
2202 	case SKMEM_REGION_TXAUSD:
2203 		name = "TXAUSD";
2204 		break;
2205 
2206 	case SKMEM_REGION_RXFUSD:
2207 		name = "RXFUSD";
2208 		break;
2209 
2210 	case SKMEM_REGION_USTATS:
2211 		name = "USTATS";
2212 		break;
2213 
2214 	case SKMEM_REGION_FLOWADV:
2215 		name = "FLOWADV";
2216 		break;
2217 
2218 	case SKMEM_REGION_NEXUSADV:
2219 		name = "NEXUSADV";
2220 		break;
2221 
2222 	case SKMEM_REGION_SYSCTLS:
2223 		name = "SYSCTLS";
2224 		break;
2225 
2226 	case SKMEM_REGION_GUARD_HEAD:
2227 		name = "HEADGUARD";
2228 		break;
2229 
2230 	case SKMEM_REGION_GUARD_TAIL:
2231 		name = "TAILGUARD";
2232 		break;
2233 
2234 	case SKMEM_REGION_KMD:
2235 		name = "KMD";
2236 		break;
2237 
2238 	case SKMEM_REGION_RXKMD:
2239 		name = "RXKMD";
2240 		break;
2241 
2242 	case SKMEM_REGION_TXKMD:
2243 		name = "TXKMD";
2244 		break;
2245 
2246 	case SKMEM_REGION_TXAKSD:
2247 		name = "TXAKSD";
2248 		break;
2249 
2250 	case SKMEM_REGION_RXFKSD:
2251 		name = "RXFKSD";
2252 		break;
2253 
2254 	case SKMEM_REGION_KSTATS:
2255 		name = "KSTATS";
2256 		break;
2257 
2258 	case SKMEM_REGION_KBFT:
2259 		name = "KBFT";
2260 		break;
2261 
2262 	case SKMEM_REGION_UBFT:
2263 		name = "UBFT";
2264 		break;
2265 
2266 	case SKMEM_REGION_RXKBFT:
2267 		name = "RXKBFT";
2268 		break;
2269 
2270 	case SKMEM_REGION_TXKBFT:
2271 		name = "TXKBFT";
2272 		break;
2273 
2274 	case SKMEM_REGION_INTRINSIC:
2275 		name = "INTRINSIC";
2276 		break;
2277 
2278 	default:
2279 		name = "UNKNOWN";
2280 		break;
2281 	}
2282 
2283 	return name;
2284 }
2285 #endif /* SK_LOG */
2286 
2287 #if (DEVELOPMENT || DEBUG)
2288 uint64_t
skmem_region_get_mtbf(void)2289 skmem_region_get_mtbf(void)
2290 {
2291 	return skmem_region_mtbf;
2292 }
2293 
2294 void
skmem_region_set_mtbf(uint64_t newval)2295 skmem_region_set_mtbf(uint64_t newval)
2296 {
2297 	if (newval < SKMEM_REGION_MTBF_MIN) {
2298 		if (newval != 0) {
2299 			newval = SKMEM_REGION_MTBF_MIN;
2300 		}
2301 	} else if (newval > SKMEM_REGION_MTBF_MAX) {
2302 		newval = SKMEM_REGION_MTBF_MAX;
2303 	}
2304 
2305 	if (skmem_region_mtbf != newval) {
2306 		os_atomic_store(&skmem_region_mtbf, newval, release);
2307 		SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2308 	}
2309 }
2310 
2311 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2312 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2313     struct sysctl_req *req)
2314 {
2315 #pragma unused(oidp, arg1, arg2)
2316 	int changed, error;
2317 	uint64_t newval;
2318 
2319 	_CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2320 	if ((error = sysctl_io_number(req, skmem_region_mtbf,
2321 	    sizeof(uint64_t), &newval, &changed)) == 0) {
2322 		if (changed) {
2323 			skmem_region_set_mtbf(newval);
2324 		}
2325 	}
2326 	return error;
2327 }
2328 #endif /* (DEVELOPMENT || DEBUG) */
2329