1 /*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* BEGIN CSTYLED */
30 /*
31 * A region represents a collection of one or more similarly-sized memory
32 * segments, each of which is a contiguous range of integers. A segment
33 * is either allocated or free, and is treated as disjoint from all other
34 * segments. That is, the contiguity applies only at the segment level,
35 * and a region with multiple segments is not contiguous at the region level.
36 * A segment always belongs to the segment freelist, or the allocated-address
37 * hash chain, as described below.
38 *
39 * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40 * stays intact even after a defunct. Otherwise, the segments belonging
41 * to the region will be freed at defunct time, and the span covered by
42 * the region will be redirected to zero-filled anonymous memory.
43 *
44 * Memory for a region is always created as pageable and purgeable. It is
45 * the client's responsibility to prepare (wire) it, and optionally insert
46 * it to the IOMMU, at segment construction time. When the segment is
47 * freed, the client is responsible for removing it from IOMMU (if needed),
48 * and complete (unwire) it.
49 *
50 * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51 * is immediately wired upon allocation (segment removed from freelist).
52 * It gets unwired when memory is discarded (segment inserted to freelist).
53 *
54 * The chronological life cycle of a segment is as such:
55 *
56 * SKSEG_STATE_DETACHED
57 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58 * [segment allocated, useable by client]
59 * ...
60 * [client frees segment]
61 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62 * [reclaim]
63 * SKSEG_STATE_DETACHED
64 *
65 * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66 * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67 * prevent modifications by the user task. Only user-mappable regions will
68 * be considered for inclusion during skmem_arena_mmap().
69 *
70 * Every skmem allocator has a region as its slab supplier. Each slab is
71 * exactly a segment. The allocator uses skmem_region_{alloc,free}() to
72 * create and destroy slabs.
73 *
74 * A region may be mirrored by another region; the latter acts as the master
75 * controller for both regions. Mirrored (slave) regions cannot be used
76 * directly by the skmem allocator. Region mirroring technique is used for
77 * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78 * region has the same size and lifetime as its shadow counterpart.
79 *
80 * CREATION/DESTRUCTION:
81 *
82 * At creation time, all segments are allocated and are immediately inserted
83 * into the freelist. Allocating a purgeable segment has very little cost,
84 * as it is not backed by physical memory until it is accessed. Immediate
85 * insertion into the freelist causes the mapping to be further torn down.
86 *
87 * At destruction time, the freelist is emptied, and each segment is then
88 * destroyed. The system will assert if it detects there are outstanding
89 * segments not yet returned to the region (not freed by the client.)
90 *
91 * ALLOCATION:
92 *
93 * Allocating involves searching the freelist for a segment; if found, the
94 * segment is removed from the freelist and is inserted into the allocated-
95 * address hash chain. The address of the memory object represented by
96 * the segment is used as hash key. The use of allocated-address hash chain
97 * is needed since we return the address of the memory object, and not the
98 * segment's itself, to the client.
99 *
100 * DEALLOCATION:
101 *
102 * Freeing a memory object causes the chain to be searched for a matching
103 * segment. The system will assert if a segment cannot be found, since
104 * that indicates that the memory object address is invalid. Once found,
105 * the segment is removed from the allocated-address hash chain, and is
106 * inserted to the freelist.
107 *
108 * Segment allocation and deallocation can be expensive. Because of this,
109 * we expect that most clients will utilize the skmem_cache slab allocator
110 * as the frontend instead.
111 */
112 /* END CSTYLED */
113
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
117
118 static void skmem_region_destroy(struct skmem_region *skr);
119 static void skmem_region_depopulate(struct skmem_region *);
120 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
121 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
122 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
123 static void sksegment_freelist_insert(struct skmem_region *,
124 struct sksegment *, boolean_t);
125 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
126 struct sksegment *, uint32_t, boolean_t);
127 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
128 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
129 uint32_t);
130 static void *skmem_region_alloc_common(struct skmem_region *,
131 struct sksegment *);
132 static void *skmem_region_mirror_alloc(struct skmem_region *,
133 struct sksegment *, struct sksegment **);
134 static void skmem_region_applyall(void (*)(struct skmem_region *));
135 static void skmem_region_update(struct skmem_region *);
136 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
137 static inline void skmem_region_retain_locked(struct skmem_region *);
138 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
139 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
140
141 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
142 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
143
144 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
145 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
146 0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
147 "Skywalk region statistics");
148
149 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
150 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
151 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
152 &skmem_region_lock_attr);
153
154 /* protected by skmem_region_lock */
155 static TAILQ_HEAD(, skmem_region) skmem_region_head;
156
157 static thread_call_t skmem_region_update_tc;
158
159 #define SKMEM_REGION_UPDATE_INTERVAL 13 /* 13 seconds */
160 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
161
162 #define SKMEM_WDT_MAXTIME 30 /* # of secs before watchdog */
163 #define SKMEM_WDT_PURGE 3 /* retry purge threshold */
164
165 #if (DEVELOPMENT || DEBUG)
166 /* Mean Time Between Failures (ms) */
167 static volatile uint64_t skmem_region_mtbf;
168
169 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
170 struct sysctl_req *);
171
172 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
173 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
174 skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
175
176 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
177 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
178 SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
179 #endif /* (DEVELOPMENT || DEBUG) */
180
181 #define SKMEM_REGION_LOCK() \
182 lck_mtx_lock(&skmem_region_lock)
183 #define SKMEM_REGION_LOCK_ASSERT_HELD() \
184 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
185 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD() \
186 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
187 #define SKMEM_REGION_UNLOCK() \
188 lck_mtx_unlock(&skmem_region_lock)
189
190 /*
191 * Hash table bounds. Start with the initial value, and rescale up to
192 * the specified limit. Ideally we don't need a limit, but in practice
193 * this helps guard against runaways. These values should be revisited
194 * in future and be adjusted as needed.
195 */
196 #define SKMEM_REGION_HASH_INITIAL 32 /* initial hash table size */
197 #define SKMEM_REGION_HASH_LIMIT 4096 /* hash table size limit */
198
199 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m) \
200 (((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
201 #define SKMEM_REGION_HASH(_skr, _addr) \
202 (&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
203 (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
204
205 static ZONE_DECLARE(skr_zone, SKMEM_ZONE_PREFIX ".mem.skr",
206 sizeof(struct skmem_region), ZC_ZFREE_CLEARMEM);
207
208 static unsigned int sg_size; /* size of zone element */
209 static struct skmem_cache *skmem_sg_cache; /* cache for sksegment */
210
211 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
212 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
213 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
214 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
215 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
216
217 #define SKMEM_TAG_SEGMENT_BMAP "com.apple.skywalk.segment.bmap"
218 static kern_allocation_name_t skmem_tag_segment_bmap;
219
220 #define SKMEM_TAG_SEGMENT_HASH "com.apple.skywalk.segment.hash"
221 static kern_allocation_name_t skmem_tag_segment_hash;
222
223 #define SKMEM_TAG_REGION_MIB "com.apple.skywalk.region.mib"
224 static kern_allocation_name_t skmem_tag_region_mib;
225
226 #define BMAPSZ 64
227
228 /* 64-bit mask with range */
229 #define BMASK64(_beg, _end) \
230 ((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
231
232 static int __skmem_region_inited = 0;
233
234 void
skmem_region_init(void)235 skmem_region_init(void)
236 {
237 boolean_t randomize_seg_size;
238
239 _CASSERT(sizeof(bitmap_t) == sizeof(uint64_t));
240 _CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3));
241 _CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
242 _CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
243 ASSERT(!__skmem_region_inited);
244
245 /* enforce the ordering here */
246 _CASSERT(SKMEM_REGION_GUARD_HEAD == 0);
247 _CASSERT(SKMEM_REGION_SCHEMA == 1);
248 _CASSERT(SKMEM_REGION_RING == 2);
249 _CASSERT(SKMEM_REGION_BUF == 3);
250 _CASSERT(SKMEM_REGION_RXBUF == 4);
251 _CASSERT(SKMEM_REGION_TXBUF == 5);
252 _CASSERT(SKMEM_REGION_UMD == 6);
253 _CASSERT(SKMEM_REGION_TXAUSD == 7);
254 _CASSERT(SKMEM_REGION_RXFUSD == 8);
255 _CASSERT(SKMEM_REGION_UBFT == 9);
256 _CASSERT(SKMEM_REGION_USTATS == 10);
257 _CASSERT(SKMEM_REGION_FLOWADV == 11);
258 _CASSERT(SKMEM_REGION_NEXUSADV == 12);
259 _CASSERT(SKMEM_REGION_SYSCTLS == 13);
260 _CASSERT(SKMEM_REGION_GUARD_TAIL == 14);
261 _CASSERT(SKMEM_REGION_KMD == 15);
262 _CASSERT(SKMEM_REGION_RXKMD == 16);
263 _CASSERT(SKMEM_REGION_TXKMD == 17);
264 _CASSERT(SKMEM_REGION_KBFT == 18);
265 _CASSERT(SKMEM_REGION_RXKBFT == 19);
266 _CASSERT(SKMEM_REGION_TXKBFT == 20);
267 _CASSERT(SKMEM_REGION_TXAKSD == 21);
268 _CASSERT(SKMEM_REGION_RXFKSD == 22);
269 _CASSERT(SKMEM_REGION_KSTATS == 23);
270 _CASSERT(SKMEM_REGION_INTRINSIC == 24);
271
272 _CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
273 _CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
274 _CASSERT(SREG_RING == SKMEM_REGION_RING);
275 _CASSERT(SREG_BUF == SKMEM_REGION_BUF);
276 _CASSERT(SREG_RXBUF == SKMEM_REGION_RXBUF);
277 _CASSERT(SREG_TXBUF == SKMEM_REGION_TXBUF);
278 _CASSERT(SREG_UMD == SKMEM_REGION_UMD);
279 _CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
280 _CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
281 _CASSERT(SREG_UBFT == SKMEM_REGION_UBFT);
282 _CASSERT(SREG_USTATS == SKMEM_REGION_USTATS);
283 _CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
284 _CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
285 _CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
286 _CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
287 _CASSERT(SREG_KMD == SKMEM_REGION_KMD);
288 _CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD);
289 _CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD);
290 _CASSERT(SREG_KBFT == SKMEM_REGION_KBFT);
291 _CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
292 _CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
293 _CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
294 _CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
295 _CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS);
296
297 _CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
298 _CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
299 _CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
300 _CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
301 _CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
302 _CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
303 _CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
304 _CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
305 _CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
306 _CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
307 _CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD);
308 _CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
309 _CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
310 _CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
311 _CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
312 _CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB);
313 _CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
314
315 (void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
316 sizeof(skmem_seg_size));
317 if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
318 skmem_seg_size = SKMEM_MIN_SEG_SIZE;
319 }
320 skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
321 SKMEM_MIN_SEG_SIZE);
322 VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
323
324 (void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
325 sizeof(skmem_md_seg_size));
326 if (skmem_md_seg_size < skmem_seg_size) {
327 skmem_md_seg_size = skmem_seg_size;
328 }
329 skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
330 SKMEM_MIN_SEG_SIZE);
331 VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
332
333 /*
334 * If set via boot-args, honor it and don't randomize.
335 */
336 randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
337 &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
338 if (skmem_drv_buf_seg_size < skmem_seg_size) {
339 skmem_drv_buf_seg_size = skmem_seg_size;
340 }
341 skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
342 (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
343 VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
344
345 /*
346 * Randomize the driver buffer segment size; here we choose
347 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
348 * Set this as the effective driver buffer segment size.
349 */
350 if (randomize_seg_size) {
351 uint32_t sm;
352 read_frandom(&sm, sizeof(sm));
353 skmem_drv_buf_seg_eff_size +=
354 (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
355 VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
356 }
357 VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
358
359 (void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
360 &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
361 if (skmem_usr_buf_seg_size < skmem_seg_size) {
362 skmem_usr_buf_seg_size = skmem_seg_size;
363 }
364 skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
365 SKMEM_MIN_SEG_SIZE);
366 VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
367
368 SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
369 "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
370 skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
371 skmem_usr_buf_seg_size);
372
373 TAILQ_INIT(&skmem_region_head);
374
375 ASSERT(skmem_tag_segment_hash == NULL);
376 skmem_tag_segment_hash =
377 kern_allocation_name_allocate(SKMEM_TAG_SEGMENT_HASH, 0);
378 ASSERT(skmem_tag_segment_hash != NULL);
379
380 ASSERT(skmem_tag_segment_bmap == NULL);
381 skmem_tag_segment_bmap =
382 kern_allocation_name_allocate(SKMEM_TAG_SEGMENT_BMAP, 0);
383 ASSERT(skmem_tag_segment_bmap != NULL);
384
385 ASSERT(skmem_tag_region_mib == NULL);
386 skmem_tag_region_mib =
387 kern_allocation_name_allocate(SKMEM_TAG_REGION_MIB, 0);
388 ASSERT(skmem_tag_region_mib != NULL);
389
390 skmem_region_update_tc =
391 thread_call_allocate_with_options(skmem_region_update_func,
392 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
393 if (skmem_region_update_tc == NULL) {
394 panic("%s: thread_call_allocate failed", __func__);
395 /* NOTREACHED */
396 __builtin_unreachable();
397 }
398
399 sg_size = sizeof(struct sksegment);
400 skmem_sg_cache = skmem_cache_create("sg", sg_size,
401 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
402
403 /* and start the periodic region update machinery */
404 skmem_dispatch(skmem_region_update_tc, NULL,
405 (skmem_region_update_interval * NSEC_PER_SEC));
406
407 __skmem_region_inited = 1;
408 }
409
410 void
skmem_region_fini(void)411 skmem_region_fini(void)
412 {
413 if (__skmem_region_inited) {
414 ASSERT(TAILQ_EMPTY(&skmem_region_head));
415
416 if (skmem_region_update_tc != NULL) {
417 (void) thread_call_cancel_wait(skmem_region_update_tc);
418 (void) thread_call_free(skmem_region_update_tc);
419 skmem_region_update_tc = NULL;
420 }
421
422 if (skmem_sg_cache != NULL) {
423 skmem_cache_destroy(skmem_sg_cache);
424 skmem_sg_cache = NULL;
425 }
426
427 if (skmem_tag_segment_hash != NULL) {
428 kern_allocation_name_release(skmem_tag_segment_hash);
429 skmem_tag_segment_hash = NULL;
430 }
431 if (skmem_tag_segment_bmap != NULL) {
432 kern_allocation_name_release(skmem_tag_segment_bmap);
433 skmem_tag_segment_bmap = NULL;
434 }
435 if (skmem_tag_region_mib != NULL) {
436 kern_allocation_name_release(skmem_tag_region_mib);
437 skmem_tag_region_mib = NULL;
438 }
439
440 __skmem_region_inited = 0;
441 }
442 }
443
444 /*
445 * Reap internal caches.
446 */
447 void
skmem_region_reap_caches(boolean_t purge)448 skmem_region_reap_caches(boolean_t purge)
449 {
450 skmem_cache_reap_now(skmem_sg_cache, purge);
451 }
452
453 /*
454 * Configure and compute the parameters of a region.
455 */
456 void
skmem_region_params_config(struct skmem_region_params * srp)457 skmem_region_params_config(struct skmem_region_params *srp)
458 {
459 uint32_t cache_line_size = skmem_cpu_cache_line_size();
460 size_t seglim, segsize, segcnt;
461 size_t objsize, objcnt;
462
463 ASSERT(srp->srp_id < SKMEM_REGIONS);
464
465 /*
466 * If magazines layer is disabled system-wide, override
467 * the region parameter here. This will effectively reduce
468 * the number of requested objects computed below. Note that
469 * the region may have already been configured to exclude
470 * magazines in the default skmem_regions[] array.
471 */
472 if (!skmem_allow_magazines()) {
473 srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
474 }
475
476 objsize = srp->srp_r_obj_size;
477 ASSERT(objsize != 0);
478 objcnt = srp->srp_r_obj_cnt;
479 ASSERT(objcnt != 0);
480
481 if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
482 size_t align = srp->srp_align;
483
484 VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
485 VERIFY(powerof2(align));
486 objsize = MAX(objsize, sizeof(uint64_t));
487 #if KASAN
488 /*
489 * When KASAN is enabled, the zone allocator adjusts the
490 * element size to include the redzone regions, in which
491 * case we assume that the elements won't start on the
492 * alignment boundary and thus need to do some fix-ups.
493 * These include increasing the effective object size
494 * which adds at least 16 bytes to the original size.
495 */
496 objsize += sizeof(uint64_t) + align;
497 #endif /* KASAN */
498 objsize = P2ROUNDUP(objsize, align);
499
500 segsize = objsize;
501 srp->srp_r_seg_size = (uint32_t)segsize;
502 segcnt = objcnt;
503 goto done;
504 } else {
505 /* objects are always aligned at CPU cache line size */
506 srp->srp_align = cache_line_size;
507 }
508
509 /*
510 * Start with default segment size for the region, and compute the
511 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE). If the
512 * object size is greater, then we adjust the segment size to next
513 * multiple of the effective size larger than the object size.
514 */
515 if (srp->srp_r_seg_size == 0) {
516 switch (srp->srp_id) {
517 case SKMEM_REGION_UMD:
518 case SKMEM_REGION_KMD:
519 case SKMEM_REGION_RXKMD:
520 case SKMEM_REGION_TXKMD:
521 srp->srp_r_seg_size = skmem_md_seg_size;
522 break;
523
524 case SKMEM_REGION_BUF:
525 case SKMEM_REGION_RXBUF:
526 case SKMEM_REGION_TXBUF:
527 /*
528 * Use the effective driver buffer segment size,
529 * since it reflects any randomization done at
530 * skmem_region_init() time.
531 */
532 srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
533 break;
534
535 default:
536 srp->srp_r_seg_size = skmem_seg_size;
537 break;
538 }
539 } else {
540 srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
541 SKMEM_MIN_SEG_SIZE);
542 }
543
544 seglim = srp->srp_r_seg_size;
545 VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
546
547 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
548 srp->srp_name, seglim, objsize, objcnt);
549
550 /*
551 * Make sure object size is multiple of CPU cache line
552 * size, and that we can evenly divide the segment size.
553 */
554 if (!((objsize < cache_line_size) && (objsize < seglim) &&
555 ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
556 objsize = P2ROUNDUP(objsize, cache_line_size);
557 while (objsize < seglim && (seglim % objsize) != 0) {
558 SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
559 srp->srp_name, objsize, objsize + cache_line_size);
560 objsize += cache_line_size;
561 }
562 }
563
564 /* segment must be larger than object */
565 while (objsize > seglim) {
566 SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
567 seglim, seglim + SKMEM_MIN_SEG_SIZE);
568 seglim += SKMEM_MIN_SEG_SIZE;
569 }
570
571 /*
572 * Take into account worst-case per-CPU cached
573 * objects if this region is configured for it.
574 */
575 if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
576 uint32_t magazine_max_objs =
577 skmem_cache_magazine_max((uint32_t)objsize);
578 SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
579 objcnt, objcnt + magazine_max_objs);
580 objcnt += magazine_max_objs;
581 }
582
583 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
584 "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
585
586 segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
587 if (seglim > segsize) {
588 /*
589 * If the segment limit is larger than what we need,
590 * avoid memory wastage by shrinking it.
591 */
592 while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
593 VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
594 SK_DF(SK_VERB_MEM,
595 "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
596 srp->srp_name, segsize, objsize, objcnt, seglim,
597 P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
598 SKMEM_MIN_SEG_SIZE));
599 seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
600 SKMEM_MIN_SEG_SIZE);
601 }
602
603 /* adjust segment size */
604 segsize = seglim;
605 } else if (seglim < segsize) {
606 size_t oseglim = seglim;
607 /*
608 * If the segment limit is less than the segment size,
609 * see if increasing it slightly (up to 1.5x the segment
610 * size) would allow us to avoid allocating too many
611 * extra objects (due to excessive segment count).
612 */
613 while (seglim < segsize && (segsize % seglim) != 0) {
614 SK_DF(SK_VERB_MEM,
615 "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
616 srp->srp_name, segsize, objsize, objcnt, seglim,
617 (seglim + SKMEM_MIN_SEG_SIZE));
618 seglim += SKMEM_MIN_SEG_SIZE;
619 if (seglim >= (oseglim + (oseglim >> 1))) {
620 break;
621 }
622 }
623
624 /* can't use P2ROUNDUP since seglim may not be power of 2 */
625 segsize = SK_ROUNDUP(segsize, seglim);
626 }
627 ASSERT(segsize != 0 && (segsize % seglim) == 0);
628
629 SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
630 srp->srp_name, segsize, seglim);
631
632 /* compute segment count, and recompute segment size */
633 if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
634 segcnt = 1;
635 } else {
636 /*
637 * The adjustments above were done in increments of
638 * SKMEM_MIN_SEG_SIZE. If the object size is greater
639 * than that, ensure that the segment size is a multiple
640 * of the object size.
641 */
642 if (objsize > SKMEM_MIN_SEG_SIZE) {
643 ASSERT(seglim >= objsize);
644 if ((seglim % objsize) != 0) {
645 seglim += (seglim - objsize);
646 }
647 /* recompute segsize; see SK_ROUNDUP comment above */
648 segsize = SK_ROUNDUP(segsize, seglim);
649 }
650
651 segcnt = MAX(1, (segsize / seglim));
652 segsize /= segcnt;
653 }
654
655 SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
656 srp->srp_name, segcnt, segsize);
657
658 /* recompute object count to avoid wastage */
659 objcnt = (segsize * segcnt) / objsize;
660 ASSERT(objcnt != 0);
661 done:
662 srp->srp_c_obj_size = (uint32_t)objsize;
663 srp->srp_c_obj_cnt = (uint32_t)objcnt;
664 srp->srp_c_seg_size = (uint32_t)segsize;
665 srp->srp_seg_cnt = (uint32_t)segcnt;
666
667 SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
668 srp->srp_name, objsize, objcnt, segcnt, segsize);
669
670 #if SK_LOG
671 if (__improbable(sk_verbose != 0)) {
672 char label[32];
673 (void) snprintf(label, sizeof(label), "REGION_%s:",
674 skmem_region_id2name(srp->srp_id));
675 SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
676 (uint32_t)srp->srp_r_obj_cnt,
677 (uint32_t)srp->srp_r_obj_size,
678 (uint32_t)srp->srp_c_obj_cnt,
679 (uint32_t)srp->srp_c_obj_size);
680 }
681 #endif /* SK_LOG */
682 }
683
684 /*
685 * Create a region.
686 */
687 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)688 skmem_region_create(const char *name, struct skmem_region_params *srp,
689 sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
690 {
691 boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
692 uint32_t cflags = srp->srp_cflags;
693 struct skmem_region *skr;
694 uint32_t i;
695
696 ASSERT(srp->srp_id < SKMEM_REGIONS);
697 ASSERT(srp->srp_c_seg_size != 0 &&
698 (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
699 ASSERT(srp->srp_seg_cnt != 0);
700 ASSERT(srp->srp_c_obj_cnt == 1 ||
701 (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
702 ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
703
704 skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
705 skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
706 skr->skr_seg_size = srp->srp_c_seg_size;
707 skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
708 skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
709
710 if (!pseudo) {
711 skr->skr_seg_max_cnt = srp->srp_seg_cnt;
712
713 /* set alignment to CPU cache line size */
714 skr->skr_params.srp_align = skmem_cpu_cache_line_size();
715
716 /* allocate the allocated-address hash chain */
717 skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
718 skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
719 skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
720 skr->skr_hash_initial, Z_WAITOK | Z_NOFAIL,
721 skmem_tag_segment_hash);
722 skr->skr_hash_mask = (skr->skr_hash_initial - 1);
723 skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
724
725 for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
726 TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
727 }
728 } else {
729 /* this upper bound doesn't apply */
730 skr->skr_seg_max_cnt = 0;
731
732 /* pick up value set by skmem_regions_params_config() */
733 skr->skr_params.srp_align = srp->srp_align;
734 }
735
736 skr->skr_r_obj_size = srp->srp_r_obj_size;
737 skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
738 skr->skr_c_obj_size = srp->srp_c_obj_size;
739 skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
740
741 skr->skr_params.srp_md_type = srp->srp_md_type;
742 skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
743 skr->skr_params.srp_max_frags = srp->srp_max_frags;
744
745 skr->skr_seg_ctor = ctor;
746 skr->skr_seg_dtor = dtor;
747 skr->skr_private = private;
748
749 lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
750 &skmem_region_lock_attr);
751
752 TAILQ_INIT(&skr->skr_seg_free);
753 RB_INIT(&skr->skr_seg_tfree);
754
755 skr->skr_id = srp->srp_id;
756 uuid_generate_random(skr->skr_uuid);
757 (void) snprintf(skr->skr_name, sizeof(skr->skr_name),
758 "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
759
760 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
761 skr->skr_name, SK_KVA(skr));
762
763 /* sanity check */
764 ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
765 !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
766 SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
767 SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
768 SKMEM_REGION_CR_PUREDATA)));
769
770 skr->skr_cflags = cflags;
771 if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
772 skr->skr_mode |= SKR_MODE_NOREDIRECT;
773 }
774 if (cflags & SKMEM_REGION_CR_MMAPOK) {
775 skr->skr_mode |= SKR_MODE_MMAPOK;
776 }
777 if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
778 (cflags & SKMEM_REGION_CR_UREADONLY)) {
779 skr->skr_mode |= SKR_MODE_UREADONLY;
780 }
781 if (cflags & SKMEM_REGION_CR_KREADONLY) {
782 skr->skr_mode |= SKR_MODE_KREADONLY;
783 }
784 if (cflags & SKMEM_REGION_CR_PERSISTENT) {
785 skr->skr_mode |= SKR_MODE_PERSISTENT;
786 }
787 if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
788 skr->skr_mode |= SKR_MODE_MONOLITHIC;
789 }
790 if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
791 skr->skr_mode |= SKR_MODE_NOMAGAZINES;
792 }
793 if (cflags & SKMEM_REGION_CR_NOCACHE) {
794 skr->skr_mode |= SKR_MODE_NOCACHE;
795 }
796 if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
797 skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
798 }
799 if (cflags & SKMEM_REGION_CR_SHAREOK) {
800 skr->skr_mode |= SKR_MODE_SHAREOK;
801 }
802 if (cflags & SKMEM_REGION_CR_IODIR_IN) {
803 skr->skr_mode |= SKR_MODE_IODIR_IN;
804 }
805 if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
806 skr->skr_mode |= SKR_MODE_IODIR_OUT;
807 }
808 if (cflags & SKMEM_REGION_CR_GUARD) {
809 skr->skr_mode |= SKR_MODE_GUARD;
810 }
811 if (cflags & SKMEM_REGION_CR_PUREDATA) {
812 skr->skr_mode |= SKR_MODE_PUREDATA;
813 }
814 if (cflags & SKMEM_REGION_CR_PSEUDO) {
815 skr->skr_mode |= SKR_MODE_PSEUDO;
816 }
817
818 #if XNU_TARGET_OS_OSX
819 /*
820 * Mark all regions as persistent except for the guard and Intrinsic
821 * regions.
822 * This is to ensure that kernel threads won't be faulting-in while
823 * accessing these memory regions. We have observed various kinds of
824 * kernel panics due to kernel threads faulting on non-wired memory
825 * access when the VM subsystem is not in a state to swap-in the page.
826 */
827 if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
828 (skr->skr_mode & SKR_MODE_GUARD))) {
829 skr->skr_mode |= SKR_MODE_PERSISTENT;
830 }
831 #endif /* XNU_TARGET_OS_OSX */
832
833 /* SKR_MODE_UREADONLY only takes effect for user task mapping */
834 skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
835 skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
836 skr->skr_bufspec.purgeable = TRUE;
837 skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
838 skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
839 skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
840 skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
841 skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
842 skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
843
844 /* allocate segment bitmaps */
845 if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
846 ASSERT(skr->skr_seg_max_cnt != 0);
847 skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
848 skr->skr_seg_bmap = sk_alloc_data(BITMAP_SIZE(skr->skr_seg_max_cnt),
849 Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
850 ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
851 (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
852
853 /* mark all bitmaps as free (bit set) */
854 bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
855 }
856
857 /*
858 * Populate the freelist by allocating all segments for the
859 * region, which will be mapped but not faulted-in, and then
860 * immediately insert each to the freelist. That will in
861 * turn unmap the segment's memory object.
862 */
863 SKR_LOCK(skr);
864 if (skr->skr_mode & SKR_MODE_PSEUDO) {
865 char zone_name[64];
866 (void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
867 SKMEM_ZONE_PREFIX, name);
868 skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
869 ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
870 } else {
871 /* create a backing IOSKRegion object */
872 if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
873 (IOSKSize)skr->skr_seg_size,
874 (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
875 SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed",
876 skr->skr_name, (uint32_t)skr->skr_seg_size,
877 (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags,
878 SKMEM_REGION_CR_BITS);
879 goto failed;
880 }
881 }
882
883 ASSERT(skr->skr_seg_objs != 0);
884
885 ++skr->skr_refcnt; /* for caller */
886 SKR_UNLOCK(skr);
887
888 SKMEM_REGION_LOCK();
889 TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
890 SKMEM_REGION_UNLOCK();
891
892 SK_DF(SK_VERB_MEM_REGION,
893 " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b",
894 (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
895 (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
896 skr->skr_cflags, SKMEM_REGION_CR_BITS);
897
898 return skr;
899
900 failed:
901 SKR_LOCK_ASSERT_HELD(skr);
902 skmem_region_destroy(skr);
903
904 return NULL;
905 }
906
907 /*
908 * Destroy a region.
909 */
910 static void
skmem_region_destroy(struct skmem_region * skr)911 skmem_region_destroy(struct skmem_region *skr)
912 {
913 struct skmem_region *mskr;
914
915 SKR_LOCK_ASSERT_HELD(skr);
916
917 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx",
918 skr->skr_name, SK_KVA(skr));
919
920 /*
921 * Panic if we detect there are unfreed segments; the caller
922 * destroying this region is responsible for ensuring that all
923 * allocated segments have been freed prior to getting here.
924 */
925 ASSERT(skr->skr_refcnt == 0);
926 if (skr->skr_seginuse != 0) {
927 panic("%s: '%s' (%p) not empty (%u unfreed)",
928 __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
929 /* NOTREACHED */
930 __builtin_unreachable();
931 }
932
933 if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
934 SKR_UNLOCK(skr);
935 SKMEM_REGION_LOCK();
936 TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
937 SKMEM_REGION_UNLOCK();
938 SKR_LOCK(skr);
939 ASSERT(skr->skr_refcnt == 0);
940 }
941
942 /*
943 * Undo what's done earlier at region creation time.
944 */
945 skmem_region_depopulate(skr);
946 ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
947 ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
948 ASSERT(skr->skr_seg_free_cnt == 0);
949
950 if (skr->skr_reg != NULL) {
951 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
952 IOSKRegionDestroy(skr->skr_reg);
953 skr->skr_reg = NULL;
954 }
955
956 if (skr->skr_zreg != NULL) {
957 ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
958 zdestroy(skr->skr_zreg);
959 skr->skr_zreg = NULL;
960 }
961
962 if (skr->skr_seg_bmap != NULL) {
963 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
964 #if (DEBUG || DEVELOPMENT)
965 ASSERT(skr->skr_seg_bmap_len != 0);
966 /* must have been set to vacant (bit set) by now */
967 assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
968 #endif /* DEBUG || DEVELOPMENT */
969
970 sk_free_data(skr->skr_seg_bmap, BITMAP_SIZE(skr->skr_seg_max_cnt));
971 skr->skr_seg_bmap = NULL;
972 skr->skr_seg_bmap_len = 0;
973 }
974 ASSERT(skr->skr_seg_bmap_len == 0);
975
976 if (skr->skr_hash_table != NULL) {
977 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
978 #if (DEBUG || DEVELOPMENT)
979 for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
980 ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
981 }
982 #endif /* DEBUG || DEVELOPMENT */
983
984 sk_free_type_array(struct sksegment_bkt, skr->skr_hash_mask + 1,
985 skr->skr_hash_table);
986 skr->skr_hash_table = NULL;
987 }
988 if ((mskr = skr->skr_mirror) != NULL) {
989 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
990 skr->skr_mirror = NULL;
991 mskr->skr_mode &= ~SKR_MODE_MIRRORED;
992 }
993 SKR_UNLOCK(skr);
994
995 if (mskr != NULL) {
996 skmem_region_release(mskr);
997 }
998
999 lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
1000
1001 zfree(skr_zone, skr);
1002 }
1003
1004 /*
1005 * Mirror mskr (slave) to skr (master).
1006 */
1007 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)1008 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
1009 {
1010 SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ",
1011 SK_KVA(skr), SK_KVA(mskr));
1012
1013 SKR_LOCK(skr);
1014 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1015 ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
1016 ASSERT(skr->skr_mirror == NULL);
1017
1018 /* both regions must share identical parameters */
1019 ASSERT(skr->skr_size == mskr->skr_size);
1020 ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
1021 ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
1022
1023 skr->skr_mirror = mskr;
1024 skmem_region_retain(mskr);
1025 mskr->skr_mode |= SKR_MODE_MIRRORED;
1026 SKR_UNLOCK(skr);
1027 }
1028
1029 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm)1030 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm)
1031 {
1032 SKR_LOCK(skr);
1033 if (skm != NULL) {
1034 ASSERT(!(skr->skr_mode & SKR_MODE_SLAB));
1035 skr->skr_mode |= SKR_MODE_SLAB;
1036 ASSERT(skr->skr_cache == NULL);
1037 skr->skr_cache = skm;
1038 skmem_region_retain_locked(skr);
1039 SKR_UNLOCK(skr);
1040 } else {
1041 ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1042 skr->skr_mode &= ~SKR_MODE_SLAB;
1043 ASSERT(skr->skr_cache != NULL);
1044 skr->skr_cache = NULL;
1045 if (!skmem_region_release_locked(skr)) {
1046 SKR_UNLOCK(skr);
1047 }
1048 }
1049 }
1050
1051 /*
1052 * Common routines for skmem_region_{alloc,mirror_alloc}.
1053 */
1054 static void *
skmem_region_alloc_common(struct skmem_region * skr,struct sksegment * sg)1055 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg)
1056 {
1057 struct sksegment_bkt *sgb;
1058 void *addr;
1059
1060 SKR_LOCK_ASSERT_HELD(skr);
1061
1062 ASSERT(sg->sg_md != NULL);
1063 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1064 addr = (void *)sg->sg_start;
1065 sgb = SKMEM_REGION_HASH(skr, addr);
1066 ASSERT(sg->sg_link.tqe_next == NULL);
1067 ASSERT(sg->sg_link.tqe_prev == NULL);
1068 TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1069
1070 skr->skr_seginuse++;
1071 skr->skr_meminuse += skr->skr_seg_size;
1072 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1073 skr->skr_w_meminuse += skr->skr_seg_size;
1074 }
1075 skr->skr_alloc++;
1076
1077 return addr;
1078 }
1079
1080 /*
1081 * Allocate a segment from the region.
1082 */
1083 void *
skmem_region_alloc(struct skmem_region * skr,void ** maddr,struct sksegment ** retsg,struct sksegment ** retsgm,uint32_t skmflag)1084 skmem_region_alloc(struct skmem_region *skr, void **maddr,
1085 struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag)
1086 {
1087 struct sksegment *sg = NULL;
1088 struct sksegment *sg1 = NULL;
1089 void *addr = NULL, *addr1 = NULL;
1090 uint32_t retries = 0;
1091
1092 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1093
1094 if (retsg != NULL) {
1095 *retsg = NULL;
1096 }
1097 if (retsgm != NULL) {
1098 *retsgm = NULL;
1099 }
1100
1101 /* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1102 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1103 (SKMEM_NOSLEEP | SKMEM_FAILOK));
1104
1105 SKR_LOCK(skr);
1106 while (sg == NULL) {
1107 /* see if there's a segment in the freelist */
1108 sg = TAILQ_FIRST(&skr->skr_seg_free);
1109 if (sg == NULL) {
1110 /* see if we can grow the freelist */
1111 sg = sksegment_freelist_grow(skr);
1112 if (sg != NULL) {
1113 break;
1114 }
1115
1116 if (skr->skr_mode & SKR_MODE_SLAB) {
1117 SKR_UNLOCK(skr);
1118 /*
1119 * None found; it's possible that the slab
1120 * layer is caching extra amount, so ask
1121 * skmem_cache to reap/purge its caches.
1122 */
1123 skmem_cache_reap_now(skr->skr_cache, TRUE);
1124 SKR_LOCK(skr);
1125 /*
1126 * If we manage to get some freed, try again.
1127 */
1128 if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1129 continue;
1130 }
1131 }
1132
1133 /*
1134 * Give up if this is a non-blocking allocation,
1135 * or if this is a blocking allocation but the
1136 * caller is willing to retry.
1137 */
1138 if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1139 break;
1140 }
1141
1142 /* otherwise we wait until one is available */
1143 ++skr->skr_seg_waiters;
1144 (void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1145 (PZERO - 1), skr->skr_name, NULL);
1146 }
1147 }
1148
1149 SKR_LOCK_ASSERT_HELD(skr);
1150
1151 if (sg != NULL) {
1152 retry:
1153 /*
1154 * We have a segment; remove it from the freelist and
1155 * insert it into the allocated-address hash chain.
1156 * Note that this may return NULL if we can't allocate
1157 * the memory descriptor.
1158 */
1159 if (sksegment_freelist_remove(skr, sg, skmflag,
1160 FALSE) == NULL) {
1161 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1162 ASSERT(sg->sg_md == NULL);
1163 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1164
1165 /*
1166 * If it's non-blocking allocation, simply just give
1167 * up and let the caller decide when to retry. Else,
1168 * it gets a bit complicated due to the contract we
1169 * have for blocking allocations with the client; the
1170 * most sensible thing to do here is to retry the
1171 * allocation ourselves. Note that we keep using the
1172 * same segment we originally got, since we only need
1173 * the memory descriptor to be allocated for it; thus
1174 * we make sure we don't release the region lock when
1175 * retrying allocation. Doing so is crucial when the
1176 * region is mirrored, since the segment indices on
1177 * both regions need to match.
1178 */
1179 if (skmflag & SKMEM_NOSLEEP) {
1180 SK_ERR("\"%s\": failed to allocate segment "
1181 "(non-sleeping mode)", skr->skr_name);
1182 sg = NULL;
1183 } else {
1184 if (++retries > SKMEM_WDT_MAXTIME) {
1185 panic_plain("\"%s\": failed to "
1186 "allocate segment (sleeping mode) "
1187 "after %u retries\n\n%s",
1188 skr->skr_name, SKMEM_WDT_MAXTIME,
1189 skmem_dump(skr));
1190 /* NOTREACHED */
1191 __builtin_unreachable();
1192 } else {
1193 SK_ERR("\"%s\": failed to allocate "
1194 "segment (sleeping mode): %u "
1195 "retries", skr->skr_name, retries);
1196 }
1197 if (skr->skr_mode & SKR_MODE_SLAB) {
1198 /*
1199 * We can't get any memory descriptor
1200 * for this segment; reap extra cached
1201 * objects from the slab layer and hope
1202 * that we get lucky next time around.
1203 *
1204 * XXX [email protected]: perhaps also
1205 * trigger the zone allocator to do
1206 * its garbage collection here?
1207 */
1208 skmem_cache_reap();
1209 }
1210 delay(1 * USEC_PER_SEC); /* 1 sec */
1211 goto retry;
1212 }
1213 }
1214
1215 if (sg != NULL) {
1216 /* insert to allocated-address hash chain */
1217 addr = skmem_region_alloc_common(skr, sg);
1218 }
1219 }
1220
1221 if (sg == NULL) {
1222 VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1223 if (skmflag & SKMEM_PANIC) {
1224 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1225 SKMEM_NOSLEEP);
1226 /*
1227 * If is a failed non-blocking alloc and the caller
1228 * insists that it must be successful, then panic.
1229 */
1230 panic_plain("\"%s\": skr 0x%p unable to satisfy "
1231 "mandatory allocation\n", skr->skr_name, skr);
1232 /* NOTREACHED */
1233 __builtin_unreachable();
1234 } else {
1235 /*
1236 * Give up if this is a non-blocking allocation,
1237 * or one where the caller is willing to handle
1238 * allocation failures.
1239 */
1240 goto done;
1241 }
1242 }
1243
1244 ASSERT((mach_vm_address_t)addr == sg->sg_start);
1245
1246 #if SK_LOG
1247 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1248 SK_KVA(skr), SK_KVA(sg));
1249 if (skr->skr_mirror == NULL ||
1250 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1251 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1252 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1253 } else {
1254 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored",
1255 sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start),
1256 SK_KVA(sg->sg_end));
1257 }
1258 #endif /* SK_LOG */
1259
1260 /*
1261 * If mirroring, allocate shadow object from slave region.
1262 */
1263 if (skr->skr_mirror != NULL) {
1264 ASSERT(skr->skr_mirror != skr);
1265 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1266 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1267 addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg, &sg1);
1268 ASSERT(addr1 != NULL);
1269 ASSERT(sg1 != NULL && sg1 != sg);
1270 ASSERT(sg1->sg_index == sg->sg_index);
1271 }
1272
1273 done:
1274 SKR_UNLOCK(skr);
1275
1276 /* return segment metadata to caller if asked (reference not needed) */
1277 if (addr != NULL) {
1278 if (retsg != NULL) {
1279 *retsg = sg;
1280 }
1281 if (retsgm != NULL) {
1282 *retsgm = sg1;
1283 }
1284 }
1285
1286 if (maddr != NULL) {
1287 *maddr = addr1;
1288 }
1289
1290 return addr;
1291 }
1292
1293 /*
1294 * Allocate a segment from a mirror region at the same index. While it
1295 * is somewhat a simplified variant of skmem_region_alloc, keeping it
1296 * separate allows us to avoid further convoluting that routine.
1297 */
1298 static void *
skmem_region_mirror_alloc(struct skmem_region * skr,struct sksegment * sg0,struct sksegment ** retsg)1299 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1300 struct sksegment **retsg)
1301 {
1302 struct sksegment sg_key = { .sg_index = sg0->sg_index };
1303 struct sksegment *sg = NULL;
1304 void *addr = NULL;
1305
1306 ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1307 ASSERT(skr->skr_mirror == NULL);
1308 ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1309
1310 if (retsg != NULL) {
1311 *retsg = NULL;
1312 }
1313
1314 SKR_LOCK(skr);
1315
1316 /*
1317 * See if we can find one in the freelist first. Otherwise,
1318 * create a new segment of the same index and add that to the
1319 * freelist. We would always get a segment since both regions
1320 * are synchronized when it comes to the indices of allocated
1321 * segments.
1322 */
1323 sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1324 if (sg == NULL) {
1325 sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1326 VERIFY(sg != NULL);
1327 }
1328 VERIFY(sg->sg_index == sg0->sg_index);
1329
1330 /*
1331 * We have a segment; remove it from the freelist and insert
1332 * it into the allocated-address hash chain. This either
1333 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1334 * can't be allocated.
1335 *
1336 * TODO: consider retrying IOBMD allocation attempts if needed.
1337 */
1338 sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1339 VERIFY(sg != NULL);
1340
1341 /* insert to allocated-address hash chain */
1342 addr = skmem_region_alloc_common(skr, sg);
1343
1344 #if SK_LOG
1345 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1346 SK_KVA(skr), SK_KVA(sg));
1347 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1348 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1349 #endif /* SK_LOG */
1350
1351 SKR_UNLOCK(skr);
1352
1353 /* return segment metadata to caller if asked (reference not needed) */
1354 if (retsg != NULL) {
1355 *retsg = sg;
1356 }
1357
1358 return addr;
1359 }
1360
1361 /*
1362 * Free a segment to the region.
1363 */
1364 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1365 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1366 {
1367 struct sksegment_bkt *sgb;
1368 struct sksegment *sg, *tsg;
1369
1370 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1371
1372 /*
1373 * Search the hash chain to find a matching segment for the
1374 * given address. If found, remove the segment from the
1375 * hash chain and insert it into the freelist. Otherwise,
1376 * we panic since the caller has given us a bogus address.
1377 */
1378 SKR_LOCK(skr);
1379 sgb = SKMEM_REGION_HASH(skr, addr);
1380 TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1381 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1382 if (sg->sg_start == (mach_vm_address_t)addr) {
1383 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1384 sg->sg_link.tqe_next = NULL;
1385 sg->sg_link.tqe_prev = NULL;
1386 break;
1387 }
1388 }
1389
1390 ASSERT(sg != NULL);
1391 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1392 ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1393 skr->skr_w_meminuse -= skr->skr_seg_size;
1394 }
1395 sksegment_freelist_insert(skr, sg, FALSE);
1396
1397 ASSERT(skr->skr_seginuse != 0);
1398 skr->skr_seginuse--;
1399 skr->skr_meminuse -= skr->skr_seg_size;
1400 skr->skr_free++;
1401
1402 #if SK_LOG
1403 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1404 SK_KVA(skr), SK_KVA(sg));
1405 if (skr->skr_mirror == NULL ||
1406 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1407 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1408 sg->sg_index, SK_KVA(addr),
1409 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1410 } else {
1411 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored",
1412 sg->sg_index, SK_KVA(sg), SK_KVA(addr),
1413 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1414 }
1415 #endif /* SK_LOG */
1416
1417 /*
1418 * If mirroring, also free shadow object in slave region.
1419 */
1420 if (skr->skr_mirror != NULL) {
1421 ASSERT(maddr != NULL);
1422 ASSERT(skr->skr_mirror != skr);
1423 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1424 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1425 skmem_region_free(skr->skr_mirror, maddr, NULL);
1426 }
1427
1428 /* wake up any blocked threads waiting for a segment */
1429 if (skr->skr_seg_waiters != 0) {
1430 SK_DF(SK_VERB_MEM_REGION,
1431 "sg 0x%llx waking up %u waiters", SK_KVA(sg),
1432 skr->skr_seg_waiters);
1433 skr->skr_seg_waiters = 0;
1434 wakeup(&skr->skr_seg_free);
1435 }
1436 SKR_UNLOCK(skr);
1437 }
1438
1439 __attribute__((always_inline))
1440 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1441 skmem_region_retain_locked(struct skmem_region *skr)
1442 {
1443 SKR_LOCK_ASSERT_HELD(skr);
1444 skr->skr_refcnt++;
1445 ASSERT(skr->skr_refcnt != 0);
1446 }
1447
1448 /*
1449 * Retain a segment.
1450 */
1451 void
skmem_region_retain(struct skmem_region * skr)1452 skmem_region_retain(struct skmem_region *skr)
1453 {
1454 SKR_LOCK(skr);
1455 skmem_region_retain_locked(skr);
1456 SKR_UNLOCK(skr);
1457 }
1458
1459 __attribute__((always_inline))
1460 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1461 skmem_region_release_locked(struct skmem_region *skr)
1462 {
1463 SKR_LOCK_ASSERT_HELD(skr);
1464 ASSERT(skr->skr_refcnt != 0);
1465 if (--skr->skr_refcnt == 0) {
1466 skmem_region_destroy(skr);
1467 return TRUE;
1468 }
1469 return FALSE;
1470 }
1471
1472 /*
1473 * Release (and potentially destroy) a segment.
1474 */
1475 boolean_t
skmem_region_release(struct skmem_region * skr)1476 skmem_region_release(struct skmem_region *skr)
1477 {
1478 boolean_t lastref;
1479
1480 SKR_LOCK(skr);
1481 if (!(lastref = skmem_region_release_locked(skr))) {
1482 SKR_UNLOCK(skr);
1483 }
1484
1485 return lastref;
1486 }
1487
1488 /*
1489 * Depopulate the segment freelist.
1490 */
1491 static void
skmem_region_depopulate(struct skmem_region * skr)1492 skmem_region_depopulate(struct skmem_region *skr)
1493 {
1494 struct sksegment *sg, *tsg;
1495
1496 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
1497 skr->skr_name, SK_KVA(skr));
1498
1499 SKR_LOCK_ASSERT_HELD(skr);
1500 ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1501
1502 TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1503 struct sksegment *sg0;
1504 uint32_t i;
1505
1506 i = sg->sg_index;
1507 sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1508 VERIFY(sg0 == sg);
1509
1510 sksegment_destroy(skr, sg);
1511 ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1512 }
1513 }
1514
1515 /*
1516 * Free tree segment compare routine.
1517 */
1518 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1519 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1520 {
1521 return sg1->sg_index - sg2->sg_index;
1522 }
1523
1524 /*
1525 * Create a segment.
1526 *
1527 * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1528 */
1529 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1530 sksegment_create(struct skmem_region *skr, uint32_t i)
1531 {
1532 struct sksegment *sg = NULL;
1533 bitmap_t *bmap;
1534
1535 SKR_LOCK_ASSERT_HELD(skr);
1536
1537 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1538 ASSERT(i < skr->skr_seg_max_cnt);
1539 ASSERT(skr->skr_reg != NULL);
1540 ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1541
1542 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1543 ASSERT(bit_test(*bmap, i % BMAPSZ));
1544
1545 sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1546 bzero(sg, sg_size);
1547
1548 sg->sg_region = skr;
1549 sg->sg_index = i;
1550 sg->sg_state = SKSEG_STATE_DETACHED;
1551
1552 /* claim it (clear bit) */
1553 bit_clear(*bmap, i % BMAPSZ);
1554
1555 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", i,
1556 SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode,
1557 SKR_MODE_BITS);
1558
1559 return sg;
1560 }
1561
1562 /*
1563 * Destroy a segment.
1564 *
1565 * Set the bit for the segment's index in skr_seg_bmap bitmap,
1566 * indicating that it is now vacant.
1567 */
1568 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1569 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1570 {
1571 uint32_t i = sg->sg_index;
1572 bitmap_t *bmap;
1573
1574 SKR_LOCK_ASSERT_HELD(skr);
1575
1576 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1577 ASSERT(skr == sg->sg_region);
1578 ASSERT(skr->skr_reg != NULL);
1579 ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1580 ASSERT(i < skr->skr_seg_max_cnt);
1581
1582 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1583 ASSERT(!bit_test(*bmap, i % BMAPSZ));
1584
1585 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b",
1586 i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end),
1587 skr->skr_mode, SKR_MODE_BITS);
1588
1589 /*
1590 * Undo what's done earlier at segment creation time.
1591 */
1592
1593 ASSERT(sg->sg_md == NULL);
1594 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1595 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1596
1597 /* release it (set bit) */
1598 bit_set(*bmap, i % BMAPSZ);
1599
1600 skmem_cache_free(skmem_sg_cache, sg);
1601 }
1602
1603 /*
1604 * Insert a segment into freelist (freeing the segment).
1605 */
1606 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1607 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1608 boolean_t populating)
1609 {
1610 SKR_LOCK_ASSERT_HELD(skr);
1611
1612 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1613 ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1614 ASSERT(skr == sg->sg_region);
1615 ASSERT(skr->skr_reg != NULL);
1616 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1617
1618 /*
1619 * If the region is being populated, then we're done.
1620 */
1621 if (__improbable(populating)) {
1622 ASSERT(sg->sg_md == NULL);
1623 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1624 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1625 } else {
1626 IOSKMemoryBufferRef md;
1627 IOReturn err;
1628
1629 ASSERT(sg->sg_md != NULL);
1630 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1631
1632 /*
1633 * Let the client remove the memory from IOMMU, and unwire it.
1634 */
1635 if (skr->skr_seg_dtor != NULL) {
1636 skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1637 }
1638
1639 ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1640 sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1641
1642 IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1643 VERIFY(sg->sg_md == md);
1644
1645 /* if persistent, unwire this memory now */
1646 if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1647 err = IOSKMemoryUnwire(md);
1648 if (err != kIOReturnSuccess) {
1649 panic("Fail to unwire md %p, err %d", md, err);
1650 }
1651 }
1652
1653 /* mark memory as empty/discarded for consistency */
1654 err = IOSKMemoryDiscard(md);
1655 if (err != kIOReturnSuccess) {
1656 panic("Fail to discard md %p, err %d", md, err);
1657 }
1658
1659 IOSKMemoryDestroy(md);
1660 sg->sg_md = NULL;
1661 sg->sg_start = sg->sg_end = 0;
1662 sg->sg_state = SKSEG_STATE_DETACHED;
1663
1664 ASSERT(skr->skr_memtotal >= skr->skr_seg_size);
1665 skr->skr_memtotal -= skr->skr_seg_size;
1666 }
1667
1668 sg->sg_type = SKSEG_TYPE_FREE;
1669 ASSERT(sg->sg_link.tqe_next == NULL);
1670 ASSERT(sg->sg_link.tqe_prev == NULL);
1671 TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1672 ASSERT(sg->sg_node.rbe_left == NULL);
1673 ASSERT(sg->sg_node.rbe_right == NULL);
1674 ASSERT(sg->sg_node.rbe_parent == NULL);
1675 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1676 ++skr->skr_seg_free_cnt;
1677 ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1678 }
1679
1680 /*
1681 * Remove a segment from the freelist (allocating the segment).
1682 */
1683 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1684 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1685 uint32_t skmflag, boolean_t purging)
1686 {
1687 #pragma unused(skmflag)
1688 mach_vm_address_t segstart;
1689 IOReturn err;
1690
1691 SKR_LOCK_ASSERT_HELD(skr);
1692
1693 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1694 ASSERT(sg != NULL);
1695 ASSERT(skr == sg->sg_region);
1696 ASSERT(skr->skr_reg != NULL);
1697 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1698 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1699
1700 #if (DEVELOPMENT || DEBUG)
1701 uint64_t mtbf = skmem_region_get_mtbf();
1702 /*
1703 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1704 */
1705 if (__improbable(mtbf != 0 && !purging &&
1706 (net_uptime_ms() % mtbf) == 0 &&
1707 !(skmflag & SKMEM_PANIC))) {
1708 SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure",
1709 skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1710 net_update_uptime();
1711 return NULL;
1712 }
1713 #endif /* (DEVELOPMENT || DEBUG) */
1714
1715 TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1716 sg->sg_link.tqe_next = NULL;
1717 sg->sg_link.tqe_prev = NULL;
1718 RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1719 sg->sg_node.rbe_left = NULL;
1720 sg->sg_node.rbe_right = NULL;
1721 sg->sg_node.rbe_parent = NULL;
1722
1723 ASSERT(skr->skr_seg_free_cnt != 0);
1724 --skr->skr_seg_free_cnt;
1725
1726 /*
1727 * If the region is being depopulated, then we're done.
1728 */
1729 if (__improbable(purging)) {
1730 ASSERT(sg->sg_md == NULL);
1731 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1732 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1733 sg->sg_type = SKSEG_TYPE_DESTROYED;
1734 return sg;
1735 }
1736
1737 ASSERT(sg->sg_md == NULL);
1738 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1739 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1740
1741 /* created as non-volatile (mapped) upon success */
1742 if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1743 &skr->skr_bufspec, &segstart)) == NULL) {
1744 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1745 if (skmflag & SKMEM_PANIC) {
1746 /* if the caller insists for a success then panic */
1747 panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1748 "to satisfy mandatory allocation\n", skr->skr_name,
1749 skr, sg, sg->sg_index);
1750 /* NOTREACHED */
1751 __builtin_unreachable();
1752 }
1753 /* reinsert this segment to freelist */
1754 ASSERT(sg->sg_link.tqe_next == NULL);
1755 ASSERT(sg->sg_link.tqe_prev == NULL);
1756 TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1757 ASSERT(sg->sg_node.rbe_left == NULL);
1758 ASSERT(sg->sg_node.rbe_right == NULL);
1759 ASSERT(sg->sg_node.rbe_parent == NULL);
1760 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1761 ++skr->skr_seg_free_cnt;
1762 return NULL;
1763 }
1764
1765 sg->sg_start = segstart;
1766 sg->sg_end = (segstart + skr->skr_seg_size);
1767 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1768
1769 /* mark memory as non-volatile just to be consistent */
1770 err = IOSKMemoryReclaim(sg->sg_md);
1771 if (err != kIOReturnSuccess) {
1772 panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1773 }
1774
1775 /* if persistent, wire down its memory now */
1776 if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1777 err = IOSKMemoryWire(sg->sg_md);
1778 if (err != kIOReturnSuccess) {
1779 panic("Fail to wire md %p, err %d", sg->sg_md, err);
1780 }
1781 }
1782
1783 err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1784 if (err != kIOReturnSuccess) {
1785 panic("Fail to set md %p, err %d", sg->sg_md, err);
1786 }
1787
1788 /*
1789 * Let the client wire it and insert to IOMMU, if applicable.
1790 * Try to find out if it's wired and set the right state.
1791 */
1792 if (skr->skr_seg_ctor != NULL) {
1793 skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1794 }
1795
1796 sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1797 SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1798
1799 skr->skr_memtotal += skr->skr_seg_size;
1800
1801 ASSERT(sg->sg_md != NULL);
1802 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1803
1804 sg->sg_type = SKSEG_TYPE_ALLOC;
1805 return sg;
1806 }
1807
1808 /*
1809 * Find the first available index and allocate a segment at that index.
1810 */
1811 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1812 sksegment_freelist_grow(struct skmem_region *skr)
1813 {
1814 struct sksegment *sg = NULL;
1815 uint32_t i, j, idx;
1816
1817 SKR_LOCK_ASSERT_HELD(skr);
1818
1819 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1820 ASSERT(skr->skr_seg_bmap_len != 0);
1821 ASSERT(skr->skr_seg_max_cnt != 0);
1822
1823 for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1824 bitmap_t *bmap, mask;
1825 uint32_t end = (BMAPSZ - 1);
1826
1827 if (i == (skr->skr_seg_bmap_len - 1)) {
1828 end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1829 }
1830
1831 bmap = &skr->skr_seg_bmap[i];
1832 mask = BMASK64(0, end);
1833
1834 j = ffsll((*bmap) & mask);
1835 if (j == 0) {
1836 continue;
1837 }
1838
1839 --j;
1840 idx = (i * BMAPSZ) + j;
1841
1842 sg = sksegment_alloc_with_idx(skr, idx);
1843
1844 /* we're done */
1845 break;
1846 }
1847
1848 ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1849 return sg;
1850 }
1851
1852 /*
1853 * Create a single segment at a specific index and add it to the freelist.
1854 */
1855 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1856 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1857 {
1858 struct sksegment *sg;
1859
1860 SKR_LOCK_ASSERT_HELD(skr);
1861
1862 if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1863 panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1864 __func__, skr->skr_name, (void *)skr, idx,
1865 (skr->skr_seg_max_cnt - 1));
1866 /* NOTREACHED */
1867 __builtin_unreachable();
1868 }
1869
1870 /* must not fail, blocking alloc */
1871 sg = sksegment_create(skr, idx);
1872 VERIFY(sg != NULL);
1873 VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1874
1875 /* populate the freelist */
1876 sksegment_freelist_insert(skr, sg, TRUE);
1877 ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1878 #if (DEVELOPMENT || DEBUG)
1879 struct sksegment sg_key = { .sg_index = sg->sg_index };
1880 ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1881 #endif /* (DEVELOPMENT || DEBUG) */
1882
1883 SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1884
1885 return sg;
1886 }
1887
1888 /*
1889 * Rescale the regions's allocated-address hash table.
1890 */
1891 static void
skmem_region_hash_rescale(struct skmem_region * skr)1892 skmem_region_hash_rescale(struct skmem_region *skr)
1893 {
1894 struct sksegment_bkt *old_table, *new_table;
1895 size_t old_size, new_size;
1896 uint32_t i, moved = 0;
1897
1898 if (skr->skr_mode & SKR_MODE_PSEUDO) {
1899 ASSERT(skr->skr_hash_table == NULL);
1900 /* this is no-op for pseudo region */
1901 return;
1902 }
1903
1904 ASSERT(skr->skr_hash_table != NULL);
1905 /* insist that we are executing in the update thread call context */
1906 ASSERT(sk_is_region_update_protected());
1907
1908 /*
1909 * To get small average lookup time (lookup depth near 1.0), the hash
1910 * table size should be roughly the same (not necessarily equivalent)
1911 * as the region size.
1912 */
1913 new_size = MAX(skr->skr_hash_initial,
1914 (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
1915 new_size = MIN(skr->skr_hash_limit, new_size);
1916 old_size = (skr->skr_hash_mask + 1);
1917
1918 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
1919 return;
1920 }
1921
1922 new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
1923 Z_NOWAIT, skmem_tag_segment_hash);
1924 if (__improbable(new_table == NULL)) {
1925 return;
1926 }
1927
1928 for (i = 0; i < new_size; i++) {
1929 TAILQ_INIT(&new_table[i].sgb_head);
1930 }
1931
1932 SKR_LOCK(skr);
1933
1934 old_size = (skr->skr_hash_mask + 1);
1935 old_table = skr->skr_hash_table;
1936
1937 skr->skr_hash_mask = (uint32_t)(new_size - 1);
1938 skr->skr_hash_table = new_table;
1939 skr->skr_rescale++;
1940
1941 for (i = 0; i < old_size; i++) {
1942 struct sksegment_bkt *sgb = &old_table[i];
1943 struct sksegment_bkt *new_sgb;
1944 struct sksegment *sg;
1945
1946 while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
1947 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1948 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1949 new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
1950 TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
1951 ++moved;
1952 }
1953 ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
1954 }
1955
1956 SK_DF(SK_VERB_MEM_REGION,
1957 "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr),
1958 (uint32_t)old_size, (uint32_t)new_size, moved);
1959
1960 SKR_UNLOCK(skr);
1961
1962 sk_free_type_array(struct sksegment_bkt, old_size, old_table);
1963 }
1964
1965 /*
1966 * Apply a function to operate on all regions.
1967 */
1968 static void
skmem_region_applyall(void (* func)(struct skmem_region *))1969 skmem_region_applyall(void (*func)(struct skmem_region *))
1970 {
1971 struct skmem_region *skr;
1972
1973 net_update_uptime();
1974
1975 SKMEM_REGION_LOCK();
1976 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
1977 func(skr);
1978 }
1979 SKMEM_REGION_UNLOCK();
1980 }
1981
1982 static void
skmem_region_update(struct skmem_region * skr)1983 skmem_region_update(struct skmem_region *skr)
1984 {
1985 SKMEM_REGION_LOCK_ASSERT_HELD();
1986
1987 /* insist that we are executing in the update thread call context */
1988 ASSERT(sk_is_region_update_protected());
1989
1990 SKR_LOCK(skr);
1991 /*
1992 * If there are threads blocked waiting for an available
1993 * segment, wake them up periodically so they can issue
1994 * another skmem_cache_reap() to reclaim resources cached
1995 * by skmem_cache.
1996 */
1997 if (skr->skr_seg_waiters != 0) {
1998 SK_DF(SK_VERB_MEM_REGION,
1999 "waking up %u waiters to reclaim", skr->skr_seg_waiters);
2000 skr->skr_seg_waiters = 0;
2001 wakeup(&skr->skr_seg_free);
2002 }
2003 SKR_UNLOCK(skr);
2004
2005 /*
2006 * Rescale the hash table if needed.
2007 */
2008 skmem_region_hash_rescale(skr);
2009 }
2010
2011 /*
2012 * Thread call callback for update.
2013 */
2014 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)2015 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2016 {
2017 #pragma unused(dummy, arg)
2018 sk_protect_t protect;
2019
2020 protect = sk_region_update_protect();
2021 skmem_region_applyall(skmem_region_update);
2022 sk_region_update_unprotect(protect);
2023
2024 skmem_dispatch(skmem_region_update_tc, NULL,
2025 (skmem_region_update_interval * NSEC_PER_SEC));
2026 }
2027
2028 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2029 skmem_region_for_pp(skmem_region_id_t id)
2030 {
2031 int i;
2032
2033 for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2034 if (id == skmem_pp_region_ids[i]) {
2035 return TRUE;
2036 }
2037 }
2038 return FALSE;
2039 }
2040
2041 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2042 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2043 {
2044 bzero(sreg, sizeof(*sreg));
2045
2046 (void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2047 "%s", skr->skr_name);
2048 uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2049 sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2050 sreg->sreg_mode = skr->skr_mode;
2051
2052 sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2053 sreg->sreg_c_seg_size = skr->skr_seg_size;
2054 sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2055 sreg->sreg_seg_objs = skr->skr_seg_objs;
2056 sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2057 sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2058 sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2059 sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2060 sreg->sreg_align = skr->skr_align;
2061 sreg->sreg_max_frags = skr->skr_max_frags;
2062
2063 sreg->sreg_meminuse = skr->skr_meminuse;
2064 sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2065 sreg->sreg_memtotal = skr->skr_memtotal;
2066 sreg->sreg_seginuse = skr->skr_seginuse;
2067 sreg->sreg_rescale = skr->skr_rescale;
2068 sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2069 sreg->sreg_alloc = skr->skr_alloc;
2070 sreg->sreg_free = skr->skr_free;
2071 }
2072
2073 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * out,size_t len)2074 skmem_region_mib_get_stats(struct skmem_region *skr, void *out, size_t len)
2075 {
2076 size_t actual_space = sizeof(struct sk_stats_region);
2077 struct sk_stats_region *sreg = out;
2078
2079 if (out == NULL || len < actual_space) {
2080 goto done;
2081 }
2082
2083 skmem_region_get_stats(skr, sreg);
2084
2085 done:
2086 return actual_space;
2087 }
2088
2089 static int
2090 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2091 {
2092 #pragma unused(arg1, arg2, oidp)
2093 struct skmem_region *skr;
2094 size_t actual_space;
2095 size_t buffer_space;
2096 size_t allocated_space;
2097 caddr_t buffer = NULL;
2098 caddr_t scan;
2099 int error = 0;
2100
2101 if (!kauth_cred_issuser(kauth_cred_get())) {
2102 return EPERM;
2103 }
2104
2105 net_update_uptime();
2106 buffer_space = req->oldlen;
2107 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2108 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2109 buffer_space = SK_SYSCTL_ALLOC_MAX;
2110 }
2111 allocated_space = buffer_space;
2112 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_region_mib);
2113 if (__improbable(buffer == NULL)) {
2114 return ENOBUFS;
2115 }
2116 } else if (req->oldptr == USER_ADDR_NULL) {
2117 buffer_space = 0;
2118 }
2119 actual_space = 0;
2120 scan = buffer;
2121
2122 SKMEM_REGION_LOCK();
2123 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2124 size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2125 if (scan != NULL) {
2126 if (buffer_space < size) {
2127 /* supplied buffer too small, stop copying */
2128 error = ENOMEM;
2129 break;
2130 }
2131 scan += size;
2132 buffer_space -= size;
2133 }
2134 actual_space += size;
2135 }
2136 SKMEM_REGION_UNLOCK();
2137
2138 if (actual_space != 0) {
2139 int out_error = SYSCTL_OUT(req, buffer, actual_space);
2140 if (out_error != 0) {
2141 error = out_error;
2142 }
2143 }
2144 if (buffer != NULL) {
2145 sk_free_data(buffer, allocated_space);
2146 }
2147
2148 return error;
2149 }
2150
2151 #if SK_LOG
2152 const char *
skmem_region_id2name(skmem_region_id_t id)2153 skmem_region_id2name(skmem_region_id_t id)
2154 {
2155 const char *name;
2156 switch (id) {
2157 case SKMEM_REGION_SCHEMA:
2158 name = "SCHEMA";
2159 break;
2160
2161 case SKMEM_REGION_RING:
2162 name = "RING";
2163 break;
2164
2165 case SKMEM_REGION_BUF:
2166 name = "BUF";
2167 break;
2168
2169 case SKMEM_REGION_RXBUF:
2170 name = "RXBUF";
2171 break;
2172
2173 case SKMEM_REGION_TXBUF:
2174 name = "TXBUF";
2175 break;
2176
2177 case SKMEM_REGION_UMD:
2178 name = "UMD";
2179 break;
2180
2181 case SKMEM_REGION_TXAUSD:
2182 name = "TXAUSD";
2183 break;
2184
2185 case SKMEM_REGION_RXFUSD:
2186 name = "RXFUSD";
2187 break;
2188
2189 case SKMEM_REGION_USTATS:
2190 name = "USTATS";
2191 break;
2192
2193 case SKMEM_REGION_FLOWADV:
2194 name = "FLOWADV";
2195 break;
2196
2197 case SKMEM_REGION_NEXUSADV:
2198 name = "NEXUSADV";
2199 break;
2200
2201 case SKMEM_REGION_SYSCTLS:
2202 name = "SYSCTLS";
2203 break;
2204
2205 case SKMEM_REGION_GUARD_HEAD:
2206 name = "HEADGUARD";
2207 break;
2208
2209 case SKMEM_REGION_GUARD_TAIL:
2210 name = "TAILGUARD";
2211 break;
2212
2213 case SKMEM_REGION_KMD:
2214 name = "KMD";
2215 break;
2216
2217 case SKMEM_REGION_RXKMD:
2218 name = "RXKMD";
2219 break;
2220
2221 case SKMEM_REGION_TXKMD:
2222 name = "TXKMD";
2223 break;
2224
2225 case SKMEM_REGION_TXAKSD:
2226 name = "TXAKSD";
2227 break;
2228
2229 case SKMEM_REGION_RXFKSD:
2230 name = "RXFKSD";
2231 break;
2232
2233 case SKMEM_REGION_KSTATS:
2234 name = "KSTATS";
2235 break;
2236
2237 case SKMEM_REGION_KBFT:
2238 name = "KBFT";
2239 break;
2240
2241 case SKMEM_REGION_UBFT:
2242 name = "UBFT";
2243 break;
2244
2245 case SKMEM_REGION_RXKBFT:
2246 name = "RXKBFT";
2247 break;
2248
2249 case SKMEM_REGION_TXKBFT:
2250 name = "TXKBFT";
2251 break;
2252
2253 case SKMEM_REGION_INTRINSIC:
2254 name = "INTRINSIC";
2255 break;
2256
2257 default:
2258 name = "UNKNOWN";
2259 break;
2260 }
2261
2262 return name;
2263 }
2264 #endif /* SK_LOG */
2265
2266 #if (DEVELOPMENT || DEBUG)
2267 uint64_t
skmem_region_get_mtbf(void)2268 skmem_region_get_mtbf(void)
2269 {
2270 return skmem_region_mtbf;
2271 }
2272
2273 void
skmem_region_set_mtbf(uint64_t newval)2274 skmem_region_set_mtbf(uint64_t newval)
2275 {
2276 if (newval < SKMEM_REGION_MTBF_MIN) {
2277 if (newval != 0) {
2278 newval = SKMEM_REGION_MTBF_MIN;
2279 }
2280 } else if (newval > SKMEM_REGION_MTBF_MAX) {
2281 newval = SKMEM_REGION_MTBF_MAX;
2282 }
2283
2284 if (skmem_region_mtbf != newval) {
2285 atomic_set_64(&skmem_region_mtbf, newval);
2286 SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2287 }
2288 }
2289
2290 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2291 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2292 struct sysctl_req *req)
2293 {
2294 #pragma unused(oidp, arg1, arg2)
2295 int changed, error;
2296 uint64_t newval;
2297
2298 _CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2299 if ((error = sysctl_io_number(req, skmem_region_mtbf,
2300 sizeof(uint64_t), &newval, &changed)) == 0) {
2301 if (changed) {
2302 skmem_region_set_mtbf(newval);
2303 }
2304 }
2305 return error;
2306 }
2307 #endif /* (DEVELOPMENT || DEBUG) */
2308