1 /*
2 * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* BEGIN CSTYLED */
30 /*
31 * A region represents a collection of one or more similarly-sized memory
32 * segments, each of which is a contiguous range of integers. A segment
33 * is either allocated or free, and is treated as disjoint from all other
34 * segments. That is, the contiguity applies only at the segment level,
35 * and a region with multiple segments is not contiguous at the region level.
36 * A segment always belongs to the segment freelist, or the allocated-address
37 * hash chain, as described below.
38 *
39 * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40 * stays intact even after a defunct. Otherwise, the segments belonging
41 * to the region will be freed at defunct time, and the span covered by
42 * the region will be redirected to zero-filled anonymous memory.
43 *
44 * Memory for a region is always created as pageable and purgeable. It is
45 * the client's responsibility to prepare (wire) it, and optionally insert
46 * it to the IOMMU, at segment construction time. When the segment is
47 * freed, the client is responsible for removing it from IOMMU (if needed),
48 * and complete (unwire) it.
49 *
50 * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51 * is immediately wired upon allocation (segment removed from freelist).
52 * It gets unwired when memory is discarded (segment inserted to freelist).
53 *
54 * The chronological life cycle of a segment is as such:
55 *
56 * SKSEG_STATE_DETACHED
57 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58 * [segment allocated, useable by client]
59 * ...
60 * [client frees segment]
61 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62 * [reclaim]
63 * SKSEG_STATE_DETACHED
64 *
65 * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66 * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67 * prevent modifications by the user task. Only user-mappable regions will
68 * be considered for inclusion during skmem_arena_mmap().
69 *
70 * Every skmem allocator has a region as its slab supplier. Each slab is
71 * exactly a segment. The allocator uses skmem_region_{alloc,free}() to
72 * create and destroy slabs.
73 *
74 * A region may be mirrored by another region; the latter acts as the master
75 * controller for both regions. Mirrored (slave) regions cannot be used
76 * directly by the skmem allocator. Region mirroring technique is used for
77 * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78 * region has the same size and lifetime as its shadow counterpart.
79 *
80 * CREATION/DESTRUCTION:
81 *
82 * At creation time, all segments are allocated and are immediately inserted
83 * into the freelist. Allocating a purgeable segment has very little cost,
84 * as it is not backed by physical memory until it is accessed. Immediate
85 * insertion into the freelist causes the mapping to be further torn down.
86 *
87 * At destruction time, the freelist is emptied, and each segment is then
88 * destroyed. The system will assert if it detects there are outstanding
89 * segments not yet returned to the region (not freed by the client.)
90 *
91 * ALLOCATION:
92 *
93 * Allocating involves searching the freelist for a segment; if found, the
94 * segment is removed from the freelist and is inserted into the allocated-
95 * address hash chain. The address of the memory object represented by
96 * the segment is used as hash key. The use of allocated-address hash chain
97 * is needed since we return the address of the memory object, and not the
98 * segment's itself, to the client.
99 *
100 * DEALLOCATION:
101 *
102 * Freeing a memory object causes the chain to be searched for a matching
103 * segment. The system will assert if a segment cannot be found, since
104 * that indicates that the memory object address is invalid. Once found,
105 * the segment is removed from the allocated-address hash chain, and is
106 * inserted to the freelist.
107 *
108 * Segment allocation and deallocation can be expensive. Because of this,
109 * we expect that most clients will utilize the skmem_cache slab allocator
110 * as the frontend instead.
111 */
112 /* END CSTYLED */
113
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
117
118 static void skmem_region_destroy(struct skmem_region *skr);
119 static void skmem_region_depopulate(struct skmem_region *);
120 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
121 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
122 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
123 static void sksegment_freelist_insert(struct skmem_region *,
124 struct sksegment *, boolean_t);
125 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
126 struct sksegment *, uint32_t, boolean_t);
127 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
128 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
129 uint32_t);
130 static void *__sized_by(seg_size) skmem_region_alloc_common(struct skmem_region *,
131 struct sksegment *, uint32_t seg_size);
132 static void *__sized_by(seg_size) skmem_region_mirror_alloc(struct skmem_region *,
133 struct sksegment *, uint32_t seg_size, struct sksegment **);
134 static void skmem_region_applyall(void (*)(struct skmem_region *));
135 static void skmem_region_update(struct skmem_region *);
136 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
137 static inline void skmem_region_retain_locked(struct skmem_region *);
138 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
139 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
140
141 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
142 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
143
144 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
145 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
146 0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
147 "Skywalk region statistics");
148
149 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
150 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
151 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
152 &skmem_region_lock_attr);
153
154 /* protected by skmem_region_lock */
155 static TAILQ_HEAD(, skmem_region) skmem_region_head;
156
157 static thread_call_t skmem_region_update_tc;
158
159 #define SKMEM_REGION_UPDATE_INTERVAL 13 /* 13 seconds */
160 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
161
162 #define SKMEM_WDT_MAXTIME 30 /* # of secs before watchdog */
163 #define SKMEM_WDT_PURGE 3 /* retry purge threshold */
164
165 #if (DEVELOPMENT || DEBUG)
166 /* Mean Time Between Failures (ms) */
167 static volatile uint64_t skmem_region_mtbf;
168
169 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
170 struct sysctl_req *);
171
172 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
173 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
174 skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
175
176 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
177 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
178 SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
179 #endif /* (DEVELOPMENT || DEBUG) */
180
181 #define SKMEM_REGION_LOCK() \
182 lck_mtx_lock(&skmem_region_lock)
183 #define SKMEM_REGION_LOCK_ASSERT_HELD() \
184 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
185 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD() \
186 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
187 #define SKMEM_REGION_UNLOCK() \
188 lck_mtx_unlock(&skmem_region_lock)
189
190 /*
191 * Hash table bounds. Start with the initial value, and rescale up to
192 * the specified limit. Ideally we don't need a limit, but in practice
193 * this helps guard against runaways. These values should be revisited
194 * in future and be adjusted as needed.
195 */
196 #define SKMEM_REGION_HASH_INITIAL 32 /* initial hash table size */
197 #define SKMEM_REGION_HASH_LIMIT 4096 /* hash table size limit */
198
199 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m) \
200 (((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
201 #define SKMEM_REGION_HASH(_skr, _addr) \
202 (&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
203 (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
204
205 static SKMEM_TYPE_DEFINE(skr_zone, struct skmem_region);
206
207 /*
208 * XXX: This is used in only one function (skmem_region_init) after the
209 * -fbounds-safety changes were made for Skmem. We can remove this global and
210 * just make it a local variable to the function (skmem_region_init).
211 */
212 static unsigned int sg_size; /* size of zone element */
213 static struct skmem_cache *skmem_sg_cache; /* cache for sksegment */
214
215 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
216 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
217 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
218 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
219 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
220
221 #define SKMEM_TAG_SEGMENT_BMAP "com.apple.skywalk.segment.bmap"
222 static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP);
223
224 #define SKMEM_TAG_SEGMENT_HASH "com.apple.skywalk.segment.hash"
225 static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH);
226
227 #define SKMEM_TAG_REGION_MIB "com.apple.skywalk.region.mib"
228 static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB);
229
230 #define BMAPSZ 64
231
232 /* 64-bit mask with range */
233 #define BMASK64(_beg, _end) \
234 ((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
235
236 static int __skmem_region_inited = 0;
237
238 /*
239 * XXX -fbounds-safety: we added seg_size to skmem_region_alloc_common(), but
240 * this is only used by -fbounds-safety, so we add __unused if -fbounds-safety
241 * is disabled. The utility macro for that is SK_BF_ARG().
242 * We do the same for skmem_region_alloc(), with objsize
243 */
244 #if !__has_ptrcheck
245 #define SK_FB_ARG __unused
246 #else
247 #define SK_FB_ARG
248 #endif
249
250 void
skmem_region_init(void)251 skmem_region_init(void)
252 {
253 boolean_t randomize_seg_size;
254
255 _CASSERT(sizeof(bitmap_t) == sizeof(uint64_t));
256 _CASSERT(BMAPSZ == (sizeof(bitmap_t) << 3));
257 _CASSERT((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
258 _CASSERT(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
259 ASSERT(!__skmem_region_inited);
260
261 /* enforce the ordering here */
262 _CASSERT(SKMEM_REGION_GUARD_HEAD == 0);
263 _CASSERT(SKMEM_REGION_SCHEMA == 1);
264 _CASSERT(SKMEM_REGION_RING == 2);
265 _CASSERT(SKMEM_REGION_BUF_DEF == 3);
266 _CASSERT(SKMEM_REGION_BUF_LARGE == 4);
267 _CASSERT(SKMEM_REGION_RXBUF_DEF == 5);
268 _CASSERT(SKMEM_REGION_RXBUF_LARGE == 6);
269 _CASSERT(SKMEM_REGION_TXBUF_DEF == 7);
270 _CASSERT(SKMEM_REGION_TXBUF_LARGE == 8);
271 _CASSERT(SKMEM_REGION_UMD == 9);
272 _CASSERT(SKMEM_REGION_TXAUSD == 10);
273 _CASSERT(SKMEM_REGION_RXFUSD == 11);
274 _CASSERT(SKMEM_REGION_UBFT == 12);
275 _CASSERT(SKMEM_REGION_USTATS == 13);
276 _CASSERT(SKMEM_REGION_FLOWADV == 14);
277 _CASSERT(SKMEM_REGION_NEXUSADV == 15);
278 _CASSERT(SKMEM_REGION_SYSCTLS == 16);
279 _CASSERT(SKMEM_REGION_GUARD_TAIL == 17);
280 _CASSERT(SKMEM_REGION_KMD == 18);
281 _CASSERT(SKMEM_REGION_RXKMD == 19);
282 _CASSERT(SKMEM_REGION_TXKMD == 20);
283 _CASSERT(SKMEM_REGION_KBFT == 21);
284 _CASSERT(SKMEM_REGION_RXKBFT == 22);
285 _CASSERT(SKMEM_REGION_TXKBFT == 23);
286 _CASSERT(SKMEM_REGION_TXAKSD == 24);
287 _CASSERT(SKMEM_REGION_RXFKSD == 25);
288 _CASSERT(SKMEM_REGION_KSTATS == 26);
289 _CASSERT(SKMEM_REGION_INTRINSIC == 27);
290
291 _CASSERT(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
292 _CASSERT(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
293 _CASSERT(SREG_RING == SKMEM_REGION_RING);
294 _CASSERT(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF);
295 _CASSERT(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE);
296 _CASSERT(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF);
297 _CASSERT(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE);
298 _CASSERT(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF);
299 _CASSERT(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE);
300 _CASSERT(SREG_UMD == SKMEM_REGION_UMD);
301 _CASSERT(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
302 _CASSERT(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
303 _CASSERT(SREG_UBFT == SKMEM_REGION_UBFT);
304 _CASSERT(SREG_USTATS == SKMEM_REGION_USTATS);
305 _CASSERT(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
306 _CASSERT(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
307 _CASSERT(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
308 _CASSERT(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
309 _CASSERT(SREG_KMD == SKMEM_REGION_KMD);
310 _CASSERT(SREG_RXKMD == SKMEM_REGION_RXKMD);
311 _CASSERT(SREG_TXKMD == SKMEM_REGION_TXKMD);
312 _CASSERT(SREG_KBFT == SKMEM_REGION_KBFT);
313 _CASSERT(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
314 _CASSERT(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
315 _CASSERT(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
316 _CASSERT(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
317 _CASSERT(SREG_KSTATS == SKMEM_REGION_KSTATS);
318
319 _CASSERT(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
320 _CASSERT(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
321 _CASSERT(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
322 _CASSERT(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
323 _CASSERT(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
324 _CASSERT(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
325 _CASSERT(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
326 _CASSERT(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
327 _CASSERT(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
328 _CASSERT(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
329 _CASSERT(SKR_MODE_GUARD == SREG_MODE_GUARD);
330 _CASSERT(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
331 _CASSERT(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
332 _CASSERT(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
333 _CASSERT(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
334 _CASSERT(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE);
335 _CASSERT(SKR_MODE_SLAB == SREG_MODE_SLAB);
336 _CASSERT(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
337
338 (void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
339 sizeof(skmem_seg_size));
340 if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
341 skmem_seg_size = SKMEM_MIN_SEG_SIZE;
342 }
343 skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
344 SKMEM_MIN_SEG_SIZE);
345 VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
346
347 (void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
348 sizeof(skmem_md_seg_size));
349 if (skmem_md_seg_size < skmem_seg_size) {
350 skmem_md_seg_size = skmem_seg_size;
351 }
352 skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
353 SKMEM_MIN_SEG_SIZE);
354 VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
355
356 /*
357 * If set via boot-args, honor it and don't randomize.
358 */
359 randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
360 &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
361 if (skmem_drv_buf_seg_size < skmem_seg_size) {
362 skmem_drv_buf_seg_size = skmem_seg_size;
363 }
364 skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
365 (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
366 VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
367
368 /*
369 * Randomize the driver buffer segment size; here we choose
370 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
371 * Set this as the effective driver buffer segment size.
372 */
373 if (randomize_seg_size) {
374 uint32_t sm;
375 read_frandom(&sm, sizeof(sm));
376 skmem_drv_buf_seg_eff_size +=
377 (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
378 VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
379 }
380 VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
381
382 (void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
383 &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
384 if (skmem_usr_buf_seg_size < skmem_seg_size) {
385 skmem_usr_buf_seg_size = skmem_seg_size;
386 }
387 skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
388 SKMEM_MIN_SEG_SIZE);
389 VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
390
391 SK_ERR("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
392 "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
393 skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
394 skmem_usr_buf_seg_size);
395
396 TAILQ_INIT(&skmem_region_head);
397
398 skmem_region_update_tc =
399 thread_call_allocate_with_options(skmem_region_update_func,
400 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
401 if (skmem_region_update_tc == NULL) {
402 panic("%s: thread_call_allocate failed", __func__);
403 /* NOTREACHED */
404 __builtin_unreachable();
405 }
406
407 sg_size = sizeof(struct sksegment);
408 skmem_sg_cache = skmem_cache_create("sg", sg_size,
409 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
410
411 /* and start the periodic region update machinery */
412 skmem_dispatch(skmem_region_update_tc, NULL,
413 (skmem_region_update_interval * NSEC_PER_SEC));
414
415 __skmem_region_inited = 1;
416 }
417
418 void
skmem_region_fini(void)419 skmem_region_fini(void)
420 {
421 if (__skmem_region_inited) {
422 ASSERT(TAILQ_EMPTY(&skmem_region_head));
423
424 if (skmem_region_update_tc != NULL) {
425 (void) thread_call_cancel_wait(skmem_region_update_tc);
426 (void) thread_call_free(skmem_region_update_tc);
427 skmem_region_update_tc = NULL;
428 }
429
430 if (skmem_sg_cache != NULL) {
431 skmem_cache_destroy(skmem_sg_cache);
432 skmem_sg_cache = NULL;
433 }
434
435 __skmem_region_inited = 0;
436 }
437 }
438
439 /*
440 * Reap internal caches.
441 */
442 void
skmem_region_reap_caches(boolean_t purge)443 skmem_region_reap_caches(boolean_t purge)
444 {
445 skmem_cache_reap_now(skmem_sg_cache, purge);
446 }
447
448 /*
449 * Configure and compute the parameters of a region.
450 */
451 void
skmem_region_params_config(struct skmem_region_params * srp)452 skmem_region_params_config(struct skmem_region_params *srp)
453 {
454 uint32_t cache_line_size = skmem_cpu_cache_line_size();
455 size_t seglim, segsize, segcnt;
456 size_t objsize, objcnt;
457
458 ASSERT(srp->srp_id < SKMEM_REGIONS);
459
460 /*
461 * If magazines layer is disabled system-wide, override
462 * the region parameter here. This will effectively reduce
463 * the number of requested objects computed below. Note that
464 * the region may have already been configured to exclude
465 * magazines in the default skmem_regions[] array.
466 */
467 if (!skmem_allow_magazines()) {
468 srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
469 }
470
471 objsize = srp->srp_r_obj_size;
472 ASSERT(objsize != 0);
473 objcnt = srp->srp_r_obj_cnt;
474 ASSERT(objcnt != 0);
475
476 if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
477 size_t align = srp->srp_align;
478
479 VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
480 VERIFY(powerof2(align));
481 objsize = MAX(objsize, sizeof(uint64_t));
482 #if KASAN
483 /*
484 * When KASAN is enabled, the zone allocator adjusts the
485 * element size to include the redzone regions, in which
486 * case we assume that the elements won't start on the
487 * alignment boundary and thus need to do some fix-ups.
488 * These include increasing the effective object size
489 * which adds at least 16 bytes to the original size.
490 */
491 objsize += sizeof(uint64_t) + align;
492 #endif /* KASAN */
493 objsize = P2ROUNDUP(objsize, align);
494
495 segsize = objsize;
496 srp->srp_r_seg_size = (uint32_t)segsize;
497 segcnt = objcnt;
498 goto done;
499 } else {
500 /* objects are always aligned at CPU cache line size */
501 srp->srp_align = cache_line_size;
502 }
503
504 /*
505 * Start with default segment size for the region, and compute the
506 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE). If the
507 * object size is greater, then we adjust the segment size to next
508 * multiple of the effective size larger than the object size.
509 */
510 if (srp->srp_r_seg_size == 0) {
511 switch (srp->srp_id) {
512 case SKMEM_REGION_UMD:
513 case SKMEM_REGION_KMD:
514 case SKMEM_REGION_RXKMD:
515 case SKMEM_REGION_TXKMD:
516 srp->srp_r_seg_size = skmem_md_seg_size;
517 break;
518
519 case SKMEM_REGION_BUF_DEF:
520 case SKMEM_REGION_RXBUF_DEF:
521 case SKMEM_REGION_TXBUF_DEF:
522 /*
523 * Use the effective driver buffer segment size,
524 * since it reflects any randomization done at
525 * skmem_region_init() time.
526 */
527 srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
528 break;
529
530 default:
531 srp->srp_r_seg_size = skmem_seg_size;
532 break;
533 }
534 } else {
535 srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
536 SKMEM_MIN_SEG_SIZE);
537 }
538
539 seglim = srp->srp_r_seg_size;
540 VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
541
542 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
543 srp->srp_name, seglim, objsize, objcnt);
544
545 /*
546 * Make sure object size is multiple of CPU cache line
547 * size, and that we can evenly divide the segment size.
548 */
549 if (!((objsize < cache_line_size) && (objsize < seglim) &&
550 ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
551 objsize = P2ROUNDUP(objsize, cache_line_size);
552 while (objsize < seglim && (seglim % objsize) != 0) {
553 SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
554 srp->srp_name, objsize, objsize + cache_line_size);
555 objsize += cache_line_size;
556 }
557 }
558
559 /* segment must be larger than object */
560 while (objsize > seglim) {
561 SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
562 seglim, seglim + SKMEM_MIN_SEG_SIZE);
563 seglim += SKMEM_MIN_SEG_SIZE;
564 }
565
566 /*
567 * Take into account worst-case per-CPU cached
568 * objects if this region is configured for it.
569 */
570 if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
571 uint32_t magazine_max_objs =
572 skmem_cache_magazine_max((uint32_t)objsize);
573 SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
574 objcnt, objcnt + magazine_max_objs);
575 objcnt += magazine_max_objs;
576 }
577
578 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
579 "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
580
581 segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
582 if (seglim > segsize) {
583 /*
584 * If the segment limit is larger than what we need,
585 * avoid memory wastage by shrinking it.
586 */
587 while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
588 VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
589 SK_DF(SK_VERB_MEM,
590 "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
591 srp->srp_name, segsize, objsize, objcnt, seglim,
592 P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
593 SKMEM_MIN_SEG_SIZE));
594 seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
595 SKMEM_MIN_SEG_SIZE);
596 }
597
598 /* adjust segment size */
599 segsize = seglim;
600 } else if (seglim < segsize) {
601 size_t oseglim = seglim;
602 /*
603 * If the segment limit is less than the segment size,
604 * see if increasing it slightly (up to 1.5x the segment
605 * size) would allow us to avoid allocating too many
606 * extra objects (due to excessive segment count).
607 */
608 while (seglim < segsize && (segsize % seglim) != 0) {
609 SK_DF(SK_VERB_MEM,
610 "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
611 srp->srp_name, segsize, objsize, objcnt, seglim,
612 (seglim + SKMEM_MIN_SEG_SIZE));
613 seglim += SKMEM_MIN_SEG_SIZE;
614 if (seglim >= (oseglim + (oseglim >> 1))) {
615 break;
616 }
617 }
618
619 /* can't use P2ROUNDUP since seglim may not be power of 2 */
620 segsize = SK_ROUNDUP(segsize, seglim);
621 }
622 ASSERT(segsize != 0 && (segsize % seglim) == 0);
623
624 SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
625 srp->srp_name, segsize, seglim);
626
627 /* compute segment count, and recompute segment size */
628 if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
629 segcnt = 1;
630 } else {
631 /*
632 * The adjustments above were done in increments of
633 * SKMEM_MIN_SEG_SIZE. If the object size is greater
634 * than that, ensure that the segment size is a multiple
635 * of the object size.
636 */
637 if (objsize > SKMEM_MIN_SEG_SIZE) {
638 ASSERT(seglim >= objsize);
639 if ((seglim % objsize) != 0) {
640 seglim += (seglim - objsize);
641 }
642 /* recompute segsize; see SK_ROUNDUP comment above */
643 segsize = SK_ROUNDUP(segsize, seglim);
644 }
645
646 segcnt = MAX(1, (segsize / seglim));
647 segsize /= segcnt;
648 }
649
650 SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
651 srp->srp_name, segcnt, segsize);
652
653 /* recompute object count to avoid wastage */
654 objcnt = (segsize * segcnt) / objsize;
655 ASSERT(objcnt != 0);
656 done:
657 srp->srp_c_obj_size = (uint32_t)objsize;
658 srp->srp_c_obj_cnt = (uint32_t)objcnt;
659 srp->srp_c_seg_size = (uint32_t)segsize;
660 srp->srp_seg_cnt = (uint32_t)segcnt;
661
662 SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
663 srp->srp_name, objsize, objcnt, segcnt, segsize);
664
665 #if SK_LOG
666 if (__improbable(sk_verbose != 0)) {
667 char label[32];
668 (void) snprintf(label, sizeof(label), "REGION_%s:",
669 skmem_region_id2name(srp->srp_id));
670 SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
671 (uint32_t)srp->srp_r_obj_cnt,
672 (uint32_t)srp->srp_r_obj_size,
673 (uint32_t)srp->srp_c_obj_cnt,
674 (uint32_t)srp->srp_c_obj_size);
675 }
676 #endif /* SK_LOG */
677 }
678
679 /*
680 * Create a region.
681 */
682 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)683 skmem_region_create(const char *name, struct skmem_region_params *srp,
684 sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
685 {
686 boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
687 uint32_t cflags = srp->srp_cflags;
688 struct skmem_region *skr;
689 uint32_t i;
690
691 ASSERT(srp->srp_id < SKMEM_REGIONS);
692 ASSERT(srp->srp_c_seg_size != 0 &&
693 (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
694 ASSERT(srp->srp_seg_cnt != 0);
695 ASSERT(srp->srp_c_obj_cnt == 1 ||
696 (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
697 ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
698
699 skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
700 skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
701 skr->skr_seg_size = srp->srp_c_seg_size;
702 skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
703 skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
704
705 if (!pseudo) {
706 skr->skr_seg_max_cnt = srp->srp_seg_cnt;
707
708 /* set alignment to CPU cache line size */
709 skr->skr_params.srp_align = skmem_cpu_cache_line_size();
710
711 /* allocate the allocated-address hash chain */
712 skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
713 skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
714 uint32_t size = skr->skr_hash_initial;
715 skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
716 size, Z_WAITOK | Z_NOFAIL,
717 skmem_tag_segment_hash);
718 skr->skr_hash_size = size;
719 skr->skr_hash_mask = (skr->skr_hash_initial - 1);
720 skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
721
722 for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
723 TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
724 }
725 } else {
726 /* this upper bound doesn't apply */
727 skr->skr_seg_max_cnt = 0;
728
729 /* pick up value set by skmem_regions_params_config() */
730 skr->skr_params.srp_align = srp->srp_align;
731 }
732
733 skr->skr_r_obj_size = srp->srp_r_obj_size;
734 skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
735 skr->skr_c_obj_size = srp->srp_c_obj_size;
736 skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
737
738 skr->skr_params.srp_md_type = srp->srp_md_type;
739 skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
740 skr->skr_params.srp_max_frags = srp->srp_max_frags;
741
742 skr->skr_seg_ctor = ctor;
743 skr->skr_seg_dtor = dtor;
744 skr->skr_private = private;
745
746 lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
747 &skmem_region_lock_attr);
748
749 TAILQ_INIT(&skr->skr_seg_free);
750 RB_INIT(&skr->skr_seg_tfree);
751
752 skr->skr_id = srp->srp_id;
753 uuid_generate_random(skr->skr_uuid);
754 (void) snprintf(skr->skr_name, sizeof(skr->skr_name),
755 "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
756
757 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
758 skr->skr_name, SK_KVA(skr));
759
760 /* sanity check */
761 ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
762 !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
763 SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
764 SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
765 SKMEM_REGION_CR_PUREDATA)));
766
767 skr->skr_cflags = cflags;
768 if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
769 skr->skr_mode |= SKR_MODE_NOREDIRECT;
770 }
771 if (cflags & SKMEM_REGION_CR_MMAPOK) {
772 skr->skr_mode |= SKR_MODE_MMAPOK;
773 }
774 if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
775 (cflags & SKMEM_REGION_CR_UREADONLY)) {
776 skr->skr_mode |= SKR_MODE_UREADONLY;
777 }
778 if (cflags & SKMEM_REGION_CR_KREADONLY) {
779 skr->skr_mode |= SKR_MODE_KREADONLY;
780 }
781 if (cflags & SKMEM_REGION_CR_PERSISTENT) {
782 skr->skr_mode |= SKR_MODE_PERSISTENT;
783 }
784 if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
785 skr->skr_mode |= SKR_MODE_MONOLITHIC;
786 }
787 if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
788 skr->skr_mode |= SKR_MODE_NOMAGAZINES;
789 }
790 if (cflags & SKMEM_REGION_CR_NOCACHE) {
791 skr->skr_mode |= SKR_MODE_NOCACHE;
792 }
793 if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
794 skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
795 }
796 if (cflags & SKMEM_REGION_CR_SHAREOK) {
797 skr->skr_mode |= SKR_MODE_SHAREOK;
798 }
799 if (cflags & SKMEM_REGION_CR_IODIR_IN) {
800 skr->skr_mode |= SKR_MODE_IODIR_IN;
801 }
802 if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
803 skr->skr_mode |= SKR_MODE_IODIR_OUT;
804 }
805 if (cflags & SKMEM_REGION_CR_GUARD) {
806 skr->skr_mode |= SKR_MODE_GUARD;
807 }
808 if (cflags & SKMEM_REGION_CR_PUREDATA) {
809 skr->skr_mode |= SKR_MODE_PUREDATA;
810 }
811 if (cflags & SKMEM_REGION_CR_PSEUDO) {
812 skr->skr_mode |= SKR_MODE_PSEUDO;
813 }
814 if (cflags & SKMEM_REGION_CR_THREADSAFE) {
815 skr->skr_mode |= SKR_MODE_THREADSAFE;
816 }
817
818 if (cflags & SKMEM_REGION_CR_MEMTAG) {
819 skr->skr_mode |= SKR_MODE_MEMTAG;
820 }
821
822 #if XNU_TARGET_OS_OSX
823 /*
824 * Mark all regions as persistent except for the guard and Intrinsic
825 * regions.
826 * This is to ensure that kernel threads won't be faulting-in while
827 * accessing these memory regions. We have observed various kinds of
828 * kernel panics due to kernel threads faulting on non-wired memory
829 * access when the VM subsystem is not in a state to swap-in the page.
830 */
831 if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
832 (skr->skr_mode & SKR_MODE_GUARD))) {
833 skr->skr_mode |= SKR_MODE_PERSISTENT;
834 }
835 #endif /* XNU_TARGET_OS_OSX */
836
837 /* SKR_MODE_UREADONLY only takes effect for user task mapping */
838 skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
839 skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
840 skr->skr_bufspec.purgeable = TRUE;
841 skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
842 skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
843 skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
844 skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
845 skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
846 skr->skr_bufspec.threadSafe = !!(skr->skr_mode & SKR_MODE_THREADSAFE);
847 skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
848
849 /* allocate segment bitmaps */
850 if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
851 ASSERT(skr->skr_seg_max_cnt != 0);
852 skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
853 size_t size = BITMAP_SIZE(skr->skr_seg_max_cnt);
854 skr->skr_seg_bmap = sk_alloc_data(size,
855 Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
856 skr->skr_seg_bmap_size = size;
857 ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
858 (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
859
860 /* mark all bitmaps as free (bit set) */
861 bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
862 }
863
864 /*
865 * Populate the freelist by allocating all segments for the
866 * region, which will be mapped but not faulted-in, and then
867 * immediately insert each to the freelist. That will in
868 * turn unmap the segment's memory object.
869 */
870 SKR_LOCK(skr);
871 if (skr->skr_mode & SKR_MODE_PSEUDO) {
872 char zone_name[64];
873 (void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
874 SKMEM_ZONE_PREFIX, name);
875 skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
876 ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
877 } else {
878 /* create a backing IOSKRegion object */
879 if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
880 (IOSKSize)skr->skr_seg_size,
881 (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
882 SK_ERR("\%s\": [%u * %u] cflags 0x%b skr_reg failed",
883 skr->skr_name, (uint32_t)skr->skr_seg_size,
884 (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags,
885 SKMEM_REGION_CR_BITS);
886 goto failed;
887 }
888 }
889
890 ASSERT(skr->skr_seg_objs != 0);
891
892 ++skr->skr_refcnt; /* for caller */
893 SKR_UNLOCK(skr);
894
895 SKMEM_REGION_LOCK();
896 TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
897 SKMEM_REGION_UNLOCK();
898
899 SK_DF(SK_VERB_MEM_REGION,
900 " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%b",
901 (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
902 (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
903 skr->skr_cflags, SKMEM_REGION_CR_BITS);
904
905 return skr;
906
907 failed:
908 SKR_LOCK_ASSERT_HELD(skr);
909 skmem_region_destroy(skr);
910
911 return NULL;
912 }
913
914 /*
915 * Destroy a region.
916 */
917 static void
skmem_region_destroy(struct skmem_region * skr)918 skmem_region_destroy(struct skmem_region *skr)
919 {
920 struct skmem_region *mskr;
921
922 SKR_LOCK_ASSERT_HELD(skr);
923
924 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx",
925 skr->skr_name, SK_KVA(skr));
926
927 /*
928 * Panic if we detect there are unfreed segments; the caller
929 * destroying this region is responsible for ensuring that all
930 * allocated segments have been freed prior to getting here.
931 */
932 ASSERT(skr->skr_refcnt == 0);
933 if (skr->skr_seginuse != 0) {
934 panic("%s: '%s' (%p) not empty (%u unfreed)",
935 __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
936 /* NOTREACHED */
937 __builtin_unreachable();
938 }
939
940 if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
941 SKR_UNLOCK(skr);
942 SKMEM_REGION_LOCK();
943 TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
944 SKMEM_REGION_UNLOCK();
945 SKR_LOCK(skr);
946 ASSERT(skr->skr_refcnt == 0);
947 }
948
949 /*
950 * Undo what's done earlier at region creation time.
951 */
952 skmem_region_depopulate(skr);
953 ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
954 ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
955 ASSERT(skr->skr_seg_free_cnt == 0);
956
957 if (skr->skr_reg != NULL) {
958 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
959 IOSKRegionDestroy(skr->skr_reg);
960 skr->skr_reg = NULL;
961 }
962
963 if (skr->skr_zreg != NULL) {
964 ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
965 zdestroy(skr->skr_zreg);
966 skr->skr_zreg = NULL;
967 }
968
969 if (skr->skr_seg_bmap != NULL) {
970 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
971 #if (DEBUG || DEVELOPMENT)
972 ASSERT(skr->skr_seg_bmap_len != 0);
973 /* must have been set to vacant (bit set) by now */
974 assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
975 #endif /* DEBUG || DEVELOPMENT */
976
977 bitmap_t *__indexable bmap = skr->skr_seg_bmap;
978 sk_free_data(bmap, skr->skr_seg_bmap_size);
979 skr->skr_seg_bmap = NULL;
980 skr->skr_seg_bmap_size = 0;
981 skr->skr_seg_bmap_len = 0;
982 }
983 ASSERT(skr->skr_seg_bmap_len == 0);
984
985 if (skr->skr_hash_table != NULL) {
986 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
987 #if (DEBUG || DEVELOPMENT)
988 for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
989 ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
990 }
991 #endif /* DEBUG || DEVELOPMENT */
992
993 struct sksegment_bkt *__indexable htable = skr->skr_hash_table;
994 sk_free_type_array(struct sksegment_bkt, skr->skr_hash_size,
995 htable);
996 skr->skr_hash_table = NULL;
997 skr->skr_hash_size = 0;
998 htable = NULL;
999 }
1000 if ((mskr = skr->skr_mirror) != NULL) {
1001 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1002 skr->skr_mirror = NULL;
1003 mskr->skr_mode &= ~SKR_MODE_MIRRORED;
1004 }
1005 SKR_UNLOCK(skr);
1006
1007 if (mskr != NULL) {
1008 skmem_region_release(mskr);
1009 }
1010
1011 lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
1012
1013 zfree(skr_zone, skr);
1014 }
1015
1016 /*
1017 * Mirror mskr (slave) to skr (master).
1018 */
1019 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)1020 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
1021 {
1022 ASSERT(mskr != NULL);
1023 SK_DF(SK_VERB_MEM_REGION, "skr master 0x%llx, slave 0x%llx ",
1024 SK_KVA(skr), SK_KVA(mskr));
1025
1026 SKR_LOCK(skr);
1027 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1028 ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
1029 ASSERT(skr->skr_mirror == NULL);
1030
1031 /* both regions must share identical parameters */
1032 ASSERT(skr->skr_size == mskr->skr_size);
1033 ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
1034 ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
1035
1036 skr->skr_mirror = mskr;
1037 skmem_region_retain(mskr);
1038 mskr->skr_mode |= SKR_MODE_MIRRORED;
1039 SKR_UNLOCK(skr);
1040 }
1041
1042 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm,bool attach)1043 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm,
1044 bool attach)
1045 {
1046 int i;
1047
1048 SKR_LOCK(skr);
1049 if (attach) {
1050 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != NULL;
1051 i++) {
1052 ;
1053 }
1054 VERIFY(i < SKR_MAX_CACHES);
1055 ASSERT(skr->skr_cache[i] == NULL);
1056 skr->skr_mode |= SKR_MODE_SLAB;
1057 skr->skr_cache[i] = skm;
1058 skmem_region_retain_locked(skr);
1059 SKR_UNLOCK(skr);
1060 } else {
1061 ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1062 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != skm;
1063 i++) {
1064 ;
1065 }
1066 VERIFY(i < SKR_MAX_CACHES);
1067 ASSERT(skr->skr_cache[i] == skm);
1068 skr->skr_cache[i] = NULL;
1069 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] == NULL;
1070 i++) {
1071 ;
1072 }
1073 if (i == SKR_MAX_CACHES) {
1074 skr->skr_mode &= ~SKR_MODE_SLAB;
1075 }
1076 if (!skmem_region_release_locked(skr)) {
1077 SKR_UNLOCK(skr);
1078 }
1079 }
1080 }
1081
1082 /*
1083 * Common routines for skmem_region_{alloc,mirror_alloc}.
1084 */
1085 static void *
__sized_by(objsize)1086 __sized_by(objsize)
1087 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg,
1088 uint32_t SK_FB_ARG objsize)
1089 {
1090 struct sksegment_bkt *sgb;
1091 uint32_t SK_FB_ARG seg_sz = 0;
1092 void *__sized_by(seg_sz) addr;
1093
1094 SKR_LOCK_ASSERT_HELD(skr);
1095
1096 ASSERT(sg->sg_md != NULL);
1097 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1098 addr = __unsafe_forge_bidi_indexable(void *, (void *)sg->sg_start, objsize);
1099 seg_sz = objsize;
1100 sgb = SKMEM_REGION_HASH(skr, addr);
1101 ASSERT(sg->sg_link.tqe_next == NULL);
1102 ASSERT(sg->sg_link.tqe_prev == NULL);
1103 TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1104
1105 skr->skr_seginuse++;
1106 skr->skr_meminuse += skr->skr_seg_size;
1107 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1108 skr->skr_w_meminuse += skr->skr_seg_size;
1109 }
1110 skr->skr_alloc++;
1111
1112 return addr;
1113 }
1114
1115 /*
1116 * Allocate a segment from the region.
1117 * XXX -fbounds-safety: there's only 5 callers of this funcion, so it was easier
1118 * to just add objsize to the function signature
1119 * XXX -fbounds-safety: until we have __sized_by_or_null (rdar://75598414), we
1120 * can't pass NULL, but instead create a variable whose value is NULL. Also,
1121 * once rdar://83900556 lands, -fbounds-safety will do size checking at return.
1122 * So we need to come back to this once rdar://75598414 and rdar://83900556
1123 * land.
1124 */
1125 void *
__sized_by(objsize)1126 __sized_by(objsize)
1127 skmem_region_alloc(struct skmem_region *skr, void *__sized_by(*msize) * maddr,
1128 struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag,
1129 uint32_t SK_FB_ARG objsize, uint32_t *SK_FB_ARG msize)
1130 {
1131 struct sksegment *sg = NULL;
1132 struct sksegment *__single sg1 = NULL;
1133 void *__indexable addr = NULL, *__indexable addr1 = NULL;
1134 uint32_t retries = 0;
1135
1136 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1137
1138 if (retsg != NULL) {
1139 *retsg = NULL;
1140 }
1141 if (retsgm != NULL) {
1142 *retsgm = NULL;
1143 }
1144
1145 /* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1146 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1147 (SKMEM_NOSLEEP | SKMEM_FAILOK));
1148
1149 SKR_LOCK(skr);
1150 while (sg == NULL) {
1151 /* see if there's a segment in the freelist */
1152 sg = TAILQ_FIRST(&skr->skr_seg_free);
1153 if (sg == NULL) {
1154 /* see if we can grow the freelist */
1155 sg = sksegment_freelist_grow(skr);
1156 if (sg != NULL) {
1157 break;
1158 }
1159
1160 if (skr->skr_mode & SKR_MODE_SLAB) {
1161 SKR_UNLOCK(skr);
1162 /*
1163 * None found; it's possible that the slab
1164 * layer is caching extra amount, so ask
1165 * skmem_cache to reap/purge its caches.
1166 */
1167 for (int i = 0; i < SKR_MAX_CACHES; i++) {
1168 if (skr->skr_cache[i] == NULL) {
1169 continue;
1170 }
1171 skmem_cache_reap_now(skr->skr_cache[i],
1172 TRUE);
1173 }
1174 SKR_LOCK(skr);
1175 /*
1176 * If we manage to get some freed, try again.
1177 */
1178 if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1179 continue;
1180 }
1181 }
1182
1183 /*
1184 * Give up if this is a non-blocking allocation,
1185 * or if this is a blocking allocation but the
1186 * caller is willing to retry.
1187 */
1188 if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1189 break;
1190 }
1191
1192 /* otherwise we wait until one is available */
1193 ++skr->skr_seg_waiters;
1194 (void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1195 (PZERO - 1), skr->skr_name, NULL);
1196 }
1197 }
1198
1199 SKR_LOCK_ASSERT_HELD(skr);
1200
1201 if (sg != NULL) {
1202 retry:
1203 /*
1204 * We have a segment; remove it from the freelist and
1205 * insert it into the allocated-address hash chain.
1206 * Note that this may return NULL if we can't allocate
1207 * the memory descriptor.
1208 */
1209 if (sksegment_freelist_remove(skr, sg, skmflag,
1210 FALSE) == NULL) {
1211 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1212 ASSERT(sg->sg_md == NULL);
1213 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1214
1215 /*
1216 * If it's non-blocking allocation, simply just give
1217 * up and let the caller decide when to retry. Else,
1218 * it gets a bit complicated due to the contract we
1219 * have for blocking allocations with the client; the
1220 * most sensible thing to do here is to retry the
1221 * allocation ourselves. Note that we keep using the
1222 * same segment we originally got, since we only need
1223 * the memory descriptor to be allocated for it; thus
1224 * we make sure we don't release the region lock when
1225 * retrying allocation. Doing so is crucial when the
1226 * region is mirrored, since the segment indices on
1227 * both regions need to match.
1228 */
1229 if (skmflag & SKMEM_NOSLEEP) {
1230 SK_ERR("\"%s\": failed to allocate segment "
1231 "(non-sleeping mode)", skr->skr_name);
1232 sg = NULL;
1233 } else {
1234 if (++retries > SKMEM_WDT_MAXTIME) {
1235 panic_plain("\"%s\": failed to "
1236 "allocate segment (sleeping mode) "
1237 "after %u retries\n\n%s",
1238 skr->skr_name, SKMEM_WDT_MAXTIME,
1239 skmem_dump(skr));
1240 /* NOTREACHED */
1241 __builtin_unreachable();
1242 } else {
1243 SK_ERR("\"%s\": failed to allocate "
1244 "segment (sleeping mode): %u "
1245 "retries", skr->skr_name, retries);
1246 }
1247 if (skr->skr_mode & SKR_MODE_SLAB) {
1248 /*
1249 * We can't get any memory descriptor
1250 * for this segment; reap extra cached
1251 * objects from the slab layer and hope
1252 * that we get lucky next time around.
1253 *
1254 * XXX [email protected]: perhaps also
1255 * trigger the zone allocator to do
1256 * its garbage collection here?
1257 */
1258 skmem_cache_reap();
1259 }
1260 delay(1 * USEC_PER_SEC); /* 1 sec */
1261 goto retry;
1262 }
1263 }
1264
1265 if (sg != NULL) {
1266 /* insert to allocated-address hash chain */
1267 addr = skmem_region_alloc_common(skr, sg,
1268 skr->skr_seg_size);
1269 }
1270 }
1271
1272 if (sg == NULL) {
1273 VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1274 if (skmflag & SKMEM_PANIC) {
1275 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1276 SKMEM_NOSLEEP);
1277 /*
1278 * If is a failed non-blocking alloc and the caller
1279 * insists that it must be successful, then panic.
1280 */
1281 panic_plain("\"%s\": skr 0x%p unable to satisfy "
1282 "mandatory allocation\n", skr->skr_name, skr);
1283 /* NOTREACHED */
1284 __builtin_unreachable();
1285 } else {
1286 /*
1287 * Give up if this is a non-blocking allocation,
1288 * or one where the caller is willing to handle
1289 * allocation failures.
1290 */
1291 goto done;
1292 }
1293 }
1294
1295 ASSERT((mach_vm_address_t)addr == sg->sg_start);
1296
1297 #if SK_LOG
1298 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1299 SK_KVA(skr), SK_KVA(sg));
1300 if (skr->skr_mirror == NULL ||
1301 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1302 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1303 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1304 } else {
1305 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored",
1306 sg->sg_index, SK_KVA(sg), SK_KVA(sg->sg_start),
1307 SK_KVA(sg->sg_end));
1308 }
1309 #endif /* SK_LOG */
1310
1311 /*
1312 * If mirroring, allocate shadow object from slave region.
1313 */
1314 if (skr->skr_mirror != NULL) {
1315 ASSERT(skr->skr_mirror != skr);
1316 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1317 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1318 addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg,
1319 skr->skr_mirror->skr_seg_size, &sg1);
1320 ASSERT(addr1 != NULL);
1321 ASSERT(sg1 != NULL && sg1 != sg);
1322 ASSERT(sg1->sg_index == sg->sg_index);
1323 }
1324
1325 done:
1326 SKR_UNLOCK(skr);
1327
1328 /* return segment metadata to caller if asked (reference not needed) */
1329 if (addr != NULL) {
1330 if (retsg != NULL) {
1331 *retsg = sg;
1332 }
1333 if (retsgm != NULL) {
1334 *retsgm = sg1;
1335 }
1336 }
1337
1338 if (maddr != NULL) {
1339 if (addr1) {
1340 *maddr = addr1;
1341 *msize = skr->skr_mirror->skr_seg_size;
1342 } else {
1343 *maddr = addr1;
1344 *msize = 0;
1345 }
1346 }
1347
1348 return addr;
1349 }
1350
1351 /*
1352 * Allocate a segment from a mirror region at the same index. While it
1353 * is somewhat a simplified variant of skmem_region_alloc, keeping it
1354 * separate allows us to avoid further convoluting that routine.
1355 */
1356 static void *
__sized_by(seg_size)1357 __sized_by(seg_size)
1358 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1359 uint32_t SK_FB_ARG seg_size, struct sksegment **__single retsg)
1360 {
1361 struct sksegment sg_key = { .sg_index = sg0->sg_index };
1362 struct sksegment *sg = NULL;
1363 void *addr = NULL;
1364
1365 ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1366 ASSERT(skr->skr_mirror == NULL);
1367 ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1368
1369 if (retsg != NULL) {
1370 *retsg = NULL;
1371 }
1372
1373 SKR_LOCK(skr);
1374
1375 /*
1376 * See if we can find one in the freelist first. Otherwise,
1377 * create a new segment of the same index and add that to the
1378 * freelist. We would always get a segment since both regions
1379 * are synchronized when it comes to the indices of allocated
1380 * segments.
1381 */
1382 sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1383 if (sg == NULL) {
1384 sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1385 VERIFY(sg != NULL);
1386 }
1387 VERIFY(sg->sg_index == sg0->sg_index);
1388
1389 /*
1390 * We have a segment; remove it from the freelist and insert
1391 * it into the allocated-address hash chain. This either
1392 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1393 * can't be allocated.
1394 *
1395 * TODO: consider retrying IOBMD allocation attempts if needed.
1396 */
1397 sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1398 VERIFY(sg != NULL);
1399
1400 /* insert to allocated-address hash chain */
1401 addr = skmem_region_alloc_common(skr, sg, skr->skr_seg_size);
1402
1403 #if SK_LOG
1404 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1405 SK_KVA(skr), SK_KVA(sg));
1406 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1407 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1408 #endif /* SK_LOG */
1409
1410 SKR_UNLOCK(skr);
1411
1412 /* return segment metadata to caller if asked (reference not needed) */
1413 if (retsg != NULL) {
1414 *retsg = sg;
1415 }
1416
1417 return addr;
1418 }
1419
1420 /*
1421 * Free a segment to the region.
1422 */
1423 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1424 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1425 {
1426 struct sksegment_bkt *sgb;
1427 struct sksegment *sg, *tsg;
1428
1429 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1430
1431 /*
1432 * Search the hash chain to find a matching segment for the
1433 * given address. If found, remove the segment from the
1434 * hash chain and insert it into the freelist. Otherwise,
1435 * we panic since the caller has given us a bogus address.
1436 */
1437 SKR_LOCK(skr);
1438 sgb = SKMEM_REGION_HASH(skr, addr);
1439 TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1440 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1441 if (sg->sg_start == (mach_vm_address_t)addr) {
1442 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1443 sg->sg_link.tqe_next = NULL;
1444 sg->sg_link.tqe_prev = NULL;
1445 break;
1446 }
1447 }
1448
1449 ASSERT(sg != NULL);
1450 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1451 ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1452 skr->skr_w_meminuse -= skr->skr_seg_size;
1453 }
1454 sksegment_freelist_insert(skr, sg, FALSE);
1455
1456 ASSERT(skr->skr_seginuse != 0);
1457 skr->skr_seginuse--;
1458 skr->skr_meminuse -= skr->skr_seg_size;
1459 skr->skr_free++;
1460
1461 #if SK_LOG
1462 SK_DF(SK_VERB_MEM_REGION, "skr 0x%llx sg 0x%llx",
1463 SK_KVA(skr), SK_KVA(sg));
1464 if (skr->skr_mirror == NULL ||
1465 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1466 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx)",
1467 sg->sg_index, SK_KVA(addr),
1468 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1469 } else {
1470 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) mirrored",
1471 sg->sg_index, SK_KVA(sg), SK_KVA(addr),
1472 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1473 }
1474 #endif /* SK_LOG */
1475
1476 /*
1477 * If mirroring, also free shadow object in slave region.
1478 */
1479 if (skr->skr_mirror != NULL) {
1480 ASSERT(maddr != NULL);
1481 ASSERT(skr->skr_mirror != skr);
1482 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1483 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1484 skmem_region_free(skr->skr_mirror, maddr, NULL);
1485 }
1486
1487 /* wake up any blocked threads waiting for a segment */
1488 if (skr->skr_seg_waiters != 0) {
1489 SK_DF(SK_VERB_MEM_REGION,
1490 "sg 0x%llx waking up %u waiters", SK_KVA(sg),
1491 skr->skr_seg_waiters);
1492 skr->skr_seg_waiters = 0;
1493 wakeup(&skr->skr_seg_free);
1494 }
1495 SKR_UNLOCK(skr);
1496 }
1497
1498 __attribute__((always_inline))
1499 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1500 skmem_region_retain_locked(struct skmem_region *skr)
1501 {
1502 SKR_LOCK_ASSERT_HELD(skr);
1503 skr->skr_refcnt++;
1504 ASSERT(skr->skr_refcnt != 0);
1505 }
1506
1507 /*
1508 * Retain a segment.
1509 */
1510 void
skmem_region_retain(struct skmem_region * skr)1511 skmem_region_retain(struct skmem_region *skr)
1512 {
1513 SKR_LOCK(skr);
1514 skmem_region_retain_locked(skr);
1515 SKR_UNLOCK(skr);
1516 }
1517
1518 __attribute__((always_inline))
1519 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1520 skmem_region_release_locked(struct skmem_region *skr)
1521 {
1522 SKR_LOCK_ASSERT_HELD(skr);
1523 ASSERT(skr->skr_refcnt != 0);
1524 if (--skr->skr_refcnt == 0) {
1525 skmem_region_destroy(skr);
1526 return TRUE;
1527 }
1528 return FALSE;
1529 }
1530
1531 /*
1532 * Release (and potentially destroy) a segment.
1533 */
1534 boolean_t
skmem_region_release(struct skmem_region * skr)1535 skmem_region_release(struct skmem_region *skr)
1536 {
1537 boolean_t lastref;
1538
1539 SKR_LOCK(skr);
1540 if (!(lastref = skmem_region_release_locked(skr))) {
1541 SKR_UNLOCK(skr);
1542 }
1543
1544 return lastref;
1545 }
1546
1547 /*
1548 * Depopulate the segment freelist.
1549 */
1550 static void
skmem_region_depopulate(struct skmem_region * skr)1551 skmem_region_depopulate(struct skmem_region *skr)
1552 {
1553 struct sksegment *sg, *tsg;
1554
1555 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr 0x%llx ",
1556 skr->skr_name, SK_KVA(skr));
1557
1558 SKR_LOCK_ASSERT_HELD(skr);
1559 ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1560
1561 TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1562 struct sksegment *sg0;
1563 uint32_t i;
1564
1565 i = sg->sg_index;
1566 sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1567 VERIFY(sg0 == sg);
1568
1569 sksegment_destroy(skr, sg);
1570 ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1571 }
1572 }
1573
1574 /*
1575 * Free tree segment compare routine.
1576 */
1577 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1578 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1579 {
1580 return sg1->sg_index - sg2->sg_index;
1581 }
1582
1583 /*
1584 * Create a segment.
1585 *
1586 * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1587 */
1588 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1589 sksegment_create(struct skmem_region *skr, uint32_t i)
1590 {
1591 struct sksegment *__single sg = NULL;
1592 bitmap_t *bmap;
1593
1594 SKR_LOCK_ASSERT_HELD(skr);
1595
1596 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1597 ASSERT(i < skr->skr_seg_max_cnt);
1598 ASSERT(skr->skr_reg != NULL);
1599 ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1600
1601 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1602 ASSERT(bit_test(*bmap, i % BMAPSZ));
1603
1604 sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1605 bzero(sg, sizeof(*sg));
1606
1607 sg->sg_region = skr;
1608 sg->sg_index = i;
1609 sg->sg_state = SKSEG_STATE_DETACHED;
1610
1611 /* claim it (clear bit) */
1612 bit_clear(*bmap, i % BMAPSZ);
1613
1614 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b", i,
1615 SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode,
1616 SKR_MODE_BITS);
1617
1618 return sg;
1619 }
1620
1621 /*
1622 * Destroy a segment.
1623 *
1624 * Set the bit for the segment's index in skr_seg_bmap bitmap,
1625 * indicating that it is now vacant.
1626 */
1627 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1628 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1629 {
1630 uint32_t i = sg->sg_index;
1631 bitmap_t *bmap;
1632
1633 SKR_LOCK_ASSERT_HELD(skr);
1634
1635 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1636 ASSERT(skr == sg->sg_region);
1637 ASSERT(skr->skr_reg != NULL);
1638 ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1639 ASSERT(i < skr->skr_seg_max_cnt);
1640
1641 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1642 ASSERT(!bit_test(*bmap, i % BMAPSZ));
1643
1644 SK_DF(SK_VERB_MEM_REGION, " [%u] [0x%llx-0x%llx) 0x%b",
1645 i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end),
1646 skr->skr_mode, SKR_MODE_BITS);
1647
1648 /*
1649 * Undo what's done earlier at segment creation time.
1650 */
1651
1652 ASSERT(sg->sg_md == NULL);
1653 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1654 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1655
1656 /* release it (set bit) */
1657 bit_set(*bmap, i % BMAPSZ);
1658
1659 skmem_cache_free(skmem_sg_cache, sg);
1660 }
1661
1662 /*
1663 * Insert a segment into freelist (freeing the segment).
1664 */
1665 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1666 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1667 boolean_t populating)
1668 {
1669 SKR_LOCK_ASSERT_HELD(skr);
1670
1671 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1672 ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1673 ASSERT(skr == sg->sg_region);
1674 ASSERT(skr->skr_reg != NULL);
1675 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1676
1677 /*
1678 * If the region is being populated, then we're done.
1679 */
1680 if (__improbable(populating)) {
1681 ASSERT(sg->sg_md == NULL);
1682 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1683 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1684 } else {
1685 IOSKMemoryBufferRef __single md;
1686 IOReturn err;
1687
1688 ASSERT(sg->sg_md != NULL);
1689 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1690
1691 /*
1692 * Let the client remove the memory from IOMMU, and unwire it.
1693 */
1694 if (skr->skr_seg_dtor != NULL) {
1695 skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1696 }
1697
1698 ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1699 sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1700
1701 IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1702 VERIFY(sg->sg_md == md);
1703
1704 /* if persistent, unwire this memory now */
1705 if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1706 err = IOSKMemoryUnwire(md);
1707 if (err != kIOReturnSuccess) {
1708 panic("Fail to unwire md %p, err %d", md, err);
1709 }
1710 }
1711
1712 /* mark memory as empty/discarded for consistency */
1713 err = IOSKMemoryDiscard(md);
1714 if (err != kIOReturnSuccess) {
1715 panic("Fail to discard md %p, err %d", md, err);
1716 }
1717
1718 IOSKMemoryDestroy(md);
1719 sg->sg_md = NULL;
1720 sg->sg_start = sg->sg_end = 0;
1721 sg->sg_state = SKSEG_STATE_DETACHED;
1722
1723 ASSERT(skr->skr_memtotal >= skr->skr_seg_size);
1724 skr->skr_memtotal -= skr->skr_seg_size;
1725 }
1726
1727 sg->sg_type = SKSEG_TYPE_FREE;
1728 ASSERT(sg->sg_link.tqe_next == NULL);
1729 ASSERT(sg->sg_link.tqe_prev == NULL);
1730 TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1731 ASSERT(sg->sg_node.rbe_left == NULL);
1732 ASSERT(sg->sg_node.rbe_right == NULL);
1733 ASSERT(sg->sg_node.rbe_parent == NULL);
1734 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1735 ++skr->skr_seg_free_cnt;
1736 ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1737 }
1738
1739 /*
1740 * Remove a segment from the freelist (allocating the segment).
1741 */
1742 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1743 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1744 uint32_t skmflag, boolean_t purging)
1745 {
1746 #pragma unused(skmflag)
1747 mach_vm_address_t segstart;
1748 IOReturn err;
1749
1750 SKR_LOCK_ASSERT_HELD(skr);
1751
1752 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1753 ASSERT(sg != NULL);
1754 ASSERT(skr == sg->sg_region);
1755 ASSERT(skr->skr_reg != NULL);
1756 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1757 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1758
1759 #if (DEVELOPMENT || DEBUG)
1760 uint64_t mtbf = skmem_region_get_mtbf();
1761 /*
1762 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1763 */
1764 if (__improbable(mtbf != 0 && !purging &&
1765 (net_uptime_ms() % mtbf) == 0 &&
1766 !(skmflag & SKMEM_PANIC))) {
1767 SK_ERR("skr \"%s\" 0x%llx sg 0x%llx MTBF failure",
1768 skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1769 net_update_uptime();
1770 return NULL;
1771 }
1772 #endif /* (DEVELOPMENT || DEBUG) */
1773
1774 TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1775 sg->sg_link.tqe_next = NULL;
1776 sg->sg_link.tqe_prev = NULL;
1777 RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1778 sg->sg_node.rbe_left = NULL;
1779 sg->sg_node.rbe_right = NULL;
1780 sg->sg_node.rbe_parent = NULL;
1781
1782 ASSERT(skr->skr_seg_free_cnt != 0);
1783 --skr->skr_seg_free_cnt;
1784
1785 /*
1786 * If the region is being depopulated, then we're done.
1787 */
1788 if (__improbable(purging)) {
1789 ASSERT(sg->sg_md == NULL);
1790 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1791 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1792 sg->sg_type = SKSEG_TYPE_DESTROYED;
1793 return sg;
1794 }
1795
1796 ASSERT(sg->sg_md == NULL);
1797 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1798 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1799
1800 /* created as non-volatile (mapped) upon success */
1801 if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1802 &skr->skr_bufspec, &segstart)) == NULL) {
1803 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1804 if (skmflag & SKMEM_PANIC) {
1805 /* if the caller insists for a success then panic */
1806 panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1807 "to satisfy mandatory allocation\n", skr->skr_name,
1808 skr, sg, sg->sg_index);
1809 /* NOTREACHED */
1810 __builtin_unreachable();
1811 }
1812 /* reinsert this segment to freelist */
1813 ASSERT(sg->sg_link.tqe_next == NULL);
1814 ASSERT(sg->sg_link.tqe_prev == NULL);
1815 TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1816 ASSERT(sg->sg_node.rbe_left == NULL);
1817 ASSERT(sg->sg_node.rbe_right == NULL);
1818 ASSERT(sg->sg_node.rbe_parent == NULL);
1819 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1820 ++skr->skr_seg_free_cnt;
1821 return NULL;
1822 }
1823
1824 sg->sg_start = segstart;
1825 sg->sg_end = (segstart + skr->skr_seg_size);
1826 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1827
1828 /* mark memory as non-volatile just to be consistent */
1829 err = IOSKMemoryReclaim(sg->sg_md);
1830 if (err != kIOReturnSuccess) {
1831 panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1832 }
1833
1834 /* if persistent, wire down its memory now */
1835 if (skr->skr_mode & SKR_MODE_PERSISTENT) {
1836 err = IOSKMemoryWire(sg->sg_md);
1837 if (err != kIOReturnSuccess) {
1838 panic("Fail to wire md %p, err %d", sg->sg_md, err);
1839 }
1840 }
1841
1842 err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1843 if (err != kIOReturnSuccess) {
1844 panic("Fail to set md %p, err %d", sg->sg_md, err);
1845 }
1846
1847 /*
1848 * Let the client wire it and insert to IOMMU, if applicable.
1849 * Try to find out if it's wired and set the right state.
1850 */
1851 if (skr->skr_seg_ctor != NULL) {
1852 skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1853 }
1854
1855 sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1856 SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1857
1858 skr->skr_memtotal += skr->skr_seg_size;
1859
1860 ASSERT(sg->sg_md != NULL);
1861 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1862
1863 sg->sg_type = SKSEG_TYPE_ALLOC;
1864 return sg;
1865 }
1866
1867 /*
1868 * Find the first available index and allocate a segment at that index.
1869 */
1870 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1871 sksegment_freelist_grow(struct skmem_region *skr)
1872 {
1873 struct sksegment *sg = NULL;
1874 uint32_t i, j, idx;
1875
1876 SKR_LOCK_ASSERT_HELD(skr);
1877
1878 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1879 ASSERT(skr->skr_seg_bmap_len != 0);
1880 ASSERT(skr->skr_seg_max_cnt != 0);
1881
1882 for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1883 bitmap_t *bmap, mask;
1884 uint32_t end = (BMAPSZ - 1);
1885
1886 if (i == (skr->skr_seg_bmap_len - 1)) {
1887 end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1888 }
1889
1890 bmap = &skr->skr_seg_bmap[i];
1891 mask = BMASK64(0, end);
1892
1893 j = ffsll((*bmap) & mask);
1894 if (j == 0) {
1895 continue;
1896 }
1897
1898 --j;
1899 idx = (i * BMAPSZ) + j;
1900
1901 sg = sksegment_alloc_with_idx(skr, idx);
1902
1903 /* we're done */
1904 break;
1905 }
1906
1907 ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1908 return sg;
1909 }
1910
1911 /*
1912 * Create a single segment at a specific index and add it to the freelist.
1913 */
1914 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1915 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1916 {
1917 struct sksegment *sg;
1918
1919 SKR_LOCK_ASSERT_HELD(skr);
1920
1921 if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1922 panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1923 __func__, skr->skr_name, (void *)skr, idx,
1924 (skr->skr_seg_max_cnt - 1));
1925 /* NOTREACHED */
1926 __builtin_unreachable();
1927 }
1928
1929 /* must not fail, blocking alloc */
1930 sg = sksegment_create(skr, idx);
1931 VERIFY(sg != NULL);
1932 VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1933
1934 /* populate the freelist */
1935 sksegment_freelist_insert(skr, sg, TRUE);
1936 ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1937 #if (DEVELOPMENT || DEBUG)
1938 struct sksegment sg_key = { .sg_index = sg->sg_index };
1939 ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1940 #endif /* (DEVELOPMENT || DEBUG) */
1941
1942 SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1943
1944 return sg;
1945 }
1946
1947 /*
1948 * Rescale the regions's allocated-address hash table.
1949 */
1950 static void
skmem_region_hash_rescale(struct skmem_region * skr)1951 skmem_region_hash_rescale(struct skmem_region *skr)
1952 {
1953 struct sksegment_bkt *__indexable old_table, *new_table;
1954 size_t old_size, new_size;
1955 uint32_t i, moved = 0;
1956
1957 if (skr->skr_mode & SKR_MODE_PSEUDO) {
1958 ASSERT(skr->skr_hash_table == NULL);
1959 /* this is no-op for pseudo region */
1960 return;
1961 }
1962
1963 ASSERT(skr->skr_hash_table != NULL);
1964 /* insist that we are executing in the update thread call context */
1965 ASSERT(sk_is_region_update_protected());
1966
1967 /*
1968 * To get small average lookup time (lookup depth near 1.0), the hash
1969 * table size should be roughly the same (not necessarily equivalent)
1970 * as the region size.
1971 */
1972 new_size = MAX(skr->skr_hash_initial,
1973 (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
1974 new_size = MIN(skr->skr_hash_limit, new_size);
1975 old_size = (skr->skr_hash_mask + 1);
1976
1977 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
1978 return;
1979 }
1980
1981 new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
1982 Z_NOWAIT, skmem_tag_segment_hash);
1983 if (__improbable(new_table == NULL)) {
1984 return;
1985 }
1986
1987 for (i = 0; i < new_size; i++) {
1988 TAILQ_INIT(&new_table[i].sgb_head);
1989 }
1990
1991 SKR_LOCK(skr);
1992
1993 old_size = (skr->skr_hash_mask + 1);
1994 old_table = skr->skr_hash_table;
1995
1996 skr->skr_hash_mask = (uint32_t)(new_size - 1);
1997 skr->skr_hash_table = new_table;
1998 skr->skr_hash_size = new_size;
1999 skr->skr_rescale++;
2000
2001 for (i = 0; i < old_size; i++) {
2002 struct sksegment_bkt *sgb = &old_table[i];
2003 struct sksegment_bkt *new_sgb;
2004 struct sksegment *sg;
2005
2006 while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
2007 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
2008 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
2009 new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
2010 TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
2011 ++moved;
2012 }
2013 ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
2014 }
2015
2016 SK_DF(SK_VERB_MEM_REGION,
2017 "skr 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skr),
2018 (uint32_t)old_size, (uint32_t)new_size, moved);
2019
2020 SKR_UNLOCK(skr);
2021
2022 sk_free_type_array(struct sksegment_bkt, old_size, old_table);
2023 }
2024
2025 /*
2026 * Apply a function to operate on all regions.
2027 */
2028 static void
skmem_region_applyall(void (* func)(struct skmem_region *))2029 skmem_region_applyall(void (*func)(struct skmem_region *))
2030 {
2031 struct skmem_region *skr;
2032
2033 net_update_uptime();
2034
2035 SKMEM_REGION_LOCK();
2036 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2037 func(skr);
2038 }
2039 SKMEM_REGION_UNLOCK();
2040 }
2041
2042 static void
skmem_region_update(struct skmem_region * skr)2043 skmem_region_update(struct skmem_region *skr)
2044 {
2045 SKMEM_REGION_LOCK_ASSERT_HELD();
2046
2047 /* insist that we are executing in the update thread call context */
2048 ASSERT(sk_is_region_update_protected());
2049
2050 SKR_LOCK(skr);
2051 /*
2052 * If there are threads blocked waiting for an available
2053 * segment, wake them up periodically so they can issue
2054 * another skmem_cache_reap() to reclaim resources cached
2055 * by skmem_cache.
2056 */
2057 if (skr->skr_seg_waiters != 0) {
2058 SK_DF(SK_VERB_MEM_REGION,
2059 "waking up %u waiters to reclaim", skr->skr_seg_waiters);
2060 skr->skr_seg_waiters = 0;
2061 wakeup(&skr->skr_seg_free);
2062 }
2063 SKR_UNLOCK(skr);
2064
2065 /*
2066 * Rescale the hash table if needed.
2067 */
2068 skmem_region_hash_rescale(skr);
2069 }
2070
2071 /*
2072 * Thread call callback for update.
2073 */
2074 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)2075 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2076 {
2077 #pragma unused(dummy, arg)
2078 sk_protect_t protect;
2079
2080 protect = sk_region_update_protect();
2081 skmem_region_applyall(skmem_region_update);
2082 sk_region_update_unprotect(protect);
2083
2084 skmem_dispatch(skmem_region_update_tc, NULL,
2085 (skmem_region_update_interval * NSEC_PER_SEC));
2086 }
2087
2088 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2089 skmem_region_for_pp(skmem_region_id_t id)
2090 {
2091 int i;
2092
2093 for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2094 if (id == skmem_pp_region_ids[i]) {
2095 return TRUE;
2096 }
2097 }
2098 return FALSE;
2099 }
2100
2101 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2102 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2103 {
2104 bzero(sreg, sizeof(*sreg));
2105
2106 (void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2107 "%s", skr->skr_name);
2108 uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2109 sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2110 sreg->sreg_mode = skr->skr_mode;
2111
2112 sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2113 sreg->sreg_c_seg_size = skr->skr_seg_size;
2114 sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2115 sreg->sreg_seg_objs = skr->skr_seg_objs;
2116 sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2117 sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2118 sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2119 sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2120 sreg->sreg_align = skr->skr_align;
2121 sreg->sreg_max_frags = skr->skr_max_frags;
2122
2123 sreg->sreg_meminuse = skr->skr_meminuse;
2124 sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2125 sreg->sreg_memtotal = skr->skr_memtotal;
2126 sreg->sreg_seginuse = skr->skr_seginuse;
2127 sreg->sreg_rescale = skr->skr_rescale;
2128 sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2129 sreg->sreg_alloc = skr->skr_alloc;
2130 sreg->sreg_free = skr->skr_free;
2131 }
2132
2133 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * __sized_by (len)out,size_t len)2134 skmem_region_mib_get_stats(struct skmem_region *skr, void *__sized_by(len) out,
2135 size_t len)
2136 {
2137 size_t actual_space = sizeof(struct sk_stats_region);
2138 struct sk_stats_region *__single sreg;
2139
2140 if (out == NULL || len < actual_space) {
2141 goto done;
2142 }
2143 sreg = out;
2144
2145 skmem_region_get_stats(skr, sreg);
2146
2147 done:
2148 return actual_space;
2149 }
2150
2151 static int
2152 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2153 {
2154 #pragma unused(arg1, arg2, oidp)
2155 struct skmem_region *skr;
2156 size_t actual_space;
2157 size_t buffer_space;
2158 size_t allocated_space = 0;
2159 caddr_t __sized_by(allocated_space) buffer = NULL;
2160 caddr_t scan;
2161 int error = 0;
2162
2163 if (!kauth_cred_issuser(kauth_cred_get())) {
2164 return EPERM;
2165 }
2166
2167 net_update_uptime();
2168 buffer_space = req->oldlen;
2169 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2170 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2171 buffer_space = SK_SYSCTL_ALLOC_MAX;
2172 }
2173 caddr_t temp;
2174 temp = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_region_mib);
2175 if (__improbable(temp == NULL)) {
2176 return ENOBUFS;
2177 }
2178 buffer = temp;
2179 allocated_space = buffer_space;
2180 } else if (req->oldptr == USER_ADDR_NULL) {
2181 buffer_space = 0;
2182 }
2183 actual_space = 0;
2184 scan = buffer;
2185
2186 SKMEM_REGION_LOCK();
2187 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2188 size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2189 if (scan != NULL) {
2190 if (buffer_space < size) {
2191 /* supplied buffer too small, stop copying */
2192 error = ENOMEM;
2193 break;
2194 }
2195 scan += size;
2196 buffer_space -= size;
2197 }
2198 actual_space += size;
2199 }
2200 SKMEM_REGION_UNLOCK();
2201
2202 if (actual_space != 0) {
2203 int out_error = SYSCTL_OUT(req, buffer, actual_space);
2204 if (out_error != 0) {
2205 error = out_error;
2206 }
2207 }
2208 if (buffer != NULL) {
2209 sk_free_data_sized_by(buffer, allocated_space);
2210 }
2211
2212 return error;
2213 }
2214
2215 #if SK_LOG
2216 const char *
skmem_region_id2name(skmem_region_id_t id)2217 skmem_region_id2name(skmem_region_id_t id)
2218 {
2219 const char *name;
2220 switch (id) {
2221 case SKMEM_REGION_SCHEMA:
2222 name = "SCHEMA";
2223 break;
2224
2225 case SKMEM_REGION_RING:
2226 name = "RING";
2227 break;
2228
2229 case SKMEM_REGION_BUF_DEF:
2230 name = "BUF_DEF";
2231 break;
2232
2233 case SKMEM_REGION_BUF_LARGE:
2234 name = "BUF_LARGE";
2235 break;
2236
2237 case SKMEM_REGION_RXBUF_DEF:
2238 name = "RXBUF_DEF";
2239 break;
2240
2241 case SKMEM_REGION_RXBUF_LARGE:
2242 name = "RXBUF_LARGE";
2243 break;
2244
2245 case SKMEM_REGION_TXBUF_DEF:
2246 name = "TXBUF_DEF";
2247 break;
2248
2249 case SKMEM_REGION_TXBUF_LARGE:
2250 name = "TXBUF_LARGE";
2251 break;
2252
2253 case SKMEM_REGION_UMD:
2254 name = "UMD";
2255 break;
2256
2257 case SKMEM_REGION_TXAUSD:
2258 name = "TXAUSD";
2259 break;
2260
2261 case SKMEM_REGION_RXFUSD:
2262 name = "RXFUSD";
2263 break;
2264
2265 case SKMEM_REGION_USTATS:
2266 name = "USTATS";
2267 break;
2268
2269 case SKMEM_REGION_FLOWADV:
2270 name = "FLOWADV";
2271 break;
2272
2273 case SKMEM_REGION_NEXUSADV:
2274 name = "NEXUSADV";
2275 break;
2276
2277 case SKMEM_REGION_SYSCTLS:
2278 name = "SYSCTLS";
2279 break;
2280
2281 case SKMEM_REGION_GUARD_HEAD:
2282 name = "HEADGUARD";
2283 break;
2284
2285 case SKMEM_REGION_GUARD_TAIL:
2286 name = "TAILGUARD";
2287 break;
2288
2289 case SKMEM_REGION_KMD:
2290 name = "KMD";
2291 break;
2292
2293 case SKMEM_REGION_RXKMD:
2294 name = "RXKMD";
2295 break;
2296
2297 case SKMEM_REGION_TXKMD:
2298 name = "TXKMD";
2299 break;
2300
2301 case SKMEM_REGION_TXAKSD:
2302 name = "TXAKSD";
2303 break;
2304
2305 case SKMEM_REGION_RXFKSD:
2306 name = "RXFKSD";
2307 break;
2308
2309 case SKMEM_REGION_KSTATS:
2310 name = "KSTATS";
2311 break;
2312
2313 case SKMEM_REGION_KBFT:
2314 name = "KBFT";
2315 break;
2316
2317 case SKMEM_REGION_UBFT:
2318 name = "UBFT";
2319 break;
2320
2321 case SKMEM_REGION_RXKBFT:
2322 name = "RXKBFT";
2323 break;
2324
2325 case SKMEM_REGION_TXKBFT:
2326 name = "TXKBFT";
2327 break;
2328
2329 case SKMEM_REGION_INTRINSIC:
2330 name = "INTRINSIC";
2331 break;
2332
2333 default:
2334 name = "UNKNOWN";
2335 break;
2336 }
2337
2338 const char *__null_terminated s = __unsafe_null_terminated_from_indexable(name);
2339
2340 return s;
2341 }
2342 #endif /* SK_LOG */
2343
2344 #if (DEVELOPMENT || DEBUG)
2345 uint64_t
skmem_region_get_mtbf(void)2346 skmem_region_get_mtbf(void)
2347 {
2348 return skmem_region_mtbf;
2349 }
2350
2351 void
skmem_region_set_mtbf(uint64_t newval)2352 skmem_region_set_mtbf(uint64_t newval)
2353 {
2354 if (newval < SKMEM_REGION_MTBF_MIN) {
2355 if (newval != 0) {
2356 newval = SKMEM_REGION_MTBF_MIN;
2357 }
2358 } else if (newval > SKMEM_REGION_MTBF_MAX) {
2359 newval = SKMEM_REGION_MTBF_MAX;
2360 }
2361
2362 if (skmem_region_mtbf != newval) {
2363 os_atomic_store(&skmem_region_mtbf, newval, release);
2364 SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2365 }
2366 }
2367
2368 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2369 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2370 struct sysctl_req *req)
2371 {
2372 #pragma unused(oidp, arg1, arg2)
2373 int changed, error;
2374 uint64_t newval;
2375
2376 _CASSERT(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2377 if ((error = sysctl_io_number(req, skmem_region_mtbf,
2378 sizeof(uint64_t), &newval, &changed)) == 0) {
2379 if (changed) {
2380 skmem_region_set_mtbf(newval);
2381 }
2382 }
2383 return error;
2384 }
2385 #endif /* (DEVELOPMENT || DEBUG) */
2386