1 /*
2 * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /* BEGIN CSTYLED */
30 /*
31 * A region represents a collection of one or more similarly-sized memory
32 * segments, each of which is a contiguous range of integers. A segment
33 * is either allocated or free, and is treated as disjoint from all other
34 * segments. That is, the contiguity applies only at the segment level,
35 * and a region with multiple segments is not contiguous at the region level.
36 * A segment always belongs to the segment freelist, or the allocated-address
37 * hash chain, as described below.
38 *
39 * The optional SKMEM_REGION_CR_NOREDIRECT flag indicates that the region
40 * stays intact even after a defunct. Otherwise, the segments belonging
41 * to the region will be freed at defunct time, and the span covered by
42 * the region will be redirected to zero-filled anonymous memory.
43 *
44 * Memory for a region is always created as pageable and purgeable. It is
45 * the client's responsibility to prepare (wire) it, and optionally insert
46 * it to the IOMMU, at segment construction time. When the segment is
47 * freed, the client is responsible for removing it from IOMMU (if needed),
48 * and complete (unwire) it.
49 *
50 * When the region is created with SKMEM_REGION_CR_PERSISTENT, the memory
51 * is immediately wired upon allocation (segment removed from freelist).
52 * It gets unwired when memory is discarded (segment inserted to freelist).
53 *
54 * The chronological life cycle of a segment is as such:
55 *
56 * SKSEG_STATE_DETACHED
57 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
58 * [segment allocated, useable by client]
59 * ...
60 * [client frees segment]
61 * SKSEG_STATE_{MAPPED,MAPPED_WIRED}
62 * [reclaim]
63 * SKSEG_STATE_DETACHED
64 *
65 * The region can also be marked as user-mappable (SKMEM_REGION_CR_MMAPOK);
66 * this allows it to be further marked with SKMEM_REGION_CR_UREADONLY to
67 * prevent modifications by the user task. Only user-mappable regions will
68 * be considered for inclusion during skmem_arena_mmap().
69 *
70 * Every skmem allocator has a region as its slab supplier. Each slab is
71 * exactly a segment. The allocator uses skmem_region_{alloc,free}() to
72 * create and destroy slabs.
73 *
74 * A region may be mirrored by another region; the latter acts as the master
75 * controller for both regions. Mirrored (slave) regions cannot be used
76 * directly by the skmem allocator. Region mirroring technique is used for
77 * managing shadow objects {umd,kmd} and {usd,ksd}, where an object in one
78 * region has the same size and lifetime as its shadow counterpart.
79 *
80 * CREATION/DESTRUCTION:
81 *
82 * At creation time, all segments are allocated and are immediately inserted
83 * into the freelist. Allocating a purgeable segment has very little cost,
84 * as it is not backed by physical memory until it is accessed. Immediate
85 * insertion into the freelist causes the mapping to be further torn down.
86 *
87 * At destruction time, the freelist is emptied, and each segment is then
88 * destroyed. The system will assert if it detects there are outstanding
89 * segments not yet returned to the region (not freed by the client.)
90 *
91 * ALLOCATION:
92 *
93 * Allocating involves searching the freelist for a segment; if found, the
94 * segment is removed from the freelist and is inserted into the allocated-
95 * address hash chain. The address of the memory object represented by
96 * the segment is used as hash key. The use of allocated-address hash chain
97 * is needed since we return the address of the memory object, and not the
98 * segment's itself, to the client.
99 *
100 * DEALLOCATION:
101 *
102 * Freeing a memory object causes the chain to be searched for a matching
103 * segment. The system will assert if a segment cannot be found, since
104 * that indicates that the memory object address is invalid. Once found,
105 * the segment is removed from the allocated-address hash chain, and is
106 * inserted to the freelist.
107 *
108 * Segment allocation and deallocation can be expensive. Because of this,
109 * we expect that most clients will utilize the skmem_cache slab allocator
110 * as the frontend instead.
111 */
112 /* END CSTYLED */
113
114 #include <skywalk/os_skywalk_private.h>
115 #define _FN_KPRINTF /* don't redefine kprintf() */
116 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
117
118 #include <kern/uipc_domain.h>
119
120 static void skmem_region_destroy(struct skmem_region *skr);
121 static void skmem_region_depopulate(struct skmem_region *);
122 static int sksegment_cmp(const struct sksegment *, const struct sksegment *);
123 static struct sksegment *sksegment_create(struct skmem_region *, uint32_t);
124 static void sksegment_destroy(struct skmem_region *, struct sksegment *);
125 static void sksegment_freelist_insert(struct skmem_region *,
126 struct sksegment *, boolean_t);
127 static struct sksegment *sksegment_freelist_remove(struct skmem_region *,
128 struct sksegment *, uint32_t, boolean_t);
129 static struct sksegment *sksegment_freelist_grow(struct skmem_region *);
130 static struct sksegment *sksegment_alloc_with_idx(struct skmem_region *,
131 uint32_t);
132 static void *__sized_by(seg_size) skmem_region_alloc_common(struct skmem_region *,
133 struct sksegment *, uint32_t seg_size);
134 static void *__sized_by(seg_size) skmem_region_mirror_alloc(struct skmem_region *,
135 struct sksegment *, uint32_t seg_size, struct sksegment **);
136 static void skmem_region_applyall(void (*)(struct skmem_region *));
137 static void skmem_region_update(struct skmem_region *);
138 static void skmem_region_update_func(thread_call_param_t, thread_call_param_t);
139 static inline void skmem_region_retain_locked(struct skmem_region *);
140 static inline boolean_t skmem_region_release_locked(struct skmem_region *);
141 static int skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS;
142
143 RB_PROTOTYPE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
144 RB_GENERATE_PREV(segtfreehead, sksegment, sg_node, sksegment_cmp);
145
146 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, region,
147 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
148 0, 0, skmem_region_mib_get_sysctl, "S,sk_stats_region",
149 "Skywalk region statistics");
150
151 static LCK_ATTR_DECLARE(skmem_region_lock_attr, 0, 0);
152 static LCK_GRP_DECLARE(skmem_region_lock_grp, "skmem_region");
153 static LCK_MTX_DECLARE_ATTR(skmem_region_lock, &skmem_region_lock_grp,
154 &skmem_region_lock_attr);
155
156 /* protected by skmem_region_lock */
157 static TAILQ_HEAD(, skmem_region) skmem_region_head;
158
159 static thread_call_t skmem_region_update_tc;
160
161 #define SKMEM_REGION_UPDATE_INTERVAL 13 /* 13 seconds */
162 static uint32_t skmem_region_update_interval = SKMEM_REGION_UPDATE_INTERVAL;
163
164 #define SKMEM_WDT_MAXTIME 30 /* # of secs before watchdog */
165 #define SKMEM_WDT_PURGE 3 /* retry purge threshold */
166
167 #if (DEVELOPMENT || DEBUG)
168 /* Mean Time Between Failures (ms) */
169 static volatile uint64_t skmem_region_mtbf;
170
171 static int skmem_region_mtbf_sysctl(struct sysctl_oid *, void *, int,
172 struct sysctl_req *);
173
174 SYSCTL_PROC(_kern_skywalk_mem, OID_AUTO, region_mtbf,
175 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
176 skmem_region_mtbf_sysctl, "Q", "Region MTBF (ms)");
177
178 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, region_update_interval,
179 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_region_update_interval,
180 SKMEM_REGION_UPDATE_INTERVAL, "Region update interval (sec)");
181 #endif /* (DEVELOPMENT || DEBUG) */
182
183 #define SKMEM_REGION_LOCK() \
184 lck_mtx_lock(&skmem_region_lock)
185 #define SKMEM_REGION_LOCK_ASSERT_HELD() \
186 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_OWNED)
187 #define SKMEM_REGION_LOCK_ASSERT_NOTHELD() \
188 LCK_MTX_ASSERT(&skmem_region_lock, LCK_MTX_ASSERT_NOTOWNED)
189 #define SKMEM_REGION_UNLOCK() \
190 lck_mtx_unlock(&skmem_region_lock)
191
192 /*
193 * Hash table bounds. Start with the initial value, and rescale up to
194 * the specified limit. Ideally we don't need a limit, but in practice
195 * this helps guard against runaways. These values should be revisited
196 * in future and be adjusted as needed.
197 */
198 #define SKMEM_REGION_HASH_INITIAL 32 /* initial hash table size */
199 #define SKMEM_REGION_HASH_LIMIT 4096 /* hash table size limit */
200
201 #define SKMEM_REGION_HASH_INDEX(_a, _s, _m) \
202 (((_a) + ((_a) >> (_s)) + ((_a) >> ((_s) << 1))) & (_m))
203 #define SKMEM_REGION_HASH(_skr, _addr) \
204 (&(_skr)->skr_hash_table[SKMEM_REGION_HASH_INDEX((uintptr_t)_addr, \
205 (_skr)->skr_hash_shift, (_skr)->skr_hash_mask)])
206
207 static SKMEM_TYPE_DEFINE(skr_zone, struct skmem_region);
208
209 /*
210 * XXX: This is used in only one function (skmem_region_init) after the
211 * -fbounds-safety changes were made for Skmem. We can remove this global and
212 * just make it a local variable to the function (skmem_region_init).
213 */
214 static unsigned int sg_size; /* size of zone element */
215 static struct skmem_cache *skmem_sg_cache; /* cache for sksegment */
216
217 static uint32_t skmem_seg_size = SKMEM_SEG_SIZE;
218 static uint32_t skmem_md_seg_size = SKMEM_MD_SEG_SIZE;
219 static uint32_t skmem_drv_buf_seg_size = SKMEM_DRV_BUF_SEG_SIZE;
220 static uint32_t skmem_drv_buf_seg_eff_size = SKMEM_DRV_BUF_SEG_SIZE;
221 uint32_t skmem_usr_buf_seg_size = SKMEM_USR_BUF_SEG_SIZE;
222
223 #define SKMEM_TAG_SEGMENT_BMAP "com.apple.skywalk.segment.bmap"
224 static SKMEM_TAG_DEFINE(skmem_tag_segment_bmap, SKMEM_TAG_SEGMENT_BMAP);
225
226 #define SKMEM_TAG_SEGMENT_HASH "com.apple.skywalk.segment.hash"
227 static SKMEM_TAG_DEFINE(skmem_tag_segment_hash, SKMEM_TAG_SEGMENT_HASH);
228
229 #define SKMEM_TAG_REGION_MIB "com.apple.skywalk.region.mib"
230 static SKMEM_TAG_DEFINE(skmem_tag_region_mib, SKMEM_TAG_REGION_MIB);
231
232 #define BMAPSZ 64
233
234 /* 64-bit mask with range */
235 #define BMASK64(_beg, _end) \
236 ((((uint64_t)-1) >> ((BMAPSZ - 1) - (_end))) & ~((1ULL << (_beg)) - 1))
237
238 static int __skmem_region_inited = 0;
239
240 /*
241 * XXX -fbounds-safety: we added seg_size to skmem_region_alloc_common(), but
242 * this is only used by -fbounds-safety, so we add __unused if -fbounds-safety
243 * is disabled. The utility macro for that is SK_BF_ARG().
244 * We do the same for skmem_region_alloc(), with objsize
245 */
246 #if !__has_ptrcheck
247 #define SK_FB_ARG __unused
248 #else
249 #define SK_FB_ARG
250 #endif
251
252 void
skmem_region_init(void)253 skmem_region_init(void)
254 {
255 boolean_t randomize_seg_size;
256
257 static_assert(sizeof(bitmap_t) == sizeof(uint64_t));
258 static_assert(BMAPSZ == (sizeof(bitmap_t) << 3));
259 static_assert((SKMEM_SEG_SIZE % SKMEM_PAGE_SIZE) == 0);
260 static_assert(SKMEM_REGION_HASH_LIMIT >= SKMEM_REGION_HASH_INITIAL);
261 ASSERT(!__skmem_region_inited);
262
263 /* enforce the ordering here */
264 static_assert(SKMEM_REGION_GUARD_HEAD == 0);
265 static_assert(SKMEM_REGION_SCHEMA == 1);
266 static_assert(SKMEM_REGION_RING == 2);
267 static_assert(SKMEM_REGION_BUF_DEF == 3);
268 static_assert(SKMEM_REGION_BUF_LARGE == 4);
269 static_assert(SKMEM_REGION_RXBUF_DEF == 5);
270 static_assert(SKMEM_REGION_RXBUF_LARGE == 6);
271 static_assert(SKMEM_REGION_TXBUF_DEF == 7);
272 static_assert(SKMEM_REGION_TXBUF_LARGE == 8);
273 static_assert(SKMEM_REGION_UMD == 9);
274 static_assert(SKMEM_REGION_TXAUSD == 10);
275 static_assert(SKMEM_REGION_RXFUSD == 11);
276 static_assert(SKMEM_REGION_UBFT == 12);
277 static_assert(SKMEM_REGION_USTATS == 13);
278 static_assert(SKMEM_REGION_FLOWADV == 14);
279 static_assert(SKMEM_REGION_NEXUSADV == 15);
280 static_assert(SKMEM_REGION_SYSCTLS == 16);
281 static_assert(SKMEM_REGION_GUARD_TAIL == 17);
282 static_assert(SKMEM_REGION_KMD == 18);
283 static_assert(SKMEM_REGION_RXKMD == 19);
284 static_assert(SKMEM_REGION_TXKMD == 20);
285 static_assert(SKMEM_REGION_KBFT == 21);
286 static_assert(SKMEM_REGION_RXKBFT == 22);
287 static_assert(SKMEM_REGION_TXKBFT == 23);
288 static_assert(SKMEM_REGION_TXAKSD == 24);
289 static_assert(SKMEM_REGION_RXFKSD == 25);
290 static_assert(SKMEM_REGION_KSTATS == 26);
291 static_assert(SKMEM_REGION_INTRINSIC == 27);
292
293 static_assert(SREG_GUARD_HEAD == SKMEM_REGION_GUARD_HEAD);
294 static_assert(SREG_SCHEMA == SKMEM_REGION_SCHEMA);
295 static_assert(SREG_RING == SKMEM_REGION_RING);
296 static_assert(SREG_BUF_DEF == SKMEM_REGION_BUF_DEF);
297 static_assert(SREG_BUF_LARGE == SKMEM_REGION_BUF_LARGE);
298 static_assert(SREG_RXBUF_DEF == SKMEM_REGION_RXBUF_DEF);
299 static_assert(SREG_RXBUF_LARGE == SKMEM_REGION_RXBUF_LARGE);
300 static_assert(SREG_TXBUF_DEF == SKMEM_REGION_TXBUF_DEF);
301 static_assert(SREG_TXBUF_LARGE == SKMEM_REGION_TXBUF_LARGE);
302 static_assert(SREG_UMD == SKMEM_REGION_UMD);
303 static_assert(SREG_TXAUSD == SKMEM_REGION_TXAUSD);
304 static_assert(SREG_RXFUSD == SKMEM_REGION_RXFUSD);
305 static_assert(SREG_UBFT == SKMEM_REGION_UBFT);
306 static_assert(SREG_USTATS == SKMEM_REGION_USTATS);
307 static_assert(SREG_FLOWADV == SKMEM_REGION_FLOWADV);
308 static_assert(SREG_NEXUSADV == SKMEM_REGION_NEXUSADV);
309 static_assert(SREG_SYSCTLS == SKMEM_REGION_SYSCTLS);
310 static_assert(SREG_GUARD_TAIL == SKMEM_REGION_GUARD_TAIL);
311 static_assert(SREG_KMD == SKMEM_REGION_KMD);
312 static_assert(SREG_RXKMD == SKMEM_REGION_RXKMD);
313 static_assert(SREG_TXKMD == SKMEM_REGION_TXKMD);
314 static_assert(SREG_KBFT == SKMEM_REGION_KBFT);
315 static_assert(SREG_RXKBFT == SKMEM_REGION_RXKBFT);
316 static_assert(SREG_TXKBFT == SKMEM_REGION_TXKBFT);
317 static_assert(SREG_TXAKSD == SKMEM_REGION_TXAKSD);
318 static_assert(SREG_RXFKSD == SKMEM_REGION_RXFKSD);
319 static_assert(SREG_KSTATS == SKMEM_REGION_KSTATS);
320
321 static_assert(SKR_MODE_NOREDIRECT == SREG_MODE_NOREDIRECT);
322 static_assert(SKR_MODE_MMAPOK == SREG_MODE_MMAPOK);
323 static_assert(SKR_MODE_UREADONLY == SREG_MODE_UREADONLY);
324 static_assert(SKR_MODE_KREADONLY == SREG_MODE_KREADONLY);
325 static_assert(SKR_MODE_PERSISTENT == SREG_MODE_PERSISTENT);
326 static_assert(SKR_MODE_MONOLITHIC == SREG_MODE_MONOLITHIC);
327 static_assert(SKR_MODE_NOMAGAZINES == SREG_MODE_NOMAGAZINES);
328 static_assert(SKR_MODE_NOCACHE == SREG_MODE_NOCACHE);
329 static_assert(SKR_MODE_IODIR_IN == SREG_MODE_IODIR_IN);
330 static_assert(SKR_MODE_IODIR_OUT == SREG_MODE_IODIR_OUT);
331 static_assert(SKR_MODE_GUARD == SREG_MODE_GUARD);
332 static_assert(SKR_MODE_SEGPHYSCONTIG == SREG_MODE_SEGPHYSCONTIG);
333 static_assert(SKR_MODE_SHAREOK == SREG_MODE_SHAREOK);
334 static_assert(SKR_MODE_PUREDATA == SREG_MODE_PUREDATA);
335 static_assert(SKR_MODE_PSEUDO == SREG_MODE_PSEUDO);
336 static_assert(SKR_MODE_THREADSAFE == SREG_MODE_THREADSAFE);
337 static_assert(SKR_MODE_SLAB == SREG_MODE_SLAB);
338 static_assert(SKR_MODE_MIRRORED == SREG_MODE_MIRRORED);
339
340 (void) PE_parse_boot_argn("skmem_seg_size", &skmem_seg_size,
341 sizeof(skmem_seg_size));
342 if (skmem_seg_size < SKMEM_MIN_SEG_SIZE) {
343 skmem_seg_size = SKMEM_MIN_SEG_SIZE;
344 }
345 skmem_seg_size = (uint32_t)P2ROUNDUP(skmem_seg_size,
346 SKMEM_MIN_SEG_SIZE);
347 VERIFY(skmem_seg_size != 0 && (skmem_seg_size % SKMEM_PAGE_SIZE) == 0);
348
349 (void) PE_parse_boot_argn("skmem_md_seg_size", &skmem_md_seg_size,
350 sizeof(skmem_md_seg_size));
351 if (skmem_md_seg_size < skmem_seg_size) {
352 skmem_md_seg_size = skmem_seg_size;
353 }
354 skmem_md_seg_size = (uint32_t)P2ROUNDUP(skmem_md_seg_size,
355 SKMEM_MIN_SEG_SIZE);
356 VERIFY((skmem_md_seg_size % SKMEM_PAGE_SIZE) == 0);
357
358 /*
359 * If set via boot-args, honor it and don't randomize.
360 */
361 randomize_seg_size = !PE_parse_boot_argn("skmem_drv_buf_seg_size",
362 &skmem_drv_buf_seg_size, sizeof(skmem_drv_buf_seg_size));
363 if (skmem_drv_buf_seg_size < skmem_seg_size) {
364 skmem_drv_buf_seg_size = skmem_seg_size;
365 }
366 skmem_drv_buf_seg_size = skmem_drv_buf_seg_eff_size =
367 (uint32_t)P2ROUNDUP(skmem_drv_buf_seg_size, SKMEM_MIN_SEG_SIZE);
368 VERIFY((skmem_drv_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
369
370 /*
371 * Randomize the driver buffer segment size; here we choose
372 * a SKMEM_MIN_SEG_SIZE multiplier to bump up the value to.
373 * Set this as the effective driver buffer segment size.
374 */
375 if (randomize_seg_size) {
376 uint32_t sm;
377 read_frandom(&sm, sizeof(sm));
378 skmem_drv_buf_seg_eff_size +=
379 (SKMEM_MIN_SEG_SIZE * (sm % SKMEM_DRV_BUF_SEG_MULTIPLIER));
380 VERIFY((skmem_drv_buf_seg_eff_size % SKMEM_MIN_SEG_SIZE) == 0);
381 }
382 VERIFY(skmem_drv_buf_seg_eff_size >= skmem_drv_buf_seg_size);
383
384 (void) PE_parse_boot_argn("skmem_usr_buf_seg_size",
385 &skmem_usr_buf_seg_size, sizeof(skmem_usr_buf_seg_size));
386 if (skmem_usr_buf_seg_size < skmem_seg_size) {
387 skmem_usr_buf_seg_size = skmem_seg_size;
388 }
389 skmem_usr_buf_seg_size = (uint32_t)P2ROUNDUP(skmem_usr_buf_seg_size,
390 SKMEM_MIN_SEG_SIZE);
391 VERIFY((skmem_usr_buf_seg_size % SKMEM_PAGE_SIZE) == 0);
392
393 SK_D("seg_size %u, md_seg_size %u, drv_buf_seg_size %u [eff %u], "
394 "usr_buf_seg_size %u", skmem_seg_size, skmem_md_seg_size,
395 skmem_drv_buf_seg_size, skmem_drv_buf_seg_eff_size,
396 skmem_usr_buf_seg_size);
397
398 TAILQ_INIT(&skmem_region_head);
399
400 skmem_region_update_tc =
401 thread_call_allocate_with_options(skmem_region_update_func,
402 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
403 if (skmem_region_update_tc == NULL) {
404 panic("%s: thread_call_allocate failed", __func__);
405 /* NOTREACHED */
406 __builtin_unreachable();
407 }
408
409 sg_size = sizeof(struct sksegment);
410 skmem_sg_cache = skmem_cache_create("sg", sg_size,
411 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
412
413 /* and start the periodic region update machinery */
414 skmem_dispatch(skmem_region_update_tc, NULL,
415 (skmem_region_update_interval * NSEC_PER_SEC));
416
417 __skmem_region_inited = 1;
418 }
419
420 void
skmem_region_fini(void)421 skmem_region_fini(void)
422 {
423 if (__skmem_region_inited) {
424 ASSERT(TAILQ_EMPTY(&skmem_region_head));
425
426 if (skmem_region_update_tc != NULL) {
427 (void) thread_call_cancel_wait(skmem_region_update_tc);
428 (void) thread_call_free(skmem_region_update_tc);
429 skmem_region_update_tc = NULL;
430 }
431
432 if (skmem_sg_cache != NULL) {
433 skmem_cache_destroy(skmem_sg_cache);
434 skmem_sg_cache = NULL;
435 }
436
437 __skmem_region_inited = 0;
438 }
439 }
440
441 /*
442 * Reap internal caches.
443 */
444 void
skmem_region_reap_caches(boolean_t purge)445 skmem_region_reap_caches(boolean_t purge)
446 {
447 skmem_cache_reap_now(skmem_sg_cache, purge);
448 }
449
450 /*
451 * Configure and compute the parameters of a region.
452 */
453 void
skmem_region_params_config(struct skmem_region_params * srp)454 skmem_region_params_config(struct skmem_region_params *srp)
455 {
456 uint32_t cache_line_size = skmem_cpu_cache_line_size();
457 size_t seglim, segsize, segcnt;
458 size_t objsize, objcnt;
459
460 ASSERT(srp->srp_id < SKMEM_REGIONS);
461
462 /*
463 * If magazines layer is disabled system-wide, override
464 * the region parameter here. This will effectively reduce
465 * the number of requested objects computed below. Note that
466 * the region may have already been configured to exclude
467 * magazines in the default skmem_regions[] array.
468 */
469 if (!skmem_allow_magazines()) {
470 srp->srp_cflags |= SKMEM_REGION_CR_NOMAGAZINES;
471 }
472
473 objsize = srp->srp_r_obj_size;
474 ASSERT(objsize != 0);
475 objcnt = srp->srp_r_obj_cnt;
476 ASSERT(objcnt != 0);
477
478 if (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO) {
479 size_t align = srp->srp_align;
480
481 VERIFY(align != 0 && (align % SKMEM_CACHE_ALIGN) == 0);
482 VERIFY(powerof2(align));
483 objsize = MAX(objsize, sizeof(uint64_t));
484 #if KASAN
485 /*
486 * When KASAN is enabled, the zone allocator adjusts the
487 * element size to include the redzone regions, in which
488 * case we assume that the elements won't start on the
489 * alignment boundary and thus need to do some fix-ups.
490 * These include increasing the effective object size
491 * which adds at least 16 bytes to the original size.
492 */
493 objsize += sizeof(uint64_t) + align;
494 #endif /* KASAN */
495 objsize = P2ROUNDUP(objsize, align);
496
497 segsize = objsize;
498 srp->srp_r_seg_size = (uint32_t)segsize;
499 segcnt = objcnt;
500 goto done;
501 } else {
502 /* objects are always aligned at CPU cache line size */
503 srp->srp_align = cache_line_size;
504 }
505
506 /*
507 * Start with default segment size for the region, and compute the
508 * effective segment size (to nearest SKMEM_MIN_SEG_SIZE). If the
509 * object size is greater, then we adjust the segment size to next
510 * multiple of the effective size larger than the object size.
511 */
512 if (srp->srp_r_seg_size == 0) {
513 switch (srp->srp_id) {
514 case SKMEM_REGION_UMD:
515 case SKMEM_REGION_KMD:
516 case SKMEM_REGION_RXKMD:
517 case SKMEM_REGION_TXKMD:
518 srp->srp_r_seg_size = skmem_md_seg_size;
519 break;
520
521 case SKMEM_REGION_BUF_DEF:
522 case SKMEM_REGION_RXBUF_DEF:
523 case SKMEM_REGION_TXBUF_DEF:
524 /*
525 * Use the effective driver buffer segment size,
526 * since it reflects any randomization done at
527 * skmem_region_init() time.
528 */
529 srp->srp_r_seg_size = skmem_drv_buf_seg_eff_size;
530 break;
531
532 default:
533 srp->srp_r_seg_size = skmem_seg_size;
534 break;
535 }
536 } else {
537 srp->srp_r_seg_size = (uint32_t)P2ROUNDUP(srp->srp_r_seg_size,
538 SKMEM_MIN_SEG_SIZE);
539 }
540
541 seglim = srp->srp_r_seg_size;
542 VERIFY(seglim != 0 && (seglim % SKMEM_PAGE_SIZE) == 0);
543
544 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu objcnt %zu",
545 srp->srp_name, seglim, objsize, objcnt);
546
547 /*
548 * Make sure object size is multiple of CPU cache line
549 * size, and that we can evenly divide the segment size.
550 */
551 if (!((objsize < cache_line_size) && (objsize < seglim) &&
552 ((cache_line_size % objsize) == 0) && ((seglim % objsize) == 0))) {
553 objsize = P2ROUNDUP(objsize, cache_line_size);
554 while (objsize < seglim && (seglim % objsize) != 0) {
555 SK_DF(SK_VERB_MEM, "%s: objsize %zu -> %zu",
556 srp->srp_name, objsize, objsize + cache_line_size);
557 objsize += cache_line_size;
558 }
559 }
560
561 /* segment must be larger than object */
562 while (objsize > seglim) {
563 SK_DF(SK_VERB_MEM, "%s: seglim %zu -> %zu", srp->srp_name,
564 seglim, seglim + SKMEM_MIN_SEG_SIZE);
565 seglim += SKMEM_MIN_SEG_SIZE;
566 }
567
568 /*
569 * Take into account worst-case per-CPU cached
570 * objects if this region is configured for it.
571 */
572 if (!(srp->srp_cflags & SKMEM_REGION_CR_NOMAGAZINES)) {
573 uint32_t magazine_max_objs =
574 skmem_cache_magazine_max((uint32_t)objsize);
575 SK_DF(SK_VERB_MEM, "%s: objcnt %zu -> %zu", srp->srp_name,
576 objcnt, objcnt + magazine_max_objs);
577 objcnt += magazine_max_objs;
578 }
579
580 SK_DF(SK_VERB_MEM, "%s: seglim %zu objsize %zu "
581 "objcnt %zu", srp->srp_name, seglim, objsize, objcnt);
582
583 segsize = P2ROUNDUP(objsize * objcnt, SKMEM_MIN_SEG_SIZE);
584 if (seglim > segsize) {
585 /*
586 * If the segment limit is larger than what we need,
587 * avoid memory wastage by shrinking it.
588 */
589 while (seglim > segsize && seglim > SKMEM_MIN_SEG_SIZE) {
590 VERIFY(seglim >= SKMEM_MIN_SEG_SIZE);
591 SK_DF(SK_VERB_MEM,
592 "%s: segsize %zu (%zu*%zu) seglim [-] %zu -> %zu",
593 srp->srp_name, segsize, objsize, objcnt, seglim,
594 P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
595 SKMEM_MIN_SEG_SIZE));
596 seglim = P2ROUNDUP(seglim - SKMEM_MIN_SEG_SIZE,
597 SKMEM_MIN_SEG_SIZE);
598 }
599
600 /* adjust segment size */
601 segsize = seglim;
602 } else if (seglim < segsize) {
603 size_t oseglim = seglim;
604 /*
605 * If the segment limit is less than the segment size,
606 * see if increasing it slightly (up to 1.5x the segment
607 * size) would allow us to avoid allocating too many
608 * extra objects (due to excessive segment count).
609 */
610 while (seglim < segsize && (segsize % seglim) != 0) {
611 SK_DF(SK_VERB_MEM,
612 "%s: segsize %zu (%zu*%zu) seglim [+] %zu -> %zu",
613 srp->srp_name, segsize, objsize, objcnt, seglim,
614 (seglim + SKMEM_MIN_SEG_SIZE));
615 seglim += SKMEM_MIN_SEG_SIZE;
616 if (seglim >= (oseglim + (oseglim >> 1))) {
617 break;
618 }
619 }
620
621 /* can't use P2ROUNDUP since seglim may not be power of 2 */
622 segsize = SK_ROUNDUP(segsize, seglim);
623 }
624 ASSERT(segsize != 0 && (segsize % seglim) == 0);
625
626 SK_DF(SK_VERB_MEM, "%s: segsize %zu seglim %zu",
627 srp->srp_name, segsize, seglim);
628
629 /* compute segment count, and recompute segment size */
630 if (srp->srp_cflags & SKMEM_REGION_CR_MONOLITHIC) {
631 segcnt = 1;
632 } else {
633 /*
634 * The adjustments above were done in increments of
635 * SKMEM_MIN_SEG_SIZE. If the object size is greater
636 * than that, ensure that the segment size is a multiple
637 * of the object size.
638 */
639 if (objsize > SKMEM_MIN_SEG_SIZE) {
640 ASSERT(seglim >= objsize);
641 if ((seglim % objsize) != 0) {
642 seglim += (seglim - objsize);
643 }
644 /* recompute segsize; see SK_ROUNDUP comment above */
645 segsize = SK_ROUNDUP(segsize, seglim);
646 }
647
648 segcnt = MAX(1, (segsize / seglim));
649 segsize /= segcnt;
650 }
651
652 SK_DF(SK_VERB_MEM, "%s: segcnt %zu segsize %zu",
653 srp->srp_name, segcnt, segsize);
654
655 /* recompute object count to avoid wastage */
656 objcnt = (segsize * segcnt) / objsize;
657 ASSERT(objcnt != 0);
658 done:
659 srp->srp_c_obj_size = (uint32_t)objsize;
660 srp->srp_c_obj_cnt = (uint32_t)objcnt;
661 srp->srp_c_seg_size = (uint32_t)segsize;
662 srp->srp_seg_cnt = (uint32_t)segcnt;
663
664 SK_DF(SK_VERB_MEM, "%s: objsize %zu objcnt %zu segcnt %zu segsize %zu",
665 srp->srp_name, objsize, objcnt, segcnt, segsize);
666
667 #if SK_LOG
668 if (__improbable(sk_verbose != 0)) {
669 char label[32];
670 (void) snprintf(label, sizeof(label), "REGION_%s:",
671 skmem_region_id2name(srp->srp_id));
672 SK_D("%-16s o:[%4u x %6u -> %4u x %6u]", label,
673 (uint32_t)srp->srp_r_obj_cnt,
674 (uint32_t)srp->srp_r_obj_size,
675 (uint32_t)srp->srp_c_obj_cnt,
676 (uint32_t)srp->srp_c_obj_size);
677 }
678 #endif /* SK_LOG */
679 }
680
681 /*
682 * Create a region.
683 */
684 struct skmem_region *
skmem_region_create(const char * name,struct skmem_region_params * srp,sksegment_ctor_fn_t ctor,sksegment_dtor_fn_t dtor,void * private)685 skmem_region_create(const char *name, struct skmem_region_params *srp,
686 sksegment_ctor_fn_t ctor, sksegment_dtor_fn_t dtor, void *private)
687 {
688 boolean_t pseudo = (srp->srp_cflags & SKMEM_REGION_CR_PSEUDO);
689 uint32_t cflags = srp->srp_cflags;
690 struct skmem_region *skr;
691 uint32_t i;
692
693 ASSERT(srp->srp_id < SKMEM_REGIONS);
694 ASSERT(srp->srp_c_seg_size != 0 &&
695 (pseudo || (srp->srp_c_seg_size % SKMEM_PAGE_SIZE) == 0));
696 ASSERT(srp->srp_seg_cnt != 0);
697 ASSERT(srp->srp_c_obj_cnt == 1 ||
698 (srp->srp_c_seg_size % srp->srp_c_obj_size) == 0);
699 ASSERT(srp->srp_c_obj_size <= srp->srp_c_seg_size);
700
701 skr = zalloc_flags(skr_zone, Z_WAITOK | Z_ZERO);
702 skr->skr_params.srp_r_seg_size = srp->srp_r_seg_size;
703 skr->skr_seg_size = srp->srp_c_seg_size;
704 skr->skr_size = (srp->srp_c_seg_size * srp->srp_seg_cnt);
705 skr->skr_seg_objs = (srp->srp_c_seg_size / srp->srp_c_obj_size);
706
707 if (!pseudo) {
708 skr->skr_seg_max_cnt = srp->srp_seg_cnt;
709
710 /* set alignment to CPU cache line size */
711 skr->skr_params.srp_align = skmem_cpu_cache_line_size();
712
713 /* allocate the allocated-address hash chain */
714 skr->skr_hash_initial = SKMEM_REGION_HASH_INITIAL;
715 skr->skr_hash_limit = SKMEM_REGION_HASH_LIMIT;
716 uint32_t size = skr->skr_hash_initial;
717 skr->skr_hash_table = sk_alloc_type_array(struct sksegment_bkt,
718 size, Z_WAITOK | Z_NOFAIL,
719 skmem_tag_segment_hash);
720 skr->skr_hash_size = size;
721 skr->skr_hash_mask = (skr->skr_hash_initial - 1);
722 skr->skr_hash_shift = flsll(srp->srp_c_seg_size) - 1;
723
724 for (i = 0; i < (skr->skr_hash_mask + 1); i++) {
725 TAILQ_INIT(&skr->skr_hash_table[i].sgb_head);
726 }
727 } else {
728 /* this upper bound doesn't apply */
729 skr->skr_seg_max_cnt = 0;
730
731 /* pick up value set by skmem_regions_params_config() */
732 skr->skr_params.srp_align = srp->srp_align;
733 }
734
735 skr->skr_r_obj_size = srp->srp_r_obj_size;
736 skr->skr_r_obj_cnt = srp->srp_r_obj_cnt;
737 skr->skr_c_obj_size = srp->srp_c_obj_size;
738 skr->skr_c_obj_cnt = srp->srp_c_obj_cnt;
739 skr->skr_memtotal = skr->skr_seg_size * srp->srp_seg_cnt;
740
741 skr->skr_params.srp_md_type = srp->srp_md_type;
742 skr->skr_params.srp_md_subtype = srp->srp_md_subtype;
743 skr->skr_params.srp_max_frags = srp->srp_max_frags;
744
745 skr->skr_seg_ctor = ctor;
746 skr->skr_seg_dtor = dtor;
747 skr->skr_private = private;
748
749 lck_mtx_init(&skr->skr_lock, &skmem_region_lock_grp,
750 &skmem_region_lock_attr);
751
752 TAILQ_INIT(&skr->skr_seg_free);
753 RB_INIT(&skr->skr_seg_tfree);
754
755 skr->skr_id = srp->srp_id;
756 uuid_generate_random(skr->skr_uuid);
757 (void) snprintf(skr->skr_name, sizeof(skr->skr_name),
758 "%s.%s.%s", SKMEM_REGION_PREFIX, srp->srp_name, name);
759
760 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p ",
761 skr->skr_name, SK_KVA(skr));
762
763 /* sanity check */
764 ASSERT(!(cflags & SKMEM_REGION_CR_GUARD) ||
765 !(cflags & (SKMEM_REGION_CR_KREADONLY | SKMEM_REGION_CR_UREADONLY |
766 SKMEM_REGION_CR_PERSISTENT | SKMEM_REGION_CR_SHAREOK |
767 SKMEM_REGION_CR_IODIR_IN | SKMEM_REGION_CR_IODIR_OUT |
768 SKMEM_REGION_CR_PUREDATA)));
769
770 skr->skr_cflags = cflags;
771 if (cflags & SKMEM_REGION_CR_NOREDIRECT) {
772 skr->skr_mode |= SKR_MODE_NOREDIRECT;
773 }
774 if (cflags & SKMEM_REGION_CR_MMAPOK) {
775 skr->skr_mode |= SKR_MODE_MMAPOK;
776 }
777 if ((cflags & SKMEM_REGION_CR_MMAPOK) &&
778 (cflags & SKMEM_REGION_CR_UREADONLY)) {
779 skr->skr_mode |= SKR_MODE_UREADONLY;
780 }
781 if (cflags & SKMEM_REGION_CR_KREADONLY) {
782 skr->skr_mode |= SKR_MODE_KREADONLY;
783 }
784 if (cflags & SKMEM_REGION_CR_PERSISTENT) {
785 skr->skr_mode |= SKR_MODE_PERSISTENT;
786 }
787 if (cflags & SKMEM_REGION_CR_MONOLITHIC) {
788 skr->skr_mode |= SKR_MODE_MONOLITHIC;
789 }
790 if (cflags & SKMEM_REGION_CR_NOMAGAZINES) {
791 skr->skr_mode |= SKR_MODE_NOMAGAZINES;
792 }
793 if (cflags & SKMEM_REGION_CR_NOCACHE) {
794 skr->skr_mode |= SKR_MODE_NOCACHE;
795 }
796 if (cflags & SKMEM_REGION_CR_SEGPHYSCONTIG) {
797 skr->skr_mode |= SKR_MODE_SEGPHYSCONTIG;
798 }
799 if (cflags & SKMEM_REGION_CR_SHAREOK) {
800 skr->skr_mode |= SKR_MODE_SHAREOK;
801 }
802 if (cflags & SKMEM_REGION_CR_IODIR_IN) {
803 skr->skr_mode |= SKR_MODE_IODIR_IN;
804 }
805 if (cflags & SKMEM_REGION_CR_IODIR_OUT) {
806 skr->skr_mode |= SKR_MODE_IODIR_OUT;
807 }
808 if (cflags & SKMEM_REGION_CR_GUARD) {
809 skr->skr_mode |= SKR_MODE_GUARD;
810 }
811 if (cflags & SKMEM_REGION_CR_PUREDATA) {
812 skr->skr_mode |= SKR_MODE_PUREDATA;
813 }
814 if (cflags & SKMEM_REGION_CR_PSEUDO) {
815 skr->skr_mode |= SKR_MODE_PSEUDO;
816 }
817 if (cflags & SKMEM_REGION_CR_THREADSAFE) {
818 skr->skr_mode |= SKR_MODE_THREADSAFE;
819 }
820 if (cflags & SKMEM_REGION_CR_MEMTAG) {
821 skr->skr_mode |= SKR_MODE_MEMTAG;
822 }
823
824 #if XNU_TARGET_OS_OSX
825 /*
826 * Mark all regions as persistent except for the guard and Intrinsic
827 * regions.
828 * This is to ensure that kernel threads won't be faulting-in while
829 * accessing these memory regions. We have observed various kinds of
830 * kernel panics due to kernel threads faulting on non-wired memory
831 * access when the VM subsystem is not in a state to swap-in the page.
832 */
833 if (!((skr->skr_mode & SKR_MODE_PSEUDO) ||
834 (skr->skr_mode & SKR_MODE_GUARD))) {
835 skr->skr_mode |= SKR_MODE_PERSISTENT;
836 }
837 #endif /* XNU_TARGET_OS_OSX */
838
839 /* SKR_MODE_UREADONLY only takes effect for user task mapping */
840 skr->skr_bufspec.user_writable = !(skr->skr_mode & SKR_MODE_UREADONLY);
841 skr->skr_bufspec.kernel_writable = !(skr->skr_mode & SKR_MODE_KREADONLY);
842 /* Regions containing pointers are wired (i.e. not pageable nor purgeable) */
843 skr->skr_bufspec.purgeable = !(skr->skr_mode & SKR_MODE_MEMTAG);
844 skr->skr_bufspec.inhibitCache = !!(skr->skr_mode & SKR_MODE_NOCACHE);
845 skr->skr_bufspec.physcontig = (skr->skr_mode & SKR_MODE_SEGPHYSCONTIG);
846 skr->skr_bufspec.iodir_in = !!(skr->skr_mode & SKR_MODE_IODIR_IN);
847 skr->skr_bufspec.iodir_out = !!(skr->skr_mode & SKR_MODE_IODIR_OUT);
848 skr->skr_bufspec.puredata = !!(skr->skr_mode & SKR_MODE_PUREDATA);
849 skr->skr_bufspec.threadSafe = !!(skr->skr_mode & SKR_MODE_THREADSAFE);
850 skr->skr_regspec.noRedirect = !!(skr->skr_mode & SKR_MODE_NOREDIRECT);
851 skr->skr_bufspec.memtag = !!(skr->skr_mode & SKR_MODE_MEMTAG);
852 /* allocate segment bitmaps */
853 if (!(skr->skr_mode & SKR_MODE_PSEUDO)) {
854 ASSERT(skr->skr_seg_max_cnt != 0);
855 skr->skr_seg_bmap_len = BITMAP_LEN(skr->skr_seg_max_cnt);
856 size_t size = BITMAP_SIZE(skr->skr_seg_max_cnt);
857 skr->skr_seg_bmap = sk_alloc_data(size,
858 Z_WAITOK | Z_NOFAIL, skmem_tag_segment_bmap);
859 skr->skr_seg_bmap_size = size;
860 ASSERT(BITMAP_SIZE(skr->skr_seg_max_cnt) ==
861 (skr->skr_seg_bmap_len * sizeof(*skr->skr_seg_bmap)));
862
863 /* mark all bitmaps as free (bit set) */
864 bitmap_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt);
865 }
866
867 /*
868 * Populate the freelist by allocating all segments for the
869 * region, which will be mapped but not faulted-in, and then
870 * immediately insert each to the freelist. That will in
871 * turn unmap the segment's memory object.
872 */
873 SKR_LOCK(skr);
874 if (skr->skr_mode & SKR_MODE_PSEUDO) {
875 char zone_name[64];
876 (void) snprintf(zone_name, sizeof(zone_name), "%s.reg.%s",
877 SKMEM_ZONE_PREFIX, name);
878 skr->skr_zreg = zone_create(zone_name, skr->skr_c_obj_size,
879 ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
880 } else {
881 /* create a backing IOSKRegion object */
882 if ((skr->skr_reg = IOSKRegionCreate(&skr->skr_regspec,
883 (IOSKSize)skr->skr_seg_size,
884 (IOSKCount)skr->skr_seg_max_cnt)) == NULL) {
885 SK_ERR("\%s\": [%u * %u] cflags 0x%x skr_reg failed",
886 skr->skr_name, (uint32_t)skr->skr_seg_size,
887 (uint32_t)skr->skr_seg_max_cnt, skr->skr_cflags);
888 goto failed;
889 }
890 }
891
892 ASSERT(skr->skr_seg_objs != 0);
893
894 ++skr->skr_refcnt; /* for caller */
895 SKR_UNLOCK(skr);
896
897 SKMEM_REGION_LOCK();
898 TAILQ_INSERT_TAIL(&skmem_region_head, skr, skr_link);
899 SKMEM_REGION_UNLOCK();
900
901 SK_DF(SK_VERB_MEM_REGION,
902 " [TOTAL] seg (%u*%u) obj (%u*%u) cflags 0x%x",
903 (uint32_t)skr->skr_seg_size, (uint32_t)skr->skr_seg_max_cnt,
904 (uint32_t)skr->skr_c_obj_size, (uint32_t)skr->skr_c_obj_cnt,
905 skr->skr_cflags);
906
907 return skr;
908
909 failed:
910 SKR_LOCK_ASSERT_HELD(skr);
911 skmem_region_destroy(skr);
912
913 return NULL;
914 }
915
916 /*
917 * Destroy a region.
918 */
919 static void
skmem_region_destroy(struct skmem_region * skr)920 skmem_region_destroy(struct skmem_region *skr)
921 {
922 struct skmem_region *mskr;
923
924 SKR_LOCK_ASSERT_HELD(skr);
925
926 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p",
927 skr->skr_name, SK_KVA(skr));
928
929 /*
930 * Panic if we detect there are unfreed segments; the caller
931 * destroying this region is responsible for ensuring that all
932 * allocated segments have been freed prior to getting here.
933 */
934 ASSERT(skr->skr_refcnt == 0);
935 if (skr->skr_seginuse != 0) {
936 panic("%s: '%s' (%p) not empty (%u unfreed)",
937 __func__, skr->skr_name, (void *)skr, skr->skr_seginuse);
938 /* NOTREACHED */
939 __builtin_unreachable();
940 }
941
942 if (skr->skr_link.tqe_next != NULL || skr->skr_link.tqe_prev != NULL) {
943 SKR_UNLOCK(skr);
944 SKMEM_REGION_LOCK();
945 TAILQ_REMOVE(&skmem_region_head, skr, skr_link);
946 SKMEM_REGION_UNLOCK();
947 SKR_LOCK(skr);
948 ASSERT(skr->skr_refcnt == 0);
949 }
950
951 /*
952 * Undo what's done earlier at region creation time.
953 */
954 skmem_region_depopulate(skr);
955 ASSERT(TAILQ_EMPTY(&skr->skr_seg_free));
956 ASSERT(RB_EMPTY(&skr->skr_seg_tfree));
957 ASSERT(skr->skr_seg_free_cnt == 0);
958
959 if (skr->skr_reg != NULL) {
960 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
961 IOSKRegionDestroy(skr->skr_reg);
962 skr->skr_reg = NULL;
963 }
964
965 if (skr->skr_zreg != NULL) {
966 ASSERT(skr->skr_mode & SKR_MODE_PSEUDO);
967 zdestroy(skr->skr_zreg);
968 skr->skr_zreg = NULL;
969 }
970
971 if (skr->skr_seg_bmap != NULL) {
972 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
973 #if (DEBUG || DEVELOPMENT)
974 ASSERT(skr->skr_seg_bmap_len != 0);
975 /* must have been set to vacant (bit set) by now */
976 assert(bitmap_is_full(skr->skr_seg_bmap, skr->skr_seg_max_cnt));
977 #endif /* DEBUG || DEVELOPMENT */
978
979 bitmap_t *__indexable bmap = skr->skr_seg_bmap;
980 sk_free_data(bmap, skr->skr_seg_bmap_size);
981 skr->skr_seg_bmap = NULL;
982 skr->skr_seg_bmap_size = 0;
983 skr->skr_seg_bmap_len = 0;
984 }
985 ASSERT(skr->skr_seg_bmap_len == 0);
986
987 if (skr->skr_hash_table != NULL) {
988 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
989 #if (DEBUG || DEVELOPMENT)
990 for (uint32_t i = 0; i < (skr->skr_hash_mask + 1); i++) {
991 ASSERT(TAILQ_EMPTY(&skr->skr_hash_table[i].sgb_head));
992 }
993 #endif /* DEBUG || DEVELOPMENT */
994
995 struct sksegment_bkt *__indexable htable = skr->skr_hash_table;
996 sk_free_type_array(struct sksegment_bkt, skr->skr_hash_size,
997 htable);
998 skr->skr_hash_table = NULL;
999 skr->skr_hash_size = 0;
1000 htable = NULL;
1001 }
1002 if ((mskr = skr->skr_mirror) != NULL) {
1003 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1004 skr->skr_mirror = NULL;
1005 mskr->skr_mode &= ~SKR_MODE_MIRRORED;
1006 }
1007 SKR_UNLOCK(skr);
1008
1009 if (mskr != NULL) {
1010 skmem_region_release(mskr);
1011 }
1012
1013 lck_mtx_destroy(&skr->skr_lock, &skmem_region_lock_grp);
1014
1015 zfree(skr_zone, skr);
1016 }
1017
1018 /*
1019 * Mirror mskr (slave) to skr (master).
1020 */
1021 void
skmem_region_mirror(struct skmem_region * skr,struct skmem_region * mskr)1022 skmem_region_mirror(struct skmem_region *skr, struct skmem_region *mskr)
1023 {
1024 ASSERT(mskr != NULL);
1025 SK_DF(SK_VERB_MEM_REGION, "skr master %p, slave %p ",
1026 SK_KVA(skr), SK_KVA(mskr));
1027
1028 SKR_LOCK(skr);
1029 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1030 ASSERT(!(mskr->skr_mode & SKR_MODE_MIRRORED));
1031 ASSERT(skr->skr_mirror == NULL);
1032
1033 /* both regions must share identical parameters */
1034 ASSERT(skr->skr_size == mskr->skr_size);
1035 ASSERT(skr->skr_seg_size == mskr->skr_seg_size);
1036 ASSERT(skr->skr_seg_free_cnt == mskr->skr_seg_free_cnt);
1037
1038 skr->skr_mirror = mskr;
1039 skmem_region_retain(mskr);
1040 mskr->skr_mode |= SKR_MODE_MIRRORED;
1041 SKR_UNLOCK(skr);
1042 }
1043
1044 void
skmem_region_slab_config(struct skmem_region * skr,struct skmem_cache * skm,bool attach)1045 skmem_region_slab_config(struct skmem_region *skr, struct skmem_cache *skm,
1046 bool attach)
1047 {
1048 int i;
1049
1050 SKR_LOCK(skr);
1051 if (attach) {
1052 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != NULL;
1053 i++) {
1054 ;
1055 }
1056 VERIFY(i < SKR_MAX_CACHES);
1057 ASSERT(skr->skr_cache[i] == NULL);
1058 skr->skr_mode |= SKR_MODE_SLAB;
1059 skr->skr_cache[i] = skm;
1060 skmem_region_retain_locked(skr);
1061 SKR_UNLOCK(skr);
1062 } else {
1063 ASSERT(skr->skr_mode & SKR_MODE_SLAB);
1064 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] != skm;
1065 i++) {
1066 ;
1067 }
1068 VERIFY(i < SKR_MAX_CACHES);
1069 ASSERT(skr->skr_cache[i] == skm);
1070 skr->skr_cache[i] = NULL;
1071 for (i = 0; i < SKR_MAX_CACHES && skr->skr_cache[i] == NULL;
1072 i++) {
1073 ;
1074 }
1075 if (i == SKR_MAX_CACHES) {
1076 skr->skr_mode &= ~SKR_MODE_SLAB;
1077 }
1078 if (!skmem_region_release_locked(skr)) {
1079 SKR_UNLOCK(skr);
1080 }
1081 }
1082 }
1083
1084 /*
1085 * Common routines for skmem_region_{alloc,mirror_alloc}.
1086 */
1087 static void *
__sized_by(objsize)1088 __sized_by(objsize)
1089 skmem_region_alloc_common(struct skmem_region *skr, struct sksegment *sg,
1090 uint32_t SK_FB_ARG objsize)
1091 {
1092 struct sksegment_bkt *sgb;
1093 uint32_t SK_FB_ARG seg_sz = 0;
1094 void *__sized_by(seg_sz) addr;
1095
1096 SKR_LOCK_ASSERT_HELD(skr);
1097
1098 ASSERT(sg->sg_md != NULL);
1099 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1100 addr = __unsafe_forge_bidi_indexable(void *, (void *)sg->sg_start, objsize);
1101 seg_sz = objsize;
1102 sgb = SKMEM_REGION_HASH(skr, addr);
1103 ASSERT(sg->sg_link.tqe_next == NULL);
1104 ASSERT(sg->sg_link.tqe_prev == NULL);
1105 TAILQ_INSERT_HEAD(&sgb->sgb_head, sg, sg_link);
1106
1107 skr->skr_seginuse++;
1108 skr->skr_meminuse += skr->skr_seg_size;
1109 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1110 skr->skr_w_meminuse += skr->skr_seg_size;
1111 }
1112 skr->skr_alloc++;
1113
1114 return addr;
1115 }
1116
1117 /*
1118 * Allocate a segment from the region.
1119 * XXX -fbounds-safety: there's only 5 callers of this funcion, so it was easier
1120 * to just add objsize to the function signature
1121 * XXX -fbounds-safety: until we have __sized_by_or_null (rdar://75598414), we
1122 * can't pass NULL, but instead create a variable whose value is NULL. Also,
1123 * once rdar://83900556 lands, -fbounds-safety will do size checking at return.
1124 * So we need to come back to this once rdar://75598414 and rdar://83900556
1125 * land.
1126 */
1127 void *
__sized_by(objsize)1128 __sized_by(objsize)
1129 skmem_region_alloc(struct skmem_region *skr, void *__sized_by(*msize) * maddr,
1130 struct sksegment **retsg, struct sksegment **retsgm, uint32_t skmflag,
1131 uint32_t SK_FB_ARG objsize, uint32_t *SK_FB_ARG msize)
1132 {
1133 struct sksegment *sg = NULL;
1134 struct sksegment *__single sg1 = NULL;
1135 void *__indexable addr = NULL, *__indexable addr1 = NULL;
1136 uint32_t retries = 0;
1137
1138 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1139
1140 if (retsg != NULL) {
1141 *retsg = NULL;
1142 }
1143 if (retsgm != NULL) {
1144 *retsgm = NULL;
1145 }
1146
1147 /* SKMEM_NOSLEEP and SKMEM_FAILOK are mutually exclusive */
1148 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) !=
1149 (SKMEM_NOSLEEP | SKMEM_FAILOK));
1150
1151 SKR_LOCK(skr);
1152 while (sg == NULL) {
1153 /* see if there's a segment in the freelist */
1154 sg = TAILQ_FIRST(&skr->skr_seg_free);
1155 if (sg == NULL) {
1156 /* see if we can grow the freelist */
1157 sg = sksegment_freelist_grow(skr);
1158 if (sg != NULL) {
1159 break;
1160 }
1161
1162 if (skr->skr_mode & SKR_MODE_SLAB) {
1163 SKR_UNLOCK(skr);
1164 /*
1165 * None found; it's possible that the slab
1166 * layer is caching extra amount, so ask
1167 * skmem_cache to reap/purge its caches.
1168 */
1169 for (int i = 0; i < SKR_MAX_CACHES; i++) {
1170 if (skr->skr_cache[i] == NULL) {
1171 continue;
1172 }
1173 skmem_cache_reap_now(skr->skr_cache[i],
1174 TRUE);
1175 }
1176 SKR_LOCK(skr);
1177 /*
1178 * If we manage to get some freed, try again.
1179 */
1180 if (TAILQ_FIRST(&skr->skr_seg_free) != NULL) {
1181 continue;
1182 }
1183 }
1184
1185 /*
1186 * Give up if this is a non-blocking allocation,
1187 * or if this is a blocking allocation but the
1188 * caller is willing to retry.
1189 */
1190 if (skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) {
1191 break;
1192 }
1193
1194 /* otherwise we wait until one is available */
1195 ++skr->skr_seg_waiters;
1196 (void) msleep(&skr->skr_seg_free, &skr->skr_lock,
1197 (PZERO - 1), skr->skr_name, NULL);
1198 }
1199 }
1200
1201 SKR_LOCK_ASSERT_HELD(skr);
1202
1203 if (sg != NULL) {
1204 retry:
1205 /*
1206 * We have a segment; remove it from the freelist and
1207 * insert it into the allocated-address hash chain.
1208 * Note that this may return NULL if we can't allocate
1209 * the memory descriptor.
1210 */
1211 if (sksegment_freelist_remove(skr, sg, skmflag,
1212 FALSE) == NULL) {
1213 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1214 ASSERT(sg->sg_md == NULL);
1215 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1216
1217 /*
1218 * If it's non-blocking allocation, simply just give
1219 * up and let the caller decide when to retry. Else,
1220 * it gets a bit complicated due to the contract we
1221 * have for blocking allocations with the client; the
1222 * most sensible thing to do here is to retry the
1223 * allocation ourselves. Note that we keep using the
1224 * same segment we originally got, since we only need
1225 * the memory descriptor to be allocated for it; thus
1226 * we make sure we don't release the region lock when
1227 * retrying allocation. Doing so is crucial when the
1228 * region is mirrored, since the segment indices on
1229 * both regions need to match.
1230 */
1231 if (skmflag & SKMEM_NOSLEEP) {
1232 SK_ERR("\"%s\": failed to allocate segment "
1233 "(non-sleeping mode)", skr->skr_name);
1234 sg = NULL;
1235 } else {
1236 if (++retries > SKMEM_WDT_MAXTIME) {
1237 panic_plain("\"%s\": failed to "
1238 "allocate segment (sleeping mode) "
1239 "after %u retries\n\n%s",
1240 skr->skr_name, SKMEM_WDT_MAXTIME,
1241 skmem_dump(skr));
1242 /* NOTREACHED */
1243 __builtin_unreachable();
1244 } else {
1245 SK_ERR("\"%s\": failed to allocate "
1246 "segment (sleeping mode): %u "
1247 "retries", skr->skr_name, retries);
1248 }
1249 if (skr->skr_mode & SKR_MODE_SLAB) {
1250 /*
1251 * We can't get any memory descriptor
1252 * for this segment; reap extra cached
1253 * objects from the slab layer and hope
1254 * that we get lucky next time around.
1255 *
1256 * XXX [email protected]: perhaps also
1257 * trigger the zone allocator to do
1258 * its garbage collection here?
1259 */
1260 skmem_cache_reap();
1261 }
1262 delay(1 * USEC_PER_SEC); /* 1 sec */
1263 goto retry;
1264 }
1265 }
1266
1267 if (sg != NULL) {
1268 /* insert to allocated-address hash chain */
1269 addr = skmem_region_alloc_common(skr, sg,
1270 skr->skr_seg_size);
1271 }
1272 }
1273
1274 if (sg == NULL) {
1275 VERIFY(skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK));
1276 if (skmflag & SKMEM_PANIC) {
1277 VERIFY((skmflag & (SKMEM_NOSLEEP | SKMEM_FAILOK)) ==
1278 SKMEM_NOSLEEP);
1279 /*
1280 * If is a failed non-blocking alloc and the caller
1281 * insists that it must be successful, then panic.
1282 */
1283 panic_plain("\"%s\": skr 0x%p unable to satisfy "
1284 "mandatory allocation\n", skr->skr_name, skr);
1285 /* NOTREACHED */
1286 __builtin_unreachable();
1287 } else {
1288 /*
1289 * Give up if this is a non-blocking allocation,
1290 * or one where the caller is willing to handle
1291 * allocation failures.
1292 */
1293 goto done;
1294 }
1295 }
1296
1297 ASSERT((mach_vm_address_t)addr == sg->sg_start);
1298
1299 #if SK_LOG
1300 SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p",
1301 SK_KVA(skr), SK_KVA(sg));
1302 if (skr->skr_mirror == NULL ||
1303 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1304 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)",
1305 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1306 } else {
1307 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) mirrored",
1308 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1309 }
1310 #endif /* SK_LOG */
1311
1312 /*
1313 * If mirroring, allocate shadow object from slave region.
1314 */
1315 if (skr->skr_mirror != NULL) {
1316 ASSERT(skr->skr_mirror != skr);
1317 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1318 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1319 addr1 = skmem_region_mirror_alloc(skr->skr_mirror, sg,
1320 skr->skr_mirror->skr_seg_size, &sg1);
1321 ASSERT(addr1 != NULL);
1322 ASSERT(sg1 != NULL && sg1 != sg);
1323 ASSERT(sg1->sg_index == sg->sg_index);
1324 }
1325
1326 done:
1327 SKR_UNLOCK(skr);
1328
1329 /* return segment metadata to caller if asked (reference not needed) */
1330 if (addr != NULL) {
1331 if (retsg != NULL) {
1332 *retsg = sg;
1333 }
1334 if (retsgm != NULL) {
1335 *retsgm = sg1;
1336 }
1337 }
1338
1339 if (maddr != NULL) {
1340 if (addr1) {
1341 *maddr = addr1;
1342 *msize = skr->skr_mirror->skr_seg_size;
1343 } else {
1344 *maddr = addr1;
1345 *msize = 0;
1346 }
1347 }
1348
1349 return addr;
1350 }
1351
1352 /*
1353 * Allocate a segment from a mirror region at the same index. While it
1354 * is somewhat a simplified variant of skmem_region_alloc, keeping it
1355 * separate allows us to avoid further convoluting that routine.
1356 */
1357 static void *
__sized_by(seg_size)1358 __sized_by(seg_size)
1359 skmem_region_mirror_alloc(struct skmem_region *skr, struct sksegment *sg0,
1360 uint32_t SK_FB_ARG seg_size, struct sksegment **__single retsg)
1361 {
1362 struct sksegment sg_key = { .sg_index = sg0->sg_index };
1363 struct sksegment *sg = NULL;
1364 void *addr = NULL;
1365
1366 ASSERT(skr->skr_mode & SKR_MODE_MIRRORED);
1367 ASSERT(skr->skr_mirror == NULL);
1368 ASSERT(sg0->sg_type == SKSEG_TYPE_ALLOC);
1369
1370 if (retsg != NULL) {
1371 *retsg = NULL;
1372 }
1373
1374 SKR_LOCK(skr);
1375
1376 /*
1377 * See if we can find one in the freelist first. Otherwise,
1378 * create a new segment of the same index and add that to the
1379 * freelist. We would always get a segment since both regions
1380 * are synchronized when it comes to the indices of allocated
1381 * segments.
1382 */
1383 sg = RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key);
1384 if (sg == NULL) {
1385 sg = sksegment_alloc_with_idx(skr, sg0->sg_index);
1386 VERIFY(sg != NULL);
1387 }
1388 VERIFY(sg->sg_index == sg0->sg_index);
1389
1390 /*
1391 * We have a segment; remove it from the freelist and insert
1392 * it into the allocated-address hash chain. This either
1393 * succeeds or panics (SKMEM_PANIC) when a memory descriptor
1394 * can't be allocated.
1395 *
1396 * TODO: consider retrying IOBMD allocation attempts if needed.
1397 */
1398 sg = sksegment_freelist_remove(skr, sg, SKMEM_PANIC, FALSE);
1399 VERIFY(sg != NULL);
1400
1401 /* insert to allocated-address hash chain */
1402 addr = skmem_region_alloc_common(skr, sg, skr->skr_seg_size);
1403
1404 #if SK_LOG
1405 SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p",
1406 SK_KVA(skr), SK_KVA(sg));
1407 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)",
1408 sg->sg_index, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end));
1409 #endif /* SK_LOG */
1410
1411 SKR_UNLOCK(skr);
1412
1413 /* return segment metadata to caller if asked (reference not needed) */
1414 if (retsg != NULL) {
1415 *retsg = sg;
1416 }
1417
1418 return addr;
1419 }
1420
1421 /*
1422 * Free a segment to the region.
1423 */
1424 void
skmem_region_free(struct skmem_region * skr,void * addr,void * maddr)1425 skmem_region_free(struct skmem_region *skr, void *addr, void *maddr)
1426 {
1427 struct sksegment_bkt *sgb;
1428 struct sksegment *sg, *tsg;
1429
1430 VERIFY(!(skr->skr_mode & SKR_MODE_GUARD));
1431
1432 /*
1433 * Search the hash chain to find a matching segment for the
1434 * given address. If found, remove the segment from the
1435 * hash chain and insert it into the freelist. Otherwise,
1436 * we panic since the caller has given us a bogus address.
1437 */
1438 SKR_LOCK(skr);
1439 sgb = SKMEM_REGION_HASH(skr, addr);
1440 TAILQ_FOREACH_SAFE(sg, &sgb->sgb_head, sg_link, tsg) {
1441 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1442 if (sg->sg_start == (mach_vm_address_t)addr) {
1443 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
1444 sg->sg_link.tqe_next = NULL;
1445 sg->sg_link.tqe_prev = NULL;
1446 break;
1447 }
1448 }
1449
1450 ASSERT(sg != NULL);
1451 if (sg->sg_state == SKSEG_STATE_MAPPED_WIRED) {
1452 ASSERT(skr->skr_w_meminuse >= skr->skr_seg_size);
1453 skr->skr_w_meminuse -= skr->skr_seg_size;
1454 }
1455 sksegment_freelist_insert(skr, sg, FALSE);
1456
1457 ASSERT(skr->skr_seginuse != 0);
1458 skr->skr_seginuse--;
1459 skr->skr_meminuse -= skr->skr_seg_size;
1460 skr->skr_free++;
1461
1462 #if SK_LOG
1463 SK_DF(SK_VERB_MEM_REGION, "skr %p sg %p",
1464 SK_KVA(skr), SK_KVA(sg));
1465 if (skr->skr_mirror == NULL ||
1466 !(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED)) {
1467 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p)",
1468 sg->sg_index, SK_KVA(addr),
1469 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1470 } else {
1471 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) mirrored",
1472 sg->sg_index, SK_KVA(addr),
1473 SK_KVA((uintptr_t)addr + skr->skr_seg_size));
1474 }
1475 #endif /* SK_LOG */
1476
1477 /*
1478 * If mirroring, also free shadow object in slave region.
1479 */
1480 if (skr->skr_mirror != NULL) {
1481 ASSERT(maddr != NULL);
1482 ASSERT(skr->skr_mirror != skr);
1483 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1484 ASSERT(skr->skr_mirror->skr_mode & SKR_MODE_MIRRORED);
1485 skmem_region_free(skr->skr_mirror, maddr, NULL);
1486 }
1487
1488 /* wake up any blocked threads waiting for a segment */
1489 if (skr->skr_seg_waiters != 0) {
1490 SK_DF(SK_VERB_MEM_REGION,
1491 "sg %p waking up %u waiters", SK_KVA(sg),
1492 skr->skr_seg_waiters);
1493 skr->skr_seg_waiters = 0;
1494 wakeup(&skr->skr_seg_free);
1495 }
1496 SKR_UNLOCK(skr);
1497 }
1498
1499 __attribute__((always_inline))
1500 static inline void
skmem_region_retain_locked(struct skmem_region * skr)1501 skmem_region_retain_locked(struct skmem_region *skr)
1502 {
1503 SKR_LOCK_ASSERT_HELD(skr);
1504 skr->skr_refcnt++;
1505 ASSERT(skr->skr_refcnt != 0);
1506 }
1507
1508 /*
1509 * Retain a segment.
1510 */
1511 void
skmem_region_retain(struct skmem_region * skr)1512 skmem_region_retain(struct skmem_region *skr)
1513 {
1514 SKR_LOCK(skr);
1515 skmem_region_retain_locked(skr);
1516 SKR_UNLOCK(skr);
1517 }
1518
1519 __attribute__((always_inline))
1520 static inline boolean_t
skmem_region_release_locked(struct skmem_region * skr)1521 skmem_region_release_locked(struct skmem_region *skr)
1522 {
1523 SKR_LOCK_ASSERT_HELD(skr);
1524 ASSERT(skr->skr_refcnt != 0);
1525 if (--skr->skr_refcnt == 0) {
1526 skmem_region_destroy(skr);
1527 return TRUE;
1528 }
1529 return FALSE;
1530 }
1531
1532 /*
1533 * Release (and potentially destroy) a segment.
1534 */
1535 boolean_t
skmem_region_release(struct skmem_region * skr)1536 skmem_region_release(struct skmem_region *skr)
1537 {
1538 boolean_t lastref;
1539
1540 SKR_LOCK(skr);
1541 if (!(lastref = skmem_region_release_locked(skr))) {
1542 SKR_UNLOCK(skr);
1543 }
1544
1545 return lastref;
1546 }
1547
1548 /*
1549 * Depopulate the segment freelist.
1550 */
1551 static void
skmem_region_depopulate(struct skmem_region * skr)1552 skmem_region_depopulate(struct skmem_region *skr)
1553 {
1554 struct sksegment *sg, *tsg;
1555
1556 SK_DF(SK_VERB_MEM_REGION, "\"%s\": skr %p ",
1557 skr->skr_name, SK_KVA(skr));
1558
1559 SKR_LOCK_ASSERT_HELD(skr);
1560 ASSERT(skr->skr_seg_bmap_len != 0 || (skr->skr_mode & SKR_MODE_PSEUDO));
1561
1562 TAILQ_FOREACH_SAFE(sg, &skr->skr_seg_free, sg_link, tsg) {
1563 struct sksegment *sg0;
1564 uint32_t i;
1565
1566 i = sg->sg_index;
1567 sg0 = sksegment_freelist_remove(skr, sg, 0, TRUE);
1568 VERIFY(sg0 == sg);
1569
1570 sksegment_destroy(skr, sg);
1571 ASSERT(bit_test(skr->skr_seg_bmap[i / BMAPSZ], i % BMAPSZ));
1572 }
1573 }
1574
1575 /*
1576 * Free tree segment compare routine.
1577 */
1578 static int
sksegment_cmp(const struct sksegment * sg1,const struct sksegment * sg2)1579 sksegment_cmp(const struct sksegment *sg1, const struct sksegment *sg2)
1580 {
1581 return sg1->sg_index - sg2->sg_index;
1582 }
1583
1584 /*
1585 * Create a segment.
1586 *
1587 * Upon success, clear the bit for the segment's index in skr_seg_bmap bitmap.
1588 */
1589 static struct sksegment *
sksegment_create(struct skmem_region * skr,uint32_t i)1590 sksegment_create(struct skmem_region *skr, uint32_t i)
1591 {
1592 struct sksegment *__single sg = NULL;
1593 bitmap_t *bmap;
1594
1595 SKR_LOCK_ASSERT_HELD(skr);
1596
1597 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1598 ASSERT(i < skr->skr_seg_max_cnt);
1599 ASSERT(skr->skr_reg != NULL);
1600 ASSERT(skr->skr_seg_size == round_page(skr->skr_seg_size));
1601
1602 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1603 ASSERT(bit_test(*bmap, i % BMAPSZ));
1604
1605 sg = skmem_cache_alloc(skmem_sg_cache, SKMEM_SLEEP);
1606 bzero(sg, sizeof(*sg));
1607
1608 sg->sg_region = skr;
1609 sg->sg_index = i;
1610 sg->sg_state = SKSEG_STATE_DETACHED;
1611
1612 /* claim it (clear bit) */
1613 bit_clear(*bmap, i % BMAPSZ);
1614
1615 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) 0x%x", i,
1616 SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode);
1617
1618 return sg;
1619 }
1620
1621 /*
1622 * Destroy a segment.
1623 *
1624 * Set the bit for the segment's index in skr_seg_bmap bitmap,
1625 * indicating that it is now vacant.
1626 */
1627 static void
sksegment_destroy(struct skmem_region * skr,struct sksegment * sg)1628 sksegment_destroy(struct skmem_region *skr, struct sksegment *sg)
1629 {
1630 uint32_t i = sg->sg_index;
1631 bitmap_t *bmap;
1632
1633 SKR_LOCK_ASSERT_HELD(skr);
1634
1635 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1636 ASSERT(skr == sg->sg_region);
1637 ASSERT(skr->skr_reg != NULL);
1638 ASSERT(sg->sg_type == SKSEG_TYPE_DESTROYED);
1639 ASSERT(i < skr->skr_seg_max_cnt);
1640
1641 bmap = &skr->skr_seg_bmap[i / BMAPSZ];
1642 ASSERT(!bit_test(*bmap, i % BMAPSZ));
1643
1644 SK_DF(SK_VERB_MEM_REGION, " [%u] [%p-%p) 0x%x",
1645 i, SK_KVA(sg->sg_start), SK_KVA(sg->sg_end), skr->skr_mode);
1646
1647 /*
1648 * Undo what's done earlier at segment creation time.
1649 */
1650
1651 ASSERT(sg->sg_md == NULL);
1652 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1653 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1654
1655 /* release it (set bit) */
1656 bit_set(*bmap, i % BMAPSZ);
1657
1658 skmem_cache_free(skmem_sg_cache, sg);
1659 }
1660
1661 /*
1662 * Insert a segment into freelist (freeing the segment).
1663 */
1664 static void
sksegment_freelist_insert(struct skmem_region * skr,struct sksegment * sg,boolean_t populating)1665 sksegment_freelist_insert(struct skmem_region *skr, struct sksegment *sg,
1666 boolean_t populating)
1667 {
1668 SKR_LOCK_ASSERT_HELD(skr);
1669
1670 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1671 ASSERT(sg->sg_type != SKSEG_TYPE_FREE);
1672 ASSERT(skr == sg->sg_region);
1673 ASSERT(skr->skr_reg != NULL);
1674 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1675
1676 /*
1677 * If the region is being populated, then we're done.
1678 */
1679 if (__improbable(populating)) {
1680 ASSERT(sg->sg_md == NULL);
1681 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1682 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1683 } else {
1684 IOSKMemoryBufferRef __single md;
1685 IOReturn err;
1686
1687 ASSERT(sg->sg_md != NULL);
1688 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1689
1690 /*
1691 * Let the client remove the memory from IOMMU, and unwire it.
1692 */
1693 if (skr->skr_seg_dtor != NULL) {
1694 skr->skr_seg_dtor(sg, sg->sg_md, skr->skr_private);
1695 }
1696
1697 ASSERT(sg->sg_state == SKSEG_STATE_MAPPED ||
1698 sg->sg_state == SKSEG_STATE_MAPPED_WIRED);
1699
1700 IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1701 VERIFY(sg->sg_md == md);
1702
1703 /*
1704 * If persistent, unwire this memory now. But do not unwire
1705 * memtag regions, as they come from zalloc.
1706 */
1707 if ((skr->skr_mode & SKR_MODE_PERSISTENT) &&
1708 !(skr->skr_mode & SKR_MODE_MEMTAG)) {
1709 err = IOSKMemoryUnwire(md);
1710 if (err != kIOReturnSuccess) {
1711 panic("Fail to unwire md %p, err %d", md, err);
1712 }
1713 }
1714
1715 /* mark memory as empty/discarded for consistency */
1716 if (!(skr->skr_mode & SKR_MODE_MEMTAG)) {
1717 err = IOSKMemoryDiscard(md);
1718 if (err != kIOReturnSuccess) {
1719 panic("Fail to discard md %p, err %d", md, err);
1720 }
1721 }
1722
1723 IOSKMemoryDestroy(md);
1724 sg->sg_md = NULL;
1725 sg->sg_start = sg->sg_end = 0;
1726 sg->sg_state = SKSEG_STATE_DETACHED;
1727 }
1728
1729 sg->sg_type = SKSEG_TYPE_FREE;
1730 ASSERT(sg->sg_link.tqe_next == NULL);
1731 ASSERT(sg->sg_link.tqe_prev == NULL);
1732 TAILQ_INSERT_TAIL(&skr->skr_seg_free, sg, sg_link);
1733 ASSERT(sg->sg_node.rbe_left == NULL);
1734 ASSERT(sg->sg_node.rbe_right == NULL);
1735 ASSERT(sg->sg_node.rbe_parent == NULL);
1736 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1737 ++skr->skr_seg_free_cnt;
1738 ASSERT(skr->skr_seg_free_cnt <= skr->skr_seg_max_cnt);
1739 }
1740
1741 /*
1742 * Remove a segment from the freelist (allocating the segment).
1743 */
1744 static struct sksegment *
sksegment_freelist_remove(struct skmem_region * skr,struct sksegment * sg,uint32_t skmflag,boolean_t purging)1745 sksegment_freelist_remove(struct skmem_region *skr, struct sksegment *sg,
1746 uint32_t skmflag, boolean_t purging)
1747 {
1748 #pragma unused(skmflag)
1749 mach_vm_address_t segstart;
1750 IOReturn err;
1751 int ret;
1752
1753 SKR_LOCK_ASSERT_HELD(skr);
1754
1755 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1756 ASSERT(sg != NULL);
1757 ASSERT(skr == sg->sg_region);
1758 ASSERT(skr->skr_reg != NULL);
1759 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1760 ASSERT(sg->sg_index < skr->skr_seg_max_cnt);
1761
1762 #if (DEVELOPMENT || DEBUG)
1763 uint64_t mtbf = skmem_region_get_mtbf();
1764 /*
1765 * MTBF doesn't apply when SKMEM_PANIC is set as caller would assert.
1766 */
1767 if (__improbable(mtbf != 0 && !purging &&
1768 (net_uptime_ms() % mtbf) == 0 &&
1769 !(skmflag & SKMEM_PANIC))) {
1770 SK_ERR("skr \"%s\" %p sg %p MTBF failure",
1771 skr->skr_name, SK_KVA(skr), SK_KVA(sg));
1772 net_update_uptime();
1773 return NULL;
1774 }
1775 #endif /* (DEVELOPMENT || DEBUG) */
1776
1777 TAILQ_REMOVE(&skr->skr_seg_free, sg, sg_link);
1778 sg->sg_link.tqe_next = NULL;
1779 sg->sg_link.tqe_prev = NULL;
1780 RB_REMOVE(segtfreehead, &skr->skr_seg_tfree, sg);
1781 sg->sg_node.rbe_left = NULL;
1782 sg->sg_node.rbe_right = NULL;
1783 sg->sg_node.rbe_parent = NULL;
1784
1785 ASSERT(skr->skr_seg_free_cnt != 0);
1786 --skr->skr_seg_free_cnt;
1787
1788 /*
1789 * If the region is being depopulated, then we're done.
1790 */
1791 if (__improbable(purging)) {
1792 ASSERT(sg->sg_md == NULL);
1793 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1794 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1795 sg->sg_type = SKSEG_TYPE_DESTROYED;
1796 return sg;
1797 }
1798
1799 ASSERT(sg->sg_md == NULL);
1800 ASSERT(sg->sg_start == 0 && sg->sg_end == 0);
1801 ASSERT(sg->sg_state == SKSEG_STATE_DETACHED);
1802
1803 /* created as non-volatile (mapped) upon success */
1804 if ((sg->sg_md = IOSKMemoryBufferCreate(skr->skr_seg_size,
1805 &skr->skr_bufspec, &segstart)) == NULL) {
1806 ASSERT(sg->sg_type == SKSEG_TYPE_FREE);
1807 if (skmflag & SKMEM_PANIC) {
1808 /* if the caller insists for a success then panic */
1809 panic_plain("\"%s\": skr 0x%p sg 0x%p (idx %u) unable "
1810 "to satisfy mandatory allocation\n", skr->skr_name,
1811 skr, sg, sg->sg_index);
1812 /* NOTREACHED */
1813 __builtin_unreachable();
1814 }
1815 /* reinsert this segment to freelist */
1816 ASSERT(sg->sg_link.tqe_next == NULL);
1817 ASSERT(sg->sg_link.tqe_prev == NULL);
1818 TAILQ_INSERT_HEAD(&skr->skr_seg_free, sg, sg_link);
1819 ASSERT(sg->sg_node.rbe_left == NULL);
1820 ASSERT(sg->sg_node.rbe_right == NULL);
1821 ASSERT(sg->sg_node.rbe_parent == NULL);
1822 RB_INSERT(segtfreehead, &skr->skr_seg_tfree, sg);
1823 ++skr->skr_seg_free_cnt;
1824 return NULL;
1825 }
1826
1827 sg->sg_start = segstart;
1828 sg->sg_end = (segstart + skr->skr_seg_size);
1829 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1830
1831 /* mark memory as non-volatile just to be consistent */
1832 if (!(skr->skr_mode & SKR_MODE_MEMTAG)) {
1833 err = IOSKMemoryReclaim(sg->sg_md);
1834 if (err != kIOReturnSuccess) {
1835 panic("Fail to reclaim md %p, err %d", sg->sg_md, err);
1836 }
1837 }
1838
1839 /*
1840 * If persistent, wire down its memory now. But do not wire memtag
1841 * regions, as they come from zalloc.
1842 */
1843 if ((skr->skr_mode & SKR_MODE_PERSISTENT) &&
1844 !(skr->skr_mode & SKR_MODE_MEMTAG)) {
1845 err = IOSKMemoryWire(sg->sg_md);
1846 if (err != kIOReturnSuccess) {
1847 panic("Fail to wire md %p, err %d", sg->sg_md, err);
1848 }
1849 }
1850
1851 err = IOSKRegionSetBuffer(skr->skr_reg, sg->sg_index, sg->sg_md);
1852 if (err != kIOReturnSuccess) {
1853 panic("Fail to set md %p, err %d", sg->sg_md, err);
1854 }
1855
1856 /*
1857 * Let the client wire it and insert to IOMMU, if applicable.
1858 * Try to find out if it's wired and set the right state.
1859 */
1860 if (skr->skr_seg_ctor != NULL) {
1861 ret = skr->skr_seg_ctor(sg, sg->sg_md, skr->skr_private);
1862 /* Handle segment creation failure from driver power down */
1863 if (__improbable(ret != 0)) {
1864 SK_ERR("segment constructor for sg %p failed, err %d", sg, ret);
1865 if (ret == ENOMEM) {
1866 /*
1867 * Undo IOSKMemoryBufferCreate, IOSKMemoryReclaim,
1868 * IOSKMemoryWire, and IOSKRegionSetBuffer.
1869 */
1870 IOSKMemoryBufferRef __single md;
1871 IOSKRegionClearBufferDebug(skr->skr_reg, sg->sg_index, &md);
1872 VERIFY(sg->sg_md == md);
1873 if ((skr->skr_mode & SKR_MODE_PERSISTENT) &&
1874 !(skr->skr_mode & SKR_MODE_MEMTAG)) {
1875 err = IOSKMemoryUnwire(md);
1876 if (err != kIOReturnSuccess) {
1877 panic("Fail to unwire md %p, err %d", md, err);
1878 }
1879 }
1880 /* mark memory as empty/discarded for consistency */
1881 if (!(skr->skr_mode & SKR_MODE_MEMTAG)) {
1882 err = IOSKMemoryDiscard(md);
1883 if (err != kIOReturnSuccess) {
1884 panic("Fail to discard md %p, err %d", md, err);
1885 }
1886 }
1887 IOSKMemoryDestroy(md);
1888 return NULL;
1889 }
1890 }
1891 }
1892
1893 sg->sg_state = IOSKBufferIsWired(sg->sg_md) ?
1894 SKSEG_STATE_MAPPED_WIRED : SKSEG_STATE_MAPPED;
1895
1896 ASSERT(sg->sg_md != NULL);
1897 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
1898
1899 sg->sg_type = SKSEG_TYPE_ALLOC;
1900 return sg;
1901 }
1902
1903 /*
1904 * Find the first available index and allocate a segment at that index.
1905 */
1906 static struct sksegment *
sksegment_freelist_grow(struct skmem_region * skr)1907 sksegment_freelist_grow(struct skmem_region *skr)
1908 {
1909 struct sksegment *sg = NULL;
1910 uint32_t i, j, idx;
1911
1912 SKR_LOCK_ASSERT_HELD(skr);
1913
1914 ASSERT(!(skr->skr_mode & SKR_MODE_PSEUDO));
1915 ASSERT(skr->skr_seg_bmap_len != 0);
1916 ASSERT(skr->skr_seg_max_cnt != 0);
1917
1918 for (i = 0; i < skr->skr_seg_bmap_len; i++) {
1919 bitmap_t *bmap, mask;
1920 uint32_t end = (BMAPSZ - 1);
1921
1922 if (i == (skr->skr_seg_bmap_len - 1)) {
1923 end = (skr->skr_seg_max_cnt - 1) % BMAPSZ;
1924 }
1925
1926 bmap = &skr->skr_seg_bmap[i];
1927 mask = BMASK64(0, end);
1928
1929 j = ffsll((*bmap) & mask);
1930 if (j == 0) {
1931 continue;
1932 }
1933
1934 --j;
1935 idx = (i * BMAPSZ) + j;
1936
1937 sg = sksegment_alloc_with_idx(skr, idx);
1938
1939 /* we're done */
1940 break;
1941 }
1942
1943 ASSERT((sg != NULL) || (skr->skr_seginuse == skr->skr_seg_max_cnt));
1944 return sg;
1945 }
1946
1947 /*
1948 * Create a single segment at a specific index and add it to the freelist.
1949 */
1950 static struct sksegment *
sksegment_alloc_with_idx(struct skmem_region * skr,uint32_t idx)1951 sksegment_alloc_with_idx(struct skmem_region *skr, uint32_t idx)
1952 {
1953 struct sksegment *sg;
1954
1955 SKR_LOCK_ASSERT_HELD(skr);
1956
1957 if (!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ)) {
1958 panic("%s: '%s' (%p) idx %u (out of %u) is already allocated",
1959 __func__, skr->skr_name, (void *)skr, idx,
1960 (skr->skr_seg_max_cnt - 1));
1961 /* NOTREACHED */
1962 __builtin_unreachable();
1963 }
1964
1965 /* must not fail, blocking alloc */
1966 sg = sksegment_create(skr, idx);
1967 VERIFY(sg != NULL);
1968 VERIFY(!bit_test(skr->skr_seg_bmap[idx / BMAPSZ], idx % BMAPSZ));
1969
1970 /* populate the freelist */
1971 sksegment_freelist_insert(skr, sg, TRUE);
1972 ASSERT(sg == TAILQ_LAST(&skr->skr_seg_free, segfreehead));
1973 #if (DEVELOPMENT || DEBUG)
1974 struct sksegment sg_key = { .sg_index = sg->sg_index };
1975 ASSERT(sg == RB_FIND(segtfreehead, &skr->skr_seg_tfree, &sg_key));
1976 #endif /* (DEVELOPMENT || DEBUG) */
1977
1978 SK_DF(SK_VERB_MEM_REGION, "sg %u/%u", (idx + 1), skr->skr_seg_max_cnt);
1979
1980 return sg;
1981 }
1982
1983 /*
1984 * Rescale the regions's allocated-address hash table.
1985 */
1986 static void
skmem_region_hash_rescale(struct skmem_region * skr)1987 skmem_region_hash_rescale(struct skmem_region *skr)
1988 {
1989 struct sksegment_bkt *__indexable old_table, *new_table;
1990 size_t old_size, new_size;
1991 uint32_t i, moved = 0;
1992
1993 if (skr->skr_mode & SKR_MODE_PSEUDO) {
1994 ASSERT(skr->skr_hash_table == NULL);
1995 /* this is no-op for pseudo region */
1996 return;
1997 }
1998
1999 ASSERT(skr->skr_hash_table != NULL);
2000 /* insist that we are executing in the update thread call context */
2001 ASSERT(sk_is_region_update_protected());
2002
2003 /*
2004 * To get small average lookup time (lookup depth near 1.0), the hash
2005 * table size should be roughly the same (not necessarily equivalent)
2006 * as the region size.
2007 */
2008 new_size = MAX(skr->skr_hash_initial,
2009 (1 << (flsll(3 * skr->skr_seginuse + 4) - 2)));
2010 new_size = MIN(skr->skr_hash_limit, new_size);
2011 old_size = (skr->skr_hash_mask + 1);
2012
2013 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2014 return;
2015 }
2016
2017 new_table = sk_alloc_type_array(struct sksegment_bkt, new_size,
2018 Z_NOWAIT, skmem_tag_segment_hash);
2019 if (__improbable(new_table == NULL)) {
2020 return;
2021 }
2022
2023 for (i = 0; i < new_size; i++) {
2024 TAILQ_INIT(&new_table[i].sgb_head);
2025 }
2026
2027 SKR_LOCK(skr);
2028
2029 old_size = (skr->skr_hash_mask + 1);
2030 old_table = skr->skr_hash_table;
2031
2032 skr->skr_hash_mask = (uint32_t)(new_size - 1);
2033 skr->skr_hash_table = new_table;
2034 skr->skr_hash_size = new_size;
2035 skr->skr_rescale++;
2036
2037 for (i = 0; i < old_size; i++) {
2038 struct sksegment_bkt *sgb = &old_table[i];
2039 struct sksegment_bkt *new_sgb;
2040 struct sksegment *sg;
2041
2042 while ((sg = TAILQ_FIRST(&sgb->sgb_head)) != NULL) {
2043 TAILQ_REMOVE(&sgb->sgb_head, sg, sg_link);
2044 ASSERT(sg->sg_start != 0 && sg->sg_end != 0);
2045 new_sgb = SKMEM_REGION_HASH(skr, sg->sg_start);
2046 TAILQ_INSERT_TAIL(&new_sgb->sgb_head, sg, sg_link);
2047 ++moved;
2048 }
2049 ASSERT(TAILQ_EMPTY(&sgb->sgb_head));
2050 }
2051
2052 SK_DF(SK_VERB_MEM_REGION,
2053 "skr %p old_size %zu new_size %zu [%u moved]", SK_KVA(skr),
2054 old_size, new_size, moved);
2055
2056 SKR_UNLOCK(skr);
2057
2058 sk_free_type_array(struct sksegment_bkt, old_size, old_table);
2059 }
2060
2061 /*
2062 * Apply a function to operate on all regions.
2063 */
2064 static void
skmem_region_applyall(void (* func)(struct skmem_region *))2065 skmem_region_applyall(void (*func)(struct skmem_region *))
2066 {
2067 struct skmem_region *skr;
2068
2069 net_update_uptime();
2070
2071 SKMEM_REGION_LOCK();
2072 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2073 func(skr);
2074 }
2075 SKMEM_REGION_UNLOCK();
2076 }
2077
2078 static void
skmem_region_update(struct skmem_region * skr)2079 skmem_region_update(struct skmem_region *skr)
2080 {
2081 SKMEM_REGION_LOCK_ASSERT_HELD();
2082
2083 /* insist that we are executing in the update thread call context */
2084 ASSERT(sk_is_region_update_protected());
2085
2086 SKR_LOCK(skr);
2087 /*
2088 * If there are threads blocked waiting for an available
2089 * segment, wake them up periodically so they can issue
2090 * another skmem_cache_reap() to reclaim resources cached
2091 * by skmem_cache.
2092 */
2093 if (skr->skr_seg_waiters != 0) {
2094 SK_DF(SK_VERB_MEM_REGION,
2095 "waking up %u waiters to reclaim", skr->skr_seg_waiters);
2096 skr->skr_seg_waiters = 0;
2097 wakeup(&skr->skr_seg_free);
2098 }
2099 SKR_UNLOCK(skr);
2100
2101 /*
2102 * Rescale the hash table if needed.
2103 */
2104 skmem_region_hash_rescale(skr);
2105 }
2106
2107 /*
2108 * Thread call callback for update.
2109 */
2110 static void
skmem_region_update_func(thread_call_param_t dummy,thread_call_param_t arg)2111 skmem_region_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2112 {
2113 #pragma unused(dummy, arg)
2114 sk_protect_t protect;
2115
2116 protect = sk_region_update_protect();
2117 skmem_region_applyall(skmem_region_update);
2118 sk_region_update_unprotect(protect);
2119
2120 skmem_dispatch(skmem_region_update_tc, NULL,
2121 (skmem_region_update_interval * NSEC_PER_SEC));
2122 }
2123
2124 boolean_t
skmem_region_for_pp(skmem_region_id_t id)2125 skmem_region_for_pp(skmem_region_id_t id)
2126 {
2127 int i;
2128
2129 for (i = 0; i < SKMEM_PP_REGIONS; i++) {
2130 if (id == skmem_pp_region_ids[i]) {
2131 return TRUE;
2132 }
2133 }
2134 return FALSE;
2135 }
2136
2137 void
skmem_region_get_stats(struct skmem_region * skr,struct sk_stats_region * sreg)2138 skmem_region_get_stats(struct skmem_region *skr, struct sk_stats_region *sreg)
2139 {
2140 bzero(sreg, sizeof(*sreg));
2141
2142 (void) snprintf(sreg->sreg_name, sizeof(sreg->sreg_name),
2143 "%s", skr->skr_name);
2144 uuid_copy(sreg->sreg_uuid, skr->skr_uuid);
2145 sreg->sreg_id = (sk_stats_region_id_t)skr->skr_id;
2146 sreg->sreg_mode = skr->skr_mode;
2147
2148 sreg->sreg_r_seg_size = skr->skr_params.srp_r_seg_size;
2149 sreg->sreg_c_seg_size = skr->skr_seg_size;
2150 sreg->sreg_seg_cnt = skr->skr_seg_max_cnt;
2151 sreg->sreg_seg_objs = skr->skr_seg_objs;
2152 sreg->sreg_r_obj_size = skr->skr_r_obj_size;
2153 sreg->sreg_r_obj_cnt = skr->skr_r_obj_cnt;
2154 sreg->sreg_c_obj_size = skr->skr_c_obj_size;
2155 sreg->sreg_c_obj_cnt = skr->skr_c_obj_cnt;
2156 sreg->sreg_align = skr->skr_align;
2157 sreg->sreg_max_frags = skr->skr_max_frags;
2158
2159 sreg->sreg_meminuse = skr->skr_meminuse;
2160 sreg->sreg_w_meminuse = skr->skr_w_meminuse;
2161 sreg->sreg_memtotal = skr->skr_memtotal;
2162 sreg->sreg_seginuse = skr->skr_seginuse;
2163 sreg->sreg_rescale = skr->skr_rescale;
2164 sreg->sreg_hash_size = (skr->skr_hash_mask + 1);
2165 sreg->sreg_alloc = skr->skr_alloc;
2166 sreg->sreg_free = skr->skr_free;
2167 }
2168
2169 static size_t
skmem_region_mib_get_stats(struct skmem_region * skr,void * __sized_by (len)out,size_t len)2170 skmem_region_mib_get_stats(struct skmem_region *skr, void *__sized_by(len) out,
2171 size_t len)
2172 {
2173 size_t actual_space = sizeof(struct sk_stats_region);
2174 struct sk_stats_region *__single sreg;
2175
2176 if (out == NULL || len < actual_space) {
2177 goto done;
2178 }
2179 sreg = out;
2180
2181 skmem_region_get_stats(skr, sreg);
2182
2183 done:
2184 return actual_space;
2185 }
2186
2187 static int
2188 skmem_region_mib_get_sysctl SYSCTL_HANDLER_ARGS
2189 {
2190 #pragma unused(arg1, arg2, oidp)
2191 struct skmem_region *skr;
2192 size_t actual_space;
2193 size_t buffer_space;
2194 size_t allocated_space = 0;
2195 caddr_t __sized_by(allocated_space) buffer = NULL;
2196 caddr_t scan;
2197 int error = 0;
2198
2199 if (!kauth_cred_issuser(kauth_cred_get())) {
2200 return EPERM;
2201 }
2202
2203 net_update_uptime();
2204 buffer_space = req->oldlen;
2205 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2206 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2207 buffer_space = SK_SYSCTL_ALLOC_MAX;
2208 }
2209 caddr_t temp;
2210 temp = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_region_mib);
2211 if (__improbable(temp == NULL)) {
2212 return ENOBUFS;
2213 }
2214 buffer = temp;
2215 allocated_space = buffer_space;
2216 } else if (req->oldptr == USER_ADDR_NULL) {
2217 buffer_space = 0;
2218 }
2219 actual_space = 0;
2220 scan = buffer;
2221
2222 SKMEM_REGION_LOCK();
2223 TAILQ_FOREACH(skr, &skmem_region_head, skr_link) {
2224 size_t size = skmem_region_mib_get_stats(skr, scan, buffer_space);
2225 if (scan != NULL) {
2226 if (buffer_space < size) {
2227 /* supplied buffer too small, stop copying */
2228 error = ENOMEM;
2229 break;
2230 }
2231 scan += size;
2232 buffer_space -= size;
2233 }
2234 actual_space += size;
2235 }
2236 SKMEM_REGION_UNLOCK();
2237
2238 if (actual_space != 0) {
2239 int out_error = SYSCTL_OUT(req, buffer, actual_space);
2240 if (out_error != 0) {
2241 error = out_error;
2242 }
2243 }
2244 if (buffer != NULL) {
2245 sk_free_data_sized_by(buffer, allocated_space);
2246 }
2247
2248 return error;
2249 }
2250
2251 #if SK_LOG
2252 const char *
skmem_region_id2name(skmem_region_id_t id)2253 skmem_region_id2name(skmem_region_id_t id)
2254 {
2255 const char *name;
2256 switch (id) {
2257 case SKMEM_REGION_SCHEMA:
2258 name = "SCHEMA";
2259 break;
2260
2261 case SKMEM_REGION_RING:
2262 name = "RING";
2263 break;
2264
2265 case SKMEM_REGION_BUF_DEF:
2266 name = "BUF_DEF";
2267 break;
2268
2269 case SKMEM_REGION_BUF_LARGE:
2270 name = "BUF_LARGE";
2271 break;
2272
2273 case SKMEM_REGION_RXBUF_DEF:
2274 name = "RXBUF_DEF";
2275 break;
2276
2277 case SKMEM_REGION_RXBUF_LARGE:
2278 name = "RXBUF_LARGE";
2279 break;
2280
2281 case SKMEM_REGION_TXBUF_DEF:
2282 name = "TXBUF_DEF";
2283 break;
2284
2285 case SKMEM_REGION_TXBUF_LARGE:
2286 name = "TXBUF_LARGE";
2287 break;
2288
2289 case SKMEM_REGION_UMD:
2290 name = "UMD";
2291 break;
2292
2293 case SKMEM_REGION_TXAUSD:
2294 name = "TXAUSD";
2295 break;
2296
2297 case SKMEM_REGION_RXFUSD:
2298 name = "RXFUSD";
2299 break;
2300
2301 case SKMEM_REGION_USTATS:
2302 name = "USTATS";
2303 break;
2304
2305 case SKMEM_REGION_FLOWADV:
2306 name = "FLOWADV";
2307 break;
2308
2309 case SKMEM_REGION_NEXUSADV:
2310 name = "NEXUSADV";
2311 break;
2312
2313 case SKMEM_REGION_SYSCTLS:
2314 name = "SYSCTLS";
2315 break;
2316
2317 case SKMEM_REGION_GUARD_HEAD:
2318 name = "HEADGUARD";
2319 break;
2320
2321 case SKMEM_REGION_GUARD_TAIL:
2322 name = "TAILGUARD";
2323 break;
2324
2325 case SKMEM_REGION_KMD:
2326 name = "KMD";
2327 break;
2328
2329 case SKMEM_REGION_RXKMD:
2330 name = "RXKMD";
2331 break;
2332
2333 case SKMEM_REGION_TXKMD:
2334 name = "TXKMD";
2335 break;
2336
2337 case SKMEM_REGION_TXAKSD:
2338 name = "TXAKSD";
2339 break;
2340
2341 case SKMEM_REGION_RXFKSD:
2342 name = "RXFKSD";
2343 break;
2344
2345 case SKMEM_REGION_KSTATS:
2346 name = "KSTATS";
2347 break;
2348
2349 case SKMEM_REGION_KBFT:
2350 name = "KBFT";
2351 break;
2352
2353 case SKMEM_REGION_UBFT:
2354 name = "UBFT";
2355 break;
2356
2357 case SKMEM_REGION_RXKBFT:
2358 name = "RXKBFT";
2359 break;
2360
2361 case SKMEM_REGION_TXKBFT:
2362 name = "TXKBFT";
2363 break;
2364
2365 case SKMEM_REGION_INTRINSIC:
2366 name = "INTRINSIC";
2367 break;
2368
2369 default:
2370 name = "UNKNOWN";
2371 break;
2372 }
2373
2374 const char *__null_terminated s = __unsafe_null_terminated_from_indexable(name);
2375
2376 return s;
2377 }
2378 #endif /* SK_LOG */
2379
2380 #if (DEVELOPMENT || DEBUG)
2381 uint64_t
skmem_region_get_mtbf(void)2382 skmem_region_get_mtbf(void)
2383 {
2384 return skmem_region_mtbf;
2385 }
2386
2387 void
skmem_region_set_mtbf(uint64_t newval)2388 skmem_region_set_mtbf(uint64_t newval)
2389 {
2390 if (newval < SKMEM_REGION_MTBF_MIN) {
2391 if (newval != 0) {
2392 newval = SKMEM_REGION_MTBF_MIN;
2393 }
2394 } else if (newval > SKMEM_REGION_MTBF_MAX) {
2395 newval = SKMEM_REGION_MTBF_MAX;
2396 }
2397
2398 if (skmem_region_mtbf != newval) {
2399 os_atomic_store(&skmem_region_mtbf, newval, release);
2400 SK_ERR("MTBF set to %llu msec", skmem_region_mtbf);
2401 }
2402 }
2403
2404 static int
skmem_region_mtbf_sysctl(struct sysctl_oid * oidp,void * arg1,int arg2,struct sysctl_req * req)2405 skmem_region_mtbf_sysctl(struct sysctl_oid *oidp, void *arg1, int arg2,
2406 struct sysctl_req *req)
2407 {
2408 #pragma unused(oidp, arg1, arg2)
2409 int changed, error;
2410 uint64_t newval;
2411
2412 static_assert(sizeof(skmem_region_mtbf) == sizeof(uint64_t));
2413 if ((error = sysctl_io_number(req, skmem_region_mtbf,
2414 sizeof(uint64_t), &newval, &changed)) == 0) {
2415 if (changed) {
2416 skmem_region_set_mtbf(newval);
2417 }
2418 }
2419 return error;
2420 }
2421 #endif /* (DEVELOPMENT || DEBUG) */
2422