1 /*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h> /* for OSBacktrace */
33 #include <kern/sched_prim.h> /* for assert_wait */
34
35 /*
36 * Memory allocator with per-CPU caching (magazines), derived from the kmem
37 * magazine concept and implementation as described in the following paper:
38 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
39 *
40 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
41 * reserved. Use is subject to license terms.
42 *
43 * This derivative differs from the original kmem slab allocator, in that:
44 *
45 * a) There is always a discrete bufctl per object, even for small sizes.
46 * This increases the overhead, but is necessary as Skywalk objects
47 * coming from the slab may be shared (RO or RW) with userland; therefore
48 * embedding the KVA pointer linkage in freed objects is a non-starter.
49 *
50 * b) Writing patterns to the slab at slab creation or destruction time
51 * (when debugging is enabled) is not implemented, as the object may
52 * be shared (RW) with userland and thus we cannot panic upon pattern
53 * mismatch episodes. This can be relaxed so that we conditionally
54 * verify the pattern for kernel-only memory.
55 *
56 * This derivative also differs from Darwin's mcache allocator (which itself
57 * is a derivative of the original kmem slab allocator), in that:
58 *
59 * 1) The slab layer is internal to skmem_cache, unlike mcache's external
60 * slab layer required to support mbufs. skmem_cache also supports
61 * constructing and deconstructing objects, while mcache does not.
62 * This brings skmem_cache's model closer to that of the original
63 * kmem slab allocator.
64 *
65 * 2) mcache allows for batch allocation and free by way of chaining the
66 * objects together using a linked list. This requires using a part
67 * of the object to act as the linkage, which is against Skywalk's
68 * requirements of not exposing any KVA pointer to userland. Although
69 * this is supported by skmem_cache, chaining is only possible if the
70 * region is not mapped to userland. That implies that kernel-only
71 * objects can be chained provided the cache is created with batching
72 * mode enabled, and that the object is large enough to contain the
73 * skmem_obj structure.
74 *
75 * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
76 * implements features that are required by Skywalk. In addition to being
77 * aware of userland access on the buffers, in also supports mirrored backend
78 * memory regions. This allows a cache to manage two independent memory
79 * regions, such that allocating/freeing an object from/to one results in
80 * allocating/freeing a shadow object in another, thus guaranteeing that both
81 * objects share the same lifetime.
82 */
83
84 static uint32_t ncpu; /* total # of initialized CPUs */
85
86 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
87 static struct thread *skmem_lock_owner = THREAD_NULL;
88
89 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
90 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
91 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
92
93 #define SKMEM_CACHE_LOCK() do { \
94 lck_mtx_lock(&skmem_cache_lock); \
95 skmem_lock_owner = current_thread(); \
96 } while (0)
97 #define SKMEM_CACHE_UNLOCK() do { \
98 skmem_lock_owner = THREAD_NULL; \
99 lck_mtx_unlock(&skmem_cache_lock); \
100 } while (0)
101 #define SKMEM_CACHE_LOCK_ASSERT_HELD() \
102 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
103 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD() \
104 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
105
106 #define SKM_SLAB_LOCK(_skm) \
107 lck_mtx_lock(&(_skm)->skm_sl_lock)
108 #define SKM_SLAB_LOCK_ASSERT_HELD(_skm) \
109 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
110 #define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm) \
111 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
112 #define SKM_SLAB_UNLOCK(_skm) \
113 lck_mtx_unlock(&(_skm)->skm_sl_lock)
114
115 #define SKM_DEPOT_LOCK(_skm) \
116 lck_mtx_lock(&(_skm)->skm_dp_lock)
117 #define SKM_DEPOT_LOCK_SPIN(_skm) \
118 lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
119 #define SKM_DEPOT_CONVERT_LOCK(_skm) \
120 lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
121 #define SKM_DEPOT_LOCK_TRY(_skm) \
122 lck_mtx_try_lock(&(_skm)->skm_dp_lock)
123 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm) \
124 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
125 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm) \
126 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
127 #define SKM_DEPOT_UNLOCK(_skm) \
128 lck_mtx_unlock(&(_skm)->skm_dp_lock)
129
130 #define SKM_RESIZE_LOCK(_skm) \
131 lck_mtx_lock(&(_skm)->skm_rs_lock)
132 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm) \
133 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
134 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm) \
135 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
136 #define SKM_RESIZE_UNLOCK(_skm) \
137 lck_mtx_unlock(&(_skm)->skm_rs_lock)
138
139 #define SKM_CPU_LOCK(_cp) \
140 lck_mtx_lock(&(_cp)->cp_lock)
141 #define SKM_CPU_LOCK_SPIN(_cp) \
142 lck_mtx_lock_spin(&(_cp)->cp_lock)
143 #define SKM_CPU_CONVERT_LOCK(_cp) \
144 lck_mtx_convert_spin(&(_cp)->cp_lock)
145 #define SKM_CPU_LOCK_ASSERT_HELD(_cp) \
146 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
147 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp) \
148 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
149 #define SKM_CPU_UNLOCK(_cp) \
150 lck_mtx_unlock(&(_cp)->cp_lock)
151
152 #define SKM_ZONE_MAX 256
153
154 static struct zone *skm_zone; /* zone for skmem_cache */
155
156 static struct skmem_cache *skmem_slab_cache; /* cache for skmem_slab */
157 static struct skmem_cache *skmem_bufctl_cache; /* cache for skmem_bufctl */
158 static unsigned int bc_size; /* size of bufctl */
159
160 /*
161 * Magazine types (one per row.)
162 *
163 * The first column defines the number of objects that the magazine can hold.
164 * Using that number, we derive the effective number: the aggregate count of
165 * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
166 * This would result in an object size that is aligned on the CPU cache
167 * size boundary; the exception to this is the KASAN mode where the size
168 * would be larger due to the redzone regions.
169 *
170 * The second column defines the alignment of the magazine. Because each
171 * magazine is used at the CPU-layer cache, we need to ensure there is no
172 * false sharing across the CPUs, and align the magazines to the maximum
173 * cache alignment size, for simplicity. The value of 0 may be used to
174 * indicate natural pointer size alignment.
175 *
176 * The third column defines the starting magazine type for a given cache,
177 * determined at the cache's creation time based on its chunk size.
178 *
179 * The fourth column defines the magazine type limit for a given cache.
180 * Magazine resizing will only occur if the chunk size is less than this.
181 */
182 static struct skmem_magtype skmem_magtype[] = {
183 #if defined(__LP64__)
184 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
185 .mt_cache = NULL, .mt_cname = "" },
186 { .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
187 .mt_cache = NULL, .mt_cname = "" },
188 { .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
189 .mt_cache = NULL, .mt_cname = "" },
190 { .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
191 .mt_cache = NULL, .mt_cname = "" },
192 { .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
193 .mt_cache = NULL, .mt_cname = "" },
194 { .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
195 .mt_cache = NULL, .mt_cname = "" },
196 { .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
197 .mt_cache = NULL, .mt_cname = "" },
198 { .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
199 .mt_cache = NULL, .mt_cname = "" },
200 #else /* !__LP64__ */
201 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
202 .mt_cache = NULL, .mt_cname = "" },
203 #endif /* !__LP64__ */
204 };
205
206 /*
207 * Hash table bounds. Start with the initial value, and rescale up to
208 * the specified limit. Ideally we don't need a limit, but in practice
209 * this helps guard against runaways. These values should be revisited
210 * in future and be adjusted as needed.
211 */
212 #define SKMEM_CACHE_HASH_INITIAL 64 /* initial hash table size */
213 #define SKMEM_CACHE_HASH_LIMIT 8192 /* hash table size limit */
214
215 #define SKMEM_CACHE_HASH_INDEX(_a, _s, _m) (((_a) >> (_s)) & (_m))
216 #define SKMEM_CACHE_HASH(_skm, _buf) \
217 (&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
218 (_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
219
220 /*
221 * The last magazine type.
222 */
223 static struct skmem_magtype *skmem_cache_magsize_last;
224
225 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
226 static boolean_t skmem_cache_ready;
227
228 static int skmem_slab_alloc_locked(struct skmem_cache *,
229 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
230 static void skmem_slab_free_locked(struct skmem_cache *, void *);
231 static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
232 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
233 static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
234 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
235 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
236 static int skmem_magazine_ctor(struct skmem_obj_info *,
237 struct skmem_obj_info *, void *, uint32_t);
238 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
239 int);
240 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
241 struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t);
242 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
243 uint32_t *, struct skmem_mag *);
244 static void skmem_depot_ws_update(struct skmem_cache *);
245 static void skmem_depot_ws_zero(struct skmem_cache *);
246 static void skmem_depot_ws_reap(struct skmem_cache *);
247 static void skmem_cache_magazine_purge(struct skmem_cache *);
248 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
249 static void skmem_cache_magazine_resize(struct skmem_cache *);
250 static void skmem_cache_hash_rescale(struct skmem_cache *);
251 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
252 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
253 struct skmem_mag *, int);
254 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
255 uint32_t);
256 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
257 static void skmem_cache_reap_start(void);
258 static void skmem_cache_reap_done(void);
259 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
260 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
261 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
262 static void skmem_cache_resize_exit(struct skmem_cache *);
263 static void skmem_audit_bufctl(struct skmem_bufctl *);
264 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
265 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
266
267 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
268 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
269 0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
270 "Skywalk cache statistics");
271
272 static volatile uint32_t skmem_cache_reaping;
273 static thread_call_t skmem_cache_reap_tc;
274 static thread_call_t skmem_cache_update_tc;
275
276 extern kern_return_t thread_terminate(thread_t);
277 extern unsigned int ml_wait_max_cpus(void);
278
279 #define SKMEM_DEBUG_NOMAGAZINES 0x1 /* disable magazines layer */
280 #define SKMEM_DEBUG_AUDIT 0x2 /* audit transactions */
281 #define SKMEM_DEBUG_MASK (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
282
283 #if DEBUG
284 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
285 #else /* !DEBUG */
286 static uint32_t skmem_debug = 0;
287 #endif /* !DEBUG */
288
289 static uint32_t skmem_clear_min = 0; /* clear on free threshold */
290
291 #define SKMEM_CACHE_UPDATE_INTERVAL 11 /* 11 seconds */
292 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
293
294 #define SKMEM_DEPOT_CONTENTION 3 /* max failed trylock per interval */
295 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
296
297 /*
298 * Too big a value will cause overflow and thus trip the assertion; the
299 * idea here is to set an upper limit for the time that a particular
300 * thread is allowed to perform retries before we give up and panic.
301 */
302 #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
303
304 /*
305 * Threshold (in msec) after which we reset the exponential backoff value
306 * back to its (random) initial value. Note that we allow the actual delay
307 * to be at most twice this value.
308 */
309 #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
310
311 /*
312 * To reduce the likelihood of global synchronization between threads,
313 * we use some random value to start the exponential backoff.
314 */
315 #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
316
317 #if (DEVELOPMENT || DEBUG)
318 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
319 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
320 SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
321 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
322 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
323 SKMEM_DEPOT_CONTENTION, "Depot contention");
324
325 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
326
327 /*
328 * Called by skmem_test_start() to set the update interval.
329 */
330 void
skmem_cache_test_start(uint32_t i)331 skmem_cache_test_start(uint32_t i)
332 {
333 skmem_cache_update_interval_saved = skmem_cache_update_interval;
334 skmem_cache_update_interval = i;
335 }
336
337 /*
338 * Called by skmem_test_stop() to restore the update interval.
339 */
340 void
skmem_cache_test_stop(void)341 skmem_cache_test_stop(void)
342 {
343 skmem_cache_update_interval = skmem_cache_update_interval_saved;
344 }
345 #endif /* (DEVELOPMENT || DEBUG) */
346
347 #define SKMEM_TAG_BUFCTL_HASH "com.apple.skywalk.bufctl.hash"
348 static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
349
350 #define SKMEM_TAG_CACHE_MIB "com.apple.skywalk.cache.mib"
351 static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
352
353 static int __skmem_cache_pre_inited = 0;
354 static int __skmem_cache_inited = 0;
355
356 /*
357 * Called before skmem_region_init().
358 */
359 void
skmem_cache_pre_init(void)360 skmem_cache_pre_init(void)
361 {
362 vm_size_t skm_size;
363
364 ASSERT(!__skmem_cache_pre_inited);
365
366 ncpu = ml_wait_max_cpus();
367
368 /* allocate extra in case we need to manually align the pointer */
369 if (skm_zone == NULL) {
370 skm_size = SKMEM_CACHE_SIZE(ncpu);
371 #if KASAN
372 /*
373 * When KASAN is enabled, the zone allocator adjusts the
374 * element size to include the redzone regions, in which
375 * case we assume that the elements won't start on the
376 * alignment boundary and thus need to do some fix-ups.
377 * These include increasing the effective object size
378 * which adds at least 136 bytes to the original size,
379 * as computed by skmem_region_params_config() above.
380 */
381 skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
382 #endif /* KASAN */
383 skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
384 skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
385 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
386 }
387
388 TAILQ_INIT(&skmem_cache_head);
389
390 __skmem_cache_pre_inited = 1;
391 }
392
393 /*
394 * Called after skmem_region_init().
395 */
396 void
skmem_cache_init(void)397 skmem_cache_init(void)
398 {
399 uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
400 struct skmem_magtype *mtp;
401 uint32_t i;
402
403 _CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
404
405 _CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
406 _CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
407 _CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
408 _CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
409 _CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
410 _CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
411 _CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
412
413 ASSERT(__skmem_cache_pre_inited);
414 ASSERT(!__skmem_cache_inited);
415
416 PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
417 skmem_debug &= SKMEM_DEBUG_MASK;
418
419 #if (DEVELOPMENT || DEBUG)
420 PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
421 sizeof(skmem_clear_min));
422 #endif /* (DEVELOPMENT || DEBUG) */
423 if (skmem_clear_min == 0) {
424 /* zeroing 2 CPU cache lines practically comes for free */
425 skmem_clear_min = 2 * cpu_cache_line_size;
426 } else {
427 /* round it up to CPU cache line size */
428 skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
429 cpu_cache_line_size);
430 }
431
432 /* create a cache for buffer control structures */
433 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
434 bc_size = sizeof(struct skmem_bufctl_audit);
435 skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
436 bc_size, sizeof(uint64_t), NULL, NULL,
437 NULL, NULL, NULL, 0);
438 } else {
439 bc_size = sizeof(struct skmem_bufctl);
440 skmem_bufctl_cache = skmem_cache_create("bufctl",
441 bc_size, sizeof(uint64_t), NULL, NULL,
442 NULL, NULL, NULL, 0);
443 }
444
445 /* create a cache for slab structures */
446 skmem_slab_cache = skmem_cache_create("slab",
447 sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
448 NULL, NULL, 0);
449
450 /*
451 * Go thru the magazine type table and create an cache for each.
452 */
453 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
454 mtp = &skmem_magtype[i];
455
456 if (mtp->mt_align != 0 &&
457 ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
458 mtp->mt_align < (int)cpu_cache_line_size)) {
459 panic("%s: bad alignment %d", __func__, mtp->mt_align);
460 /* NOTREACHED */
461 __builtin_unreachable();
462 }
463 (void) snprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
464 "mg.%d", mtp->mt_magsize);
465
466 /* create an cache for this magazine type */
467 mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
468 SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
469 skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
470
471 /* remember the last magazine type */
472 skmem_cache_magsize_last = mtp;
473 }
474
475 VERIFY(skmem_cache_magsize_last != NULL);
476 VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
477 VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
478
479 /*
480 * Allocate thread calls for cache reap and update operations.
481 */
482 skmem_cache_reap_tc =
483 thread_call_allocate_with_options(skmem_cache_reap_func,
484 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
485 skmem_cache_update_tc =
486 thread_call_allocate_with_options(skmem_cache_update_func,
487 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
488 if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
489 panic("%s: thread_call_allocate failed", __func__);
490 /* NOTREACHED */
491 __builtin_unreachable();
492 }
493
494 /*
495 * We're ready; go through existing skmem_cache entries
496 * (if any) and enable the magazines layer for each.
497 */
498 skmem_cache_applyall(skmem_cache_magazine_enable, 0);
499 skmem_cache_ready = TRUE;
500
501 /* and start the periodic cache update machinery */
502 skmem_dispatch(skmem_cache_update_tc, NULL,
503 (skmem_cache_update_interval * NSEC_PER_SEC));
504
505 __skmem_cache_inited = 1;
506 }
507
508 void
skmem_cache_fini(void)509 skmem_cache_fini(void)
510 {
511 struct skmem_magtype *mtp;
512 uint32_t i;
513
514 if (__skmem_cache_inited) {
515 ASSERT(TAILQ_EMPTY(&skmem_cache_head));
516
517 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
518 mtp = &skmem_magtype[i];
519 skmem_cache_destroy(mtp->mt_cache);
520 mtp->mt_cache = NULL;
521 }
522 skmem_cache_destroy(skmem_slab_cache);
523 skmem_slab_cache = NULL;
524 skmem_cache_destroy(skmem_bufctl_cache);
525 skmem_bufctl_cache = NULL;
526
527 if (skmem_cache_reap_tc != NULL) {
528 (void) thread_call_cancel_wait(skmem_cache_reap_tc);
529 (void) thread_call_free(skmem_cache_reap_tc);
530 skmem_cache_reap_tc = NULL;
531 }
532 if (skmem_cache_update_tc != NULL) {
533 (void) thread_call_cancel_wait(skmem_cache_update_tc);
534 (void) thread_call_free(skmem_cache_update_tc);
535 skmem_cache_update_tc = NULL;
536 }
537
538 __skmem_cache_inited = 0;
539 }
540
541 if (__skmem_cache_pre_inited) {
542 if (skm_zone != NULL) {
543 zdestroy(skm_zone);
544 skm_zone = NULL;
545 }
546
547 __skmem_cache_pre_inited = 0;
548 }
549 }
550
551 /*
552 * Create a cache.
553 */
554 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)555 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
556 skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
557 void *private, struct skmem_region *region, uint32_t cflags)
558 {
559 boolean_t pseudo = (region == NULL);
560 struct skmem_magtype *mtp;
561 struct skmem_cache *skm;
562 void *buf;
563 size_t segsize;
564 size_t chunksize;
565 size_t objsize;
566 size_t objalign;
567 uint32_t i, cpuid;
568
569 /* enforce 64-bit minimum alignment for buffers */
570 if (bufalign == 0) {
571 bufalign = SKMEM_CACHE_ALIGN;
572 }
573 bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
574
575 /* enforce alignment to be a power of 2 */
576 VERIFY(powerof2(bufalign));
577
578 if (region == NULL) {
579 struct skmem_region_params srp;
580
581 /* batching is currently not supported on pseudo regions */
582 VERIFY(!(cflags & SKMEM_CR_BATCH));
583
584 srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
585 ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
586
587 /* objalign is always equal to bufalign */
588 srp.srp_align = objalign = bufalign;
589 srp.srp_r_obj_cnt = 1;
590 srp.srp_r_obj_size = (uint32_t)bufsize;
591 skmem_region_params_config(&srp);
592
593 /* allocate region for intrinsics */
594 region = skmem_region_create(name, &srp, NULL, NULL, NULL);
595 VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
596 VERIFY(objalign == region->skr_align);
597 #if KASAN
598 /*
599 * When KASAN is enabled, the zone allocator adjusts the
600 * element size to include the redzone regions, in which
601 * case we assume that the elements won't start on the
602 * alignment boundary and thus need to do some fix-ups.
603 * These include increasing the effective object size
604 * which adds at least 16 bytes to the original size,
605 * as computed by skmem_region_params_config() above.
606 */
607 VERIFY(region->skr_c_obj_size >=
608 (bufsize + sizeof(uint64_t) + bufalign));
609 #endif /* KASAN */
610 /* enable magazine resizing by default */
611 cflags |= SKMEM_CR_DYNAMIC;
612
613 /*
614 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
615 * even though it's a no-op since the work is done
616 * at the zone layer instead.
617 */
618 cflags |= SKMEM_CR_CLEARONFREE;
619 } else {
620 objalign = region->skr_align;
621 }
622
623 ASSERT(region != NULL);
624 ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
625 segsize = region->skr_seg_size;
626 ASSERT(bufalign <= segsize);
627
628 buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO);
629 #if KASAN
630 /*
631 * In case we didn't get a cache-aligned memory, round it up
632 * accordingly. This is needed in order to get the rest of
633 * structure members aligned properly. It also means that
634 * the memory span gets shifted due to the round up, but it
635 * is okay since we've allocated extra space for this.
636 */
637 skm = (struct skmem_cache *)
638 P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
639 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
640 *pbuf = buf;
641 #else /* !KASAN */
642 /*
643 * We expect that the zone allocator would allocate elements
644 * rounded up to the requested alignment based on the object
645 * size computed in skmem_cache_pre_init() earlier, and
646 * 'skm' is therefore the element address itself.
647 */
648 skm = buf;
649 #endif /* !KASAN */
650 VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
651
652 if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
653 (cflags & SKMEM_CR_NOMAGAZINES)) {
654 /*
655 * Either the caller insists that this cache should not
656 * utilize magazines layer, or that the system override
657 * to disable magazines layer on all caches has been set.
658 */
659 skm->skm_mode |= SKM_MODE_NOMAGAZINES;
660 } else {
661 /*
662 * Region must be configured with enough objects
663 * to take into account objects at the CPU layer.
664 */
665 ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
666 }
667
668 if (cflags & SKMEM_CR_DYNAMIC) {
669 /*
670 * Enable per-CPU cache magazine resizing.
671 */
672 skm->skm_mode |= SKM_MODE_DYNAMIC;
673 }
674
675 /* region stays around after defunct? */
676 if (region->skr_mode & SKR_MODE_NOREDIRECT) {
677 skm->skm_mode |= SKM_MODE_NOREDIRECT;
678 }
679
680 if (cflags & SKMEM_CR_BATCH) {
681 /*
682 * Batch alloc/free involves storing the next object
683 * pointer at the beginning of each object; this is
684 * okay for kernel-only regions, but not those that
685 * are mappable to user space (we can't leak kernel
686 * addresses).
687 */
688 _CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
689 VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
690
691 /* batching is currently not supported on pseudo regions */
692 VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
693
694 /* validate object size */
695 VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
696
697 skm->skm_mode |= SKM_MODE_BATCH;
698 }
699
700 uuid_generate_random(skm->skm_uuid);
701 (void) snprintf(skm->skm_name, sizeof(skm->skm_name),
702 "%s.%s", SKMEM_CACHE_PREFIX, name);
703 skm->skm_bufsize = bufsize;
704 skm->skm_bufalign = bufalign;
705 skm->skm_objalign = objalign;
706 skm->skm_ctor = ctor;
707 skm->skm_dtor = dtor;
708 skm->skm_reclaim = reclaim;
709 skm->skm_private = private;
710 skm->skm_slabsize = segsize;
711
712 skm->skm_region = region;
713 /* callee holds reference */
714 skmem_region_slab_config(region, skm, true);
715 objsize = region->skr_c_obj_size;
716 skm->skm_objsize = objsize;
717
718 if (pseudo) {
719 /*
720 * Release reference from skmem_region_create()
721 * since skm->skm_region holds one now.
722 */
723 ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
724 skmem_region_release(region);
725
726 skm->skm_mode |= SKM_MODE_PSEUDO;
727
728 skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
729 skm->skm_slab_free = skmem_slab_free_pseudo_locked;
730 } else {
731 skm->skm_slab_alloc = skmem_slab_alloc_locked;
732 skm->skm_slab_free = skmem_slab_free_locked;
733
734 /* auditing was requested? (normal regions only) */
735 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
736 ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
737 skm->skm_mode |= SKM_MODE_AUDIT;
738 }
739 }
740
741 /*
742 * Clear upon free (to slab layer) as long as the region is
743 * not marked as read-only for kernel, and if the chunk size
744 * is within the threshold or if the caller had requested it.
745 */
746 if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
747 if (skm->skm_objsize <= skmem_clear_min ||
748 (cflags & SKMEM_CR_CLEARONFREE)) {
749 skm->skm_mode |= SKM_MODE_CLEARONFREE;
750 }
751 }
752
753 chunksize = bufsize;
754 if (bufalign >= SKMEM_CACHE_ALIGN) {
755 chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
756 }
757
758 chunksize = P2ROUNDUP(chunksize, bufalign);
759 if (chunksize > objsize) {
760 panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
761 __func__, bufsize, chunksize, objsize);
762 /* NOTREACHED */
763 __builtin_unreachable();
764 }
765 ASSERT(chunksize != 0);
766 skm->skm_chunksize = chunksize;
767
768 lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
769 TAILQ_INIT(&skm->skm_sl_partial_list);
770 TAILQ_INIT(&skm->skm_sl_empty_list);
771
772 /* allocated-address hash table */
773 skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
774 skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
775 skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
776 skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
777
778 skm->skm_hash_mask = (skm->skm_hash_initial - 1);
779 skm->skm_hash_shift = flsll(chunksize) - 1;
780
781 for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
782 SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
783 }
784
785 lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
786
787 /* find a suitable magazine type for this chunk size */
788 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
789 continue;
790 }
791
792 skm->skm_magtype = mtp;
793 if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
794 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
795 }
796
797 /*
798 * Initialize the CPU layer. Each per-CPU structure is aligned
799 * on the CPU cache line boundary to prevent false sharing.
800 */
801 lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
802 for (cpuid = 0; cpuid < ncpu; cpuid++) {
803 struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
804
805 VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
806 lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
807 &skmem_lock_attr);
808 ccp->cp_rounds = -1;
809 ccp->cp_prounds = -1;
810 }
811
812 SKMEM_CACHE_LOCK();
813 TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
814 SKMEM_CACHE_UNLOCK();
815
816 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
817 skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
818 SK_DF(SK_VERB_MEM_CACHE,
819 " bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
820 (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
821 (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
822 (uint32_t)skm->skm_slabsize);
823
824 if (skmem_cache_ready) {
825 skmem_cache_magazine_enable(skm, 0);
826 }
827
828 return skm;
829 }
830
831 /*
832 * Destroy a cache.
833 */
834 void
skmem_cache_destroy(struct skmem_cache * skm)835 skmem_cache_destroy(struct skmem_cache *skm)
836 {
837 uint32_t cpuid;
838
839 SKMEM_CACHE_LOCK();
840 TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
841 SKMEM_CACHE_UNLOCK();
842
843 ASSERT(skm->skm_rs_busy == 0);
844 ASSERT(skm->skm_rs_want == 0);
845
846 /* purge all cached objects for this cache */
847 skmem_cache_magazine_purge(skm);
848
849
850 /*
851 * Panic if we detect there are unfreed objects; the caller
852 * destroying this cache is responsible for ensuring that all
853 * allocated objects have been freed prior to getting here.
854 */
855 SKM_SLAB_LOCK(skm);
856 if (skm->skm_sl_bufinuse != 0) {
857 panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
858 skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
859 /* NOTREACHED */
860 __builtin_unreachable();
861 }
862 ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
863 ASSERT(skm->skm_sl_partial == 0);
864 ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
865 ASSERT(skm->skm_sl_empty == 0);
866 skm->skm_reclaim = NULL;
867 skm->skm_ctor = NULL;
868 skm->skm_dtor = NULL;
869 SKM_SLAB_UNLOCK(skm);
870
871 if (skm->skm_hash_table != NULL) {
872 #if (DEBUG || DEVELOPMENT)
873 for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
874 ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
875 }
876 #endif /* DEBUG || DEVELOPMENT */
877
878 sk_free_type_array(struct skmem_bufctl_bkt,
879 skm->skm_hash_mask + 1, skm->skm_hash_table);
880 skm->skm_hash_table = NULL;
881 }
882
883 for (cpuid = 0; cpuid < ncpu; cpuid++) {
884 lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
885 &skmem_cpu_lock_grp);
886 }
887 lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
888 lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
889 lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
890
891 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
892 skm->skm_name, SK_KVA(skm));
893
894 /* callee releases reference */
895 skmem_region_slab_config(skm->skm_region, skm, false);
896 skm->skm_region = NULL;
897
898 #if KASAN
899 /* get the original address since we're about to free it */
900 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
901 skm = *pbuf;
902 #endif /* KASAN */
903
904 zfree(skm_zone, skm);
905 }
906
907 /*
908 * Create a slab.
909 */
910 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)911 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
912 {
913 struct skmem_region *skr = skm->skm_region;
914 uint32_t objsize, chunks;
915 size_t slabsize = skm->skm_slabsize;
916 struct skmem_slab *sl;
917 struct sksegment *sg, *sgm;
918 char *buf, *bufm, *slab, *slabm;
919
920 /*
921 * Allocate a segment (a slab at our layer) from the region.
922 */
923 slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
924 if (slab == NULL) {
925 goto rg_alloc_failure;
926 }
927
928 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
929 goto slab_alloc_failure;
930 }
931
932 ASSERT(sg != NULL);
933 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
934
935 bzero(sl, sizeof(*sl));
936 sl->sl_cache = skm;
937 sl->sl_base = buf = slab;
938 sl->sl_basem = bufm = slabm;
939 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
940 objsize = (uint32_t)skr->skr_c_obj_size;
941 ASSERT(skm->skm_objsize == objsize);
942 ASSERT((slabsize / objsize) <= UINT32_MAX);
943 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
944 sl->sl_seg = sg;
945 sl->sl_segm = sgm;
946
947 /*
948 * Create one or more buffer control structures for the slab,
949 * each one tracking a chunk of raw object from the segment,
950 * and insert these into the slab's list of buffer controls.
951 */
952 ASSERT(chunks > 0);
953 while (chunks != 0) {
954 struct skmem_bufctl *bc;
955
956 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
957 if (bc == NULL) {
958 goto bufctl_alloc_failure;
959 }
960
961 bzero(bc, bc_size);
962 bc->bc_addr = buf;
963 bc->bc_addrm = bufm;
964 bc->bc_slab = sl;
965 bc->bc_idx = (sl->sl_chunks - chunks);
966 if (skr->skr_mode & SKR_MODE_SHAREOK) {
967 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
968 }
969 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
970 bc->bc_lim = objsize;
971 buf += objsize;
972 if (bufm != NULL) {
973 bufm += objsize;
974 }
975 --chunks;
976 }
977
978 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
979 SK_KVA(skm), SK_KVA(sl));
980 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
981 SK_KVA(slab), SK_KVA(slab + objsize));
982
983 return sl;
984
985 bufctl_alloc_failure:
986 skmem_slab_destroy(skm, sl);
987
988 slab_alloc_failure:
989 skmem_region_free(skr, slab, slabm);
990
991 rg_alloc_failure:
992 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
993
994 return NULL;
995 }
996
997 /*
998 * Destroy a slab.
999 */
1000 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)1001 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
1002 {
1003 struct skmem_bufctl *bc, *tbc;
1004 void *slab = sl->sl_base;
1005 void *slabm = sl->sl_basem;
1006
1007 ASSERT(sl->sl_refcnt == 0);
1008
1009 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1010 SK_KVA(skm), SK_KVA(sl));
1011 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1012 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1013
1014 /*
1015 * Go through the slab's list of buffer controls and free
1016 * them, and then free the slab itself back to its cache.
1017 */
1018 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1019 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1020 skmem_cache_free(skmem_bufctl_cache, bc);
1021 }
1022 skmem_cache_free(skmem_slab_cache, sl);
1023
1024 /* and finally free the segment back to the backing region */
1025 skmem_region_free(skm->skm_region, slab, slabm);
1026 }
1027
1028 /*
1029 * Allocate a raw object from the (locked) slab layer. Normal region variant.
1030 */
1031 static int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1032 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
1033 struct skmem_obj_info *oim, uint32_t skmflag)
1034 {
1035 struct skmem_bufctl_bkt *bcb;
1036 struct skmem_bufctl *bc;
1037 struct skmem_slab *sl;
1038 uint32_t retries = 0;
1039 uint64_t boff_total = 0; /* in usec */
1040 uint64_t boff = 0; /* in msec */
1041 boolean_t new_slab;
1042 void *buf;
1043
1044 /* this flag is not for the caller to set */
1045 VERIFY(!(skmflag & SKMEM_FAILOK));
1046
1047 /*
1048 * A slab is either in a partially-allocated list (at least it has
1049 * a free object available), or is in the empty list (everything
1050 * has been allocated.) If we can't find a partially-allocated
1051 * slab, then we need to allocate a slab (segment) from the region.
1052 */
1053 again:
1054 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1055 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1056 if (sl == NULL) {
1057 uint32_t flags = skmflag;
1058 boolean_t retry;
1059
1060 ASSERT(skm->skm_sl_partial == 0);
1061 SKM_SLAB_UNLOCK(skm);
1062 if (!(flags & SKMEM_NOSLEEP)) {
1063 /*
1064 * Pick up a random value to start the exponential
1065 * backoff, if this is the first round, or if the
1066 * current value is over the threshold. Otherwise,
1067 * double the backoff value.
1068 */
1069 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
1070 read_frandom(&boff, sizeof(boff));
1071 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
1072 ASSERT(boff > 0);
1073 } else if (os_mul_overflow(boff, 2, &boff)) {
1074 panic_plain("\"%s\": boff counter "
1075 "overflows\n", skm->skm_name);
1076 /* NOTREACHED */
1077 __builtin_unreachable();
1078 }
1079 /* add this value (in msec) to the total (in usec) */
1080 if (os_add_overflow(boff_total,
1081 (boff * NSEC_PER_USEC), &boff_total)) {
1082 panic_plain("\"%s\": boff_total counter "
1083 "overflows\n", skm->skm_name);
1084 /* NOTREACHED */
1085 __builtin_unreachable();
1086 }
1087 }
1088 /*
1089 * In the event of a race between multiple threads trying
1090 * to create the last remaining (or the only) slab, let the
1091 * loser(s) attempt to retry after waiting a bit. The winner
1092 * would have inserted the newly-created slab into the list.
1093 */
1094 if (!(flags & SKMEM_NOSLEEP) &&
1095 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1096 retry = TRUE;
1097 ++retries;
1098 flags |= SKMEM_FAILOK;
1099 } else {
1100 if (!(flags & SKMEM_NOSLEEP)) {
1101 panic_plain("\"%s\": failed to allocate "
1102 "slab (sleeping mode) after %llu "
1103 "msec, %u retries\n\n%s", skm->skm_name,
1104 (boff_total / NSEC_PER_USEC), retries,
1105 skmem_dump(skm->skm_region));
1106 /* NOTREACHED */
1107 __builtin_unreachable();
1108 }
1109 retry = FALSE;
1110 }
1111
1112 /*
1113 * Create a new slab.
1114 */
1115 if ((sl = skmem_slab_create(skm, flags)) == NULL) {
1116 if (retry) {
1117 SK_ERR("\"%s\": failed to allocate "
1118 "slab (%ssleeping mode): waiting for %llu "
1119 "msec, total %llu msec, %u retries",
1120 skm->skm_name,
1121 (flags & SKMEM_NOSLEEP) ? "non-" : "",
1122 boff, (boff_total / NSEC_PER_USEC), retries);
1123 VERIFY(boff > 0 && ((uint32_t)boff <=
1124 (SKMEM_SLAB_BACKOFF_THRES * 2)));
1125 delay((uint32_t)boff * NSEC_PER_USEC);
1126 SKM_SLAB_LOCK(skm);
1127 goto again;
1128 } else {
1129 SK_RDERR(4, "\"%s\": failed to allocate slab "
1130 "(%ssleeping mode)", skm->skm_name,
1131 (flags & SKMEM_NOSLEEP) ? "non-" : "");
1132 SKM_SLAB_LOCK(skm);
1133 }
1134 return ENOMEM;
1135 }
1136
1137 SKM_SLAB_LOCK(skm);
1138 skm->skm_sl_create++;
1139 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1140 skm->skm_sl_bufmax) {
1141 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1142 }
1143 }
1144 skm->skm_sl_alloc++;
1145
1146 new_slab = (sl->sl_refcnt == 0);
1147 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
1148
1149 sl->sl_refcnt++;
1150 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1151
1152 /*
1153 * We either have a new slab, or a partially-allocated one.
1154 * Remove a buffer control from the slab, and insert it to
1155 * the allocated-address hash chain.
1156 */
1157 bc = SLIST_FIRST(&sl->sl_head);
1158 ASSERT(bc != NULL);
1159 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1160
1161 /* sanity check */
1162 VERIFY(bc->bc_usecnt == 0);
1163
1164 /*
1165 * Also store the master object's region info for the caller.
1166 */
1167 bzero(oi, sizeof(*oi));
1168 SKMEM_OBJ_ADDR(oi) = buf = bc->bc_addr;
1169 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1170 ASSERT(skm->skm_objsize <= UINT32_MAX);
1171 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1172 SKMEM_OBJ_IDX_REG(oi) =
1173 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1174 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1175 /*
1176 * And for slave object.
1177 */
1178 if (oim != NULL) {
1179 bzero(oim, sizeof(*oim));
1180 if (bc->bc_addrm != NULL) {
1181 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1182 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1183 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1184 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1185 }
1186 }
1187
1188 if (skm->skm_mode & SKM_MODE_BATCH) {
1189 ((struct skmem_obj *)buf)->mo_next = NULL;
1190 }
1191
1192 /* insert to allocated-address hash chain */
1193 bcb = SKMEM_CACHE_HASH(skm, buf);
1194 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1195
1196 if (SLIST_EMPTY(&sl->sl_head)) {
1197 /*
1198 * If that was the last buffer control from this slab,
1199 * insert the slab into the empty list. If it was in
1200 * the partially-allocated list, then remove the slab
1201 * from there as well.
1202 */
1203 ASSERT(sl->sl_refcnt == sl->sl_chunks);
1204 if (new_slab) {
1205 ASSERT(sl->sl_chunks == 1);
1206 } else {
1207 ASSERT(sl->sl_chunks > 1);
1208 ASSERT(skm->skm_sl_partial > 0);
1209 skm->skm_sl_partial--;
1210 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1211 }
1212 skm->skm_sl_empty++;
1213 ASSERT(skm->skm_sl_empty != 0);
1214 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1215 } else {
1216 /*
1217 * The slab is not empty; if it was newly allocated
1218 * above, then it's not in the partially-allocated
1219 * list and so we insert it there.
1220 */
1221 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1222 if (new_slab) {
1223 skm->skm_sl_partial++;
1224 ASSERT(skm->skm_sl_partial != 0);
1225 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1226 sl, sl_link);
1227 }
1228 }
1229
1230 /* if auditing is enabled, record this transaction */
1231 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1232 skmem_audit_bufctl(bc);
1233 }
1234
1235 return 0;
1236 }
1237
1238 /*
1239 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
1240 */
1241 static int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1242 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1243 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
1244 {
1245 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1246 struct skmem_region *skr = skm->skm_region;
1247 void *obj, *buf;
1248
1249 /* this flag is not for the caller to set */
1250 VERIFY(!(skmflag & SKMEM_FAILOK));
1251
1252 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1253
1254 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1255 /* mirrored region is not applicable */
1256 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1257 /* batching is not yet supported */
1258 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1259
1260 if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) {
1261 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
1262 return ENOMEM;
1263 }
1264
1265 #if KASAN
1266 /*
1267 * Perform some fix-ups since the zone element isn't guaranteed
1268 * to be on the aligned boundary. The effective object size
1269 * has been adjusted accordingly by skmem_region_create() earlier
1270 * at cache creation time.
1271 *
1272 * 'buf' is get the aligned address for this object.
1273 */
1274 buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
1275 skm->skm_bufalign);
1276
1277 /*
1278 * Wind back a pointer size from the aligned address and
1279 * save the original address so we can free it later.
1280 */
1281 void **pbuf = (void **)((intptr_t)buf - sizeof(void *));
1282 *pbuf = obj;
1283
1284 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1285 ((intptr_t)obj + skm->skm_objsize));
1286 #else /* !KASAN */
1287 /*
1288 * We expect that the zone allocator would allocate elements
1289 * rounded up to the requested alignment based on the effective
1290 * object size computed in skmem_region_create() earlier, and
1291 * 'buf' is therefore the element address itself.
1292 */
1293 buf = obj;
1294 #endif /* !KASAN */
1295
1296 /* make sure the object is aligned */
1297 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1298
1299 /*
1300 * Return the object's info to the caller.
1301 */
1302 bzero(oi, sizeof(*oi));
1303 SKMEM_OBJ_ADDR(oi) = buf;
1304 ASSERT(skm->skm_objsize <= UINT32_MAX);
1305 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1306 if (oim != NULL) {
1307 bzero(oim, sizeof(*oim));
1308 }
1309
1310 skm->skm_sl_alloc++;
1311 skm->skm_sl_bufinuse++;
1312 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1313 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1314 }
1315
1316 return 0;
1317 }
1318
1319 /*
1320 * Allocate a raw object from the slab layer.
1321 */
1322 static int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1323 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
1324 struct skmem_obj_info *oim, uint32_t skmflag)
1325 {
1326 int err;
1327
1328 SKM_SLAB_LOCK(skm);
1329 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1330 SKM_SLAB_UNLOCK(skm);
1331
1332 return err;
1333 }
1334
1335 /*
1336 * Allocate raw object(s) from the slab layer.
1337 */
1338 static uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1339 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1340 uint32_t num, uint32_t skmflag)
1341 {
1342 uint32_t need = num;
1343
1344 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1345 *list = NULL;
1346
1347 SKM_SLAB_LOCK(skm);
1348 for (;;) {
1349 struct skmem_obj_info oi, oim;
1350
1351 /*
1352 * Get a single raw object from the slab layer.
1353 */
1354 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1355 break;
1356 }
1357
1358 *list = SKMEM_OBJ_ADDR(&oi);
1359 ASSERT((*list)->mo_next == NULL);
1360 /* store these inside the object itself */
1361 (*list)->mo_info = oi;
1362 (*list)->mo_minfo = oim;
1363 list = &(*list)->mo_next;
1364
1365 ASSERT(need != 0);
1366 if (--need == 0) {
1367 break;
1368 }
1369 }
1370 SKM_SLAB_UNLOCK(skm);
1371
1372 return num - need;
1373 }
1374
1375 /*
1376 * Free a raw object to the (locked) slab layer. Normal region variant.
1377 */
1378 static void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)1379 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
1380 {
1381 struct skmem_bufctl *bc, *tbc;
1382 struct skmem_bufctl_bkt *bcb;
1383 struct skmem_slab *sl = NULL;
1384
1385 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1386 ASSERT(buf != NULL);
1387 /* caller is expected to clear mo_next */
1388 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1389 ((struct skmem_obj *)buf)->mo_next == NULL);
1390
1391 /*
1392 * Search the hash chain to find a matching buffer control for the
1393 * given object address. If found, remove the buffer control from
1394 * the hash chain and insert it into the freelist. Otherwise, we
1395 * panic since the caller has given us a bogus address.
1396 */
1397 skm->skm_sl_free++;
1398 bcb = SKMEM_CACHE_HASH(skm, buf);
1399 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1400 if (bc->bc_addr == buf) {
1401 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1402 sl = bc->bc_slab;
1403 break;
1404 }
1405 }
1406
1407 if (bc == NULL) {
1408 panic("%s: attempt to free invalid or already-freed obj %p "
1409 "on skm %p", __func__, buf, skm);
1410 /* NOTREACHED */
1411 __builtin_unreachable();
1412 }
1413 ASSERT(sl != NULL && sl->sl_cache == skm);
1414 VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1415
1416 /* make sure this object is not currently in use by another object */
1417 VERIFY(bc->bc_usecnt == 0);
1418
1419 /* if auditing is enabled, record this transaction */
1420 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1421 skmem_audit_bufctl(bc);
1422 }
1423
1424 /* if clear on free is requested, zero out the object */
1425 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1426 bzero(buf, skm->skm_objsize);
1427 }
1428
1429 /* insert the buffer control to the slab's freelist */
1430 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1431
1432 ASSERT(sl->sl_refcnt >= 1);
1433 if (--sl->sl_refcnt == 0) {
1434 /*
1435 * If this was the last outstanding object for the slab,
1436 * remove the slab from the partially-allocated or empty
1437 * list, and destroy the slab (segment) back to the region.
1438 */
1439 if (sl->sl_chunks == 1) {
1440 ASSERT(skm->skm_sl_empty > 0);
1441 skm->skm_sl_empty--;
1442 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1443 } else {
1444 ASSERT(skm->skm_sl_partial > 0);
1445 skm->skm_sl_partial--;
1446 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1447 }
1448 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
1449 skm->skm_sl_bufinuse -= sl->sl_chunks;
1450 skm->skm_sl_destroy++;
1451 SKM_SLAB_UNLOCK(skm);
1452 skmem_slab_destroy(skm, sl);
1453 SKM_SLAB_LOCK(skm);
1454 return;
1455 }
1456
1457 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1458 if (SLIST_NEXT(bc, bc_link) == NULL) {
1459 /*
1460 * If this is the first (potentially amongst many) object
1461 * that's returned to the slab, remove the slab from the
1462 * empty list and insert to end of the partially-allocated
1463 * list. This should help avoid thrashing the partial slab
1464 * since we avoid disturbing what's already at the front.
1465 */
1466 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
1467 ASSERT(sl->sl_chunks > 1);
1468 ASSERT(skm->skm_sl_empty > 0);
1469 skm->skm_sl_empty--;
1470 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1471 skm->skm_sl_partial++;
1472 ASSERT(skm->skm_sl_partial != 0);
1473 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1474 }
1475 }
1476
1477 /*
1478 * Free a raw object to the (locked) slab layer. Pseudo region variant.
1479 */
1480 static void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)1481 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
1482 {
1483 struct skmem_region *skr = skm->skm_region;
1484 void *obj = buf;
1485
1486 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1487
1488 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1489
1490 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1491
1492 #if KASAN
1493 /*
1494 * Since we stuffed the original zone element address before
1495 * the buffer address in KASAN mode, get it back since we're
1496 * about to free it.
1497 */
1498 void **pbuf = (void **)((intptr_t)obj - sizeof(void *));
1499
1500 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1501 ((intptr_t)*pbuf + skm->skm_objsize));
1502
1503 obj = *pbuf;
1504 #endif /* KASAN */
1505
1506 /* free it to zone */
1507 zfree(skr->skr_zreg, obj);
1508
1509 skm->skm_sl_free++;
1510 ASSERT(skm->skm_sl_bufinuse > 0);
1511 skm->skm_sl_bufinuse--;
1512 }
1513
1514 /*
1515 * Free a raw object to the slab layer.
1516 */
1517 static void
skmem_slab_free(struct skmem_cache * skm,void * buf)1518 skmem_slab_free(struct skmem_cache *skm, void *buf)
1519 {
1520 if (skm->skm_mode & SKM_MODE_BATCH) {
1521 ((struct skmem_obj *)buf)->mo_next = NULL;
1522 }
1523
1524 SKM_SLAB_LOCK(skm);
1525 skm->skm_slab_free(skm, buf);
1526 SKM_SLAB_UNLOCK(skm);
1527 }
1528
1529 /*
1530 * Free raw object(s) to the slab layer.
1531 */
1532 static void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1533 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1534 {
1535 struct skmem_obj *listn;
1536
1537 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1538
1539 SKM_SLAB_LOCK(skm);
1540 for (;;) {
1541 listn = list->mo_next;
1542 list->mo_next = NULL;
1543
1544 /*
1545 * Free a single object to the slab layer.
1546 */
1547 skm->skm_slab_free(skm, (void *)list);
1548
1549 /* if no more objects to free, we're done */
1550 if ((list = listn) == NULL) {
1551 break;
1552 }
1553 }
1554 SKM_SLAB_UNLOCK(skm);
1555 }
1556
1557 /*
1558 * Return the object's region info.
1559 */
1560 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)1561 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
1562 struct skmem_obj_info *oi, struct skmem_obj_info *oim)
1563 {
1564 struct skmem_bufctl_bkt *bcb;
1565 struct skmem_bufctl *bc;
1566 struct skmem_slab *sl;
1567
1568 /*
1569 * Search the hash chain to find a matching buffer control for the
1570 * given object address. If not found, panic since the caller has
1571 * given us a bogus address.
1572 */
1573 SKM_SLAB_LOCK(skm);
1574 bcb = SKMEM_CACHE_HASH(skm, buf);
1575 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1576 if (bc->bc_addr == buf) {
1577 break;
1578 }
1579 }
1580
1581 if (__improbable(bc == NULL)) {
1582 panic("%s: %s failed to get object info for %p",
1583 __func__, skm->skm_name, buf);
1584 /* NOTREACHED */
1585 __builtin_unreachable();
1586 }
1587
1588 /*
1589 * Return the master object's info to the caller.
1590 */
1591 sl = bc->bc_slab;
1592 SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1593 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1594 ASSERT(skm->skm_objsize <= UINT32_MAX);
1595 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1596 SKMEM_OBJ_IDX_REG(oi) =
1597 (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1598 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1599 /*
1600 * And for slave object.
1601 */
1602 if (oim != NULL) {
1603 bzero(oim, sizeof(*oim));
1604 if (bc->bc_addrm != NULL) {
1605 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1606 SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1607 SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1608 SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1609 }
1610 }
1611 SKM_SLAB_UNLOCK(skm);
1612 }
1613
1614 /*
1615 * Magazine constructor.
1616 */
1617 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)1618 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
1619 void *arg, uint32_t skmflag)
1620 {
1621 #pragma unused(oim, skmflag)
1622 struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1623
1624 ASSERT(oim == NULL);
1625 ASSERT(arg != NULL);
1626
1627 /*
1628 * Store it in the magazine object since we'll
1629 * need to refer to it during magazine destroy;
1630 * we can't safely refer to skm_magtype as the
1631 * depot lock may not be acquired then.
1632 */
1633 mg->mg_magtype = arg;
1634
1635 return 0;
1636 }
1637
1638 /*
1639 * Destroy a magazine (free each object to the slab layer).
1640 */
1641 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)1642 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
1643 int nrounds)
1644 {
1645 int round;
1646
1647 for (round = 0; round < nrounds; round++) {
1648 void *buf = mg->mg_round[round];
1649 struct skmem_obj *next;
1650
1651 if (skm->skm_mode & SKM_MODE_BATCH) {
1652 next = ((struct skmem_obj *)buf)->mo_next;
1653 ((struct skmem_obj *)buf)->mo_next = NULL;
1654 }
1655
1656 /* deconstruct the object */
1657 if (skm->skm_dtor != NULL) {
1658 skm->skm_dtor(buf, skm->skm_private);
1659 }
1660
1661 /*
1662 * In non-batching mode, each object in the magazine has
1663 * no linkage to its neighbor, so free individual object
1664 * to the slab layer now.
1665 */
1666 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1667 skmem_slab_free(skm, buf);
1668 } else {
1669 ((struct skmem_obj *)buf)->mo_next = next;
1670 }
1671 }
1672
1673 /*
1674 * In batching mode, each object is linked to its neighbor at free
1675 * time, and so take the bottom-most object and free it to the slab
1676 * layer. Because of the way the list is reversed during free, this
1677 * will bring along the rest of objects above it.
1678 */
1679 if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1680 skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1681 }
1682
1683 /* free the magazine itself back to cache */
1684 skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1685 }
1686
1687 /*
1688 * Get one or more magazines from the depot.
1689 */
1690 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag ** list,uint32_t num)1691 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1692 uint32_t *count, struct skmem_mag **list, uint32_t num)
1693 {
1694 SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1695 struct skmem_mag *mg;
1696 uint32_t need = num, c = 0;
1697
1698 ASSERT(list != NULL && need > 0);
1699
1700 if (!SKM_DEPOT_LOCK_TRY(skm)) {
1701 /*
1702 * Track the amount of lock contention here; if the contention
1703 * level is high (more than skmem_cache_depot_contention per a
1704 * given skmem_cache_update_interval interval), then we treat
1705 * it as a sign that the per-CPU layer is not using the right
1706 * magazine type, and that we'd need to resize it.
1707 */
1708 SKM_DEPOT_LOCK(skm);
1709 if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1710 skm->skm_depot_contention++;
1711 }
1712 }
1713
1714 while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1715 SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1716 SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1717 ASSERT(ml->ml_total != 0);
1718 if (--ml->ml_total < ml->ml_min) {
1719 ml->ml_min = ml->ml_total;
1720 }
1721 c++;
1722 ml->ml_alloc++;
1723 if (--need == 0) {
1724 break;
1725 }
1726 }
1727 *count -= c;
1728
1729 SKM_DEPOT_UNLOCK(skm);
1730
1731 *list = SLIST_FIRST(&mg_list);
1732
1733 return num - need;
1734 }
1735
1736 /*
1737 * Return one or more magazines to the depot.
1738 */
1739 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1740 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1741 uint32_t *count, struct skmem_mag *mg)
1742 {
1743 struct skmem_mag *nmg;
1744 uint32_t c = 0;
1745
1746 SKM_DEPOT_LOCK(skm);
1747 while (mg != NULL) {
1748 nmg = SLIST_NEXT(mg, mg_link);
1749 SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1750 ml->ml_total++;
1751 c++;
1752 mg = nmg;
1753 }
1754 *count += c;
1755 SKM_DEPOT_UNLOCK(skm);
1756 }
1757
1758 /*
1759 * Update the depot's working state statistics.
1760 */
1761 static void
skmem_depot_ws_update(struct skmem_cache * skm)1762 skmem_depot_ws_update(struct skmem_cache *skm)
1763 {
1764 SKM_DEPOT_LOCK_SPIN(skm);
1765 skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1766 skm->skm_full.ml_min = skm->skm_full.ml_total;
1767 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1768 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1769 SKM_DEPOT_UNLOCK(skm);
1770 }
1771
1772 /*
1773 * Empty the depot's working state statistics (everything's reapable.)
1774 */
1775 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1776 skmem_depot_ws_zero(struct skmem_cache *skm)
1777 {
1778 SKM_DEPOT_LOCK_SPIN(skm);
1779 if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1780 skm->skm_full.ml_min != skm->skm_full.ml_total ||
1781 skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1782 skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1783 skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1784 skm->skm_full.ml_min = skm->skm_full.ml_total;
1785 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1786 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1787 skm->skm_depot_ws_zero++;
1788 }
1789 SKM_DEPOT_UNLOCK(skm);
1790 }
1791
1792 /*
1793 * Reap magazines that's outside of the working set.
1794 */
1795 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1796 skmem_depot_ws_reap(struct skmem_cache *skm)
1797 {
1798 struct skmem_mag *mg, *nmg;
1799 uint32_t f, e, reap;
1800
1801 reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1802 if (reap != 0) {
1803 (void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1804 &skm->skm_depot_full, &mg, reap);
1805 while (mg != NULL) {
1806 nmg = SLIST_NEXT(mg, mg_link);
1807 SLIST_NEXT(mg, mg_link) = NULL;
1808 skmem_magazine_destroy(skm, mg,
1809 mg->mg_magtype->mt_magsize);
1810 mg = nmg;
1811 }
1812 }
1813
1814 reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1815 if (reap != 0) {
1816 (void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1817 &skm->skm_depot_empty, &mg, reap);
1818 while (mg != NULL) {
1819 nmg = SLIST_NEXT(mg, mg_link);
1820 SLIST_NEXT(mg, mg_link) = NULL;
1821 skmem_magazine_destroy(skm, mg, 0);
1822 mg = nmg;
1823 }
1824 }
1825
1826 if (f != 0 || e != 0) {
1827 atomic_add_32(&skm->skm_cpu_mag_reap, 1);
1828 }
1829 }
1830
1831 /*
1832 * Performs periodic maintenance on a cache. This is serialized
1833 * through the update thread call, and so we guarantee there's at
1834 * most one update episode in the system at any given time.
1835 */
1836 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1837 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1838 {
1839 #pragma unused(arg)
1840 boolean_t resize_mag = FALSE;
1841 boolean_t rescale_hash = FALSE;
1842
1843 SKMEM_CACHE_LOCK_ASSERT_HELD();
1844
1845 /* insist that we are executing in the update thread call context */
1846 ASSERT(sk_is_cache_update_protected());
1847
1848 /*
1849 * If the cache has become much larger or smaller than the
1850 * allocated-address hash table, rescale the hash table.
1851 */
1852 SKM_SLAB_LOCK(skm);
1853 if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1854 (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1855 (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1856 skm->skm_hash_mask > skm->skm_hash_initial)) {
1857 rescale_hash = TRUE;
1858 }
1859 SKM_SLAB_UNLOCK(skm);
1860
1861 /*
1862 * Update the working set.
1863 */
1864 skmem_depot_ws_update(skm);
1865
1866 /*
1867 * If the contention count is greater than the threshold during
1868 * the update interval, and if we are not already at the maximum
1869 * magazine size, increase it.
1870 */
1871 SKM_DEPOT_LOCK_SPIN(skm);
1872 if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1873 (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1874 skmem_cache_depot_contention) {
1875 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1876 resize_mag = TRUE;
1877 }
1878 skm->skm_depot_contention_prev = skm->skm_depot_contention;
1879 SKM_DEPOT_UNLOCK(skm);
1880
1881 if (rescale_hash) {
1882 skmem_cache_hash_rescale(skm);
1883 }
1884
1885 if (resize_mag) {
1886 skmem_cache_magazine_resize(skm);
1887 }
1888 }
1889
1890 /*
1891 * Reload the CPU's magazines with mg and its follower (if any).
1892 */
1893 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1894 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1895 int rounds)
1896 {
1897 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1898 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1899 ASSERT(cp->cp_magsize > 0);
1900
1901 cp->cp_loaded = mg;
1902 cp->cp_rounds = rounds;
1903 if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1904 cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1905 cp->cp_prounds = rounds;
1906 SLIST_NEXT(mg, mg_link) = NULL;
1907 } else {
1908 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1909 cp->cp_ploaded = NULL;
1910 cp->cp_prounds = -1;
1911 }
1912 }
1913
1914 /*
1915 * Reload the CPU's magazine with mg and save the previous one.
1916 */
1917 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1918 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
1919 {
1920 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1921 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1922 ASSERT(cp->cp_magsize > 0);
1923
1924 cp->cp_ploaded = cp->cp_loaded;
1925 cp->cp_prounds = cp->cp_rounds;
1926 cp->cp_loaded = mg;
1927 cp->cp_rounds = rounds;
1928 }
1929
1930 /*
1931 * Allocate a constructed object from the cache.
1932 */
1933 void *
skmem_cache_alloc(struct skmem_cache * skm,uint32_t skmflag)1934 skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
1935 {
1936 struct skmem_obj *buf;
1937
1938 (void) skmem_cache_batch_alloc(skm, &buf, 1, skmflag);
1939 return buf;
1940 }
1941
1942 /*
1943 * Allocate constructed object(s) from the cache.
1944 */
1945 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1946 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1947 uint32_t num, uint32_t skmflag)
1948 {
1949 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1950 struct skmem_obj **top = &(*list);
1951 struct skmem_mag *mg;
1952 uint32_t need = num;
1953
1954 ASSERT(list != NULL);
1955 *list = NULL;
1956
1957 if (need == 0) {
1958 return 0;
1959 }
1960 ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
1961
1962 SKM_CPU_LOCK(cp);
1963 for (;;) {
1964 /*
1965 * If we have an object in the current CPU's loaded
1966 * magazine, return it and we're done.
1967 */
1968 if (cp->cp_rounds > 0) {
1969 int objs = MIN((unsigned int)cp->cp_rounds, need);
1970 /*
1971 * In the SKM_MODE_BATCH case, objects in are already
1972 * linked together with the most recently freed object
1973 * at the head of the list; grab as many objects as we
1974 * can. Otherwise we'll just grab 1 object at most.
1975 */
1976 *list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1977 cp->cp_rounds -= objs;
1978 cp->cp_alloc += objs;
1979
1980 if (skm->skm_mode & SKM_MODE_BATCH) {
1981 struct skmem_obj *tail =
1982 cp->cp_loaded->mg_round[cp->cp_rounds];
1983 list = &tail->mo_next;
1984 *list = NULL;
1985 }
1986
1987 /* if we got them all, return to caller */
1988 if ((need -= objs) == 0) {
1989 SKM_CPU_UNLOCK(cp);
1990 goto done;
1991 }
1992 }
1993
1994 /*
1995 * The CPU's loaded magazine is empty. If the previously
1996 * loaded magazine was full, exchange and try again.
1997 */
1998 if (cp->cp_prounds > 0) {
1999 skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2000 continue;
2001 }
2002
2003 /*
2004 * If the magazine layer is disabled, allocate from slab.
2005 * This can happen either because SKM_MODE_NOMAGAZINES is
2006 * set, or because we are resizing the magazine now.
2007 */
2008 if (cp->cp_magsize == 0) {
2009 break;
2010 }
2011
2012 /*
2013 * Both of the CPU's magazines are empty; try to get
2014 * full magazine(s) from the depot layer. Upon success,
2015 * reload and try again. To prevent potential thrashing,
2016 * replace both empty magazines only if the requested
2017 * count exceeds a magazine's worth of objects.
2018 */
2019 (void) skmem_depot_batch_alloc(skm, &skm->skm_full,
2020 &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
2021 if (mg != NULL) {
2022 SLIST_HEAD(, skmem_mag) mg_list =
2023 SLIST_HEAD_INITIALIZER(mg_list);
2024
2025 if (cp->cp_ploaded != NULL) {
2026 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2027 mg_link);
2028 }
2029 if (SLIST_NEXT(mg, mg_link) == NULL) {
2030 /*
2031 * Depot allocation returns only 1 magazine;
2032 * retain current empty magazine.
2033 */
2034 skmem_cpu_reload(cp, mg, cp->cp_magsize);
2035 } else {
2036 /*
2037 * We got 2 full magazines from depot;
2038 * release the current empty magazine
2039 * back to the depot layer.
2040 */
2041 if (cp->cp_loaded != NULL) {
2042 SLIST_INSERT_HEAD(&mg_list,
2043 cp->cp_loaded, mg_link);
2044 }
2045 skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
2046 }
2047 skmem_depot_batch_free(skm, &skm->skm_empty,
2048 &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2049 continue;
2050 }
2051
2052 /*
2053 * The depot layer doesn't have any full magazines;
2054 * allocate directly from the slab layer.
2055 */
2056 break;
2057 }
2058 SKM_CPU_UNLOCK(cp);
2059
2060 if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
2061 struct skmem_obj *rtop, *rlist, *rlistp = NULL;
2062 uint32_t rlistc, c = 0;
2063
2064 /*
2065 * Get a list of raw objects from the slab layer.
2066 */
2067 rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
2068 ASSERT(rlistc == 0 || rlist != NULL);
2069 rtop = rlist;
2070
2071 /*
2072 * Construct each object in the raw list. Upon failure,
2073 * free any remaining objects in the list back to the slab
2074 * layer, and keep the ones that were successfully constructed.
2075 * Here, "oi" and "oim" in each skmem_obj refer to the objects
2076 * coming from the master and slave regions (on mirrored
2077 * regions), respectively. They are stored inside the object
2078 * temporarily so that we can pass them to the constructor.
2079 */
2080 while (skm->skm_ctor != NULL && rlist != NULL) {
2081 struct skmem_obj_info *oi = &rlist->mo_info;
2082 struct skmem_obj_info *oim = &rlist->mo_minfo;
2083 struct skmem_obj *rlistn = rlist->mo_next;
2084
2085 /*
2086 * Note that the constructor guarantees at least
2087 * the size of a pointer at the top of the object
2088 * and no more than that. That means we must not
2089 * refer to "oi" and "oim" any longer after the
2090 * object goes thru the constructor.
2091 */
2092 if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2093 oim : NULL), skm->skm_private, skmflag) != 0) {
2094 VERIFY(rlist->mo_next == rlistn);
2095 atomic_add_64(&skm->skm_sl_alloc_fail,
2096 rlistc - c);
2097 if (rlistp != NULL) {
2098 rlistp->mo_next = NULL;
2099 }
2100 if (rlist == rtop) {
2101 rtop = NULL;
2102 ASSERT(c == 0);
2103 }
2104 skmem_slab_batch_free(skm, rlist);
2105 rlist = NULL;
2106 rlistc = c;
2107 break;
2108 }
2109 VERIFY(rlist->mo_next == rlistn);
2110
2111 ++c; /* # of constructed objs */
2112 rlistp = rlist;
2113 if ((rlist = rlist->mo_next) == NULL) {
2114 ASSERT(rlistc == c);
2115 break;
2116 }
2117 }
2118
2119 /*
2120 * At this point "top" points to the head of the chain we're
2121 * going to return to caller; "list" points to the tail of that
2122 * chain. The second chain begins at "rtop", and we append
2123 * that after "list" to form a single chain. "rlistc" is the
2124 * number of objects in "rtop" originated from the slab layer
2125 * that have been successfully constructed (if applicable).
2126 */
2127 ASSERT(c == 0 || rtop != NULL);
2128 need -= rlistc;
2129 *list = rtop;
2130 } else {
2131 struct skmem_obj_info oi, oim;
2132 void *buf;
2133
2134 ASSERT(*top == NULL && num == 1 && need == 1);
2135
2136 /*
2137 * Get a single raw object from the slab layer.
2138 */
2139 if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
2140 goto done;
2141 }
2142
2143 buf = SKMEM_OBJ_ADDR(&oi);
2144 ASSERT(buf != NULL);
2145
2146 /*
2147 * Construct the raw object. Here, "oi" and "oim" refer to
2148 * the objects coming from the master and slave regions (on
2149 * mirrored regions), respectively.
2150 */
2151 if (skm->skm_ctor != NULL &&
2152 skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2153 &oim : NULL), skm->skm_private, skmflag) != 0) {
2154 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
2155 skmem_slab_free(skm, buf);
2156 goto done;
2157 }
2158
2159 need = 0;
2160 *list = buf;
2161 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
2162 (*list)->mo_next == NULL);
2163 }
2164
2165 done:
2166 /* if auditing is enabled, record this transaction */
2167 if (__improbable(*top != NULL &&
2168 (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2169 skmem_audit_buf(skm, *top);
2170 }
2171
2172 return num - need;
2173 }
2174
2175 /*
2176 * Free a constructed object to the cache.
2177 */
2178 void
skmem_cache_free(struct skmem_cache * skm,void * buf)2179 skmem_cache_free(struct skmem_cache *skm, void *buf)
2180 {
2181 if (skm->skm_mode & SKM_MODE_BATCH) {
2182 ((struct skmem_obj *)buf)->mo_next = NULL;
2183 }
2184 skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2185 }
2186
2187 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)2188 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
2189 {
2190 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2191 struct skmem_magtype *mtp;
2192 struct skmem_mag *mg;
2193 struct skmem_obj *listn;
2194
2195 /* if auditing is enabled, record this transaction */
2196 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2197 skmem_audit_buf(skm, list);
2198 }
2199
2200 SKM_CPU_LOCK(cp);
2201 for (;;) {
2202 /*
2203 * If there's an available space in the current CPU's
2204 * loaded magazine, place it there and we're done.
2205 */
2206 if ((unsigned int)cp->cp_rounds <
2207 (unsigned int)cp->cp_magsize) {
2208 /*
2209 * In the SKM_MODE_BATCH case, reverse the list
2210 * while we place each object into the magazine;
2211 * this effectively causes the most recently
2212 * freed object to be reused during allocation.
2213 */
2214 if (skm->skm_mode & SKM_MODE_BATCH) {
2215 listn = list->mo_next;
2216 list->mo_next = (cp->cp_rounds == 0) ? NULL :
2217 cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2218 } else {
2219 listn = NULL;
2220 }
2221
2222 cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2223 cp->cp_free++;
2224
2225 if ((list = listn) != NULL) {
2226 continue;
2227 }
2228
2229 SKM_CPU_UNLOCK(cp);
2230 return;
2231 }
2232
2233 /*
2234 * The loaded magazine is full. If the previously
2235 * loaded magazine was empty, exchange and try again.
2236 */
2237 if (cp->cp_prounds == 0) {
2238 skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2239 continue;
2240 }
2241
2242 /*
2243 * If the magazine layer is disabled, free to slab.
2244 * This can happen either because SKM_MODE_NOMAGAZINES
2245 * is set, or because we are resizing the magazine now.
2246 */
2247 if (cp->cp_magsize == 0) {
2248 break;
2249 }
2250
2251 /*
2252 * Both magazines for the CPU are full; try to get
2253 * empty magazine(s) from the depot. If we get one,
2254 * exchange a full magazine with it and place the
2255 * object in there.
2256 *
2257 * TODO: Because the caller currently doesn't indicate
2258 * the number of objects in the list, we choose the more
2259 * conservative approach of allocating only 1 empty
2260 * magazine (to prevent potential thrashing). Once we
2261 * have the object count, we can replace 1 with similar
2262 * logic as used in skmem_cache_batch_alloc().
2263 */
2264 (void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
2265 &skm->skm_depot_empty, &mg, 1);
2266 if (mg != NULL) {
2267 SLIST_HEAD(, skmem_mag) mg_list =
2268 SLIST_HEAD_INITIALIZER(mg_list);
2269
2270 if (cp->cp_ploaded != NULL) {
2271 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2272 mg_link);
2273 }
2274 if (SLIST_NEXT(mg, mg_link) == NULL) {
2275 /*
2276 * Depot allocation returns only 1 magazine;
2277 * retain current full magazine.
2278 */
2279 skmem_cpu_reload(cp, mg, 0);
2280 } else {
2281 /*
2282 * We got 2 empty magazines from depot;
2283 * release the current full magazine back
2284 * to the depot layer.
2285 */
2286 if (cp->cp_loaded != NULL) {
2287 SLIST_INSERT_HEAD(&mg_list,
2288 cp->cp_loaded, mg_link);
2289 }
2290 skmem_cpu_batch_reload(cp, mg, 0);
2291 }
2292 skmem_depot_batch_free(skm, &skm->skm_full,
2293 &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2294 continue;
2295 }
2296
2297 /*
2298 * We can't get any empty magazine from the depot, and
2299 * so we need to allocate one. If the allocation fails,
2300 * just fall through, deconstruct and free the object
2301 * to the slab layer.
2302 */
2303 mtp = skm->skm_magtype;
2304 SKM_CPU_UNLOCK(cp);
2305 mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
2306 SKM_CPU_LOCK(cp);
2307
2308 if (mg != NULL) {
2309 /*
2310 * We allocated an empty magazine, but since we
2311 * dropped the CPU lock above the magazine size
2312 * may have changed. If that's the case free
2313 * the magazine and try again.
2314 */
2315 if (cp->cp_magsize != mtp->mt_magsize) {
2316 SKM_CPU_UNLOCK(cp);
2317 skmem_cache_free(mtp->mt_cache, mg);
2318 SKM_CPU_LOCK(cp);
2319 continue;
2320 }
2321
2322 /*
2323 * We have a magazine with the right size;
2324 * add it to the depot and try again.
2325 */
2326 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2327 skmem_depot_batch_free(skm, &skm->skm_empty,
2328 &skm->skm_depot_empty, mg);
2329 continue;
2330 }
2331
2332 /*
2333 * We can't get an empty magazine, so free to slab.
2334 */
2335 break;
2336 }
2337 SKM_CPU_UNLOCK(cp);
2338
2339 /*
2340 * We weren't able to free the constructed object(s) to the
2341 * magazine layer, so deconstruct them and free to the slab.
2342 */
2343 if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2344 list->mo_next != NULL)) {
2345 /* whatever is left from original list */
2346 struct skmem_obj *top = list;
2347
2348 while (list != NULL && skm->skm_dtor != NULL) {
2349 listn = list->mo_next;
2350 list->mo_next = NULL;
2351
2352 /* deconstruct the object */
2353 if (skm->skm_dtor != NULL) {
2354 skm->skm_dtor((void *)list, skm->skm_private);
2355 }
2356
2357 list->mo_next = listn;
2358 list = listn;
2359 }
2360
2361 skmem_slab_batch_free(skm, top);
2362 } else {
2363 /* deconstruct the object */
2364 if (skm->skm_dtor != NULL) {
2365 skm->skm_dtor((void *)list, skm->skm_private);
2366 }
2367
2368 skmem_slab_free(skm, (void *)list);
2369 }
2370 }
2371
2372 /*
2373 * Return the maximum number of objects cached at the magazine layer
2374 * based on the chunk size. This takes into account the starting
2375 * magazine type as well as the final magazine type used in resizing.
2376 */
2377 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)2378 skmem_cache_magazine_max(uint32_t chunksize)
2379 {
2380 struct skmem_magtype *mtp;
2381 uint32_t magsize_max;
2382
2383 VERIFY(ncpu != 0);
2384 VERIFY(chunksize > 0);
2385
2386 /* find a suitable magazine type for this chunk size */
2387 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2388 continue;
2389 }
2390
2391 /* and find the last magazine type */
2392 for (;;) {
2393 magsize_max = mtp->mt_magsize;
2394 if (mtp == skmem_cache_magsize_last ||
2395 chunksize >= mtp->mt_maxbuf) {
2396 break;
2397 }
2398 ++mtp;
2399 VERIFY(mtp <= skmem_cache_magsize_last);
2400 }
2401
2402 return ncpu * magsize_max * 2; /* two magazines per CPU */
2403 }
2404
2405 /*
2406 * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2407 */
2408 boolean_t
skmem_allow_magazines(void)2409 skmem_allow_magazines(void)
2410 {
2411 return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2412 }
2413
2414 /*
2415 * Purge all magazines from a cache and disable its per-CPU magazines layer.
2416 */
2417 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)2418 skmem_cache_magazine_purge(struct skmem_cache *skm)
2419 {
2420 struct skmem_cpu_cache *cp;
2421 struct skmem_mag *mg, *pmg;
2422 int rounds, prounds;
2423 uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
2424
2425 SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2426
2427 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2428
2429 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2430 cp = &skm->skm_cpu_cache[cpuid];
2431
2432 SKM_CPU_LOCK_SPIN(cp);
2433 mg = cp->cp_loaded;
2434 pmg = cp->cp_ploaded;
2435 rounds = cp->cp_rounds;
2436 prounds = cp->cp_prounds;
2437 cp->cp_loaded = NULL;
2438 cp->cp_ploaded = NULL;
2439 cp->cp_rounds = -1;
2440 cp->cp_prounds = -1;
2441 cp->cp_magsize = 0;
2442 SKM_CPU_UNLOCK(cp);
2443
2444 if (mg != NULL) {
2445 skmem_magazine_destroy(skm, mg, rounds);
2446 ++mg_cnt;
2447 }
2448 if (pmg != NULL) {
2449 skmem_magazine_destroy(skm, pmg, prounds);
2450 ++pmg_cnt;
2451 }
2452 }
2453
2454 if (mg_cnt != 0 || pmg_cnt != 0) {
2455 atomic_add_32(&skm->skm_cpu_mag_purge, 1);
2456 }
2457
2458 skmem_depot_ws_zero(skm);
2459 skmem_depot_ws_reap(skm);
2460 }
2461
2462 /*
2463 * Enable magazines on a cache. Must only be called on a cache with
2464 * its per-CPU magazines layer disabled (e.g. due to purge).
2465 */
2466 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)2467 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2468 {
2469 #pragma unused(arg)
2470 struct skmem_cpu_cache *cp;
2471 uint32_t cpuid;
2472
2473 if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2474 return;
2475 }
2476
2477 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2478 cp = &skm->skm_cpu_cache[cpuid];
2479 SKM_CPU_LOCK_SPIN(cp);
2480 /* the magazines layer must be disabled at this point */
2481 ASSERT(cp->cp_loaded == NULL);
2482 ASSERT(cp->cp_ploaded == NULL);
2483 ASSERT(cp->cp_rounds == -1);
2484 ASSERT(cp->cp_prounds == -1);
2485 ASSERT(cp->cp_magsize == 0);
2486 cp->cp_magsize = skm->skm_magtype->mt_magsize;
2487 SKM_CPU_UNLOCK(cp);
2488 }
2489
2490 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2491 SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2492 SKMEM_CPU_CACHE(skm)->cp_magsize);
2493 }
2494
2495 /*
2496 * Enter the cache resize perimeter. Upon success, claim exclusivity
2497 * on the perimeter and return 0, else EBUSY. Caller may indicate
2498 * whether or not they're willing to wait.
2499 */
2500 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)2501 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2502 {
2503 SKM_RESIZE_LOCK(skm);
2504 if (skm->skm_rs_owner == current_thread()) {
2505 ASSERT(skm->skm_rs_busy != 0);
2506 skm->skm_rs_busy++;
2507 goto done;
2508 }
2509 if (!can_sleep) {
2510 if (skm->skm_rs_busy != 0) {
2511 SKM_RESIZE_UNLOCK(skm);
2512 return EBUSY;
2513 }
2514 } else {
2515 while (skm->skm_rs_busy != 0) {
2516 skm->skm_rs_want++;
2517 (void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
2518 SKM_RESIZE_UNLOCK(skm);
2519 (void) thread_block(THREAD_CONTINUE_NULL);
2520 SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2521 "(0x%llx) busy=%u", skm->skm_name,
2522 SK_KVA(skm), skm->skm_rs_busy);
2523 SKM_RESIZE_LOCK(skm);
2524 }
2525 }
2526 SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2527 ASSERT(skm->skm_rs_busy == 0);
2528 skm->skm_rs_busy++;
2529 skm->skm_rs_owner = current_thread();
2530 done:
2531 SKM_RESIZE_UNLOCK(skm);
2532 return 0;
2533 }
2534
2535 /*
2536 * Exit the cache resize perimeter and unblock any waiters.
2537 */
2538 static void
skmem_cache_resize_exit(struct skmem_cache * skm)2539 skmem_cache_resize_exit(struct skmem_cache *skm)
2540 {
2541 uint32_t want;
2542
2543 SKM_RESIZE_LOCK(skm);
2544 ASSERT(skm->skm_rs_busy != 0);
2545 ASSERT(skm->skm_rs_owner == current_thread());
2546 if (--skm->skm_rs_busy == 0) {
2547 skm->skm_rs_owner = NULL;
2548 /*
2549 * We're done; notify anyone that has lost the race.
2550 */
2551 if ((want = skm->skm_rs_want) != 0) {
2552 skm->skm_rs_want = 0;
2553 wakeup((void *)&skm->skm_rs_busy);
2554 SKM_RESIZE_UNLOCK(skm);
2555 } else {
2556 SKM_RESIZE_UNLOCK(skm);
2557 }
2558 } else {
2559 SKM_RESIZE_UNLOCK(skm);
2560 }
2561 }
2562
2563 /*
2564 * Recompute a cache's magazine size. This is an expensive operation
2565 * and should not be done frequently; larger magazines provide for a
2566 * higher transfer rate with the depot while smaller magazines reduce
2567 * the memory consumption.
2568 */
2569 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)2570 skmem_cache_magazine_resize(struct skmem_cache *skm)
2571 {
2572 struct skmem_magtype *mtp = skm->skm_magtype;
2573
2574 /* insist that we are executing in the update thread call context */
2575 ASSERT(sk_is_cache_update_protected());
2576 ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2577 /* depot contention only applies to dynamic mode */
2578 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2579
2580 /*
2581 * Although we're executing in the context of the update thread
2582 * call, we need to protect the per-CPU states during resizing
2583 * against other synchronous cache purge/reenable requests that
2584 * could take place in parallel.
2585 */
2586 if (skm->skm_chunksize < mtp->mt_maxbuf) {
2587 (void) skmem_cache_resize_enter(skm, TRUE);
2588 skmem_cache_magazine_purge(skm);
2589
2590 /*
2591 * Upgrade to the next magazine type with larger size.
2592 */
2593 SKM_DEPOT_LOCK_SPIN(skm);
2594 skm->skm_cpu_mag_resize++;
2595 skm->skm_magtype = ++mtp;
2596 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2597 skm->skm_depot_contention_prev =
2598 skm->skm_depot_contention + INT_MAX;
2599 SKM_DEPOT_UNLOCK(skm);
2600
2601 skmem_cache_magazine_enable(skm, 0);
2602 skmem_cache_resize_exit(skm);
2603 }
2604 }
2605
2606 /*
2607 * Rescale the cache's allocated-address hash table.
2608 */
2609 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2610 skmem_cache_hash_rescale(struct skmem_cache *skm)
2611 {
2612 struct skmem_bufctl_bkt *old_table, *new_table;
2613 size_t old_size, new_size;
2614 uint32_t i, moved = 0;
2615
2616 /* insist that we are executing in the update thread call context */
2617 ASSERT(sk_is_cache_update_protected());
2618
2619 /*
2620 * To get small average lookup time (lookup depth near 1.0), the hash
2621 * table size should be roughly the same (not necessarily equivalent)
2622 * as the cache size.
2623 */
2624 new_size = MAX(skm->skm_hash_initial,
2625 (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2626 new_size = MIN(skm->skm_hash_limit, new_size);
2627 old_size = (skm->skm_hash_mask + 1);
2628
2629 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2630 return;
2631 }
2632
2633 new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2634 Z_NOWAIT, skmem_tag_bufctl_hash);
2635 if (__improbable(new_table == NULL)) {
2636 return;
2637 }
2638
2639 for (i = 0; i < new_size; i++) {
2640 SLIST_INIT(&new_table[i].bcb_head);
2641 }
2642
2643 SKM_SLAB_LOCK(skm);
2644
2645 old_size = (skm->skm_hash_mask + 1);
2646 old_table = skm->skm_hash_table;
2647
2648 skm->skm_hash_mask = (new_size - 1);
2649 skm->skm_hash_table = new_table;
2650 skm->skm_sl_rescale++;
2651
2652 for (i = 0; i < old_size; i++) {
2653 struct skmem_bufctl_bkt *bcb = &old_table[i];
2654 struct skmem_bufctl_bkt *new_bcb;
2655 struct skmem_bufctl *bc;
2656
2657 while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2658 SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2659 new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2660 /*
2661 * Ideally we want to insert tail here, but simple
2662 * list doesn't give us that. The fact that we are
2663 * essentially reversing the order is not a big deal
2664 * here vis-a-vis the new table size.
2665 */
2666 SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2667 ++moved;
2668 }
2669 ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2670 }
2671
2672 SK_DF(SK_VERB_MEM_CACHE,
2673 "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2674 (uint32_t)old_size, (uint32_t)new_size, moved);
2675
2676 SKM_SLAB_UNLOCK(skm);
2677
2678 sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2679 }
2680
2681 /*
2682 * Apply a function to operate on all caches.
2683 */
2684 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2685 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2686 {
2687 struct skmem_cache *skm;
2688
2689 net_update_uptime();
2690
2691 SKMEM_CACHE_LOCK();
2692 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2693 func(skm, arg);
2694 }
2695 SKMEM_CACHE_UNLOCK();
2696 }
2697
2698 /*
2699 * Reclaim unused memory from a cache.
2700 */
2701 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2702 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2703 {
2704 /*
2705 * Inform the owner to free memory if possible; the reclaim
2706 * policy is left to the owner. This is just an advisory.
2707 */
2708 if (skm->skm_reclaim != NULL) {
2709 skm->skm_reclaim(skm->skm_private);
2710 }
2711
2712 if (lowmem) {
2713 /*
2714 * If another thread is in the process of purging or
2715 * resizing, bail out and let the currently-ongoing
2716 * purging take its natural course.
2717 */
2718 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2719 skmem_cache_magazine_purge(skm);
2720 skmem_cache_magazine_enable(skm, 0);
2721 skmem_cache_resize_exit(skm);
2722 }
2723 } else {
2724 skmem_depot_ws_reap(skm);
2725 }
2726 }
2727
2728 /*
2729 * Thread call callback for reap.
2730 */
2731 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2732 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2733 {
2734 #pragma unused(dummy)
2735 void (*func)(void) = arg;
2736
2737 ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2738 func();
2739 }
2740
2741 /*
2742 * Start reaping all caches; this is serialized via thread call.
2743 */
2744 static void
skmem_cache_reap_start(void)2745 skmem_cache_reap_start(void)
2746 {
2747 SK_DF(SK_VERB_MEM_CACHE, "now running");
2748 skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2749 skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2750 (skmem_cache_update_interval * NSEC_PER_SEC));
2751 }
2752
2753 /*
2754 * Stop reaping; this would allow another reap request to occur.
2755 */
2756 static void
skmem_cache_reap_done(void)2757 skmem_cache_reap_done(void)
2758 {
2759 volatile uint32_t *flag = &skmem_cache_reaping;
2760
2761 *flag = 0;
2762 membar_sync();
2763 }
2764
2765 /*
2766 * Immediately reap all unused memory of a cache. If purging,
2767 * also purge the cached objects at the CPU layer.
2768 */
2769 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2770 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2771 {
2772 if (purge) {
2773 /*
2774 * If another thread is in the process of purging or
2775 * resizing, bail out and let the currently-ongoing
2776 * purging take its natural course.
2777 */
2778 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2779 skmem_cache_magazine_purge(skm);
2780 skmem_cache_magazine_enable(skm, 0);
2781 skmem_cache_resize_exit(skm);
2782 }
2783 } else {
2784 skmem_depot_ws_zero(skm);
2785 skmem_depot_ws_reap(skm);
2786 }
2787 }
2788
2789 /*
2790 * Request a global reap operation to be dispatched.
2791 */
2792 void
skmem_cache_reap(void)2793 skmem_cache_reap(void)
2794 {
2795 /* only one reaping episode is allowed at a time */
2796 if (skmem_lock_owner == current_thread() ||
2797 !atomic_test_set_32(&skmem_cache_reaping, 0, 1)) {
2798 return;
2799 }
2800
2801 skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2802 }
2803
2804 /*
2805 * Reap internal caches.
2806 */
2807 void
skmem_reap_caches(boolean_t purge)2808 skmem_reap_caches(boolean_t purge)
2809 {
2810 skmem_cache_reap_now(skmem_slab_cache, purge);
2811 skmem_cache_reap_now(skmem_bufctl_cache, purge);
2812
2813 /* packet buffer pool objects */
2814 pp_reap_caches(purge);
2815
2816 /* also handle the region cache(s) */
2817 skmem_region_reap_caches(purge);
2818 }
2819
2820 /*
2821 * Thread call callback for update.
2822 */
2823 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2824 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2825 {
2826 #pragma unused(dummy, arg)
2827 sk_protect_t protect;
2828
2829 protect = sk_cache_update_protect();
2830 skmem_cache_applyall(skmem_cache_update, 0);
2831 sk_cache_update_unprotect(protect);
2832
2833 skmem_dispatch(skmem_cache_update_tc, NULL,
2834 (skmem_cache_update_interval * NSEC_PER_SEC));
2835 }
2836
2837 /*
2838 * Given a buffer control, record the current transaction.
2839 */
2840 __attribute__((noinline, cold, not_tail_called))
2841 static inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)2842 skmem_audit_bufctl(struct skmem_bufctl *bc)
2843 {
2844 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
2845 struct timeval tv;
2846
2847 microuptime(&tv);
2848 bca->bc_thread = current_thread();
2849 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
2850 bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
2851 }
2852
2853 /*
2854 * Given an object, find its buffer control and record the transaction.
2855 */
2856 __attribute__((noinline, cold, not_tail_called))
2857 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)2858 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
2859 {
2860 struct skmem_bufctl_bkt *bcb;
2861 struct skmem_bufctl *bc;
2862
2863 ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
2864
2865 SKM_SLAB_LOCK(skm);
2866 while (list != NULL) {
2867 void *buf = list;
2868
2869 bcb = SKMEM_CACHE_HASH(skm, buf);
2870 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
2871 if (bc->bc_addr == buf) {
2872 break;
2873 }
2874 }
2875
2876 if (__improbable(bc == NULL)) {
2877 panic("%s: %s failed to get bufctl for %p",
2878 __func__, skm->skm_name, buf);
2879 /* NOTREACHED */
2880 __builtin_unreachable();
2881 }
2882
2883 skmem_audit_bufctl(bc);
2884
2885 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
2886 break;
2887 }
2888
2889 list = list->mo_next;
2890 }
2891 SKM_SLAB_UNLOCK(skm);
2892 }
2893
2894 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * out,size_t len)2895 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len)
2896 {
2897 size_t actual_space = sizeof(struct sk_stats_cache);
2898 struct sk_stats_cache *sca = out;
2899 int contention;
2900
2901 if (out == NULL || len < actual_space) {
2902 goto done;
2903 }
2904
2905 bzero(sca, sizeof(*sca));
2906 (void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
2907 skm->skm_name);
2908 uuid_copy(sca->sca_uuid, skm->skm_uuid);
2909 uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
2910 sca->sca_mode = skm->skm_mode;
2911 sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
2912 sca->sca_objsize = (uint64_t)skm->skm_objsize;
2913 sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
2914 sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
2915 sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
2916 sca->sca_objalign = (uint64_t)skm->skm_objalign;
2917
2918 sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
2919 sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
2920 sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
2921 sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
2922 sca->sca_depot_full = skm->skm_depot_full;
2923 sca->sca_depot_empty = skm->skm_depot_empty;
2924 sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
2925 /* in case of a race this might be a negative value, turn it into 0 */
2926 if ((contention = (int)(skm->skm_depot_contention -
2927 skm->skm_depot_contention_prev)) < 0) {
2928 contention = 0;
2929 }
2930 sca->sca_depot_contention_factor = contention;
2931
2932 sca->sca_sl_create = skm->skm_sl_create;
2933 sca->sca_sl_destroy = skm->skm_sl_destroy;
2934 sca->sca_sl_alloc = skm->skm_sl_alloc;
2935 sca->sca_sl_free = skm->skm_sl_free;
2936 sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
2937 sca->sca_sl_partial = skm->skm_sl_partial;
2938 sca->sca_sl_empty = skm->skm_sl_empty;
2939 sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
2940 sca->sca_sl_rescale = skm->skm_sl_rescale;
2941 sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
2942
2943 done:
2944 return actual_space;
2945 }
2946
2947 static int
2948 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
2949 {
2950 #pragma unused(arg1, arg2, oidp)
2951 struct skmem_cache *skm;
2952 size_t actual_space;
2953 size_t buffer_space;
2954 size_t allocated_space;
2955 caddr_t buffer = NULL;
2956 caddr_t scan;
2957 int error = 0;
2958
2959 if (!kauth_cred_issuser(kauth_cred_get())) {
2960 return EPERM;
2961 }
2962
2963 net_update_uptime();
2964 buffer_space = req->oldlen;
2965 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2966 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2967 buffer_space = SK_SYSCTL_ALLOC_MAX;
2968 }
2969 allocated_space = buffer_space;
2970 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
2971 if (__improbable(buffer == NULL)) {
2972 return ENOBUFS;
2973 }
2974 } else if (req->oldptr == USER_ADDR_NULL) {
2975 buffer_space = 0;
2976 }
2977 actual_space = 0;
2978 scan = buffer;
2979
2980 SKMEM_CACHE_LOCK();
2981 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2982 size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
2983 if (scan != NULL) {
2984 if (buffer_space < size) {
2985 /* supplied buffer too small, stop copying */
2986 error = ENOMEM;
2987 break;
2988 }
2989 scan += size;
2990 buffer_space -= size;
2991 }
2992 actual_space += size;
2993 }
2994 SKMEM_CACHE_UNLOCK();
2995
2996 if (actual_space != 0) {
2997 int out_error = SYSCTL_OUT(req, buffer, actual_space);
2998 if (out_error != 0) {
2999 error = out_error;
3000 }
3001 }
3002 if (buffer != NULL) {
3003 sk_free_data(buffer, allocated_space);
3004 }
3005
3006 return error;
3007 }
3008