1 /*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h> /* for OSBacktrace */
33 #include <kern/sched_prim.h> /* for assert_wait */
34
35 /*
36 * Memory allocator with per-CPU caching (magazines), derived from the kmem
37 * magazine concept and implementation as described in the following paper:
38 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
39 *
40 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
41 * reserved. Use is subject to license terms.
42 *
43 * This derivative differs from the original kmem slab allocator, in that:
44 *
45 * a) There is always a discrete bufctl per object, even for small sizes.
46 * This increases the overhead, but is necessary as Skywalk objects
47 * coming from the slab may be shared (RO or RW) with userland; therefore
48 * embedding the KVA pointer linkage in freed objects is a non-starter.
49 *
50 * b) Writing patterns to the slab at slab creation or destruction time
51 * (when debugging is enabled) is not implemented, as the object may
52 * be shared (RW) with userland and thus we cannot panic upon pattern
53 * mismatch episodes. This can be relaxed so that we conditionally
54 * verify the pattern for kernel-only memory.
55 *
56 * This derivative also differs from Darwin's mcache allocator (which itself
57 * is a derivative of the original kmem slab allocator), in that:
58 *
59 * 1) The slab layer is internal to skmem_cache, unlike mcache's external
60 * slab layer required to support mbufs. skmem_cache also supports
61 * constructing and deconstructing objects, while mcache does not.
62 * This brings skmem_cache's model closer to that of the original
63 * kmem slab allocator.
64 *
65 * 2) mcache allows for batch allocation and free by way of chaining the
66 * objects together using a linked list. This requires using a part
67 * of the object to act as the linkage, which is against Skywalk's
68 * requirements of not exposing any KVA pointer to userland. Although
69 * this is supported by skmem_cache, chaining is only possible if the
70 * region is not mapped to userland. That implies that kernel-only
71 * objects can be chained provided the cache is created with batching
72 * mode enabled, and that the object is large enough to contain the
73 * skmem_obj structure.
74 *
75 * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
76 * implements features that are required by Skywalk. In addition to being
77 * aware of userland access on the buffers, in also supports mirrored backend
78 * memory regions. This allows a cache to manage two independent memory
79 * regions, such that allocating/freeing an object from/to one results in
80 * allocating/freeing a shadow object in another, thus guaranteeing that both
81 * objects share the same lifetime.
82 */
83
84 static uint32_t ncpu; /* total # of initialized CPUs */
85
86 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
87 static struct thread *skmem_lock_owner = THREAD_NULL;
88
89 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
90 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
91 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
92
93 #define SKMEM_CACHE_LOCK() do { \
94 lck_mtx_lock(&skmem_cache_lock); \
95 skmem_lock_owner = current_thread(); \
96 } while (0)
97 #define SKMEM_CACHE_UNLOCK() do { \
98 skmem_lock_owner = THREAD_NULL; \
99 lck_mtx_unlock(&skmem_cache_lock); \
100 } while (0)
101 #define SKMEM_CACHE_LOCK_ASSERT_HELD() \
102 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
103 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD() \
104 LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
105
106 #define SKM_SLAB_LOCK(_skm) \
107 lck_mtx_lock(&(_skm)->skm_sl_lock)
108 #define SKM_SLAB_LOCK_ASSERT_HELD(_skm) \
109 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
110 #define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm) \
111 LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
112 #define SKM_SLAB_UNLOCK(_skm) \
113 lck_mtx_unlock(&(_skm)->skm_sl_lock)
114
115 #define SKM_DEPOT_LOCK(_skm) \
116 lck_mtx_lock(&(_skm)->skm_dp_lock)
117 #define SKM_DEPOT_LOCK_SPIN(_skm) \
118 lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
119 #define SKM_DEPOT_CONVERT_LOCK(_skm) \
120 lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
121 #define SKM_DEPOT_LOCK_TRY(_skm) \
122 lck_mtx_try_lock(&(_skm)->skm_dp_lock)
123 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm) \
124 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
125 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm) \
126 LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
127 #define SKM_DEPOT_UNLOCK(_skm) \
128 lck_mtx_unlock(&(_skm)->skm_dp_lock)
129
130 #define SKM_RESIZE_LOCK(_skm) \
131 lck_mtx_lock(&(_skm)->skm_rs_lock)
132 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm) \
133 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
134 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm) \
135 LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
136 #define SKM_RESIZE_UNLOCK(_skm) \
137 lck_mtx_unlock(&(_skm)->skm_rs_lock)
138
139 #define SKM_CPU_LOCK(_cp) \
140 lck_mtx_lock(&(_cp)->cp_lock)
141 #define SKM_CPU_LOCK_SPIN(_cp) \
142 lck_mtx_lock_spin(&(_cp)->cp_lock)
143 #define SKM_CPU_CONVERT_LOCK(_cp) \
144 lck_mtx_convert_spin(&(_cp)->cp_lock)
145 #define SKM_CPU_LOCK_ASSERT_HELD(_cp) \
146 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
147 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp) \
148 LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
149 #define SKM_CPU_UNLOCK(_cp) \
150 lck_mtx_unlock(&(_cp)->cp_lock)
151
152 #define SKM_ZONE_MAX 256
153
154 static struct zone *skm_zone; /* zone for skmem_cache */
155
156 static struct skmem_cache *skmem_slab_cache; /* cache for skmem_slab */
157 static struct skmem_cache *skmem_bufctl_cache; /* cache for skmem_bufctl */
158 static unsigned int bc_size; /* size of bufctl */
159
160 /*
161 * Magazine types (one per row.)
162 *
163 * The first column defines the number of objects that the magazine can hold.
164 * Using that number, we derive the effective number: the aggregate count of
165 * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
166 * This would result in an object size that is aligned on the CPU cache
167 * size boundary; the exception to this is the KASAN mode where the size
168 * would be larger due to the redzone regions.
169 *
170 * The second column defines the alignment of the magazine. Because each
171 * magazine is used at the CPU-layer cache, we need to ensure there is no
172 * false sharing across the CPUs, and align the magazines to the maximum
173 * cache alignment size, for simplicity. The value of 0 may be used to
174 * indicate natural pointer size alignment.
175 *
176 * The third column defines the starting magazine type for a given cache,
177 * determined at the cache's creation time based on its chunk size.
178 *
179 * The fourth column defines the magazine type limit for a given cache.
180 * Magazine resizing will only occur if the chunk size is less than this.
181 */
182 static struct skmem_magtype skmem_magtype[] = {
183 #if defined(__LP64__)
184 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
185 .mt_cache = NULL, .mt_cname = "" },
186 { .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
187 .mt_cache = NULL, .mt_cname = "" },
188 { .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
189 .mt_cache = NULL, .mt_cname = "" },
190 { .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
191 .mt_cache = NULL, .mt_cname = "" },
192 { .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
193 .mt_cache = NULL, .mt_cname = "" },
194 { .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
195 .mt_cache = NULL, .mt_cname = "" },
196 { .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
197 .mt_cache = NULL, .mt_cname = "" },
198 { .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
199 .mt_cache = NULL, .mt_cname = "" },
200 #else /* !__LP64__ */
201 { .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
202 .mt_cache = NULL, .mt_cname = "" },
203 #endif /* !__LP64__ */
204 };
205
206 /*
207 * Hash table bounds. Start with the initial value, and rescale up to
208 * the specified limit. Ideally we don't need a limit, but in practice
209 * this helps guard against runaways. These values should be revisited
210 * in future and be adjusted as needed.
211 */
212 #define SKMEM_CACHE_HASH_INITIAL 64 /* initial hash table size */
213 #define SKMEM_CACHE_HASH_LIMIT 8192 /* hash table size limit */
214
215 #define SKMEM_CACHE_HASH_INDEX(_a, _s, _m) (((_a) >> (_s)) & (_m))
216 #define SKMEM_CACHE_HASH(_skm, _buf) \
217 (&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
218 (_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
219
220 /*
221 * The last magazine type.
222 */
223 static struct skmem_magtype *skmem_cache_magsize_last;
224
225 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
226 static boolean_t skmem_cache_ready;
227
228 static int skmem_slab_alloc_locked(struct skmem_cache *,
229 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
230 static void skmem_slab_free_locked(struct skmem_cache *, void *);
231 static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
232 struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
233 static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
234 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
235 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
236 static int skmem_magazine_ctor(struct skmem_obj_info *,
237 struct skmem_obj_info *, void *, uint32_t);
238 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
239 int);
240 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
241 struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t);
242 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
243 uint32_t *, struct skmem_mag *);
244 static void skmem_depot_ws_update(struct skmem_cache *);
245 static void skmem_depot_ws_zero(struct skmem_cache *);
246 static void skmem_depot_ws_reap(struct skmem_cache *);
247 static void skmem_cache_magazine_purge(struct skmem_cache *);
248 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
249 static void skmem_cache_magazine_resize(struct skmem_cache *);
250 static void skmem_cache_hash_rescale(struct skmem_cache *);
251 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
252 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
253 struct skmem_mag *, int);
254 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
255 uint32_t);
256 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
257 static void skmem_cache_reap_start(void);
258 static void skmem_cache_reap_done(void);
259 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
260 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
261 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
262 static void skmem_cache_resize_exit(struct skmem_cache *);
263 static void skmem_audit_bufctl(struct skmem_bufctl *);
264 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
265 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
266
267 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
268 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
269 0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
270 "Skywalk cache statistics");
271
272 static volatile uint32_t skmem_cache_reaping;
273 static thread_call_t skmem_cache_reap_tc;
274 static thread_call_t skmem_cache_update_tc;
275
276 extern kern_return_t thread_terminate(thread_t);
277 extern unsigned int ml_wait_max_cpus(void);
278
279 #define SKMEM_DEBUG_NOMAGAZINES 0x1 /* disable magazines layer */
280 #define SKMEM_DEBUG_AUDIT 0x2 /* audit transactions */
281 #define SKMEM_DEBUG_MASK (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
282
283 #if DEBUG
284 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
285 #else /* !DEBUG */
286 static uint32_t skmem_debug = 0;
287 #endif /* !DEBUG */
288
289 static uint32_t skmem_clear_min = 0; /* clear on free threshold */
290
291 #define SKMEM_CACHE_UPDATE_INTERVAL 11 /* 11 seconds */
292 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
293
294 #define SKMEM_DEPOT_CONTENTION 3 /* max failed trylock per interval */
295 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
296
297 /*
298 * Too big a value will cause overflow and thus trip the assertion; the
299 * idea here is to set an upper limit for the time that a particular
300 * thread is allowed to perform retries before we give up and panic.
301 */
302 #define SKMEM_SLAB_MAX_BACKOFF (20 * USEC_PER_SEC) /* seconds */
303
304 /*
305 * Threshold (in msec) after which we reset the exponential backoff value
306 * back to its (random) initial value. Note that we allow the actual delay
307 * to be at most twice this value.
308 */
309 #define SKMEM_SLAB_BACKOFF_THRES 1024 /* up to ~2 sec (2048 msec) */
310
311 /*
312 * To reduce the likelihood of global synchronization between threads,
313 * we use some random value to start the exponential backoff.
314 */
315 #define SKMEM_SLAB_BACKOFF_RANDOM 4 /* range is [1,4] msec */
316
317 #if (DEVELOPMENT || DEBUG)
318 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
319 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
320 SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
321 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
322 CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
323 SKMEM_DEPOT_CONTENTION, "Depot contention");
324
325 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
326
327 /*
328 * Called by skmem_test_start() to set the update interval.
329 */
330 void
skmem_cache_test_start(uint32_t i)331 skmem_cache_test_start(uint32_t i)
332 {
333 skmem_cache_update_interval_saved = skmem_cache_update_interval;
334 skmem_cache_update_interval = i;
335 }
336
337 /*
338 * Called by skmem_test_stop() to restore the update interval.
339 */
340 void
skmem_cache_test_stop(void)341 skmem_cache_test_stop(void)
342 {
343 skmem_cache_update_interval = skmem_cache_update_interval_saved;
344 }
345 #endif /* (DEVELOPMENT || DEBUG) */
346
347 #define SKMEM_TAG_BUFCTL_HASH "com.apple.skywalk.bufctl.hash"
348 static kern_allocation_name_t skmem_tag_bufctl_hash;
349
350 #define SKMEM_TAG_CACHE_MIB "com.apple.skywalk.cache.mib"
351 static kern_allocation_name_t skmem_tag_cache_mib;
352
353 static int __skmem_cache_pre_inited = 0;
354 static int __skmem_cache_inited = 0;
355
356 /*
357 * Called before skmem_region_init().
358 */
359 void
skmem_cache_pre_init(void)360 skmem_cache_pre_init(void)
361 {
362 vm_size_t skm_size;
363
364 ASSERT(!__skmem_cache_pre_inited);
365
366 ncpu = ml_wait_max_cpus();
367
368 /* allocate extra in case we need to manually align the pointer */
369 if (skm_zone == NULL) {
370 skm_size = SKMEM_CACHE_SIZE(ncpu);
371 #if KASAN
372 /*
373 * When KASAN is enabled, the zone allocator adjusts the
374 * element size to include the redzone regions, in which
375 * case we assume that the elements won't start on the
376 * alignment boundary and thus need to do some fix-ups.
377 * These include increasing the effective object size
378 * which adds at least 136 bytes to the original size,
379 * as computed by skmem_region_params_config() above.
380 */
381 skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
382 #endif /* KASAN */
383 skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
384 skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
385 ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
386 }
387
388 TAILQ_INIT(&skmem_cache_head);
389
390 __skmem_cache_pre_inited = 1;
391 }
392
393 /*
394 * Called after skmem_region_init().
395 */
396 void
skmem_cache_init(void)397 skmem_cache_init(void)
398 {
399 uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
400 struct skmem_magtype *mtp;
401 uint32_t i;
402
403 _CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
404
405 _CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
406 _CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
407 _CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
408 _CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
409 _CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
410 _CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
411 _CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
412
413 ASSERT(__skmem_cache_pre_inited);
414 ASSERT(!__skmem_cache_inited);
415
416 PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
417 skmem_debug &= SKMEM_DEBUG_MASK;
418
419 #if (DEVELOPMENT || DEBUG)
420 PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
421 sizeof(skmem_clear_min));
422 #endif /* (DEVELOPMENT || DEBUG) */
423 if (skmem_clear_min == 0) {
424 /* zeroing 2 CPU cache lines practically comes for free */
425 skmem_clear_min = 2 * cpu_cache_line_size;
426 } else {
427 /* round it up to CPU cache line size */
428 skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
429 cpu_cache_line_size);
430 }
431
432 /* create a cache for buffer control structures */
433 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
434 bc_size = sizeof(struct skmem_bufctl_audit);
435 skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
436 bc_size, sizeof(uint64_t), NULL, NULL,
437 NULL, NULL, NULL, 0);
438 } else {
439 bc_size = sizeof(struct skmem_bufctl);
440 skmem_bufctl_cache = skmem_cache_create("bufctl",
441 bc_size, sizeof(uint64_t), NULL, NULL,
442 NULL, NULL, NULL, 0);
443 }
444
445 /* create a cache for slab structures */
446 skmem_slab_cache = skmem_cache_create("slab",
447 sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
448 NULL, NULL, 0);
449
450 /*
451 * Go thru the magazine type table and create an cache for each.
452 */
453 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
454 mtp = &skmem_magtype[i];
455
456 if (mtp->mt_align != 0 &&
457 ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
458 mtp->mt_align < (int)cpu_cache_line_size)) {
459 panic("%s: bad alignment %d", __func__, mtp->mt_align);
460 /* NOTREACHED */
461 __builtin_unreachable();
462 }
463 (void) snprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
464 "mg.%d", mtp->mt_magsize);
465
466 /* create an cache for this magazine type */
467 mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
468 SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
469 skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
470
471 /* remember the last magazine type */
472 skmem_cache_magsize_last = mtp;
473 }
474
475 VERIFY(skmem_cache_magsize_last != NULL);
476 VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
477 VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
478
479 /*
480 * Allocate thread calls for cache reap and update operations.
481 */
482 skmem_cache_reap_tc =
483 thread_call_allocate_with_options(skmem_cache_reap_func,
484 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
485 skmem_cache_update_tc =
486 thread_call_allocate_with_options(skmem_cache_update_func,
487 NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
488 if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
489 panic("%s: thread_call_allocate failed", __func__);
490 /* NOTREACHED */
491 __builtin_unreachable();
492 }
493
494 /*
495 * We're ready; go through existing skmem_cache entries
496 * (if any) and enable the magazines layer for each.
497 */
498 skmem_cache_applyall(skmem_cache_magazine_enable, 0);
499 skmem_cache_ready = TRUE;
500
501 /* and start the periodic cache update machinery */
502 skmem_dispatch(skmem_cache_update_tc, NULL,
503 (skmem_cache_update_interval * NSEC_PER_SEC));
504
505 ASSERT(skmem_tag_bufctl_hash == NULL);
506 skmem_tag_bufctl_hash =
507 kern_allocation_name_allocate(SKMEM_TAG_BUFCTL_HASH, 0);
508 ASSERT(skmem_tag_bufctl_hash != NULL);
509
510 ASSERT(skmem_tag_cache_mib == NULL);
511 skmem_tag_cache_mib =
512 kern_allocation_name_allocate(SKMEM_TAG_CACHE_MIB, 0);
513 ASSERT(skmem_tag_cache_mib != NULL);
514
515 __skmem_cache_inited = 1;
516 }
517
518 void
skmem_cache_fini(void)519 skmem_cache_fini(void)
520 {
521 struct skmem_magtype *mtp;
522 uint32_t i;
523
524 if (__skmem_cache_inited) {
525 ASSERT(TAILQ_EMPTY(&skmem_cache_head));
526
527 for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
528 mtp = &skmem_magtype[i];
529 skmem_cache_destroy(mtp->mt_cache);
530 mtp->mt_cache = NULL;
531 }
532 skmem_cache_destroy(skmem_slab_cache);
533 skmem_slab_cache = NULL;
534 skmem_cache_destroy(skmem_bufctl_cache);
535 skmem_bufctl_cache = NULL;
536
537 if (skmem_cache_reap_tc != NULL) {
538 (void) thread_call_cancel_wait(skmem_cache_reap_tc);
539 (void) thread_call_free(skmem_cache_reap_tc);
540 skmem_cache_reap_tc = NULL;
541 }
542 if (skmem_cache_update_tc != NULL) {
543 (void) thread_call_cancel_wait(skmem_cache_update_tc);
544 (void) thread_call_free(skmem_cache_update_tc);
545 skmem_cache_update_tc = NULL;
546 }
547 if (skmem_tag_bufctl_hash != NULL) {
548 kern_allocation_name_release(skmem_tag_bufctl_hash);
549 skmem_tag_bufctl_hash = NULL;
550 }
551 if (skmem_tag_cache_mib != NULL) {
552 kern_allocation_name_release(skmem_tag_cache_mib);
553 skmem_tag_cache_mib = NULL;
554 }
555
556 __skmem_cache_inited = 0;
557 }
558
559 if (__skmem_cache_pre_inited) {
560 if (skm_zone != NULL) {
561 zdestroy(skm_zone);
562 skm_zone = NULL;
563 }
564
565 __skmem_cache_pre_inited = 0;
566 }
567 }
568
569 /*
570 * Create a cache.
571 */
572 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)573 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
574 skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
575 void *private, struct skmem_region *region, uint32_t cflags)
576 {
577 boolean_t pseudo = (region == NULL);
578 struct skmem_magtype *mtp;
579 struct skmem_cache *skm;
580 void *buf;
581 size_t segsize;
582 size_t chunksize;
583 size_t objsize;
584 size_t objalign;
585 uint32_t i, cpuid;
586
587 /* enforce 64-bit minimum alignment for buffers */
588 if (bufalign == 0) {
589 bufalign = SKMEM_CACHE_ALIGN;
590 }
591 bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
592
593 /* enforce alignment to be a power of 2 */
594 VERIFY(powerof2(bufalign));
595
596 if (region == NULL) {
597 struct skmem_region_params srp;
598
599 /* batching is currently not supported on pseudo regions */
600 VERIFY(!(cflags & SKMEM_CR_BATCH));
601
602 srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
603 ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
604
605 /* objalign is always equal to bufalign */
606 srp.srp_align = objalign = bufalign;
607 srp.srp_r_obj_cnt = 1;
608 srp.srp_r_obj_size = (uint32_t)bufsize;
609 skmem_region_params_config(&srp);
610
611 /* allocate region for intrinsics */
612 region = skmem_region_create(name, &srp, NULL, NULL, NULL);
613 VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
614 VERIFY(objalign == region->skr_align);
615 #if KASAN
616 /*
617 * When KASAN is enabled, the zone allocator adjusts the
618 * element size to include the redzone regions, in which
619 * case we assume that the elements won't start on the
620 * alignment boundary and thus need to do some fix-ups.
621 * These include increasing the effective object size
622 * which adds at least 16 bytes to the original size,
623 * as computed by skmem_region_params_config() above.
624 */
625 VERIFY(region->skr_c_obj_size >=
626 (bufsize + sizeof(uint64_t) + bufalign));
627 #endif /* KASAN */
628 /* enable magazine resizing by default */
629 cflags |= SKMEM_CR_DYNAMIC;
630
631 /*
632 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
633 * even though it's a no-op since the work is done
634 * at the zone layer instead.
635 */
636 cflags |= SKMEM_CR_CLEARONFREE;
637 } else {
638 objalign = region->skr_align;
639 }
640
641 ASSERT(region != NULL);
642 ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
643 segsize = region->skr_seg_size;
644 ASSERT(bufalign <= segsize);
645
646 buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO);
647 #if KASAN
648 /*
649 * In case we didn't get a cache-aligned memory, round it up
650 * accordingly. This is needed in order to get the rest of
651 * structure members aligned properly. It also means that
652 * the memory span gets shifted due to the round up, but it
653 * is okay since we've allocated extra space for this.
654 */
655 skm = (struct skmem_cache *)
656 P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
657 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
658 *pbuf = buf;
659 #else /* !KASAN */
660 /*
661 * We expect that the zone allocator would allocate elements
662 * rounded up to the requested alignment based on the object
663 * size computed in skmem_cache_pre_init() earlier, and
664 * 'skm' is therefore the element address itself.
665 */
666 skm = buf;
667 #endif /* !KASAN */
668 VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
669
670 if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
671 (cflags & SKMEM_CR_NOMAGAZINES)) {
672 /*
673 * Either the caller insists that this cache should not
674 * utilize magazines layer, or that the system override
675 * to disable magazines layer on all caches has been set.
676 */
677 skm->skm_mode |= SKM_MODE_NOMAGAZINES;
678 } else {
679 /*
680 * Region must be configured with enough objects
681 * to take into account objects at the CPU layer.
682 */
683 ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
684 }
685
686 if (cflags & SKMEM_CR_DYNAMIC) {
687 /*
688 * Enable per-CPU cache magazine resizing.
689 */
690 skm->skm_mode |= SKM_MODE_DYNAMIC;
691 }
692
693 /* region stays around after defunct? */
694 if (region->skr_mode & SKR_MODE_NOREDIRECT) {
695 skm->skm_mode |= SKM_MODE_NOREDIRECT;
696 }
697
698 if (cflags & SKMEM_CR_BATCH) {
699 /*
700 * Batch alloc/free involves storing the next object
701 * pointer at the beginning of each object; this is
702 * okay for kernel-only regions, but not those that
703 * are mappable to user space (we can't leak kernel
704 * addresses).
705 */
706 _CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
707 VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
708
709 /* batching is currently not supported on pseudo regions */
710 VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
711
712 /* validate object size */
713 VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
714
715 skm->skm_mode |= SKM_MODE_BATCH;
716 }
717
718 uuid_generate_random(skm->skm_uuid);
719 (void) snprintf(skm->skm_name, sizeof(skm->skm_name),
720 "%s.%s", SKMEM_CACHE_PREFIX, name);
721 skm->skm_bufsize = bufsize;
722 skm->skm_bufalign = bufalign;
723 skm->skm_objalign = objalign;
724 skm->skm_ctor = ctor;
725 skm->skm_dtor = dtor;
726 skm->skm_reclaim = reclaim;
727 skm->skm_private = private;
728 skm->skm_slabsize = segsize;
729
730 skm->skm_region = region;
731 /* callee holds reference */
732 skmem_region_slab_config(region, skm);
733 objsize = region->skr_c_obj_size;
734 skm->skm_objsize = objsize;
735
736 if (pseudo) {
737 /*
738 * Release reference from skmem_region_create()
739 * since skm->skm_region holds one now.
740 */
741 ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
742 skmem_region_release(region);
743
744 skm->skm_mode |= SKM_MODE_PSEUDO;
745
746 skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
747 skm->skm_slab_free = skmem_slab_free_pseudo_locked;
748 } else {
749 skm->skm_slab_alloc = skmem_slab_alloc_locked;
750 skm->skm_slab_free = skmem_slab_free_locked;
751
752 /* auditing was requested? (normal regions only) */
753 if (skmem_debug & SKMEM_DEBUG_AUDIT) {
754 ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
755 skm->skm_mode |= SKM_MODE_AUDIT;
756 }
757 }
758
759 /*
760 * Clear upon free (to slab layer) as long as the region is
761 * not marked as read-only for kernel, and if the chunk size
762 * is within the threshold or if the caller had requested it.
763 */
764 if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
765 if (skm->skm_objsize <= skmem_clear_min ||
766 (cflags & SKMEM_CR_CLEARONFREE)) {
767 skm->skm_mode |= SKM_MODE_CLEARONFREE;
768 }
769 }
770
771 chunksize = bufsize;
772 if (bufalign >= SKMEM_CACHE_ALIGN) {
773 chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
774 }
775
776 chunksize = P2ROUNDUP(chunksize, bufalign);
777 if (chunksize > objsize) {
778 panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
779 __func__, bufsize, chunksize, objsize);
780 /* NOTREACHED */
781 __builtin_unreachable();
782 }
783 ASSERT(chunksize != 0);
784 skm->skm_chunksize = chunksize;
785
786 lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
787 TAILQ_INIT(&skm->skm_sl_partial_list);
788 TAILQ_INIT(&skm->skm_sl_empty_list);
789
790 /* allocated-address hash table */
791 skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
792 skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
793 skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
794 skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
795
796 skm->skm_hash_mask = (skm->skm_hash_initial - 1);
797 skm->skm_hash_shift = flsll(chunksize) - 1;
798
799 for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
800 SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
801 }
802
803 lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
804
805 /* find a suitable magazine type for this chunk size */
806 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
807 continue;
808 }
809
810 skm->skm_magtype = mtp;
811 if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
812 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
813 }
814
815 /*
816 * Initialize the CPU layer. Each per-CPU structure is aligned
817 * on the CPU cache line boundary to prevent false sharing.
818 */
819 lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
820 for (cpuid = 0; cpuid < ncpu; cpuid++) {
821 struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
822
823 VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
824 lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
825 &skmem_lock_attr);
826 ccp->cp_rounds = -1;
827 ccp->cp_prounds = -1;
828 }
829
830 SKMEM_CACHE_LOCK();
831 TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
832 SKMEM_CACHE_UNLOCK();
833
834 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
835 skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
836 SK_DF(SK_VERB_MEM_CACHE,
837 " bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
838 (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
839 (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
840 (uint32_t)skm->skm_slabsize);
841
842 if (skmem_cache_ready) {
843 skmem_cache_magazine_enable(skm, 0);
844 }
845
846 return skm;
847 }
848
849 /*
850 * Destroy a cache.
851 */
852 void
skmem_cache_destroy(struct skmem_cache * skm)853 skmem_cache_destroy(struct skmem_cache *skm)
854 {
855 uint32_t cpuid;
856
857 SKMEM_CACHE_LOCK();
858 TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
859 SKMEM_CACHE_UNLOCK();
860
861 ASSERT(skm->skm_rs_busy == 0);
862 ASSERT(skm->skm_rs_want == 0);
863
864 /* purge all cached objects for this cache */
865 skmem_cache_magazine_purge(skm);
866
867 /*
868 * Panic if we detect there are unfreed objects; the caller
869 * destroying this cache is responsible for ensuring that all
870 * allocated objects have been freed prior to getting here.
871 */
872 SKM_SLAB_LOCK(skm);
873 if (skm->skm_sl_bufinuse != 0) {
874 panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
875 skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
876 /* NOTREACHED */
877 __builtin_unreachable();
878 }
879 ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
880 ASSERT(skm->skm_sl_partial == 0);
881 ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
882 ASSERT(skm->skm_sl_empty == 0);
883 skm->skm_reclaim = NULL;
884 skm->skm_ctor = NULL;
885 skm->skm_dtor = NULL;
886 SKM_SLAB_UNLOCK(skm);
887
888 if (skm->skm_hash_table != NULL) {
889 #if (DEBUG || DEVELOPMENT)
890 for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
891 ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
892 }
893 #endif /* DEBUG || DEVELOPMENT */
894
895 sk_free_type_array(struct skmem_bufctl_bkt,
896 skm->skm_hash_mask + 1, skm->skm_hash_table);
897 skm->skm_hash_table = NULL;
898 }
899
900 for (cpuid = 0; cpuid < ncpu; cpuid++) {
901 lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
902 &skmem_cpu_lock_grp);
903 }
904 lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
905 lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
906 lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
907
908 SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
909 skm->skm_name, SK_KVA(skm));
910
911 /* callee releases reference */
912 skmem_region_slab_config(skm->skm_region, NULL);
913 skm->skm_region = NULL;
914
915 #if KASAN
916 /* get the original address since we're about to free it */
917 void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
918 skm = *pbuf;
919 #endif /* KASAN */
920
921 zfree(skm_zone, skm);
922 }
923
924 /*
925 * Create a slab.
926 */
927 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)928 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
929 {
930 struct skmem_region *skr = skm->skm_region;
931 uint32_t objsize, chunks;
932 size_t slabsize = skm->skm_slabsize;
933 struct skmem_slab *sl;
934 struct sksegment *sg, *sgm;
935 char *buf, *bufm, *slab, *slabm;
936
937 /*
938 * Allocate a segment (a slab at our layer) from the region.
939 */
940 slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
941 if (slab == NULL) {
942 goto rg_alloc_failure;
943 }
944
945 if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
946 goto slab_alloc_failure;
947 }
948
949 ASSERT(sg != NULL);
950 ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
951
952 bzero(sl, sizeof(*sl));
953 sl->sl_cache = skm;
954 sl->sl_base = buf = slab;
955 sl->sl_basem = bufm = slabm;
956 ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
957 objsize = (uint32_t)skr->skr_c_obj_size;
958 ASSERT(skm->skm_objsize == objsize);
959 ASSERT((slabsize / objsize) <= UINT32_MAX);
960 sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
961 sl->sl_seg = sg;
962 sl->sl_segm = sgm;
963
964 /*
965 * Create one or more buffer control structures for the slab,
966 * each one tracking a chunk of raw object from the segment,
967 * and insert these into the slab's list of buffer controls.
968 */
969 ASSERT(chunks > 0);
970 while (chunks != 0) {
971 struct skmem_bufctl *bc;
972
973 bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
974 if (bc == NULL) {
975 goto bufctl_alloc_failure;
976 }
977
978 bzero(bc, bc_size);
979 bc->bc_addr = buf;
980 bc->bc_addrm = bufm;
981 bc->bc_slab = sl;
982 bc->bc_idx = (sl->sl_chunks - chunks);
983 if (skr->skr_mode & SKR_MODE_SHAREOK) {
984 bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
985 }
986 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
987 bc->bc_lim = objsize;
988 buf += objsize;
989 if (bufm != NULL) {
990 bufm += objsize;
991 }
992 --chunks;
993 }
994
995 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
996 SK_KVA(skm), SK_KVA(sl));
997 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
998 SK_KVA(slab), SK_KVA(slab + objsize));
999
1000 return sl;
1001
1002 bufctl_alloc_failure:
1003 skmem_slab_destroy(skm, sl);
1004
1005 slab_alloc_failure:
1006 skmem_region_free(skr, slab, slabm);
1007
1008 rg_alloc_failure:
1009 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
1010
1011 return NULL;
1012 }
1013
1014 /*
1015 * Destroy a slab.
1016 */
1017 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)1018 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
1019 {
1020 struct skmem_bufctl *bc, *tbc;
1021 void *slab = sl->sl_base;
1022 void *slabm = sl->sl_basem;
1023
1024 ASSERT(sl->sl_refcnt == 0);
1025
1026 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1027 SK_KVA(skm), SK_KVA(sl));
1028 SK_DF(SK_VERB_MEM_CACHE, " [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1029 SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1030
1031 /*
1032 * Go through the slab's list of buffer controls and free
1033 * them, and then free the slab itself back to its cache.
1034 */
1035 SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1036 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1037 skmem_cache_free(skmem_bufctl_cache, bc);
1038 }
1039 skmem_cache_free(skmem_slab_cache, sl);
1040
1041 /* and finally free the segment back to the backing region */
1042 skmem_region_free(skm->skm_region, slab, slabm);
1043 }
1044
1045 /*
1046 * Allocate a raw object from the (locked) slab layer. Normal region variant.
1047 */
1048 static int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1049 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
1050 struct skmem_obj_info *oim, uint32_t skmflag)
1051 {
1052 struct skmem_bufctl_bkt *bcb;
1053 struct skmem_bufctl *bc;
1054 struct skmem_slab *sl;
1055 uint32_t retries = 0;
1056 uint64_t boff_total = 0; /* in usec */
1057 uint64_t boff = 0; /* in msec */
1058 boolean_t new_slab;
1059 void *buf;
1060
1061 /* this flag is not for the caller to set */
1062 VERIFY(!(skmflag & SKMEM_FAILOK));
1063
1064 /*
1065 * A slab is either in a partially-allocated list (at least it has
1066 * a free object available), or is in the empty list (everything
1067 * has been allocated.) If we can't find a partially-allocated
1068 * slab, then we need to allocate a slab (segment) from the region.
1069 */
1070 again:
1071 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1072 sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1073 if (sl == NULL) {
1074 uint32_t flags = skmflag;
1075 boolean_t retry;
1076
1077 ASSERT(skm->skm_sl_partial == 0);
1078 SKM_SLAB_UNLOCK(skm);
1079 if (!(flags & SKMEM_NOSLEEP)) {
1080 /*
1081 * Pick up a random value to start the exponential
1082 * backoff, if this is the first round, or if the
1083 * current value is over the threshold. Otherwise,
1084 * double the backoff value.
1085 */
1086 if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
1087 read_frandom(&boff, sizeof(boff));
1088 boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
1089 ASSERT(boff > 0);
1090 } else if (os_mul_overflow(boff, 2, &boff)) {
1091 panic_plain("\"%s\": boff counter "
1092 "overflows\n", skm->skm_name);
1093 /* NOTREACHED */
1094 __builtin_unreachable();
1095 }
1096 /* add this value (in msec) to the total (in usec) */
1097 if (os_add_overflow(boff_total,
1098 (boff * NSEC_PER_USEC), &boff_total)) {
1099 panic_plain("\"%s\": boff_total counter "
1100 "overflows\n", skm->skm_name);
1101 /* NOTREACHED */
1102 __builtin_unreachable();
1103 }
1104 }
1105 /*
1106 * In the event of a race between multiple threads trying
1107 * to create the last remaining (or the only) slab, let the
1108 * loser(s) attempt to retry after waiting a bit. The winner
1109 * would have inserted the newly-created slab into the list.
1110 */
1111 if (!(flags & SKMEM_NOSLEEP) &&
1112 boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1113 retry = TRUE;
1114 ++retries;
1115 flags |= SKMEM_FAILOK;
1116 } else {
1117 if (!(flags & SKMEM_NOSLEEP)) {
1118 panic_plain("\"%s\": failed to allocate "
1119 "slab (sleeping mode) after %llu "
1120 "msec, %u retries\n\n%s", skm->skm_name,
1121 (boff_total / NSEC_PER_USEC), retries,
1122 skmem_dump(skm->skm_region));
1123 /* NOTREACHED */
1124 __builtin_unreachable();
1125 }
1126 retry = FALSE;
1127 }
1128
1129 /*
1130 * Create a new slab.
1131 */
1132 if ((sl = skmem_slab_create(skm, flags)) == NULL) {
1133 if (retry) {
1134 SK_ERR("\"%s\": failed to allocate "
1135 "slab (%ssleeping mode): waiting for %llu "
1136 "msec, total %llu msec, %u retries",
1137 skm->skm_name,
1138 (flags & SKMEM_NOSLEEP) ? "non-" : "",
1139 boff, (boff_total / NSEC_PER_USEC), retries);
1140 VERIFY(boff > 0 && ((uint32_t)boff <=
1141 (SKMEM_SLAB_BACKOFF_THRES * 2)));
1142 delay((uint32_t)boff * NSEC_PER_USEC);
1143 SKM_SLAB_LOCK(skm);
1144 goto again;
1145 } else {
1146 SK_RDERR(4, "\"%s\": failed to allocate slab "
1147 "(%ssleeping mode)", skm->skm_name,
1148 (flags & SKMEM_NOSLEEP) ? "non-" : "");
1149 SKM_SLAB_LOCK(skm);
1150 }
1151 return ENOMEM;
1152 }
1153
1154 SKM_SLAB_LOCK(skm);
1155 skm->skm_sl_create++;
1156 if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1157 skm->skm_sl_bufmax) {
1158 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1159 }
1160 }
1161 skm->skm_sl_alloc++;
1162
1163 new_slab = (sl->sl_refcnt == 0);
1164 ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
1165
1166 sl->sl_refcnt++;
1167 ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1168
1169 /*
1170 * We either have a new slab, or a partially-allocated one.
1171 * Remove a buffer control from the slab, and insert it to
1172 * the allocated-address hash chain.
1173 */
1174 bc = SLIST_FIRST(&sl->sl_head);
1175 ASSERT(bc != NULL);
1176 SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1177
1178 /* sanity check */
1179 VERIFY(bc->bc_usecnt == 0);
1180
1181 /*
1182 * Also store the master object's region info for the caller.
1183 */
1184 bzero(oi, sizeof(*oi));
1185 SKMEM_OBJ_ADDR(oi) = buf = bc->bc_addr;
1186 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1187 ASSERT(skm->skm_objsize <= UINT32_MAX);
1188 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1189 SKMEM_OBJ_IDX_REG(oi) =
1190 ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1191 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1192 /*
1193 * And for slave object.
1194 */
1195 if (oim != NULL) {
1196 bzero(oim, sizeof(*oim));
1197 if (bc->bc_addrm != NULL) {
1198 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1199 SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1200 SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1201 SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1202 }
1203 }
1204
1205 if (skm->skm_mode & SKM_MODE_BATCH) {
1206 ((struct skmem_obj *)buf)->mo_next = NULL;
1207 }
1208
1209 /* insert to allocated-address hash chain */
1210 bcb = SKMEM_CACHE_HASH(skm, buf);
1211 SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1212
1213 if (SLIST_EMPTY(&sl->sl_head)) {
1214 /*
1215 * If that was the last buffer control from this slab,
1216 * insert the slab into the empty list. If it was in
1217 * the partially-allocated list, then remove the slab
1218 * from there as well.
1219 */
1220 ASSERT(sl->sl_refcnt == sl->sl_chunks);
1221 if (new_slab) {
1222 ASSERT(sl->sl_chunks == 1);
1223 } else {
1224 ASSERT(sl->sl_chunks > 1);
1225 ASSERT(skm->skm_sl_partial > 0);
1226 skm->skm_sl_partial--;
1227 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1228 }
1229 skm->skm_sl_empty++;
1230 ASSERT(skm->skm_sl_empty != 0);
1231 TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1232 } else {
1233 /*
1234 * The slab is not empty; if it was newly allocated
1235 * above, then it's not in the partially-allocated
1236 * list and so we insert it there.
1237 */
1238 ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1239 if (new_slab) {
1240 skm->skm_sl_partial++;
1241 ASSERT(skm->skm_sl_partial != 0);
1242 TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1243 sl, sl_link);
1244 }
1245 }
1246
1247 /* if auditing is enabled, record this transaction */
1248 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1249 skmem_audit_bufctl(bc);
1250 }
1251
1252 return 0;
1253 }
1254
1255 /*
1256 * Allocate a raw object from the (locked) slab layer. Pseudo region variant.
1257 */
1258 static int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1259 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1260 struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
1261 {
1262 zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1263 struct skmem_region *skr = skm->skm_region;
1264 void *obj, *buf;
1265
1266 /* this flag is not for the caller to set */
1267 VERIFY(!(skmflag & SKMEM_FAILOK));
1268
1269 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1270
1271 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1272 /* mirrored region is not applicable */
1273 ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1274 /* batching is not yet supported */
1275 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1276
1277 if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) {
1278 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
1279 return ENOMEM;
1280 }
1281
1282 #if KASAN
1283 /*
1284 * Perform some fix-ups since the zone element isn't guaranteed
1285 * to be on the aligned boundary. The effective object size
1286 * has been adjusted accordingly by skmem_region_create() earlier
1287 * at cache creation time.
1288 *
1289 * 'buf' is get the aligned address for this object.
1290 */
1291 buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
1292 skm->skm_bufalign);
1293
1294 /*
1295 * Wind back a pointer size from the aligned address and
1296 * save the original address so we can free it later.
1297 */
1298 void **pbuf = (void **)((intptr_t)buf - sizeof(void *));
1299 *pbuf = obj;
1300
1301 VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1302 ((intptr_t)obj + skm->skm_objsize));
1303 #else /* !KASAN */
1304 /*
1305 * We expect that the zone allocator would allocate elements
1306 * rounded up to the requested alignment based on the effective
1307 * object size computed in skmem_region_create() earlier, and
1308 * 'buf' is therefore the element address itself.
1309 */
1310 buf = obj;
1311 #endif /* !KASAN */
1312
1313 /* make sure the object is aligned */
1314 VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1315
1316 /*
1317 * Return the object's info to the caller.
1318 */
1319 bzero(oi, sizeof(*oi));
1320 SKMEM_OBJ_ADDR(oi) = buf;
1321 ASSERT(skm->skm_objsize <= UINT32_MAX);
1322 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1323 if (oim != NULL) {
1324 bzero(oim, sizeof(*oim));
1325 }
1326
1327 skm->skm_sl_alloc++;
1328 skm->skm_sl_bufinuse++;
1329 if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1330 skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1331 }
1332
1333 return 0;
1334 }
1335
1336 /*
1337 * Allocate a raw object from the slab layer.
1338 */
1339 static int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1340 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
1341 struct skmem_obj_info *oim, uint32_t skmflag)
1342 {
1343 int err;
1344
1345 SKM_SLAB_LOCK(skm);
1346 err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1347 SKM_SLAB_UNLOCK(skm);
1348
1349 return err;
1350 }
1351
1352 /*
1353 * Allocate raw object(s) from the slab layer.
1354 */
1355 static uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1356 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1357 uint32_t num, uint32_t skmflag)
1358 {
1359 uint32_t need = num;
1360
1361 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1362 *list = NULL;
1363
1364 SKM_SLAB_LOCK(skm);
1365 for (;;) {
1366 struct skmem_obj_info oi, oim;
1367
1368 /*
1369 * Get a single raw object from the slab layer.
1370 */
1371 if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1372 break;
1373 }
1374
1375 *list = SKMEM_OBJ_ADDR(&oi);
1376 ASSERT((*list)->mo_next == NULL);
1377 /* store these inside the object itself */
1378 (*list)->mo_info = oi;
1379 (*list)->mo_minfo = oim;
1380 list = &(*list)->mo_next;
1381
1382 ASSERT(need != 0);
1383 if (--need == 0) {
1384 break;
1385 }
1386 }
1387 SKM_SLAB_UNLOCK(skm);
1388
1389 return num - need;
1390 }
1391
1392 /*
1393 * Free a raw object to the (locked) slab layer. Normal region variant.
1394 */
1395 static void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)1396 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
1397 {
1398 struct skmem_bufctl *bc, *tbc;
1399 struct skmem_bufctl_bkt *bcb;
1400 struct skmem_slab *sl = NULL;
1401
1402 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1403 ASSERT(buf != NULL);
1404 /* caller is expected to clear mo_next */
1405 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1406 ((struct skmem_obj *)buf)->mo_next == NULL);
1407
1408 /*
1409 * Search the hash chain to find a matching buffer control for the
1410 * given object address. If found, remove the buffer control from
1411 * the hash chain and insert it into the freelist. Otherwise, we
1412 * panic since the caller has given us a bogus address.
1413 */
1414 skm->skm_sl_free++;
1415 bcb = SKMEM_CACHE_HASH(skm, buf);
1416 SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1417 if (bc->bc_addr == buf) {
1418 SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1419 sl = bc->bc_slab;
1420 break;
1421 }
1422 }
1423
1424 if (bc == NULL) {
1425 panic("%s: attempt to free invalid or already-freed obj %p "
1426 "on skm %p", __func__, buf, skm);
1427 /* NOTREACHED */
1428 __builtin_unreachable();
1429 }
1430 ASSERT(sl != NULL && sl->sl_cache == skm);
1431 VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1432
1433 /* make sure this object is not currently in use by another object */
1434 VERIFY(bc->bc_usecnt == 0);
1435
1436 /* if auditing is enabled, record this transaction */
1437 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1438 skmem_audit_bufctl(bc);
1439 }
1440
1441 /* if clear on free is requested, zero out the object */
1442 if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1443 bzero(buf, skm->skm_objsize);
1444 }
1445
1446 /* insert the buffer control to the slab's freelist */
1447 SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1448
1449 ASSERT(sl->sl_refcnt >= 1);
1450 if (--sl->sl_refcnt == 0) {
1451 /*
1452 * If this was the last outstanding object for the slab,
1453 * remove the slab from the partially-allocated or empty
1454 * list, and destroy the slab (segment) back to the region.
1455 */
1456 if (sl->sl_chunks == 1) {
1457 ASSERT(skm->skm_sl_empty > 0);
1458 skm->skm_sl_empty--;
1459 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1460 } else {
1461 ASSERT(skm->skm_sl_partial > 0);
1462 skm->skm_sl_partial--;
1463 TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1464 }
1465 ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
1466 skm->skm_sl_bufinuse -= sl->sl_chunks;
1467 skm->skm_sl_destroy++;
1468 SKM_SLAB_UNLOCK(skm);
1469 skmem_slab_destroy(skm, sl);
1470 SKM_SLAB_LOCK(skm);
1471 return;
1472 }
1473
1474 ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1475 if (SLIST_NEXT(bc, bc_link) == NULL) {
1476 /*
1477 * If this is the first (potentially amongst many) object
1478 * that's returned to the slab, remove the slab from the
1479 * empty list and insert to end of the partially-allocated
1480 * list. This should help avoid thrashing the partial slab
1481 * since we avoid disturbing what's already at the front.
1482 */
1483 ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
1484 ASSERT(sl->sl_chunks > 1);
1485 ASSERT(skm->skm_sl_empty > 0);
1486 skm->skm_sl_empty--;
1487 TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1488 skm->skm_sl_partial++;
1489 ASSERT(skm->skm_sl_partial != 0);
1490 TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1491 }
1492 }
1493
1494 /*
1495 * Free a raw object to the (locked) slab layer. Pseudo region variant.
1496 */
1497 static void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)1498 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
1499 {
1500 struct skmem_region *skr = skm->skm_region;
1501 void *obj = buf;
1502
1503 ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1504
1505 SKM_SLAB_LOCK_ASSERT_HELD(skm);
1506
1507 VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1508
1509 #if KASAN
1510 /*
1511 * Since we stuffed the original zone element address before
1512 * the buffer address in KASAN mode, get it back since we're
1513 * about to free it.
1514 */
1515 void **pbuf = (void **)((intptr_t)obj - sizeof(void *));
1516
1517 VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1518 ((intptr_t)*pbuf + skm->skm_objsize));
1519
1520 obj = *pbuf;
1521 #endif /* KASAN */
1522
1523 /* free it to zone */
1524 zfree(skr->skr_zreg, obj);
1525
1526 skm->skm_sl_free++;
1527 ASSERT(skm->skm_sl_bufinuse > 0);
1528 skm->skm_sl_bufinuse--;
1529 }
1530
1531 /*
1532 * Free a raw object to the slab layer.
1533 */
1534 static void
skmem_slab_free(struct skmem_cache * skm,void * buf)1535 skmem_slab_free(struct skmem_cache *skm, void *buf)
1536 {
1537 if (skm->skm_mode & SKM_MODE_BATCH) {
1538 ((struct skmem_obj *)buf)->mo_next = NULL;
1539 }
1540
1541 SKM_SLAB_LOCK(skm);
1542 skm->skm_slab_free(skm, buf);
1543 SKM_SLAB_UNLOCK(skm);
1544 }
1545
1546 /*
1547 * Free raw object(s) to the slab layer.
1548 */
1549 static void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1550 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1551 {
1552 struct skmem_obj *listn;
1553
1554 ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1555
1556 SKM_SLAB_LOCK(skm);
1557 for (;;) {
1558 listn = list->mo_next;
1559 list->mo_next = NULL;
1560
1561 /*
1562 * Free a single object to the slab layer.
1563 */
1564 skm->skm_slab_free(skm, (void *)list);
1565
1566 /* if no more objects to free, we're done */
1567 if ((list = listn) == NULL) {
1568 break;
1569 }
1570 }
1571 SKM_SLAB_UNLOCK(skm);
1572 }
1573
1574 /*
1575 * Return the object's region info.
1576 */
1577 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)1578 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
1579 struct skmem_obj_info *oi, struct skmem_obj_info *oim)
1580 {
1581 struct skmem_bufctl_bkt *bcb;
1582 struct skmem_bufctl *bc;
1583 struct skmem_slab *sl;
1584
1585 /*
1586 * Search the hash chain to find a matching buffer control for the
1587 * given object address. If not found, panic since the caller has
1588 * given us a bogus address.
1589 */
1590 SKM_SLAB_LOCK(skm);
1591 bcb = SKMEM_CACHE_HASH(skm, buf);
1592 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1593 if (bc->bc_addr == buf) {
1594 break;
1595 }
1596 }
1597
1598 if (__improbable(bc == NULL)) {
1599 panic("%s: %s failed to get object info for %p",
1600 __func__, skm->skm_name, buf);
1601 /* NOTREACHED */
1602 __builtin_unreachable();
1603 }
1604
1605 /*
1606 * Return the master object's info to the caller.
1607 */
1608 sl = bc->bc_slab;
1609 SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1610 SKMEM_OBJ_BUFCTL(oi) = bc; /* master only; NULL for slave */
1611 ASSERT(skm->skm_objsize <= UINT32_MAX);
1612 SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1613 SKMEM_OBJ_IDX_REG(oi) =
1614 (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1615 SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1616 /*
1617 * And for slave object.
1618 */
1619 if (oim != NULL) {
1620 bzero(oim, sizeof(*oim));
1621 if (bc->bc_addrm != NULL) {
1622 SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1623 SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1624 SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1625 SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1626 }
1627 }
1628 SKM_SLAB_UNLOCK(skm);
1629 }
1630
1631 /*
1632 * Magazine constructor.
1633 */
1634 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)1635 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
1636 void *arg, uint32_t skmflag)
1637 {
1638 #pragma unused(oim, skmflag)
1639 struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1640
1641 ASSERT(oim == NULL);
1642 ASSERT(arg != NULL);
1643
1644 /*
1645 * Store it in the magazine object since we'll
1646 * need to refer to it during magazine destroy;
1647 * we can't safely refer to skm_magtype as the
1648 * depot lock may not be acquired then.
1649 */
1650 mg->mg_magtype = arg;
1651
1652 return 0;
1653 }
1654
1655 /*
1656 * Destroy a magazine (free each object to the slab layer).
1657 */
1658 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)1659 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
1660 int nrounds)
1661 {
1662 int round;
1663
1664 for (round = 0; round < nrounds; round++) {
1665 void *buf = mg->mg_round[round];
1666 struct skmem_obj *next;
1667
1668 if (skm->skm_mode & SKM_MODE_BATCH) {
1669 next = ((struct skmem_obj *)buf)->mo_next;
1670 ((struct skmem_obj *)buf)->mo_next = NULL;
1671 }
1672
1673 /* deconstruct the object */
1674 if (skm->skm_dtor != NULL) {
1675 skm->skm_dtor(buf, skm->skm_private);
1676 }
1677
1678 /*
1679 * In non-batching mode, each object in the magazine has
1680 * no linkage to its neighbor, so free individual object
1681 * to the slab layer now.
1682 */
1683 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1684 skmem_slab_free(skm, buf);
1685 } else {
1686 ((struct skmem_obj *)buf)->mo_next = next;
1687 }
1688 }
1689
1690 /*
1691 * In batching mode, each object is linked to its neighbor at free
1692 * time, and so take the bottom-most object and free it to the slab
1693 * layer. Because of the way the list is reversed during free, this
1694 * will bring along the rest of objects above it.
1695 */
1696 if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1697 skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1698 }
1699
1700 /* free the magazine itself back to cache */
1701 skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1702 }
1703
1704 /*
1705 * Get one or more magazines from the depot.
1706 */
1707 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag ** list,uint32_t num)1708 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1709 uint32_t *count, struct skmem_mag **list, uint32_t num)
1710 {
1711 SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1712 struct skmem_mag *mg;
1713 uint32_t need = num, c = 0;
1714
1715 ASSERT(list != NULL && need > 0);
1716
1717 if (!SKM_DEPOT_LOCK_TRY(skm)) {
1718 /*
1719 * Track the amount of lock contention here; if the contention
1720 * level is high (more than skmem_cache_depot_contention per a
1721 * given skmem_cache_update_interval interval), then we treat
1722 * it as a sign that the per-CPU layer is not using the right
1723 * magazine type, and that we'd need to resize it.
1724 */
1725 SKM_DEPOT_LOCK(skm);
1726 if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1727 skm->skm_depot_contention++;
1728 }
1729 }
1730
1731 while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1732 SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1733 SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1734 ASSERT(ml->ml_total != 0);
1735 if (--ml->ml_total < ml->ml_min) {
1736 ml->ml_min = ml->ml_total;
1737 }
1738 c++;
1739 ml->ml_alloc++;
1740 if (--need == 0) {
1741 break;
1742 }
1743 }
1744 *count -= c;
1745
1746 SKM_DEPOT_UNLOCK(skm);
1747
1748 *list = SLIST_FIRST(&mg_list);
1749
1750 return num - need;
1751 }
1752
1753 /*
1754 * Return one or more magazines to the depot.
1755 */
1756 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1757 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1758 uint32_t *count, struct skmem_mag *mg)
1759 {
1760 struct skmem_mag *nmg;
1761 uint32_t c = 0;
1762
1763 SKM_DEPOT_LOCK(skm);
1764 while (mg != NULL) {
1765 nmg = SLIST_NEXT(mg, mg_link);
1766 SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1767 ml->ml_total++;
1768 c++;
1769 mg = nmg;
1770 }
1771 *count += c;
1772 SKM_DEPOT_UNLOCK(skm);
1773 }
1774
1775 /*
1776 * Update the depot's working state statistics.
1777 */
1778 static void
skmem_depot_ws_update(struct skmem_cache * skm)1779 skmem_depot_ws_update(struct skmem_cache *skm)
1780 {
1781 SKM_DEPOT_LOCK_SPIN(skm);
1782 skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1783 skm->skm_full.ml_min = skm->skm_full.ml_total;
1784 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1785 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1786 SKM_DEPOT_UNLOCK(skm);
1787 }
1788
1789 /*
1790 * Empty the depot's working state statistics (everything's reapable.)
1791 */
1792 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1793 skmem_depot_ws_zero(struct skmem_cache *skm)
1794 {
1795 SKM_DEPOT_LOCK_SPIN(skm);
1796 if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1797 skm->skm_full.ml_min != skm->skm_full.ml_total ||
1798 skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1799 skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1800 skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1801 skm->skm_full.ml_min = skm->skm_full.ml_total;
1802 skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1803 skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1804 skm->skm_depot_ws_zero++;
1805 }
1806 SKM_DEPOT_UNLOCK(skm);
1807 }
1808
1809 /*
1810 * Reap magazines that's outside of the working set.
1811 */
1812 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1813 skmem_depot_ws_reap(struct skmem_cache *skm)
1814 {
1815 struct skmem_mag *mg, *nmg;
1816 uint32_t f, e, reap;
1817
1818 reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1819 if (reap != 0) {
1820 (void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1821 &skm->skm_depot_full, &mg, reap);
1822 while (mg != NULL) {
1823 nmg = SLIST_NEXT(mg, mg_link);
1824 SLIST_NEXT(mg, mg_link) = NULL;
1825 skmem_magazine_destroy(skm, mg,
1826 mg->mg_magtype->mt_magsize);
1827 mg = nmg;
1828 }
1829 }
1830
1831 reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1832 if (reap != 0) {
1833 (void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1834 &skm->skm_depot_empty, &mg, reap);
1835 while (mg != NULL) {
1836 nmg = SLIST_NEXT(mg, mg_link);
1837 SLIST_NEXT(mg, mg_link) = NULL;
1838 skmem_magazine_destroy(skm, mg, 0);
1839 mg = nmg;
1840 }
1841 }
1842
1843 if (f != 0 || e != 0) {
1844 atomic_add_32(&skm->skm_cpu_mag_reap, 1);
1845 }
1846 }
1847
1848 /*
1849 * Performs periodic maintenance on a cache. This is serialized
1850 * through the update thread call, and so we guarantee there's at
1851 * most one update episode in the system at any given time.
1852 */
1853 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1854 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1855 {
1856 #pragma unused(arg)
1857 boolean_t resize_mag = FALSE;
1858 boolean_t rescale_hash = FALSE;
1859
1860 SKMEM_CACHE_LOCK_ASSERT_HELD();
1861
1862 /* insist that we are executing in the update thread call context */
1863 ASSERT(sk_is_cache_update_protected());
1864
1865 /*
1866 * If the cache has become much larger or smaller than the
1867 * allocated-address hash table, rescale the hash table.
1868 */
1869 SKM_SLAB_LOCK(skm);
1870 if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1871 (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1872 (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1873 skm->skm_hash_mask > skm->skm_hash_initial)) {
1874 rescale_hash = TRUE;
1875 }
1876 SKM_SLAB_UNLOCK(skm);
1877
1878 /*
1879 * Update the working set.
1880 */
1881 skmem_depot_ws_update(skm);
1882
1883 /*
1884 * If the contention count is greater than the threshold during
1885 * the update interval, and if we are not already at the maximum
1886 * magazine size, increase it.
1887 */
1888 SKM_DEPOT_LOCK_SPIN(skm);
1889 if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1890 (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1891 skmem_cache_depot_contention) {
1892 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1893 resize_mag = TRUE;
1894 }
1895 skm->skm_depot_contention_prev = skm->skm_depot_contention;
1896 SKM_DEPOT_UNLOCK(skm);
1897
1898 if (rescale_hash) {
1899 skmem_cache_hash_rescale(skm);
1900 }
1901
1902 if (resize_mag) {
1903 skmem_cache_magazine_resize(skm);
1904 }
1905 }
1906
1907 /*
1908 * Reload the CPU's magazines with mg and its follower (if any).
1909 */
1910 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1911 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1912 int rounds)
1913 {
1914 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1915 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1916 ASSERT(cp->cp_magsize > 0);
1917
1918 cp->cp_loaded = mg;
1919 cp->cp_rounds = rounds;
1920 if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1921 cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1922 cp->cp_prounds = rounds;
1923 SLIST_NEXT(mg, mg_link) = NULL;
1924 } else {
1925 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1926 cp->cp_ploaded = NULL;
1927 cp->cp_prounds = -1;
1928 }
1929 }
1930
1931 /*
1932 * Reload the CPU's magazine with mg and save the previous one.
1933 */
1934 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1935 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
1936 {
1937 ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1938 (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1939 ASSERT(cp->cp_magsize > 0);
1940
1941 cp->cp_ploaded = cp->cp_loaded;
1942 cp->cp_prounds = cp->cp_rounds;
1943 cp->cp_loaded = mg;
1944 cp->cp_rounds = rounds;
1945 }
1946
1947 /*
1948 * Allocate a constructed object from the cache.
1949 */
1950 void *
skmem_cache_alloc(struct skmem_cache * skm,uint32_t skmflag)1951 skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
1952 {
1953 struct skmem_obj *buf;
1954
1955 (void) skmem_cache_batch_alloc(skm, &buf, 1, skmflag);
1956 return buf;
1957 }
1958
1959 /*
1960 * Allocate constructed object(s) from the cache.
1961 */
1962 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1963 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1964 uint32_t num, uint32_t skmflag)
1965 {
1966 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1967 struct skmem_obj **top = &(*list);
1968 struct skmem_mag *mg;
1969 uint32_t need = num;
1970
1971 ASSERT(list != NULL);
1972 *list = NULL;
1973
1974 if (need == 0) {
1975 return 0;
1976 }
1977 ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
1978
1979 SKM_CPU_LOCK(cp);
1980 for (;;) {
1981 /*
1982 * If we have an object in the current CPU's loaded
1983 * magazine, return it and we're done.
1984 */
1985 if (cp->cp_rounds > 0) {
1986 int objs = MIN((unsigned int)cp->cp_rounds, need);
1987 /*
1988 * In the SKM_MODE_BATCH case, objects in are already
1989 * linked together with the most recently freed object
1990 * at the head of the list; grab as many objects as we
1991 * can. Otherwise we'll just grab 1 object at most.
1992 */
1993 *list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1994 cp->cp_rounds -= objs;
1995 cp->cp_alloc += objs;
1996
1997 if (skm->skm_mode & SKM_MODE_BATCH) {
1998 struct skmem_obj *tail =
1999 cp->cp_loaded->mg_round[cp->cp_rounds];
2000 list = &tail->mo_next;
2001 *list = NULL;
2002 }
2003
2004 /* if we got them all, return to caller */
2005 if ((need -= objs) == 0) {
2006 SKM_CPU_UNLOCK(cp);
2007 goto done;
2008 }
2009 }
2010
2011 /*
2012 * The CPU's loaded magazine is empty. If the previously
2013 * loaded magazine was full, exchange and try again.
2014 */
2015 if (cp->cp_prounds > 0) {
2016 skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2017 continue;
2018 }
2019
2020 /*
2021 * If the magazine layer is disabled, allocate from slab.
2022 * This can happen either because SKM_MODE_NOMAGAZINES is
2023 * set, or because we are resizing the magazine now.
2024 */
2025 if (cp->cp_magsize == 0) {
2026 break;
2027 }
2028
2029 /*
2030 * Both of the CPU's magazines are empty; try to get
2031 * full magazine(s) from the depot layer. Upon success,
2032 * reload and try again. To prevent potential thrashing,
2033 * replace both empty magazines only if the requested
2034 * count exceeds a magazine's worth of objects.
2035 */
2036 (void) skmem_depot_batch_alloc(skm, &skm->skm_full,
2037 &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
2038 if (mg != NULL) {
2039 SLIST_HEAD(, skmem_mag) mg_list =
2040 SLIST_HEAD_INITIALIZER(mg_list);
2041
2042 if (cp->cp_ploaded != NULL) {
2043 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2044 mg_link);
2045 }
2046 if (SLIST_NEXT(mg, mg_link) == NULL) {
2047 /*
2048 * Depot allocation returns only 1 magazine;
2049 * retain current empty magazine.
2050 */
2051 skmem_cpu_reload(cp, mg, cp->cp_magsize);
2052 } else {
2053 /*
2054 * We got 2 full magazines from depot;
2055 * release the current empty magazine
2056 * back to the depot layer.
2057 */
2058 if (cp->cp_loaded != NULL) {
2059 SLIST_INSERT_HEAD(&mg_list,
2060 cp->cp_loaded, mg_link);
2061 }
2062 skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
2063 }
2064 skmem_depot_batch_free(skm, &skm->skm_empty,
2065 &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2066 continue;
2067 }
2068
2069 /*
2070 * The depot layer doesn't have any full magazines;
2071 * allocate directly from the slab layer.
2072 */
2073 break;
2074 }
2075 SKM_CPU_UNLOCK(cp);
2076
2077 if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
2078 struct skmem_obj *rtop, *rlist, *rlistp = NULL;
2079 uint32_t rlistc, c = 0;
2080
2081 /*
2082 * Get a list of raw objects from the slab layer.
2083 */
2084 rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
2085 ASSERT(rlistc == 0 || rlist != NULL);
2086 rtop = rlist;
2087
2088 /*
2089 * Construct each object in the raw list. Upon failure,
2090 * free any remaining objects in the list back to the slab
2091 * layer, and keep the ones that were successfully constructed.
2092 * Here, "oi" and "oim" in each skmem_obj refer to the objects
2093 * coming from the master and slave regions (on mirrored
2094 * regions), respectively. They are stored inside the object
2095 * temporarily so that we can pass them to the constructor.
2096 */
2097 while (skm->skm_ctor != NULL && rlist != NULL) {
2098 struct skmem_obj_info *oi = &rlist->mo_info;
2099 struct skmem_obj_info *oim = &rlist->mo_minfo;
2100 struct skmem_obj *rlistn = rlist->mo_next;
2101
2102 /*
2103 * Note that the constructor guarantees at least
2104 * the size of a pointer at the top of the object
2105 * and no more than that. That means we must not
2106 * refer to "oi" and "oim" any longer after the
2107 * object goes thru the constructor.
2108 */
2109 if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2110 oim : NULL), skm->skm_private, skmflag) != 0) {
2111 VERIFY(rlist->mo_next == rlistn);
2112 atomic_add_64(&skm->skm_sl_alloc_fail,
2113 rlistc - c);
2114 if (rlistp != NULL) {
2115 rlistp->mo_next = NULL;
2116 }
2117 if (rlist == rtop) {
2118 rtop = NULL;
2119 ASSERT(c == 0);
2120 }
2121 skmem_slab_batch_free(skm, rlist);
2122 rlist = NULL;
2123 rlistc = c;
2124 break;
2125 }
2126 VERIFY(rlist->mo_next == rlistn);
2127
2128 ++c; /* # of constructed objs */
2129 rlistp = rlist;
2130 if ((rlist = rlist->mo_next) == NULL) {
2131 ASSERT(rlistc == c);
2132 break;
2133 }
2134 }
2135
2136 /*
2137 * At this point "top" points to the head of the chain we're
2138 * going to return to caller; "list" points to the tail of that
2139 * chain. The second chain begins at "rtop", and we append
2140 * that after "list" to form a single chain. "rlistc" is the
2141 * number of objects in "rtop" originated from the slab layer
2142 * that have been successfully constructed (if applicable).
2143 */
2144 ASSERT(c == 0 || rtop != NULL);
2145 need -= rlistc;
2146 *list = rtop;
2147 } else {
2148 struct skmem_obj_info oi, oim;
2149 void *buf;
2150
2151 ASSERT(*top == NULL && num == 1 && need == 1);
2152
2153 /*
2154 * Get a single raw object from the slab layer.
2155 */
2156 if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
2157 goto done;
2158 }
2159
2160 buf = SKMEM_OBJ_ADDR(&oi);
2161 ASSERT(buf != NULL);
2162
2163 /*
2164 * Construct the raw object. Here, "oi" and "oim" refer to
2165 * the objects coming from the master and slave regions (on
2166 * mirrored regions), respectively.
2167 */
2168 if (skm->skm_ctor != NULL &&
2169 skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2170 &oim : NULL), skm->skm_private, skmflag) != 0) {
2171 atomic_add_64(&skm->skm_sl_alloc_fail, 1);
2172 skmem_slab_free(skm, buf);
2173 goto done;
2174 }
2175
2176 need = 0;
2177 *list = buf;
2178 ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
2179 (*list)->mo_next == NULL);
2180 }
2181
2182 done:
2183 /* if auditing is enabled, record this transaction */
2184 if (__improbable(*top != NULL &&
2185 (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2186 skmem_audit_buf(skm, *top);
2187 }
2188
2189 return num - need;
2190 }
2191
2192 /*
2193 * Free a constructed object to the cache.
2194 */
2195 void
skmem_cache_free(struct skmem_cache * skm,void * buf)2196 skmem_cache_free(struct skmem_cache *skm, void *buf)
2197 {
2198 if (skm->skm_mode & SKM_MODE_BATCH) {
2199 ((struct skmem_obj *)buf)->mo_next = NULL;
2200 }
2201 skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2202 }
2203
2204 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)2205 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
2206 {
2207 struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2208 struct skmem_magtype *mtp;
2209 struct skmem_mag *mg;
2210 struct skmem_obj *listn;
2211
2212 /* if auditing is enabled, record this transaction */
2213 if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2214 skmem_audit_buf(skm, list);
2215 }
2216
2217 SKM_CPU_LOCK(cp);
2218 for (;;) {
2219 /*
2220 * If there's an available space in the current CPU's
2221 * loaded magazine, place it there and we're done.
2222 */
2223 if ((unsigned int)cp->cp_rounds <
2224 (unsigned int)cp->cp_magsize) {
2225 /*
2226 * In the SKM_MODE_BATCH case, reverse the list
2227 * while we place each object into the magazine;
2228 * this effectively causes the most recently
2229 * freed object to be reused during allocation.
2230 */
2231 if (skm->skm_mode & SKM_MODE_BATCH) {
2232 listn = list->mo_next;
2233 list->mo_next = (cp->cp_rounds == 0) ? NULL :
2234 cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2235 } else {
2236 listn = NULL;
2237 }
2238
2239 cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2240 cp->cp_free++;
2241
2242 if ((list = listn) != NULL) {
2243 continue;
2244 }
2245
2246 SKM_CPU_UNLOCK(cp);
2247 return;
2248 }
2249
2250 /*
2251 * The loaded magazine is full. If the previously
2252 * loaded magazine was empty, exchange and try again.
2253 */
2254 if (cp->cp_prounds == 0) {
2255 skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2256 continue;
2257 }
2258
2259 /*
2260 * If the magazine layer is disabled, free to slab.
2261 * This can happen either because SKM_MODE_NOMAGAZINES
2262 * is set, or because we are resizing the magazine now.
2263 */
2264 if (cp->cp_magsize == 0) {
2265 break;
2266 }
2267
2268 /*
2269 * Both magazines for the CPU are full; try to get
2270 * empty magazine(s) from the depot. If we get one,
2271 * exchange a full magazine with it and place the
2272 * object in there.
2273 *
2274 * TODO: Because the caller currently doesn't indicate
2275 * the number of objects in the list, we choose the more
2276 * conservative approach of allocating only 1 empty
2277 * magazine (to prevent potential thrashing). Once we
2278 * have the object count, we can replace 1 with similar
2279 * logic as used in skmem_cache_batch_alloc().
2280 */
2281 (void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
2282 &skm->skm_depot_empty, &mg, 1);
2283 if (mg != NULL) {
2284 SLIST_HEAD(, skmem_mag) mg_list =
2285 SLIST_HEAD_INITIALIZER(mg_list);
2286
2287 if (cp->cp_ploaded != NULL) {
2288 SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2289 mg_link);
2290 }
2291 if (SLIST_NEXT(mg, mg_link) == NULL) {
2292 /*
2293 * Depot allocation returns only 1 magazine;
2294 * retain current full magazine.
2295 */
2296 skmem_cpu_reload(cp, mg, 0);
2297 } else {
2298 /*
2299 * We got 2 empty magazines from depot;
2300 * release the current full magazine back
2301 * to the depot layer.
2302 */
2303 if (cp->cp_loaded != NULL) {
2304 SLIST_INSERT_HEAD(&mg_list,
2305 cp->cp_loaded, mg_link);
2306 }
2307 skmem_cpu_batch_reload(cp, mg, 0);
2308 }
2309 skmem_depot_batch_free(skm, &skm->skm_full,
2310 &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2311 continue;
2312 }
2313
2314 /*
2315 * We can't get any empty magazine from the depot, and
2316 * so we need to allocate one. If the allocation fails,
2317 * just fall through, deconstruct and free the object
2318 * to the slab layer.
2319 */
2320 mtp = skm->skm_magtype;
2321 SKM_CPU_UNLOCK(cp);
2322 mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
2323 SKM_CPU_LOCK(cp);
2324
2325 if (mg != NULL) {
2326 /*
2327 * We allocated an empty magazine, but since we
2328 * dropped the CPU lock above the magazine size
2329 * may have changed. If that's the case free
2330 * the magazine and try again.
2331 */
2332 if (cp->cp_magsize != mtp->mt_magsize) {
2333 SKM_CPU_UNLOCK(cp);
2334 skmem_cache_free(mtp->mt_cache, mg);
2335 SKM_CPU_LOCK(cp);
2336 continue;
2337 }
2338
2339 /*
2340 * We have a magazine with the right size;
2341 * add it to the depot and try again.
2342 */
2343 ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2344 skmem_depot_batch_free(skm, &skm->skm_empty,
2345 &skm->skm_depot_empty, mg);
2346 continue;
2347 }
2348
2349 /*
2350 * We can't get an empty magazine, so free to slab.
2351 */
2352 break;
2353 }
2354 SKM_CPU_UNLOCK(cp);
2355
2356 /*
2357 * We weren't able to free the constructed object(s) to the
2358 * magazine layer, so deconstruct them and free to the slab.
2359 */
2360 if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2361 list->mo_next != NULL)) {
2362 /* whatever is left from original list */
2363 struct skmem_obj *top = list;
2364
2365 while (list != NULL && skm->skm_dtor != NULL) {
2366 listn = list->mo_next;
2367 list->mo_next = NULL;
2368
2369 /* deconstruct the object */
2370 if (skm->skm_dtor != NULL) {
2371 skm->skm_dtor((void *)list, skm->skm_private);
2372 }
2373
2374 list->mo_next = listn;
2375 list = listn;
2376 }
2377
2378 skmem_slab_batch_free(skm, top);
2379 } else {
2380 /* deconstruct the object */
2381 if (skm->skm_dtor != NULL) {
2382 skm->skm_dtor((void *)list, skm->skm_private);
2383 }
2384
2385 skmem_slab_free(skm, (void *)list);
2386 }
2387 }
2388
2389 /*
2390 * Return the maximum number of objects cached at the magazine layer
2391 * based on the chunk size. This takes into account the starting
2392 * magazine type as well as the final magazine type used in resizing.
2393 */
2394 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)2395 skmem_cache_magazine_max(uint32_t chunksize)
2396 {
2397 struct skmem_magtype *mtp;
2398 uint32_t magsize_max;
2399
2400 VERIFY(ncpu != 0);
2401 VERIFY(chunksize > 0);
2402
2403 /* find a suitable magazine type for this chunk size */
2404 for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2405 continue;
2406 }
2407
2408 /* and find the last magazine type */
2409 for (;;) {
2410 magsize_max = mtp->mt_magsize;
2411 if (mtp == skmem_cache_magsize_last ||
2412 chunksize >= mtp->mt_maxbuf) {
2413 break;
2414 }
2415 ++mtp;
2416 VERIFY(mtp <= skmem_cache_magsize_last);
2417 }
2418
2419 return ncpu * magsize_max * 2; /* two magazines per CPU */
2420 }
2421
2422 /*
2423 * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2424 */
2425 boolean_t
skmem_allow_magazines(void)2426 skmem_allow_magazines(void)
2427 {
2428 return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2429 }
2430
2431 /*
2432 * Purge all magazines from a cache and disable its per-CPU magazines layer.
2433 */
2434 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)2435 skmem_cache_magazine_purge(struct skmem_cache *skm)
2436 {
2437 struct skmem_cpu_cache *cp;
2438 struct skmem_mag *mg, *pmg;
2439 int rounds, prounds;
2440 uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
2441
2442 SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2443
2444 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2445
2446 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2447 cp = &skm->skm_cpu_cache[cpuid];
2448
2449 SKM_CPU_LOCK_SPIN(cp);
2450 mg = cp->cp_loaded;
2451 pmg = cp->cp_ploaded;
2452 rounds = cp->cp_rounds;
2453 prounds = cp->cp_prounds;
2454 cp->cp_loaded = NULL;
2455 cp->cp_ploaded = NULL;
2456 cp->cp_rounds = -1;
2457 cp->cp_prounds = -1;
2458 cp->cp_magsize = 0;
2459 SKM_CPU_UNLOCK(cp);
2460
2461 if (mg != NULL) {
2462 skmem_magazine_destroy(skm, mg, rounds);
2463 ++mg_cnt;
2464 }
2465 if (pmg != NULL) {
2466 skmem_magazine_destroy(skm, pmg, prounds);
2467 ++pmg_cnt;
2468 }
2469 }
2470
2471 if (mg_cnt != 0 || pmg_cnt != 0) {
2472 atomic_add_32(&skm->skm_cpu_mag_purge, 1);
2473 }
2474
2475 skmem_depot_ws_zero(skm);
2476 skmem_depot_ws_reap(skm);
2477 }
2478
2479 /*
2480 * Enable magazines on a cache. Must only be called on a cache with
2481 * its per-CPU magazines layer disabled (e.g. due to purge).
2482 */
2483 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)2484 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2485 {
2486 #pragma unused(arg)
2487 struct skmem_cpu_cache *cp;
2488 uint32_t cpuid;
2489
2490 if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2491 return;
2492 }
2493
2494 for (cpuid = 0; cpuid < ncpu; cpuid++) {
2495 cp = &skm->skm_cpu_cache[cpuid];
2496 SKM_CPU_LOCK_SPIN(cp);
2497 /* the magazines layer must be disabled at this point */
2498 ASSERT(cp->cp_loaded == NULL);
2499 ASSERT(cp->cp_ploaded == NULL);
2500 ASSERT(cp->cp_rounds == -1);
2501 ASSERT(cp->cp_prounds == -1);
2502 ASSERT(cp->cp_magsize == 0);
2503 cp->cp_magsize = skm->skm_magtype->mt_magsize;
2504 SKM_CPU_UNLOCK(cp);
2505 }
2506
2507 SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2508 SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2509 SKMEM_CPU_CACHE(skm)->cp_magsize);
2510 }
2511
2512 /*
2513 * Enter the cache resize perimeter. Upon success, claim exclusivity
2514 * on the perimeter and return 0, else EBUSY. Caller may indicate
2515 * whether or not they're willing to wait.
2516 */
2517 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)2518 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2519 {
2520 SKM_RESIZE_LOCK(skm);
2521 if (skm->skm_rs_owner == current_thread()) {
2522 ASSERT(skm->skm_rs_busy != 0);
2523 skm->skm_rs_busy++;
2524 goto done;
2525 }
2526 if (!can_sleep) {
2527 if (skm->skm_rs_busy != 0) {
2528 SKM_RESIZE_UNLOCK(skm);
2529 return EBUSY;
2530 }
2531 } else {
2532 while (skm->skm_rs_busy != 0) {
2533 skm->skm_rs_want++;
2534 (void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
2535 SKM_RESIZE_UNLOCK(skm);
2536 (void) thread_block(THREAD_CONTINUE_NULL);
2537 SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2538 "(0x%llx) busy=%u", skm->skm_name,
2539 SK_KVA(skm), skm->skm_rs_busy);
2540 SKM_RESIZE_LOCK(skm);
2541 }
2542 }
2543 SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2544 ASSERT(skm->skm_rs_busy == 0);
2545 skm->skm_rs_busy++;
2546 skm->skm_rs_owner = current_thread();
2547 done:
2548 SKM_RESIZE_UNLOCK(skm);
2549 return 0;
2550 }
2551
2552 /*
2553 * Exit the cache resize perimeter and unblock any waiters.
2554 */
2555 static void
skmem_cache_resize_exit(struct skmem_cache * skm)2556 skmem_cache_resize_exit(struct skmem_cache *skm)
2557 {
2558 uint32_t want;
2559
2560 SKM_RESIZE_LOCK(skm);
2561 ASSERT(skm->skm_rs_busy != 0);
2562 ASSERT(skm->skm_rs_owner == current_thread());
2563 if (--skm->skm_rs_busy == 0) {
2564 skm->skm_rs_owner = NULL;
2565 /*
2566 * We're done; notify anyone that has lost the race.
2567 */
2568 if ((want = skm->skm_rs_want) != 0) {
2569 skm->skm_rs_want = 0;
2570 wakeup((void *)&skm->skm_rs_busy);
2571 SKM_RESIZE_UNLOCK(skm);
2572 } else {
2573 SKM_RESIZE_UNLOCK(skm);
2574 }
2575 } else {
2576 SKM_RESIZE_UNLOCK(skm);
2577 }
2578 }
2579
2580 /*
2581 * Recompute a cache's magazine size. This is an expensive operation
2582 * and should not be done frequently; larger magazines provide for a
2583 * higher transfer rate with the depot while smaller magazines reduce
2584 * the memory consumption.
2585 */
2586 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)2587 skmem_cache_magazine_resize(struct skmem_cache *skm)
2588 {
2589 struct skmem_magtype *mtp = skm->skm_magtype;
2590
2591 /* insist that we are executing in the update thread call context */
2592 ASSERT(sk_is_cache_update_protected());
2593 ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2594 /* depot contention only applies to dynamic mode */
2595 ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2596
2597 /*
2598 * Although we're executing in the context of the update thread
2599 * call, we need to protect the per-CPU states during resizing
2600 * against other synchronous cache purge/reenable requests that
2601 * could take place in parallel.
2602 */
2603 if (skm->skm_chunksize < mtp->mt_maxbuf) {
2604 (void) skmem_cache_resize_enter(skm, TRUE);
2605 skmem_cache_magazine_purge(skm);
2606
2607 /*
2608 * Upgrade to the next magazine type with larger size.
2609 */
2610 SKM_DEPOT_LOCK_SPIN(skm);
2611 skm->skm_cpu_mag_resize++;
2612 skm->skm_magtype = ++mtp;
2613 skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2614 skm->skm_depot_contention_prev =
2615 skm->skm_depot_contention + INT_MAX;
2616 SKM_DEPOT_UNLOCK(skm);
2617
2618 skmem_cache_magazine_enable(skm, 0);
2619 skmem_cache_resize_exit(skm);
2620 }
2621 }
2622
2623 /*
2624 * Rescale the cache's allocated-address hash table.
2625 */
2626 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2627 skmem_cache_hash_rescale(struct skmem_cache *skm)
2628 {
2629 struct skmem_bufctl_bkt *old_table, *new_table;
2630 size_t old_size, new_size;
2631 uint32_t i, moved = 0;
2632
2633 /* insist that we are executing in the update thread call context */
2634 ASSERT(sk_is_cache_update_protected());
2635
2636 /*
2637 * To get small average lookup time (lookup depth near 1.0), the hash
2638 * table size should be roughly the same (not necessarily equivalent)
2639 * as the cache size.
2640 */
2641 new_size = MAX(skm->skm_hash_initial,
2642 (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2643 new_size = MIN(skm->skm_hash_limit, new_size);
2644 old_size = (skm->skm_hash_mask + 1);
2645
2646 if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2647 return;
2648 }
2649
2650 new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2651 Z_NOWAIT, skmem_tag_bufctl_hash);
2652 if (__improbable(new_table == NULL)) {
2653 return;
2654 }
2655
2656 for (i = 0; i < new_size; i++) {
2657 SLIST_INIT(&new_table[i].bcb_head);
2658 }
2659
2660 SKM_SLAB_LOCK(skm);
2661
2662 old_size = (skm->skm_hash_mask + 1);
2663 old_table = skm->skm_hash_table;
2664
2665 skm->skm_hash_mask = (new_size - 1);
2666 skm->skm_hash_table = new_table;
2667 skm->skm_sl_rescale++;
2668
2669 for (i = 0; i < old_size; i++) {
2670 struct skmem_bufctl_bkt *bcb = &old_table[i];
2671 struct skmem_bufctl_bkt *new_bcb;
2672 struct skmem_bufctl *bc;
2673
2674 while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2675 SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2676 new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2677 /*
2678 * Ideally we want to insert tail here, but simple
2679 * list doesn't give us that. The fact that we are
2680 * essentially reversing the order is not a big deal
2681 * here vis-a-vis the new table size.
2682 */
2683 SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2684 ++moved;
2685 }
2686 ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2687 }
2688
2689 SK_DF(SK_VERB_MEM_CACHE,
2690 "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2691 (uint32_t)old_size, (uint32_t)new_size, moved);
2692
2693 SKM_SLAB_UNLOCK(skm);
2694
2695 sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2696 }
2697
2698 /*
2699 * Apply a function to operate on all caches.
2700 */
2701 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2702 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2703 {
2704 struct skmem_cache *skm;
2705
2706 net_update_uptime();
2707
2708 SKMEM_CACHE_LOCK();
2709 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2710 func(skm, arg);
2711 }
2712 SKMEM_CACHE_UNLOCK();
2713 }
2714
2715 /*
2716 * Reclaim unused memory from a cache.
2717 */
2718 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2719 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2720 {
2721 /*
2722 * Inform the owner to free memory if possible; the reclaim
2723 * policy is left to the owner. This is just an advisory.
2724 */
2725 if (skm->skm_reclaim != NULL) {
2726 skm->skm_reclaim(skm->skm_private);
2727 }
2728
2729 if (lowmem) {
2730 /*
2731 * If another thread is in the process of purging or
2732 * resizing, bail out and let the currently-ongoing
2733 * purging take its natural course.
2734 */
2735 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2736 skmem_cache_magazine_purge(skm);
2737 skmem_cache_magazine_enable(skm, 0);
2738 skmem_cache_resize_exit(skm);
2739 }
2740 } else {
2741 skmem_depot_ws_reap(skm);
2742 }
2743 }
2744
2745 /*
2746 * Thread call callback for reap.
2747 */
2748 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2749 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2750 {
2751 #pragma unused(dummy)
2752 void (*func)(void) = arg;
2753
2754 ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2755 func();
2756 }
2757
2758 /*
2759 * Start reaping all caches; this is serialized via thread call.
2760 */
2761 static void
skmem_cache_reap_start(void)2762 skmem_cache_reap_start(void)
2763 {
2764 SK_DF(SK_VERB_MEM_CACHE, "now running");
2765 skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2766 skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2767 (skmem_cache_update_interval * NSEC_PER_SEC));
2768 }
2769
2770 /*
2771 * Stop reaping; this would allow another reap request to occur.
2772 */
2773 static void
skmem_cache_reap_done(void)2774 skmem_cache_reap_done(void)
2775 {
2776 volatile uint32_t *flag = &skmem_cache_reaping;
2777
2778 *flag = 0;
2779 membar_sync();
2780 }
2781
2782 /*
2783 * Immediately reap all unused memory of a cache. If purging,
2784 * also purge the cached objects at the CPU layer.
2785 */
2786 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2787 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2788 {
2789 if (purge) {
2790 /*
2791 * If another thread is in the process of purging or
2792 * resizing, bail out and let the currently-ongoing
2793 * purging take its natural course.
2794 */
2795 if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2796 skmem_cache_magazine_purge(skm);
2797 skmem_cache_magazine_enable(skm, 0);
2798 skmem_cache_resize_exit(skm);
2799 }
2800 } else {
2801 skmem_depot_ws_zero(skm);
2802 skmem_depot_ws_reap(skm);
2803 }
2804 }
2805
2806 /*
2807 * Request a global reap operation to be dispatched.
2808 */
2809 void
skmem_cache_reap(void)2810 skmem_cache_reap(void)
2811 {
2812 /* only one reaping episode is allowed at a time */
2813 if (skmem_lock_owner == current_thread() ||
2814 !atomic_test_set_32(&skmem_cache_reaping, 0, 1)) {
2815 return;
2816 }
2817
2818 skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2819 }
2820
2821 /*
2822 * Reap internal caches.
2823 */
2824 void
skmem_reap_caches(boolean_t purge)2825 skmem_reap_caches(boolean_t purge)
2826 {
2827 skmem_cache_reap_now(skmem_slab_cache, purge);
2828 skmem_cache_reap_now(skmem_bufctl_cache, purge);
2829
2830 /* packet buffer pool objects */
2831 pp_reap_caches(purge);
2832
2833 /* also handle the region cache(s) */
2834 skmem_region_reap_caches(purge);
2835 }
2836
2837 /*
2838 * Thread call callback for update.
2839 */
2840 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2841 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2842 {
2843 #pragma unused(dummy, arg)
2844 sk_protect_t protect;
2845
2846 protect = sk_cache_update_protect();
2847 skmem_cache_applyall(skmem_cache_update, 0);
2848 sk_cache_update_unprotect(protect);
2849
2850 skmem_dispatch(skmem_cache_update_tc, NULL,
2851 (skmem_cache_update_interval * NSEC_PER_SEC));
2852 }
2853
2854 /*
2855 * Given a buffer control, record the current transaction.
2856 */
2857 __attribute__((noinline, cold, not_tail_called))
2858 static inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)2859 skmem_audit_bufctl(struct skmem_bufctl *bc)
2860 {
2861 struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
2862 struct timeval tv;
2863
2864 microuptime(&tv);
2865 bca->bc_thread = current_thread();
2866 bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
2867 bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
2868 }
2869
2870 /*
2871 * Given an object, find its buffer control and record the transaction.
2872 */
2873 __attribute__((noinline, cold, not_tail_called))
2874 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)2875 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
2876 {
2877 struct skmem_bufctl_bkt *bcb;
2878 struct skmem_bufctl *bc;
2879
2880 ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
2881
2882 SKM_SLAB_LOCK(skm);
2883 while (list != NULL) {
2884 void *buf = list;
2885
2886 bcb = SKMEM_CACHE_HASH(skm, buf);
2887 SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
2888 if (bc->bc_addr == buf) {
2889 break;
2890 }
2891 }
2892
2893 if (__improbable(bc == NULL)) {
2894 panic("%s: %s failed to get bufctl for %p",
2895 __func__, skm->skm_name, buf);
2896 /* NOTREACHED */
2897 __builtin_unreachable();
2898 }
2899
2900 skmem_audit_bufctl(bc);
2901
2902 if (!(skm->skm_mode & SKM_MODE_BATCH)) {
2903 break;
2904 }
2905
2906 list = list->mo_next;
2907 }
2908 SKM_SLAB_UNLOCK(skm);
2909 }
2910
2911 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * out,size_t len)2912 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len)
2913 {
2914 size_t actual_space = sizeof(struct sk_stats_cache);
2915 struct sk_stats_cache *sca = out;
2916 int contention;
2917
2918 if (out == NULL || len < actual_space) {
2919 goto done;
2920 }
2921
2922 bzero(sca, sizeof(*sca));
2923 (void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
2924 skm->skm_name);
2925 uuid_copy(sca->sca_uuid, skm->skm_uuid);
2926 uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
2927 sca->sca_mode = skm->skm_mode;
2928 sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
2929 sca->sca_objsize = (uint64_t)skm->skm_objsize;
2930 sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
2931 sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
2932 sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
2933 sca->sca_objalign = (uint64_t)skm->skm_objalign;
2934
2935 sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
2936 sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
2937 sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
2938 sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
2939 sca->sca_depot_full = skm->skm_depot_full;
2940 sca->sca_depot_empty = skm->skm_depot_empty;
2941 sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
2942 /* in case of a race this might be a negative value, turn it into 0 */
2943 if ((contention = (int)(skm->skm_depot_contention -
2944 skm->skm_depot_contention_prev)) < 0) {
2945 contention = 0;
2946 }
2947 sca->sca_depot_contention_factor = contention;
2948
2949 sca->sca_sl_create = skm->skm_sl_create;
2950 sca->sca_sl_destroy = skm->skm_sl_destroy;
2951 sca->sca_sl_alloc = skm->skm_sl_alloc;
2952 sca->sca_sl_free = skm->skm_sl_free;
2953 sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
2954 sca->sca_sl_partial = skm->skm_sl_partial;
2955 sca->sca_sl_empty = skm->skm_sl_empty;
2956 sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
2957 sca->sca_sl_rescale = skm->skm_sl_rescale;
2958 sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
2959
2960 done:
2961 return actual_space;
2962 }
2963
2964 static int
2965 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
2966 {
2967 #pragma unused(arg1, arg2, oidp)
2968 struct skmem_cache *skm;
2969 size_t actual_space;
2970 size_t buffer_space;
2971 size_t allocated_space;
2972 caddr_t buffer = NULL;
2973 caddr_t scan;
2974 int error = 0;
2975
2976 if (!kauth_cred_issuser(kauth_cred_get())) {
2977 return EPERM;
2978 }
2979
2980 net_update_uptime();
2981 buffer_space = req->oldlen;
2982 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2983 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2984 buffer_space = SK_SYSCTL_ALLOC_MAX;
2985 }
2986 allocated_space = buffer_space;
2987 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
2988 if (__improbable(buffer == NULL)) {
2989 return ENOBUFS;
2990 }
2991 } else if (req->oldptr == USER_ADDR_NULL) {
2992 buffer_space = 0;
2993 }
2994 actual_space = 0;
2995 scan = buffer;
2996
2997 SKMEM_CACHE_LOCK();
2998 TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2999 size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
3000 if (scan != NULL) {
3001 if (buffer_space < size) {
3002 /* supplied buffer too small, stop copying */
3003 error = ENOMEM;
3004 break;
3005 }
3006 scan += size;
3007 buffer_space -= size;
3008 }
3009 actual_space += size;
3010 }
3011 SKMEM_CACHE_UNLOCK();
3012
3013 if (actual_space != 0) {
3014 int out_error = SYSCTL_OUT(req, buffer, actual_space);
3015 if (out_error != 0) {
3016 error = out_error;
3017 }
3018 }
3019 if (buffer != NULL) {
3020 sk_free_data(buffer, allocated_space);
3021 }
3022
3023 return error;
3024 }
3025