xref: /xnu-8019.80.24/bsd/skywalk/mem/skmem_cache.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 
35 /*
36  * Memory allocator with per-CPU caching (magazines), derived from the kmem
37  * magazine concept and implementation as described in the following paper:
38  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
39  *
40  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
41  * reserved.  Use is subject to license terms.
42  *
43  * This derivative differs from the original kmem slab allocator, in that:
44  *
45  *   a) There is always a discrete bufctl per object, even for small sizes.
46  *      This increases the overhead, but is necessary as Skywalk objects
47  *      coming from the slab may be shared (RO or RW) with userland; therefore
48  *      embedding the KVA pointer linkage in freed objects is a non-starter.
49  *
50  *   b) Writing patterns to the slab at slab creation or destruction time
51  *      (when debugging is enabled) is not implemented, as the object may
52  *      be shared (RW) with userland and thus we cannot panic upon pattern
53  *      mismatch episodes.  This can be relaxed so that we conditionally
54  *      verify the pattern for kernel-only memory.
55  *
56  * This derivative also differs from Darwin's mcache allocator (which itself
57  * is a derivative of the original kmem slab allocator), in that:
58  *
59  *   1) The slab layer is internal to skmem_cache, unlike mcache's external
60  *      slab layer required to support mbufs.  skmem_cache also supports
61  *      constructing and deconstructing objects, while mcache does not.
62  *      This brings skmem_cache's model closer to that of the original
63  *      kmem slab allocator.
64  *
65  *   2) mcache allows for batch allocation and free by way of chaining the
66  *      objects together using a linked list.  This requires using a part
67  *      of the object to act as the linkage, which is against Skywalk's
68  *      requirements of not exposing any KVA pointer to userland.  Although
69  *      this is supported by skmem_cache, chaining is only possible if the
70  *      region is not mapped to userland.  That implies that kernel-only
71  *      objects can be chained provided the cache is created with batching
72  *      mode enabled, and that the object is large enough to contain the
73  *      skmem_obj structure.
74  *
75  * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
76  * implements features that are required by Skywalk.  In addition to being
77  * aware of userland access on the buffers, in also supports mirrored backend
78  * memory regions.  This allows a cache to manage two independent memory
79  * regions, such that allocating/freeing an object from/to one results in
80  * allocating/freeing a shadow object in another, thus guaranteeing that both
81  * objects share the same lifetime.
82  */
83 
84 static uint32_t ncpu;                   /* total # of initialized CPUs */
85 
86 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
87 static struct thread *skmem_lock_owner = THREAD_NULL;
88 
89 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
90 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
91 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
92 
93 #define SKMEM_CACHE_LOCK() do {                 \
94 	lck_mtx_lock(&skmem_cache_lock);        \
95 	skmem_lock_owner = current_thread();    \
96 } while (0)
97 #define SKMEM_CACHE_UNLOCK() do {               \
98 	skmem_lock_owner = THREAD_NULL;         \
99 	lck_mtx_unlock(&skmem_cache_lock);      \
100 } while (0)
101 #define SKMEM_CACHE_LOCK_ASSERT_HELD()          \
102 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
103 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD()       \
104 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
105 
106 #define SKM_SLAB_LOCK(_skm)                     \
107 	lck_mtx_lock(&(_skm)->skm_sl_lock)
108 #define SKM_SLAB_LOCK_ASSERT_HELD(_skm)         \
109 	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
110 #define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm)      \
111 	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
112 #define SKM_SLAB_UNLOCK(_skm)                   \
113 	lck_mtx_unlock(&(_skm)->skm_sl_lock)
114 
115 #define SKM_DEPOT_LOCK(_skm)                    \
116 	lck_mtx_lock(&(_skm)->skm_dp_lock)
117 #define SKM_DEPOT_LOCK_SPIN(_skm)               \
118 	lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
119 #define SKM_DEPOT_CONVERT_LOCK(_skm)            \
120 	lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
121 #define SKM_DEPOT_LOCK_TRY(_skm)                \
122 	lck_mtx_try_lock(&(_skm)->skm_dp_lock)
123 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm)        \
124 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
125 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm)     \
126 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
127 #define SKM_DEPOT_UNLOCK(_skm)                  \
128 	lck_mtx_unlock(&(_skm)->skm_dp_lock)
129 
130 #define SKM_RESIZE_LOCK(_skm)                   \
131 	lck_mtx_lock(&(_skm)->skm_rs_lock)
132 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm)       \
133 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
134 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm)    \
135 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
136 #define SKM_RESIZE_UNLOCK(_skm)                 \
137 	lck_mtx_unlock(&(_skm)->skm_rs_lock)
138 
139 #define SKM_CPU_LOCK(_cp)                       \
140 	lck_mtx_lock(&(_cp)->cp_lock)
141 #define SKM_CPU_LOCK_SPIN(_cp)                  \
142 	lck_mtx_lock_spin(&(_cp)->cp_lock)
143 #define SKM_CPU_CONVERT_LOCK(_cp)               \
144 	lck_mtx_convert_spin(&(_cp)->cp_lock)
145 #define SKM_CPU_LOCK_ASSERT_HELD(_cp)           \
146 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
147 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp)        \
148 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
149 #define SKM_CPU_UNLOCK(_cp)                     \
150 	lck_mtx_unlock(&(_cp)->cp_lock)
151 
152 #define SKM_ZONE_MAX    256
153 
154 static struct zone *skm_zone;                   /* zone for skmem_cache */
155 
156 static struct skmem_cache *skmem_slab_cache;    /* cache for skmem_slab */
157 static struct skmem_cache *skmem_bufctl_cache;  /* cache for skmem_bufctl */
158 static unsigned int bc_size;                    /* size of bufctl */
159 
160 /*
161  * Magazine types (one per row.)
162  *
163  * The first column defines the number of objects that the magazine can hold.
164  * Using that number, we derive the effective number: the aggregate count of
165  * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
166  * This would result in an object size that is aligned on the CPU cache
167  * size boundary; the exception to this is the KASAN mode where the size
168  * would be larger due to the redzone regions.
169  *
170  * The second column defines the alignment of the magazine.  Because each
171  * magazine is used at the CPU-layer cache, we need to ensure there is no
172  * false sharing across the CPUs, and align the magazines to the maximum
173  * cache alignment size, for simplicity.  The value of 0 may be used to
174  * indicate natural pointer size alignment.
175  *
176  * The third column defines the starting magazine type for a given cache,
177  * determined at the cache's creation time based on its chunk size.
178  *
179  * The fourth column defines the magazine type limit for a given cache.
180  * Magazine resizing will only occur if the chunk size is less than this.
181  */
182 static struct skmem_magtype skmem_magtype[] = {
183 #if defined(__LP64__)
184 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
185 	  .mt_cache = NULL, .mt_cname = "" },
186 	{ .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
187 	  .mt_cache = NULL, .mt_cname = "" },
188 	{ .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
189 	  .mt_cache = NULL, .mt_cname = "" },
190 	{ .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
191 	  .mt_cache = NULL, .mt_cname = "" },
192 	{ .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
193 	  .mt_cache = NULL, .mt_cname = "" },
194 	{ .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
195 	  .mt_cache = NULL, .mt_cname = "" },
196 	{ .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
197 	  .mt_cache = NULL, .mt_cname = "" },
198 	{ .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
199 	  .mt_cache = NULL, .mt_cname = "" },
200 #else /* !__LP64__ */
201 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
202 	  .mt_cache = NULL, .mt_cname = "" },
203 #endif /* !__LP64__ */
204 };
205 
206 /*
207  * Hash table bounds.  Start with the initial value, and rescale up to
208  * the specified limit.  Ideally we don't need a limit, but in practice
209  * this helps guard against runaways.  These values should be revisited
210  * in future and be adjusted as needed.
211  */
212 #define SKMEM_CACHE_HASH_INITIAL        64      /* initial hash table size */
213 #define SKMEM_CACHE_HASH_LIMIT          8192    /* hash table size limit */
214 
215 #define SKMEM_CACHE_HASH_INDEX(_a, _s, _m)      (((_a) >> (_s)) & (_m))
216 #define SKMEM_CACHE_HASH(_skm, _buf)                                     \
217 	(&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
218 	(_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
219 
220 /*
221  * The last magazine type.
222  */
223 static struct skmem_magtype *skmem_cache_magsize_last;
224 
225 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
226 static boolean_t skmem_cache_ready;
227 
228 static int skmem_slab_alloc_locked(struct skmem_cache *,
229     struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
230 static void skmem_slab_free_locked(struct skmem_cache *, void *);
231 static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
232     struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
233 static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
234 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
235 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
236 static int skmem_magazine_ctor(struct skmem_obj_info *,
237     struct skmem_obj_info *, void *, uint32_t);
238 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
239     int);
240 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
241     struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t);
242 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
243     uint32_t *, struct skmem_mag *);
244 static void skmem_depot_ws_update(struct skmem_cache *);
245 static void skmem_depot_ws_zero(struct skmem_cache *);
246 static void skmem_depot_ws_reap(struct skmem_cache *);
247 static void skmem_cache_magazine_purge(struct skmem_cache *);
248 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
249 static void skmem_cache_magazine_resize(struct skmem_cache *);
250 static void skmem_cache_hash_rescale(struct skmem_cache *);
251 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
252 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
253     struct skmem_mag *, int);
254 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
255     uint32_t);
256 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
257 static void skmem_cache_reap_start(void);
258 static void skmem_cache_reap_done(void);
259 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
260 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
261 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
262 static void skmem_cache_resize_exit(struct skmem_cache *);
263 static void skmem_audit_bufctl(struct skmem_bufctl *);
264 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
265 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
266 
267 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
268     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
269     0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
270     "Skywalk cache statistics");
271 
272 static volatile uint32_t skmem_cache_reaping;
273 static thread_call_t skmem_cache_reap_tc;
274 static thread_call_t skmem_cache_update_tc;
275 
276 extern kern_return_t thread_terminate(thread_t);
277 extern unsigned int ml_wait_max_cpus(void);
278 
279 #define SKMEM_DEBUG_NOMAGAZINES 0x1     /* disable magazines layer */
280 #define SKMEM_DEBUG_AUDIT       0x2     /* audit transactions */
281 #define SKMEM_DEBUG_MASK        (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
282 
283 #if DEBUG
284 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
285 #else /* !DEBUG */
286 static uint32_t skmem_debug = 0;
287 #endif /* !DEBUG */
288 
289 static uint32_t skmem_clear_min = 0;    /* clear on free threshold */
290 
291 #define SKMEM_CACHE_UPDATE_INTERVAL     11      /* 11 seconds */
292 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
293 
294 #define SKMEM_DEPOT_CONTENTION  3       /* max failed trylock per interval */
295 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
296 
297 /*
298  * Too big a value will cause overflow and thus trip the assertion; the
299  * idea here is to set an upper limit for the time that a particular
300  * thread is allowed to perform retries before we give up and panic.
301  */
302 #define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */
303 
304 /*
305  * Threshold (in msec) after which we reset the exponential backoff value
306  * back to its (random) initial value.  Note that we allow the actual delay
307  * to be at most twice this value.
308  */
309 #define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */
310 
311 /*
312  * To reduce the likelihood of global synchronization between threads,
313  * we use some random value to start the exponential backoff.
314  */
315 #define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */
316 
317 #if (DEVELOPMENT || DEBUG)
318 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
319     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
320     SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
321 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
322     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
323     SKMEM_DEPOT_CONTENTION, "Depot contention");
324 
325 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
326 
327 /*
328  * Called by skmem_test_start() to set the update interval.
329  */
330 void
skmem_cache_test_start(uint32_t i)331 skmem_cache_test_start(uint32_t i)
332 {
333 	skmem_cache_update_interval_saved = skmem_cache_update_interval;
334 	skmem_cache_update_interval = i;
335 }
336 
337 /*
338  * Called by skmem_test_stop() to restore the update interval.
339  */
340 void
skmem_cache_test_stop(void)341 skmem_cache_test_stop(void)
342 {
343 	skmem_cache_update_interval = skmem_cache_update_interval_saved;
344 }
345 #endif /* (DEVELOPMENT || DEBUG) */
346 
347 #define SKMEM_TAG_BUFCTL_HASH   "com.apple.skywalk.bufctl.hash"
348 static kern_allocation_name_t skmem_tag_bufctl_hash;
349 
350 #define SKMEM_TAG_CACHE_MIB     "com.apple.skywalk.cache.mib"
351 static kern_allocation_name_t skmem_tag_cache_mib;
352 
353 static int __skmem_cache_pre_inited = 0;
354 static int __skmem_cache_inited = 0;
355 
356 /*
357  * Called before skmem_region_init().
358  */
359 void
skmem_cache_pre_init(void)360 skmem_cache_pre_init(void)
361 {
362 	vm_size_t skm_size;
363 
364 	ASSERT(!__skmem_cache_pre_inited);
365 
366 	ncpu = ml_wait_max_cpus();
367 
368 	/* allocate extra in case we need to manually align the pointer */
369 	if (skm_zone == NULL) {
370 		skm_size = SKMEM_CACHE_SIZE(ncpu);
371 #if KASAN
372 		/*
373 		 * When KASAN is enabled, the zone allocator adjusts the
374 		 * element size to include the redzone regions, in which
375 		 * case we assume that the elements won't start on the
376 		 * alignment boundary and thus need to do some fix-ups.
377 		 * These include increasing the effective object size
378 		 * which adds at least 136 bytes to the original size,
379 		 * as computed by skmem_region_params_config() above.
380 		 */
381 		skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
382 #endif /* KASAN */
383 		skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
384 		skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
385 		    ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
386 	}
387 
388 	TAILQ_INIT(&skmem_cache_head);
389 
390 	__skmem_cache_pre_inited = 1;
391 }
392 
393 /*
394  * Called after skmem_region_init().
395  */
396 void
skmem_cache_init(void)397 skmem_cache_init(void)
398 {
399 	uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
400 	struct skmem_magtype *mtp;
401 	uint32_t i;
402 
403 	_CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
404 
405 	_CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
406 	_CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
407 	_CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
408 	_CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
409 	_CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
410 	_CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
411 	_CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
412 
413 	ASSERT(__skmem_cache_pre_inited);
414 	ASSERT(!__skmem_cache_inited);
415 
416 	PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
417 	skmem_debug &= SKMEM_DEBUG_MASK;
418 
419 #if (DEVELOPMENT || DEBUG)
420 	PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
421 	    sizeof(skmem_clear_min));
422 #endif /* (DEVELOPMENT || DEBUG) */
423 	if (skmem_clear_min == 0) {
424 		/* zeroing 2 CPU cache lines practically comes for free */
425 		skmem_clear_min = 2 * cpu_cache_line_size;
426 	} else {
427 		/* round it up to CPU cache line size */
428 		skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
429 		    cpu_cache_line_size);
430 	}
431 
432 	/* create a cache for buffer control structures */
433 	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
434 		bc_size = sizeof(struct skmem_bufctl_audit);
435 		skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
436 		    bc_size, sizeof(uint64_t), NULL, NULL,
437 		    NULL, NULL, NULL, 0);
438 	} else {
439 		bc_size = sizeof(struct skmem_bufctl);
440 		skmem_bufctl_cache = skmem_cache_create("bufctl",
441 		    bc_size, sizeof(uint64_t), NULL, NULL,
442 		    NULL, NULL, NULL, 0);
443 	}
444 
445 	/* create a cache for slab structures */
446 	skmem_slab_cache = skmem_cache_create("slab",
447 	    sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
448 	    NULL, NULL, 0);
449 
450 	/*
451 	 * Go thru the magazine type table and create an cache for each.
452 	 */
453 	for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
454 		mtp = &skmem_magtype[i];
455 
456 		if (mtp->mt_align != 0 &&
457 		    ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
458 		    mtp->mt_align < (int)cpu_cache_line_size)) {
459 			panic("%s: bad alignment %d", __func__, mtp->mt_align);
460 			/* NOTREACHED */
461 			__builtin_unreachable();
462 		}
463 		(void) snprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
464 		    "mg.%d", mtp->mt_magsize);
465 
466 		/* create an cache for this magazine type */
467 		mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
468 		    SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
469 		    skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
470 
471 		/* remember the last magazine type */
472 		skmem_cache_magsize_last = mtp;
473 	}
474 
475 	VERIFY(skmem_cache_magsize_last != NULL);
476 	VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
477 	VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
478 
479 	/*
480 	 * Allocate thread calls for cache reap and update operations.
481 	 */
482 	skmem_cache_reap_tc =
483 	    thread_call_allocate_with_options(skmem_cache_reap_func,
484 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
485 	skmem_cache_update_tc =
486 	    thread_call_allocate_with_options(skmem_cache_update_func,
487 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
488 	if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
489 		panic("%s: thread_call_allocate failed", __func__);
490 		/* NOTREACHED */
491 		__builtin_unreachable();
492 	}
493 
494 	/*
495 	 * We're ready; go through existing skmem_cache entries
496 	 * (if any) and enable the magazines layer for each.
497 	 */
498 	skmem_cache_applyall(skmem_cache_magazine_enable, 0);
499 	skmem_cache_ready = TRUE;
500 
501 	/* and start the periodic cache update machinery */
502 	skmem_dispatch(skmem_cache_update_tc, NULL,
503 	    (skmem_cache_update_interval * NSEC_PER_SEC));
504 
505 	ASSERT(skmem_tag_bufctl_hash == NULL);
506 	skmem_tag_bufctl_hash =
507 	    kern_allocation_name_allocate(SKMEM_TAG_BUFCTL_HASH, 0);
508 	ASSERT(skmem_tag_bufctl_hash != NULL);
509 
510 	ASSERT(skmem_tag_cache_mib == NULL);
511 	skmem_tag_cache_mib =
512 	    kern_allocation_name_allocate(SKMEM_TAG_CACHE_MIB, 0);
513 	ASSERT(skmem_tag_cache_mib != NULL);
514 
515 	__skmem_cache_inited = 1;
516 }
517 
518 void
skmem_cache_fini(void)519 skmem_cache_fini(void)
520 {
521 	struct skmem_magtype *mtp;
522 	uint32_t i;
523 
524 	if (__skmem_cache_inited) {
525 		ASSERT(TAILQ_EMPTY(&skmem_cache_head));
526 
527 		for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
528 			mtp = &skmem_magtype[i];
529 			skmem_cache_destroy(mtp->mt_cache);
530 			mtp->mt_cache = NULL;
531 		}
532 		skmem_cache_destroy(skmem_slab_cache);
533 		skmem_slab_cache = NULL;
534 		skmem_cache_destroy(skmem_bufctl_cache);
535 		skmem_bufctl_cache = NULL;
536 
537 		if (skmem_cache_reap_tc != NULL) {
538 			(void) thread_call_cancel_wait(skmem_cache_reap_tc);
539 			(void) thread_call_free(skmem_cache_reap_tc);
540 			skmem_cache_reap_tc = NULL;
541 		}
542 		if (skmem_cache_update_tc != NULL) {
543 			(void) thread_call_cancel_wait(skmem_cache_update_tc);
544 			(void) thread_call_free(skmem_cache_update_tc);
545 			skmem_cache_update_tc = NULL;
546 		}
547 		if (skmem_tag_bufctl_hash != NULL) {
548 			kern_allocation_name_release(skmem_tag_bufctl_hash);
549 			skmem_tag_bufctl_hash = NULL;
550 		}
551 		if (skmem_tag_cache_mib != NULL) {
552 			kern_allocation_name_release(skmem_tag_cache_mib);
553 			skmem_tag_cache_mib = NULL;
554 		}
555 
556 		__skmem_cache_inited = 0;
557 	}
558 
559 	if (__skmem_cache_pre_inited) {
560 		if (skm_zone != NULL) {
561 			zdestroy(skm_zone);
562 			skm_zone = NULL;
563 		}
564 
565 		__skmem_cache_pre_inited = 0;
566 	}
567 }
568 
569 /*
570  * Create a cache.
571  */
572 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)573 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
574     skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
575     void *private, struct skmem_region *region, uint32_t cflags)
576 {
577 	boolean_t pseudo = (region == NULL);
578 	struct skmem_magtype *mtp;
579 	struct skmem_cache *skm;
580 	void *buf;
581 	size_t segsize;
582 	size_t chunksize;
583 	size_t objsize;
584 	size_t objalign;
585 	uint32_t i, cpuid;
586 
587 	/* enforce 64-bit minimum alignment for buffers */
588 	if (bufalign == 0) {
589 		bufalign = SKMEM_CACHE_ALIGN;
590 	}
591 	bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
592 
593 	/* enforce alignment to be a power of 2 */
594 	VERIFY(powerof2(bufalign));
595 
596 	if (region == NULL) {
597 		struct skmem_region_params srp;
598 
599 		/* batching is currently not supported on pseudo regions */
600 		VERIFY(!(cflags & SKMEM_CR_BATCH));
601 
602 		srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
603 		ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
604 
605 		/* objalign is always equal to bufalign */
606 		srp.srp_align = objalign = bufalign;
607 		srp.srp_r_obj_cnt = 1;
608 		srp.srp_r_obj_size = (uint32_t)bufsize;
609 		skmem_region_params_config(&srp);
610 
611 		/* allocate region for intrinsics */
612 		region = skmem_region_create(name, &srp, NULL, NULL, NULL);
613 		VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
614 		VERIFY(objalign == region->skr_align);
615 #if KASAN
616 		/*
617 		 * When KASAN is enabled, the zone allocator adjusts the
618 		 * element size to include the redzone regions, in which
619 		 * case we assume that the elements won't start on the
620 		 * alignment boundary and thus need to do some fix-ups.
621 		 * These include increasing the effective object size
622 		 * which adds at least 16 bytes to the original size,
623 		 * as computed by skmem_region_params_config() above.
624 		 */
625 		VERIFY(region->skr_c_obj_size >=
626 		    (bufsize + sizeof(uint64_t) + bufalign));
627 #endif /* KASAN */
628 		/* enable magazine resizing by default */
629 		cflags |= SKMEM_CR_DYNAMIC;
630 
631 		/*
632 		 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
633 		 * even though it's a no-op since the work is done
634 		 * at the zone layer instead.
635 		 */
636 		cflags |= SKMEM_CR_CLEARONFREE;
637 	} else {
638 		objalign = region->skr_align;
639 	}
640 
641 	ASSERT(region != NULL);
642 	ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
643 	segsize = region->skr_seg_size;
644 	ASSERT(bufalign <= segsize);
645 
646 	buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO);
647 #if KASAN
648 	/*
649 	 * In case we didn't get a cache-aligned memory, round it up
650 	 * accordingly.  This is needed in order to get the rest of
651 	 * structure members aligned properly.  It also means that
652 	 * the memory span gets shifted due to the round up, but it
653 	 * is okay since we've allocated extra space for this.
654 	 */
655 	skm = (struct skmem_cache *)
656 	    P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
657 	void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
658 	*pbuf = buf;
659 #else /* !KASAN */
660 	/*
661 	 * We expect that the zone allocator would allocate elements
662 	 * rounded up to the requested alignment based on the object
663 	 * size computed in skmem_cache_pre_init() earlier, and
664 	 * 'skm' is therefore the element address itself.
665 	 */
666 	skm = buf;
667 #endif /* !KASAN */
668 	VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
669 
670 	if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
671 	    (cflags & SKMEM_CR_NOMAGAZINES)) {
672 		/*
673 		 * Either the caller insists that this cache should not
674 		 * utilize magazines layer, or that the system override
675 		 * to disable magazines layer on all caches has been set.
676 		 */
677 		skm->skm_mode |= SKM_MODE_NOMAGAZINES;
678 	} else {
679 		/*
680 		 * Region must be configured with enough objects
681 		 * to take into account objects at the CPU layer.
682 		 */
683 		ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
684 	}
685 
686 	if (cflags & SKMEM_CR_DYNAMIC) {
687 		/*
688 		 * Enable per-CPU cache magazine resizing.
689 		 */
690 		skm->skm_mode |= SKM_MODE_DYNAMIC;
691 	}
692 
693 	/* region stays around after defunct? */
694 	if (region->skr_mode & SKR_MODE_NOREDIRECT) {
695 		skm->skm_mode |= SKM_MODE_NOREDIRECT;
696 	}
697 
698 	if (cflags & SKMEM_CR_BATCH) {
699 		/*
700 		 * Batch alloc/free involves storing the next object
701 		 * pointer at the beginning of each object; this is
702 		 * okay for kernel-only regions, but not those that
703 		 * are mappable to user space (we can't leak kernel
704 		 * addresses).
705 		 */
706 		_CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
707 		VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
708 
709 		/* batching is currently not supported on pseudo regions */
710 		VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
711 
712 		/* validate object size */
713 		VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
714 
715 		skm->skm_mode |= SKM_MODE_BATCH;
716 	}
717 
718 	uuid_generate_random(skm->skm_uuid);
719 	(void) snprintf(skm->skm_name, sizeof(skm->skm_name),
720 	    "%s.%s", SKMEM_CACHE_PREFIX, name);
721 	skm->skm_bufsize = bufsize;
722 	skm->skm_bufalign = bufalign;
723 	skm->skm_objalign = objalign;
724 	skm->skm_ctor = ctor;
725 	skm->skm_dtor = dtor;
726 	skm->skm_reclaim = reclaim;
727 	skm->skm_private = private;
728 	skm->skm_slabsize = segsize;
729 
730 	skm->skm_region = region;
731 	/* callee holds reference */
732 	skmem_region_slab_config(region, skm);
733 	objsize = region->skr_c_obj_size;
734 	skm->skm_objsize = objsize;
735 
736 	if (pseudo) {
737 		/*
738 		 * Release reference from skmem_region_create()
739 		 * since skm->skm_region holds one now.
740 		 */
741 		ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
742 		skmem_region_release(region);
743 
744 		skm->skm_mode |= SKM_MODE_PSEUDO;
745 
746 		skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
747 		skm->skm_slab_free = skmem_slab_free_pseudo_locked;
748 	} else {
749 		skm->skm_slab_alloc = skmem_slab_alloc_locked;
750 		skm->skm_slab_free = skmem_slab_free_locked;
751 
752 		/* auditing was requested? (normal regions only) */
753 		if (skmem_debug & SKMEM_DEBUG_AUDIT) {
754 			ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
755 			skm->skm_mode |= SKM_MODE_AUDIT;
756 		}
757 	}
758 
759 	/*
760 	 * Clear upon free (to slab layer) as long as the region is
761 	 * not marked as read-only for kernel, and if the chunk size
762 	 * is within the threshold or if the caller had requested it.
763 	 */
764 	if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
765 		if (skm->skm_objsize <= skmem_clear_min ||
766 		    (cflags & SKMEM_CR_CLEARONFREE)) {
767 			skm->skm_mode |= SKM_MODE_CLEARONFREE;
768 		}
769 	}
770 
771 	chunksize = bufsize;
772 	if (bufalign >= SKMEM_CACHE_ALIGN) {
773 		chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
774 	}
775 
776 	chunksize = P2ROUNDUP(chunksize, bufalign);
777 	if (chunksize > objsize) {
778 		panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
779 		    __func__, bufsize, chunksize, objsize);
780 		/* NOTREACHED */
781 		__builtin_unreachable();
782 	}
783 	ASSERT(chunksize != 0);
784 	skm->skm_chunksize = chunksize;
785 
786 	lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
787 	TAILQ_INIT(&skm->skm_sl_partial_list);
788 	TAILQ_INIT(&skm->skm_sl_empty_list);
789 
790 	/* allocated-address hash table */
791 	skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
792 	skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
793 	skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
794 	    skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
795 
796 	skm->skm_hash_mask = (skm->skm_hash_initial - 1);
797 	skm->skm_hash_shift = flsll(chunksize) - 1;
798 
799 	for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
800 		SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
801 	}
802 
803 	lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
804 
805 	/* find a suitable magazine type for this chunk size */
806 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
807 		continue;
808 	}
809 
810 	skm->skm_magtype = mtp;
811 	if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
812 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
813 	}
814 
815 	/*
816 	 * Initialize the CPU layer.  Each per-CPU structure is aligned
817 	 * on the CPU cache line boundary to prevent false sharing.
818 	 */
819 	lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
820 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
821 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
822 
823 		VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
824 		lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
825 		    &skmem_lock_attr);
826 		ccp->cp_rounds = -1;
827 		ccp->cp_prounds = -1;
828 	}
829 
830 	SKMEM_CACHE_LOCK();
831 	TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
832 	SKMEM_CACHE_UNLOCK();
833 
834 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
835 	    skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
836 	SK_DF(SK_VERB_MEM_CACHE,
837 	    "  bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
838 	    (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
839 	    (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
840 	    (uint32_t)skm->skm_slabsize);
841 
842 	if (skmem_cache_ready) {
843 		skmem_cache_magazine_enable(skm, 0);
844 	}
845 
846 	return skm;
847 }
848 
849 /*
850  * Destroy a cache.
851  */
852 void
skmem_cache_destroy(struct skmem_cache * skm)853 skmem_cache_destroy(struct skmem_cache *skm)
854 {
855 	uint32_t cpuid;
856 
857 	SKMEM_CACHE_LOCK();
858 	TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
859 	SKMEM_CACHE_UNLOCK();
860 
861 	ASSERT(skm->skm_rs_busy == 0);
862 	ASSERT(skm->skm_rs_want == 0);
863 
864 	/* purge all cached objects for this cache */
865 	skmem_cache_magazine_purge(skm);
866 
867 	/*
868 	 * Panic if we detect there are unfreed objects; the caller
869 	 * destroying this cache is responsible for ensuring that all
870 	 * allocated objects have been freed prior to getting here.
871 	 */
872 	SKM_SLAB_LOCK(skm);
873 	if (skm->skm_sl_bufinuse != 0) {
874 		panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
875 		    skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
876 		/* NOTREACHED */
877 		__builtin_unreachable();
878 	}
879 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
880 	ASSERT(skm->skm_sl_partial == 0);
881 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
882 	ASSERT(skm->skm_sl_empty == 0);
883 	skm->skm_reclaim = NULL;
884 	skm->skm_ctor = NULL;
885 	skm->skm_dtor = NULL;
886 	SKM_SLAB_UNLOCK(skm);
887 
888 	if (skm->skm_hash_table != NULL) {
889 #if (DEBUG || DEVELOPMENT)
890 		for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
891 			ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
892 		}
893 #endif /* DEBUG || DEVELOPMENT */
894 
895 		sk_free_type_array(struct skmem_bufctl_bkt,
896 		    skm->skm_hash_mask + 1, skm->skm_hash_table);
897 		skm->skm_hash_table = NULL;
898 	}
899 
900 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
901 		lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
902 		    &skmem_cpu_lock_grp);
903 	}
904 	lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
905 	lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
906 	lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
907 
908 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
909 	    skm->skm_name, SK_KVA(skm));
910 
911 	/* callee releases reference */
912 	skmem_region_slab_config(skm->skm_region, NULL);
913 	skm->skm_region = NULL;
914 
915 #if KASAN
916 	/* get the original address since we're about to free it */
917 	void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
918 	skm = *pbuf;
919 #endif /* KASAN */
920 
921 	zfree(skm_zone, skm);
922 }
923 
924 /*
925  * Create a slab.
926  */
927 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)928 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
929 {
930 	struct skmem_region *skr = skm->skm_region;
931 	uint32_t objsize, chunks;
932 	size_t slabsize = skm->skm_slabsize;
933 	struct skmem_slab *sl;
934 	struct sksegment *sg, *sgm;
935 	char *buf, *bufm, *slab, *slabm;
936 
937 	/*
938 	 * Allocate a segment (a slab at our layer) from the region.
939 	 */
940 	slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
941 	if (slab == NULL) {
942 		goto rg_alloc_failure;
943 	}
944 
945 	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
946 		goto slab_alloc_failure;
947 	}
948 
949 	ASSERT(sg != NULL);
950 	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
951 
952 	bzero(sl, sizeof(*sl));
953 	sl->sl_cache = skm;
954 	sl->sl_base = buf = slab;
955 	sl->sl_basem = bufm = slabm;
956 	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
957 	objsize = (uint32_t)skr->skr_c_obj_size;
958 	ASSERT(skm->skm_objsize == objsize);
959 	ASSERT((slabsize / objsize) <= UINT32_MAX);
960 	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
961 	sl->sl_seg = sg;
962 	sl->sl_segm = sgm;
963 
964 	/*
965 	 * Create one or more buffer control structures for the slab,
966 	 * each one tracking a chunk of raw object from the segment,
967 	 * and insert these into the slab's list of buffer controls.
968 	 */
969 	ASSERT(chunks > 0);
970 	while (chunks != 0) {
971 		struct skmem_bufctl *bc;
972 
973 		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
974 		if (bc == NULL) {
975 			goto bufctl_alloc_failure;
976 		}
977 
978 		bzero(bc, bc_size);
979 		bc->bc_addr = buf;
980 		bc->bc_addrm = bufm;
981 		bc->bc_slab = sl;
982 		bc->bc_idx = (sl->sl_chunks - chunks);
983 		if (skr->skr_mode & SKR_MODE_SHAREOK) {
984 			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
985 		}
986 		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
987 		bc->bc_lim = objsize;
988 		buf += objsize;
989 		if (bufm != NULL) {
990 			bufm += objsize;
991 		}
992 		--chunks;
993 	}
994 
995 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
996 	    SK_KVA(skm), SK_KVA(sl));
997 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
998 	    SK_KVA(slab), SK_KVA(slab + objsize));
999 
1000 	return sl;
1001 
1002 bufctl_alloc_failure:
1003 	skmem_slab_destroy(skm, sl);
1004 
1005 slab_alloc_failure:
1006 	skmem_region_free(skr, slab, slabm);
1007 
1008 rg_alloc_failure:
1009 	atomic_add_64(&skm->skm_sl_alloc_fail, 1);
1010 
1011 	return NULL;
1012 }
1013 
1014 /*
1015  * Destroy a slab.
1016  */
1017 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)1018 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
1019 {
1020 	struct skmem_bufctl *bc, *tbc;
1021 	void *slab = sl->sl_base;
1022 	void *slabm = sl->sl_basem;
1023 
1024 	ASSERT(sl->sl_refcnt == 0);
1025 
1026 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1027 	    SK_KVA(skm), SK_KVA(sl));
1028 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1029 	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1030 
1031 	/*
1032 	 * Go through the slab's list of buffer controls and free
1033 	 * them, and then free the slab itself back to its cache.
1034 	 */
1035 	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1036 		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1037 		skmem_cache_free(skmem_bufctl_cache, bc);
1038 	}
1039 	skmem_cache_free(skmem_slab_cache, sl);
1040 
1041 	/* and finally free the segment back to the backing region */
1042 	skmem_region_free(skm->skm_region, slab, slabm);
1043 }
1044 
1045 /*
1046  * Allocate a raw object from the (locked) slab layer.  Normal region variant.
1047  */
1048 static int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1049 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
1050     struct skmem_obj_info *oim, uint32_t skmflag)
1051 {
1052 	struct skmem_bufctl_bkt *bcb;
1053 	struct skmem_bufctl *bc;
1054 	struct skmem_slab *sl;
1055 	uint32_t retries = 0;
1056 	uint64_t boff_total = 0;                /* in usec */
1057 	uint64_t boff = 0;                      /* in msec */
1058 	boolean_t new_slab;
1059 	void *buf;
1060 
1061 	/* this flag is not for the caller to set */
1062 	VERIFY(!(skmflag & SKMEM_FAILOK));
1063 
1064 	/*
1065 	 * A slab is either in a partially-allocated list (at least it has
1066 	 * a free object available), or is in the empty list (everything
1067 	 * has been allocated.)  If we can't find a partially-allocated
1068 	 * slab, then we need to allocate a slab (segment) from the region.
1069 	 */
1070 again:
1071 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1072 	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1073 	if (sl == NULL) {
1074 		uint32_t flags = skmflag;
1075 		boolean_t retry;
1076 
1077 		ASSERT(skm->skm_sl_partial == 0);
1078 		SKM_SLAB_UNLOCK(skm);
1079 		if (!(flags & SKMEM_NOSLEEP)) {
1080 			/*
1081 			 * Pick up a random value to start the exponential
1082 			 * backoff, if this is the first round, or if the
1083 			 * current value is over the threshold.  Otherwise,
1084 			 * double the backoff value.
1085 			 */
1086 			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
1087 				read_frandom(&boff, sizeof(boff));
1088 				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
1089 				ASSERT(boff > 0);
1090 			} else if (os_mul_overflow(boff, 2, &boff)) {
1091 				panic_plain("\"%s\": boff counter "
1092 				    "overflows\n", skm->skm_name);
1093 				/* NOTREACHED */
1094 				__builtin_unreachable();
1095 			}
1096 			/* add this value (in msec) to the total (in usec) */
1097 			if (os_add_overflow(boff_total,
1098 			    (boff * NSEC_PER_USEC), &boff_total)) {
1099 				panic_plain("\"%s\": boff_total counter "
1100 				    "overflows\n", skm->skm_name);
1101 				/* NOTREACHED */
1102 				__builtin_unreachable();
1103 			}
1104 		}
1105 		/*
1106 		 * In the event of a race between multiple threads trying
1107 		 * to create the last remaining (or the only) slab, let the
1108 		 * loser(s) attempt to retry after waiting a bit.  The winner
1109 		 * would have inserted the newly-created slab into the list.
1110 		 */
1111 		if (!(flags & SKMEM_NOSLEEP) &&
1112 		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1113 			retry = TRUE;
1114 			++retries;
1115 			flags |= SKMEM_FAILOK;
1116 		} else {
1117 			if (!(flags & SKMEM_NOSLEEP)) {
1118 				panic_plain("\"%s\": failed to allocate "
1119 				    "slab (sleeping mode) after %llu "
1120 				    "msec, %u retries\n\n%s", skm->skm_name,
1121 				    (boff_total / NSEC_PER_USEC), retries,
1122 				    skmem_dump(skm->skm_region));
1123 				/* NOTREACHED */
1124 				__builtin_unreachable();
1125 			}
1126 			retry = FALSE;
1127 		}
1128 
1129 		/*
1130 		 * Create a new slab.
1131 		 */
1132 		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
1133 			if (retry) {
1134 				SK_ERR("\"%s\": failed to allocate "
1135 				    "slab (%ssleeping mode): waiting for %llu "
1136 				    "msec, total %llu msec, %u retries",
1137 				    skm->skm_name,
1138 				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
1139 				    boff, (boff_total / NSEC_PER_USEC), retries);
1140 				VERIFY(boff > 0 && ((uint32_t)boff <=
1141 				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
1142 				delay((uint32_t)boff * NSEC_PER_USEC);
1143 				SKM_SLAB_LOCK(skm);
1144 				goto again;
1145 			} else {
1146 				SK_RDERR(4, "\"%s\": failed to allocate slab "
1147 				    "(%ssleeping mode)", skm->skm_name,
1148 				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
1149 				SKM_SLAB_LOCK(skm);
1150 			}
1151 			return ENOMEM;
1152 		}
1153 
1154 		SKM_SLAB_LOCK(skm);
1155 		skm->skm_sl_create++;
1156 		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1157 		    skm->skm_sl_bufmax) {
1158 			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1159 		}
1160 	}
1161 	skm->skm_sl_alloc++;
1162 
1163 	new_slab = (sl->sl_refcnt == 0);
1164 	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
1165 
1166 	sl->sl_refcnt++;
1167 	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1168 
1169 	/*
1170 	 * We either have a new slab, or a partially-allocated one.
1171 	 * Remove a buffer control from the slab, and insert it to
1172 	 * the allocated-address hash chain.
1173 	 */
1174 	bc = SLIST_FIRST(&sl->sl_head);
1175 	ASSERT(bc != NULL);
1176 	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1177 
1178 	/* sanity check */
1179 	VERIFY(bc->bc_usecnt == 0);
1180 
1181 	/*
1182 	 * Also store the master object's region info for the caller.
1183 	 */
1184 	bzero(oi, sizeof(*oi));
1185 	SKMEM_OBJ_ADDR(oi) = buf = bc->bc_addr;
1186 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
1187 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1188 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1189 	SKMEM_OBJ_IDX_REG(oi) =
1190 	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1191 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1192 	/*
1193 	 * And for slave object.
1194 	 */
1195 	if (oim != NULL) {
1196 		bzero(oim, sizeof(*oim));
1197 		if (bc->bc_addrm != NULL) {
1198 			SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1199 			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1200 			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1201 			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1202 		}
1203 	}
1204 
1205 	if (skm->skm_mode & SKM_MODE_BATCH) {
1206 		((struct skmem_obj *)buf)->mo_next = NULL;
1207 	}
1208 
1209 	/* insert to allocated-address hash chain */
1210 	bcb = SKMEM_CACHE_HASH(skm, buf);
1211 	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1212 
1213 	if (SLIST_EMPTY(&sl->sl_head)) {
1214 		/*
1215 		 * If that was the last buffer control from this slab,
1216 		 * insert the slab into the empty list.  If it was in
1217 		 * the partially-allocated list, then remove the slab
1218 		 * from there as well.
1219 		 */
1220 		ASSERT(sl->sl_refcnt == sl->sl_chunks);
1221 		if (new_slab) {
1222 			ASSERT(sl->sl_chunks == 1);
1223 		} else {
1224 			ASSERT(sl->sl_chunks > 1);
1225 			ASSERT(skm->skm_sl_partial > 0);
1226 			skm->skm_sl_partial--;
1227 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1228 		}
1229 		skm->skm_sl_empty++;
1230 		ASSERT(skm->skm_sl_empty != 0);
1231 		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1232 	} else {
1233 		/*
1234 		 * The slab is not empty; if it was newly allocated
1235 		 * above, then it's not in the partially-allocated
1236 		 * list and so we insert it there.
1237 		 */
1238 		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1239 		if (new_slab) {
1240 			skm->skm_sl_partial++;
1241 			ASSERT(skm->skm_sl_partial != 0);
1242 			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1243 			    sl, sl_link);
1244 		}
1245 	}
1246 
1247 	/* if auditing is enabled, record this transaction */
1248 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1249 		skmem_audit_bufctl(bc);
1250 	}
1251 
1252 	return 0;
1253 }
1254 
1255 /*
1256  * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
1257  */
1258 static int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1259 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1260     struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
1261 {
1262 	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1263 	struct skmem_region *skr = skm->skm_region;
1264 	void *obj, *buf;
1265 
1266 	/* this flag is not for the caller to set */
1267 	VERIFY(!(skmflag & SKMEM_FAILOK));
1268 
1269 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1270 
1271 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1272 	/* mirrored region is not applicable */
1273 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1274 	/* batching is not yet supported */
1275 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1276 
1277 	if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) {
1278 		atomic_add_64(&skm->skm_sl_alloc_fail, 1);
1279 		return ENOMEM;
1280 	}
1281 
1282 #if KASAN
1283 	/*
1284 	 * Perform some fix-ups since the zone element isn't guaranteed
1285 	 * to be on the aligned boundary.  The effective object size
1286 	 * has been adjusted accordingly by skmem_region_create() earlier
1287 	 * at cache creation time.
1288 	 *
1289 	 * 'buf' is get the aligned address for this object.
1290 	 */
1291 	buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
1292 	    skm->skm_bufalign);
1293 
1294 	/*
1295 	 * Wind back a pointer size from the aligned address and
1296 	 * save the original address so we can free it later.
1297 	 */
1298 	void **pbuf = (void **)((intptr_t)buf - sizeof(void *));
1299 	*pbuf = obj;
1300 
1301 	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1302 	    ((intptr_t)obj + skm->skm_objsize));
1303 #else /* !KASAN */
1304 	/*
1305 	 * We expect that the zone allocator would allocate elements
1306 	 * rounded up to the requested alignment based on the effective
1307 	 * object size computed in skmem_region_create() earlier, and
1308 	 * 'buf' is therefore the element address itself.
1309 	 */
1310 	buf = obj;
1311 #endif /* !KASAN */
1312 
1313 	/* make sure the object is aligned */
1314 	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1315 
1316 	/*
1317 	 * Return the object's info to the caller.
1318 	 */
1319 	bzero(oi, sizeof(*oi));
1320 	SKMEM_OBJ_ADDR(oi) = buf;
1321 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1322 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1323 	if (oim != NULL) {
1324 		bzero(oim, sizeof(*oim));
1325 	}
1326 
1327 	skm->skm_sl_alloc++;
1328 	skm->skm_sl_bufinuse++;
1329 	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1330 		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1331 	}
1332 
1333 	return 0;
1334 }
1335 
1336 /*
1337  * Allocate a raw object from the slab layer.
1338  */
1339 static int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1340 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
1341     struct skmem_obj_info *oim, uint32_t skmflag)
1342 {
1343 	int err;
1344 
1345 	SKM_SLAB_LOCK(skm);
1346 	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1347 	SKM_SLAB_UNLOCK(skm);
1348 
1349 	return err;
1350 }
1351 
1352 /*
1353  * Allocate raw object(s) from the slab layer.
1354  */
1355 static uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1356 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1357     uint32_t num, uint32_t skmflag)
1358 {
1359 	uint32_t need = num;
1360 
1361 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1362 	*list = NULL;
1363 
1364 	SKM_SLAB_LOCK(skm);
1365 	for (;;) {
1366 		struct skmem_obj_info oi, oim;
1367 
1368 		/*
1369 		 * Get a single raw object from the slab layer.
1370 		 */
1371 		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1372 			break;
1373 		}
1374 
1375 		*list = SKMEM_OBJ_ADDR(&oi);
1376 		ASSERT((*list)->mo_next == NULL);
1377 		/* store these inside the object itself */
1378 		(*list)->mo_info = oi;
1379 		(*list)->mo_minfo = oim;
1380 		list = &(*list)->mo_next;
1381 
1382 		ASSERT(need != 0);
1383 		if (--need == 0) {
1384 			break;
1385 		}
1386 	}
1387 	SKM_SLAB_UNLOCK(skm);
1388 
1389 	return num - need;
1390 }
1391 
1392 /*
1393  * Free a raw object to the (locked) slab layer.  Normal region variant.
1394  */
1395 static void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)1396 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
1397 {
1398 	struct skmem_bufctl *bc, *tbc;
1399 	struct skmem_bufctl_bkt *bcb;
1400 	struct skmem_slab *sl = NULL;
1401 
1402 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1403 	ASSERT(buf != NULL);
1404 	/* caller is expected to clear mo_next */
1405 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1406 	    ((struct skmem_obj *)buf)->mo_next == NULL);
1407 
1408 	/*
1409 	 * Search the hash chain to find a matching buffer control for the
1410 	 * given object address.  If found, remove the buffer control from
1411 	 * the hash chain and insert it into the freelist.  Otherwise, we
1412 	 * panic since the caller has given us a bogus address.
1413 	 */
1414 	skm->skm_sl_free++;
1415 	bcb = SKMEM_CACHE_HASH(skm, buf);
1416 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1417 		if (bc->bc_addr == buf) {
1418 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1419 			sl = bc->bc_slab;
1420 			break;
1421 		}
1422 	}
1423 
1424 	if (bc == NULL) {
1425 		panic("%s: attempt to free invalid or already-freed obj %p "
1426 		    "on skm %p", __func__, buf, skm);
1427 		/* NOTREACHED */
1428 		__builtin_unreachable();
1429 	}
1430 	ASSERT(sl != NULL && sl->sl_cache == skm);
1431 	VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1432 
1433 	/* make sure this object is not currently in use by another object */
1434 	VERIFY(bc->bc_usecnt == 0);
1435 
1436 	/* if auditing is enabled, record this transaction */
1437 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1438 		skmem_audit_bufctl(bc);
1439 	}
1440 
1441 	/* if clear on free is requested, zero out the object */
1442 	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1443 		bzero(buf, skm->skm_objsize);
1444 	}
1445 
1446 	/* insert the buffer control to the slab's freelist */
1447 	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1448 
1449 	ASSERT(sl->sl_refcnt >= 1);
1450 	if (--sl->sl_refcnt == 0) {
1451 		/*
1452 		 * If this was the last outstanding object for the slab,
1453 		 * remove the slab from the partially-allocated or empty
1454 		 * list, and destroy the slab (segment) back to the region.
1455 		 */
1456 		if (sl->sl_chunks == 1) {
1457 			ASSERT(skm->skm_sl_empty > 0);
1458 			skm->skm_sl_empty--;
1459 			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1460 		} else {
1461 			ASSERT(skm->skm_sl_partial > 0);
1462 			skm->skm_sl_partial--;
1463 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1464 		}
1465 		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
1466 		skm->skm_sl_bufinuse -= sl->sl_chunks;
1467 		skm->skm_sl_destroy++;
1468 		SKM_SLAB_UNLOCK(skm);
1469 		skmem_slab_destroy(skm, sl);
1470 		SKM_SLAB_LOCK(skm);
1471 		return;
1472 	}
1473 
1474 	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1475 	if (SLIST_NEXT(bc, bc_link) == NULL) {
1476 		/*
1477 		 * If this is the first (potentially amongst many) object
1478 		 * that's returned to the slab, remove the slab from the
1479 		 * empty list and insert to end of the partially-allocated
1480 		 * list. This should help avoid thrashing the partial slab
1481 		 * since we avoid disturbing what's already at the front.
1482 		 */
1483 		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
1484 		ASSERT(sl->sl_chunks > 1);
1485 		ASSERT(skm->skm_sl_empty > 0);
1486 		skm->skm_sl_empty--;
1487 		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1488 		skm->skm_sl_partial++;
1489 		ASSERT(skm->skm_sl_partial != 0);
1490 		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1491 	}
1492 }
1493 
1494 /*
1495  * Free a raw object to the (locked) slab layer.  Pseudo region variant.
1496  */
1497 static void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)1498 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
1499 {
1500 	struct skmem_region *skr = skm->skm_region;
1501 	void *obj = buf;
1502 
1503 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1504 
1505 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1506 
1507 	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1508 
1509 #if KASAN
1510 	/*
1511 	 * Since we stuffed the original zone element address before
1512 	 * the buffer address in KASAN mode, get it back since we're
1513 	 * about to free it.
1514 	 */
1515 	void **pbuf = (void **)((intptr_t)obj - sizeof(void *));
1516 
1517 	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1518 	    ((intptr_t)*pbuf + skm->skm_objsize));
1519 
1520 	obj = *pbuf;
1521 #endif /* KASAN */
1522 
1523 	/* free it to zone */
1524 	zfree(skr->skr_zreg, obj);
1525 
1526 	skm->skm_sl_free++;
1527 	ASSERT(skm->skm_sl_bufinuse > 0);
1528 	skm->skm_sl_bufinuse--;
1529 }
1530 
1531 /*
1532  * Free a raw object to the slab layer.
1533  */
1534 static void
skmem_slab_free(struct skmem_cache * skm,void * buf)1535 skmem_slab_free(struct skmem_cache *skm, void *buf)
1536 {
1537 	if (skm->skm_mode & SKM_MODE_BATCH) {
1538 		((struct skmem_obj *)buf)->mo_next = NULL;
1539 	}
1540 
1541 	SKM_SLAB_LOCK(skm);
1542 	skm->skm_slab_free(skm, buf);
1543 	SKM_SLAB_UNLOCK(skm);
1544 }
1545 
1546 /*
1547  * Free raw object(s) to the slab layer.
1548  */
1549 static void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1550 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1551 {
1552 	struct skmem_obj *listn;
1553 
1554 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1555 
1556 	SKM_SLAB_LOCK(skm);
1557 	for (;;) {
1558 		listn = list->mo_next;
1559 		list->mo_next = NULL;
1560 
1561 		/*
1562 		 * Free a single object to the slab layer.
1563 		 */
1564 		skm->skm_slab_free(skm, (void *)list);
1565 
1566 		/* if no more objects to free, we're done */
1567 		if ((list = listn) == NULL) {
1568 			break;
1569 		}
1570 	}
1571 	SKM_SLAB_UNLOCK(skm);
1572 }
1573 
1574 /*
1575  * Return the object's region info.
1576  */
1577 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)1578 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
1579     struct skmem_obj_info *oi, struct skmem_obj_info *oim)
1580 {
1581 	struct skmem_bufctl_bkt *bcb;
1582 	struct skmem_bufctl *bc;
1583 	struct skmem_slab *sl;
1584 
1585 	/*
1586 	 * Search the hash chain to find a matching buffer control for the
1587 	 * given object address.  If not found, panic since the caller has
1588 	 * given us a bogus address.
1589 	 */
1590 	SKM_SLAB_LOCK(skm);
1591 	bcb = SKMEM_CACHE_HASH(skm, buf);
1592 	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1593 		if (bc->bc_addr == buf) {
1594 			break;
1595 		}
1596 	}
1597 
1598 	if (__improbable(bc == NULL)) {
1599 		panic("%s: %s failed to get object info for %p",
1600 		    __func__, skm->skm_name, buf);
1601 		/* NOTREACHED */
1602 		__builtin_unreachable();
1603 	}
1604 
1605 	/*
1606 	 * Return the master object's info to the caller.
1607 	 */
1608 	sl = bc->bc_slab;
1609 	SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1610 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
1611 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1612 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1613 	SKMEM_OBJ_IDX_REG(oi) =
1614 	    (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1615 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1616 	/*
1617 	 * And for slave object.
1618 	 */
1619 	if (oim != NULL) {
1620 		bzero(oim, sizeof(*oim));
1621 		if (bc->bc_addrm != NULL) {
1622 			SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1623 			SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1624 			SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1625 			SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1626 		}
1627 	}
1628 	SKM_SLAB_UNLOCK(skm);
1629 }
1630 
1631 /*
1632  * Magazine constructor.
1633  */
1634 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)1635 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
1636     void *arg, uint32_t skmflag)
1637 {
1638 #pragma unused(oim, skmflag)
1639 	struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1640 
1641 	ASSERT(oim == NULL);
1642 	ASSERT(arg != NULL);
1643 
1644 	/*
1645 	 * Store it in the magazine object since we'll
1646 	 * need to refer to it during magazine destroy;
1647 	 * we can't safely refer to skm_magtype as the
1648 	 * depot lock may not be acquired then.
1649 	 */
1650 	mg->mg_magtype = arg;
1651 
1652 	return 0;
1653 }
1654 
1655 /*
1656  * Destroy a magazine (free each object to the slab layer).
1657  */
1658 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)1659 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
1660     int nrounds)
1661 {
1662 	int round;
1663 
1664 	for (round = 0; round < nrounds; round++) {
1665 		void *buf = mg->mg_round[round];
1666 		struct skmem_obj *next;
1667 
1668 		if (skm->skm_mode & SKM_MODE_BATCH) {
1669 			next = ((struct skmem_obj *)buf)->mo_next;
1670 			((struct skmem_obj *)buf)->mo_next = NULL;
1671 		}
1672 
1673 		/* deconstruct the object */
1674 		if (skm->skm_dtor != NULL) {
1675 			skm->skm_dtor(buf, skm->skm_private);
1676 		}
1677 
1678 		/*
1679 		 * In non-batching mode, each object in the magazine has
1680 		 * no linkage to its neighbor, so free individual object
1681 		 * to the slab layer now.
1682 		 */
1683 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1684 			skmem_slab_free(skm, buf);
1685 		} else {
1686 			((struct skmem_obj *)buf)->mo_next = next;
1687 		}
1688 	}
1689 
1690 	/*
1691 	 * In batching mode, each object is linked to its neighbor at free
1692 	 * time, and so take the bottom-most object and free it to the slab
1693 	 * layer.  Because of the way the list is reversed during free, this
1694 	 * will bring along the rest of objects above it.
1695 	 */
1696 	if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1697 		skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1698 	}
1699 
1700 	/* free the magazine itself back to cache */
1701 	skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1702 }
1703 
1704 /*
1705  * Get one or more magazines from the depot.
1706  */
1707 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag ** list,uint32_t num)1708 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1709     uint32_t *count, struct skmem_mag **list, uint32_t num)
1710 {
1711 	SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1712 	struct skmem_mag *mg;
1713 	uint32_t need = num, c = 0;
1714 
1715 	ASSERT(list != NULL && need > 0);
1716 
1717 	if (!SKM_DEPOT_LOCK_TRY(skm)) {
1718 		/*
1719 		 * Track the amount of lock contention here; if the contention
1720 		 * level is high (more than skmem_cache_depot_contention per a
1721 		 * given skmem_cache_update_interval interval), then we treat
1722 		 * it as a sign that the per-CPU layer is not using the right
1723 		 * magazine type, and that we'd need to resize it.
1724 		 */
1725 		SKM_DEPOT_LOCK(skm);
1726 		if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1727 			skm->skm_depot_contention++;
1728 		}
1729 	}
1730 
1731 	while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1732 		SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1733 		SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1734 		ASSERT(ml->ml_total != 0);
1735 		if (--ml->ml_total < ml->ml_min) {
1736 			ml->ml_min = ml->ml_total;
1737 		}
1738 		c++;
1739 		ml->ml_alloc++;
1740 		if (--need == 0) {
1741 			break;
1742 		}
1743 	}
1744 	*count -= c;
1745 
1746 	SKM_DEPOT_UNLOCK(skm);
1747 
1748 	*list = SLIST_FIRST(&mg_list);
1749 
1750 	return num - need;
1751 }
1752 
1753 /*
1754  * Return one or more magazines to the depot.
1755  */
1756 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1757 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1758     uint32_t *count, struct skmem_mag *mg)
1759 {
1760 	struct skmem_mag *nmg;
1761 	uint32_t c = 0;
1762 
1763 	SKM_DEPOT_LOCK(skm);
1764 	while (mg != NULL) {
1765 		nmg = SLIST_NEXT(mg, mg_link);
1766 		SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1767 		ml->ml_total++;
1768 		c++;
1769 		mg = nmg;
1770 	}
1771 	*count += c;
1772 	SKM_DEPOT_UNLOCK(skm);
1773 }
1774 
1775 /*
1776  * Update the depot's working state statistics.
1777  */
1778 static void
skmem_depot_ws_update(struct skmem_cache * skm)1779 skmem_depot_ws_update(struct skmem_cache *skm)
1780 {
1781 	SKM_DEPOT_LOCK_SPIN(skm);
1782 	skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1783 	skm->skm_full.ml_min = skm->skm_full.ml_total;
1784 	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1785 	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1786 	SKM_DEPOT_UNLOCK(skm);
1787 }
1788 
1789 /*
1790  * Empty the depot's working state statistics (everything's reapable.)
1791  */
1792 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1793 skmem_depot_ws_zero(struct skmem_cache *skm)
1794 {
1795 	SKM_DEPOT_LOCK_SPIN(skm);
1796 	if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1797 	    skm->skm_full.ml_min != skm->skm_full.ml_total ||
1798 	    skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1799 	    skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1800 		skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1801 		skm->skm_full.ml_min = skm->skm_full.ml_total;
1802 		skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1803 		skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1804 		skm->skm_depot_ws_zero++;
1805 	}
1806 	SKM_DEPOT_UNLOCK(skm);
1807 }
1808 
1809 /*
1810  * Reap magazines that's outside of the working set.
1811  */
1812 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1813 skmem_depot_ws_reap(struct skmem_cache *skm)
1814 {
1815 	struct skmem_mag *mg, *nmg;
1816 	uint32_t f, e, reap;
1817 
1818 	reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1819 	if (reap != 0) {
1820 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1821 		    &skm->skm_depot_full, &mg, reap);
1822 		while (mg != NULL) {
1823 			nmg = SLIST_NEXT(mg, mg_link);
1824 			SLIST_NEXT(mg, mg_link) = NULL;
1825 			skmem_magazine_destroy(skm, mg,
1826 			    mg->mg_magtype->mt_magsize);
1827 			mg = nmg;
1828 		}
1829 	}
1830 
1831 	reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1832 	if (reap != 0) {
1833 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1834 		    &skm->skm_depot_empty, &mg, reap);
1835 		while (mg != NULL) {
1836 			nmg = SLIST_NEXT(mg, mg_link);
1837 			SLIST_NEXT(mg, mg_link) = NULL;
1838 			skmem_magazine_destroy(skm, mg, 0);
1839 			mg = nmg;
1840 		}
1841 	}
1842 
1843 	if (f != 0 || e != 0) {
1844 		atomic_add_32(&skm->skm_cpu_mag_reap, 1);
1845 	}
1846 }
1847 
1848 /*
1849  * Performs periodic maintenance on a cache.  This is serialized
1850  * through the update thread call, and so we guarantee there's at
1851  * most one update episode in the system at any given time.
1852  */
1853 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1854 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1855 {
1856 #pragma unused(arg)
1857 	boolean_t resize_mag = FALSE;
1858 	boolean_t rescale_hash = FALSE;
1859 
1860 	SKMEM_CACHE_LOCK_ASSERT_HELD();
1861 
1862 	/* insist that we are executing in the update thread call context */
1863 	ASSERT(sk_is_cache_update_protected());
1864 
1865 	/*
1866 	 * If the cache has become much larger or smaller than the
1867 	 * allocated-address hash table, rescale the hash table.
1868 	 */
1869 	SKM_SLAB_LOCK(skm);
1870 	if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1871 	    (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1872 	    (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1873 	    skm->skm_hash_mask > skm->skm_hash_initial)) {
1874 		rescale_hash = TRUE;
1875 	}
1876 	SKM_SLAB_UNLOCK(skm);
1877 
1878 	/*
1879 	 * Update the working set.
1880 	 */
1881 	skmem_depot_ws_update(skm);
1882 
1883 	/*
1884 	 * If the contention count is greater than the threshold during
1885 	 * the update interval, and if we are not already at the maximum
1886 	 * magazine size, increase it.
1887 	 */
1888 	SKM_DEPOT_LOCK_SPIN(skm);
1889 	if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1890 	    (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1891 	    skmem_cache_depot_contention) {
1892 		ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1893 		resize_mag = TRUE;
1894 	}
1895 	skm->skm_depot_contention_prev = skm->skm_depot_contention;
1896 	SKM_DEPOT_UNLOCK(skm);
1897 
1898 	if (rescale_hash) {
1899 		skmem_cache_hash_rescale(skm);
1900 	}
1901 
1902 	if (resize_mag) {
1903 		skmem_cache_magazine_resize(skm);
1904 	}
1905 }
1906 
1907 /*
1908  * Reload the CPU's magazines with mg and its follower (if any).
1909  */
1910 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1911 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1912     int rounds)
1913 {
1914 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1915 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1916 	ASSERT(cp->cp_magsize > 0);
1917 
1918 	cp->cp_loaded = mg;
1919 	cp->cp_rounds = rounds;
1920 	if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1921 		cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1922 		cp->cp_prounds = rounds;
1923 		SLIST_NEXT(mg, mg_link) = NULL;
1924 	} else {
1925 		ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1926 		cp->cp_ploaded = NULL;
1927 		cp->cp_prounds = -1;
1928 	}
1929 }
1930 
1931 /*
1932  * Reload the CPU's magazine with mg and save the previous one.
1933  */
1934 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1935 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
1936 {
1937 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1938 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1939 	ASSERT(cp->cp_magsize > 0);
1940 
1941 	cp->cp_ploaded = cp->cp_loaded;
1942 	cp->cp_prounds = cp->cp_rounds;
1943 	cp->cp_loaded = mg;
1944 	cp->cp_rounds = rounds;
1945 }
1946 
1947 /*
1948  * Allocate a constructed object from the cache.
1949  */
1950 void *
skmem_cache_alloc(struct skmem_cache * skm,uint32_t skmflag)1951 skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
1952 {
1953 	struct skmem_obj *buf;
1954 
1955 	(void) skmem_cache_batch_alloc(skm, &buf, 1, skmflag);
1956 	return buf;
1957 }
1958 
1959 /*
1960  * Allocate constructed object(s) from the cache.
1961  */
1962 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1963 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1964     uint32_t num, uint32_t skmflag)
1965 {
1966 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1967 	struct skmem_obj **top = &(*list);
1968 	struct skmem_mag *mg;
1969 	uint32_t need = num;
1970 
1971 	ASSERT(list != NULL);
1972 	*list = NULL;
1973 
1974 	if (need == 0) {
1975 		return 0;
1976 	}
1977 	ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
1978 
1979 	SKM_CPU_LOCK(cp);
1980 	for (;;) {
1981 		/*
1982 		 * If we have an object in the current CPU's loaded
1983 		 * magazine, return it and we're done.
1984 		 */
1985 		if (cp->cp_rounds > 0) {
1986 			int objs = MIN((unsigned int)cp->cp_rounds, need);
1987 			/*
1988 			 * In the SKM_MODE_BATCH case, objects in are already
1989 			 * linked together with the most recently freed object
1990 			 * at the head of the list; grab as many objects as we
1991 			 * can.  Otherwise we'll just grab 1 object at most.
1992 			 */
1993 			*list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1994 			cp->cp_rounds -= objs;
1995 			cp->cp_alloc += objs;
1996 
1997 			if (skm->skm_mode & SKM_MODE_BATCH) {
1998 				struct skmem_obj *tail =
1999 				    cp->cp_loaded->mg_round[cp->cp_rounds];
2000 				list = &tail->mo_next;
2001 				*list = NULL;
2002 			}
2003 
2004 			/* if we got them all, return to caller */
2005 			if ((need -= objs) == 0) {
2006 				SKM_CPU_UNLOCK(cp);
2007 				goto done;
2008 			}
2009 		}
2010 
2011 		/*
2012 		 * The CPU's loaded magazine is empty.  If the previously
2013 		 * loaded magazine was full, exchange and try again.
2014 		 */
2015 		if (cp->cp_prounds > 0) {
2016 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2017 			continue;
2018 		}
2019 
2020 		/*
2021 		 * If the magazine layer is disabled, allocate from slab.
2022 		 * This can happen either because SKM_MODE_NOMAGAZINES is
2023 		 * set, or because we are resizing the magazine now.
2024 		 */
2025 		if (cp->cp_magsize == 0) {
2026 			break;
2027 		}
2028 
2029 		/*
2030 		 * Both of the CPU's magazines are empty; try to get
2031 		 * full magazine(s) from the depot layer.  Upon success,
2032 		 * reload and try again.  To prevent potential thrashing,
2033 		 * replace both empty magazines only if the requested
2034 		 * count exceeds a magazine's worth of objects.
2035 		 */
2036 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
2037 		    &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
2038 		if (mg != NULL) {
2039 			SLIST_HEAD(, skmem_mag) mg_list =
2040 			    SLIST_HEAD_INITIALIZER(mg_list);
2041 
2042 			if (cp->cp_ploaded != NULL) {
2043 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2044 				    mg_link);
2045 			}
2046 			if (SLIST_NEXT(mg, mg_link) == NULL) {
2047 				/*
2048 				 * Depot allocation returns only 1 magazine;
2049 				 * retain current empty magazine.
2050 				 */
2051 				skmem_cpu_reload(cp, mg, cp->cp_magsize);
2052 			} else {
2053 				/*
2054 				 * We got 2 full magazines from depot;
2055 				 * release the current empty magazine
2056 				 * back to the depot layer.
2057 				 */
2058 				if (cp->cp_loaded != NULL) {
2059 					SLIST_INSERT_HEAD(&mg_list,
2060 					    cp->cp_loaded, mg_link);
2061 				}
2062 				skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
2063 			}
2064 			skmem_depot_batch_free(skm, &skm->skm_empty,
2065 			    &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2066 			continue;
2067 		}
2068 
2069 		/*
2070 		 * The depot layer doesn't have any full magazines;
2071 		 * allocate directly from the slab layer.
2072 		 */
2073 		break;
2074 	}
2075 	SKM_CPU_UNLOCK(cp);
2076 
2077 	if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
2078 		struct skmem_obj *rtop, *rlist, *rlistp = NULL;
2079 		uint32_t rlistc, c = 0;
2080 
2081 		/*
2082 		 * Get a list of raw objects from the slab layer.
2083 		 */
2084 		rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
2085 		ASSERT(rlistc == 0 || rlist != NULL);
2086 		rtop = rlist;
2087 
2088 		/*
2089 		 * Construct each object in the raw list.  Upon failure,
2090 		 * free any remaining objects in the list back to the slab
2091 		 * layer, and keep the ones that were successfully constructed.
2092 		 * Here, "oi" and "oim" in each skmem_obj refer to the objects
2093 		 * coming from the master and slave regions (on mirrored
2094 		 * regions), respectively.  They are stored inside the object
2095 		 * temporarily so that we can pass them to the constructor.
2096 		 */
2097 		while (skm->skm_ctor != NULL && rlist != NULL) {
2098 			struct skmem_obj_info *oi = &rlist->mo_info;
2099 			struct skmem_obj_info *oim = &rlist->mo_minfo;
2100 			struct skmem_obj *rlistn = rlist->mo_next;
2101 
2102 			/*
2103 			 * Note that the constructor guarantees at least
2104 			 * the size of a pointer at the top of the object
2105 			 * and no more than that.  That means we must not
2106 			 * refer to "oi" and "oim" any longer after the
2107 			 * object goes thru the constructor.
2108 			 */
2109 			if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2110 			    oim : NULL), skm->skm_private, skmflag) != 0) {
2111 				VERIFY(rlist->mo_next == rlistn);
2112 				atomic_add_64(&skm->skm_sl_alloc_fail,
2113 				    rlistc - c);
2114 				if (rlistp != NULL) {
2115 					rlistp->mo_next = NULL;
2116 				}
2117 				if (rlist == rtop) {
2118 					rtop = NULL;
2119 					ASSERT(c == 0);
2120 				}
2121 				skmem_slab_batch_free(skm, rlist);
2122 				rlist = NULL;
2123 				rlistc = c;
2124 				break;
2125 			}
2126 			VERIFY(rlist->mo_next == rlistn);
2127 
2128 			++c;                    /* # of constructed objs */
2129 			rlistp = rlist;
2130 			if ((rlist = rlist->mo_next) == NULL) {
2131 				ASSERT(rlistc == c);
2132 				break;
2133 			}
2134 		}
2135 
2136 		/*
2137 		 * At this point "top" points to the head of the chain we're
2138 		 * going to return to caller; "list" points to the tail of that
2139 		 * chain.  The second chain begins at "rtop", and we append
2140 		 * that after "list" to form a single chain.  "rlistc" is the
2141 		 * number of objects in "rtop" originated from the slab layer
2142 		 * that have been successfully constructed (if applicable).
2143 		 */
2144 		ASSERT(c == 0 || rtop != NULL);
2145 		need -= rlistc;
2146 		*list = rtop;
2147 	} else {
2148 		struct skmem_obj_info oi, oim;
2149 		void *buf;
2150 
2151 		ASSERT(*top == NULL && num == 1 && need == 1);
2152 
2153 		/*
2154 		 * Get a single raw object from the slab layer.
2155 		 */
2156 		if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
2157 			goto done;
2158 		}
2159 
2160 		buf = SKMEM_OBJ_ADDR(&oi);
2161 		ASSERT(buf != NULL);
2162 
2163 		/*
2164 		 * Construct the raw object.  Here, "oi" and "oim" refer to
2165 		 * the objects coming from the master and slave regions (on
2166 		 * mirrored regions), respectively.
2167 		 */
2168 		if (skm->skm_ctor != NULL &&
2169 		    skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2170 		    &oim : NULL), skm->skm_private, skmflag) != 0) {
2171 			atomic_add_64(&skm->skm_sl_alloc_fail, 1);
2172 			skmem_slab_free(skm, buf);
2173 			goto done;
2174 		}
2175 
2176 		need = 0;
2177 		*list = buf;
2178 		ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
2179 		    (*list)->mo_next == NULL);
2180 	}
2181 
2182 done:
2183 	/* if auditing is enabled, record this transaction */
2184 	if (__improbable(*top != NULL &&
2185 	    (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2186 		skmem_audit_buf(skm, *top);
2187 	}
2188 
2189 	return num - need;
2190 }
2191 
2192 /*
2193  * Free a constructed object to the cache.
2194  */
2195 void
skmem_cache_free(struct skmem_cache * skm,void * buf)2196 skmem_cache_free(struct skmem_cache *skm, void *buf)
2197 {
2198 	if (skm->skm_mode & SKM_MODE_BATCH) {
2199 		((struct skmem_obj *)buf)->mo_next = NULL;
2200 	}
2201 	skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2202 }
2203 
2204 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)2205 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
2206 {
2207 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2208 	struct skmem_magtype *mtp;
2209 	struct skmem_mag *mg;
2210 	struct skmem_obj *listn;
2211 
2212 	/* if auditing is enabled, record this transaction */
2213 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2214 		skmem_audit_buf(skm, list);
2215 	}
2216 
2217 	SKM_CPU_LOCK(cp);
2218 	for (;;) {
2219 		/*
2220 		 * If there's an available space in the current CPU's
2221 		 * loaded magazine, place it there and we're done.
2222 		 */
2223 		if ((unsigned int)cp->cp_rounds <
2224 		    (unsigned int)cp->cp_magsize) {
2225 			/*
2226 			 * In the SKM_MODE_BATCH case, reverse the list
2227 			 * while we place each object into the magazine;
2228 			 * this effectively causes the most recently
2229 			 * freed object to be reused during allocation.
2230 			 */
2231 			if (skm->skm_mode & SKM_MODE_BATCH) {
2232 				listn = list->mo_next;
2233 				list->mo_next = (cp->cp_rounds == 0) ? NULL :
2234 				    cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2235 			} else {
2236 				listn = NULL;
2237 			}
2238 
2239 			cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2240 			cp->cp_free++;
2241 
2242 			if ((list = listn) != NULL) {
2243 				continue;
2244 			}
2245 
2246 			SKM_CPU_UNLOCK(cp);
2247 			return;
2248 		}
2249 
2250 		/*
2251 		 * The loaded magazine is full.  If the previously
2252 		 * loaded magazine was empty, exchange and try again.
2253 		 */
2254 		if (cp->cp_prounds == 0) {
2255 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2256 			continue;
2257 		}
2258 
2259 		/*
2260 		 * If the magazine layer is disabled, free to slab.
2261 		 * This can happen either because SKM_MODE_NOMAGAZINES
2262 		 * is set, or because we are resizing the magazine now.
2263 		 */
2264 		if (cp->cp_magsize == 0) {
2265 			break;
2266 		}
2267 
2268 		/*
2269 		 * Both magazines for the CPU are full; try to get
2270 		 * empty magazine(s) from the depot.  If we get one,
2271 		 * exchange a full magazine with it and place the
2272 		 * object in there.
2273 		 *
2274 		 * TODO: Because the caller currently doesn't indicate
2275 		 * the number of objects in the list, we choose the more
2276 		 * conservative approach of allocating only 1 empty
2277 		 * magazine (to prevent potential thrashing).  Once we
2278 		 * have the object count, we can replace 1 with similar
2279 		 * logic as used in skmem_cache_batch_alloc().
2280 		 */
2281 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
2282 		    &skm->skm_depot_empty, &mg, 1);
2283 		if (mg != NULL) {
2284 			SLIST_HEAD(, skmem_mag) mg_list =
2285 			    SLIST_HEAD_INITIALIZER(mg_list);
2286 
2287 			if (cp->cp_ploaded != NULL) {
2288 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2289 				    mg_link);
2290 			}
2291 			if (SLIST_NEXT(mg, mg_link) == NULL) {
2292 				/*
2293 				 * Depot allocation returns only 1 magazine;
2294 				 * retain current full magazine.
2295 				 */
2296 				skmem_cpu_reload(cp, mg, 0);
2297 			} else {
2298 				/*
2299 				 * We got 2 empty magazines from depot;
2300 				 * release the current full magazine back
2301 				 * to the depot layer.
2302 				 */
2303 				if (cp->cp_loaded != NULL) {
2304 					SLIST_INSERT_HEAD(&mg_list,
2305 					    cp->cp_loaded, mg_link);
2306 				}
2307 				skmem_cpu_batch_reload(cp, mg, 0);
2308 			}
2309 			skmem_depot_batch_free(skm, &skm->skm_full,
2310 			    &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2311 			continue;
2312 		}
2313 
2314 		/*
2315 		 * We can't get any empty magazine from the depot, and
2316 		 * so we need to allocate one.  If the allocation fails,
2317 		 * just fall through, deconstruct and free the object
2318 		 * to the slab layer.
2319 		 */
2320 		mtp = skm->skm_magtype;
2321 		SKM_CPU_UNLOCK(cp);
2322 		mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
2323 		SKM_CPU_LOCK(cp);
2324 
2325 		if (mg != NULL) {
2326 			/*
2327 			 * We allocated an empty magazine, but since we
2328 			 * dropped the CPU lock above the magazine size
2329 			 * may have changed.  If that's the case free
2330 			 * the magazine and try again.
2331 			 */
2332 			if (cp->cp_magsize != mtp->mt_magsize) {
2333 				SKM_CPU_UNLOCK(cp);
2334 				skmem_cache_free(mtp->mt_cache, mg);
2335 				SKM_CPU_LOCK(cp);
2336 				continue;
2337 			}
2338 
2339 			/*
2340 			 * We have a magazine with the right size;
2341 			 * add it to the depot and try again.
2342 			 */
2343 			ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2344 			skmem_depot_batch_free(skm, &skm->skm_empty,
2345 			    &skm->skm_depot_empty, mg);
2346 			continue;
2347 		}
2348 
2349 		/*
2350 		 * We can't get an empty magazine, so free to slab.
2351 		 */
2352 		break;
2353 	}
2354 	SKM_CPU_UNLOCK(cp);
2355 
2356 	/*
2357 	 * We weren't able to free the constructed object(s) to the
2358 	 * magazine layer, so deconstruct them and free to the slab.
2359 	 */
2360 	if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2361 	    list->mo_next != NULL)) {
2362 		/* whatever is left from original list */
2363 		struct skmem_obj *top = list;
2364 
2365 		while (list != NULL && skm->skm_dtor != NULL) {
2366 			listn = list->mo_next;
2367 			list->mo_next = NULL;
2368 
2369 			/* deconstruct the object */
2370 			if (skm->skm_dtor != NULL) {
2371 				skm->skm_dtor((void *)list, skm->skm_private);
2372 			}
2373 
2374 			list->mo_next = listn;
2375 			list = listn;
2376 		}
2377 
2378 		skmem_slab_batch_free(skm, top);
2379 	} else {
2380 		/* deconstruct the object */
2381 		if (skm->skm_dtor != NULL) {
2382 			skm->skm_dtor((void *)list, skm->skm_private);
2383 		}
2384 
2385 		skmem_slab_free(skm, (void *)list);
2386 	}
2387 }
2388 
2389 /*
2390  * Return the maximum number of objects cached at the magazine layer
2391  * based on the chunk size.  This takes into account the starting
2392  * magazine type as well as the final magazine type used in resizing.
2393  */
2394 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)2395 skmem_cache_magazine_max(uint32_t chunksize)
2396 {
2397 	struct skmem_magtype *mtp;
2398 	uint32_t magsize_max;
2399 
2400 	VERIFY(ncpu != 0);
2401 	VERIFY(chunksize > 0);
2402 
2403 	/* find a suitable magazine type for this chunk size */
2404 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2405 		continue;
2406 	}
2407 
2408 	/* and find the last magazine type  */
2409 	for (;;) {
2410 		magsize_max = mtp->mt_magsize;
2411 		if (mtp == skmem_cache_magsize_last ||
2412 		    chunksize >= mtp->mt_maxbuf) {
2413 			break;
2414 		}
2415 		++mtp;
2416 		VERIFY(mtp <= skmem_cache_magsize_last);
2417 	}
2418 
2419 	return ncpu * magsize_max * 2; /* two magazines per CPU */
2420 }
2421 
2422 /*
2423  * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2424  */
2425 boolean_t
skmem_allow_magazines(void)2426 skmem_allow_magazines(void)
2427 {
2428 	return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2429 }
2430 
2431 /*
2432  * Purge all magazines from a cache and disable its per-CPU magazines layer.
2433  */
2434 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)2435 skmem_cache_magazine_purge(struct skmem_cache *skm)
2436 {
2437 	struct skmem_cpu_cache *cp;
2438 	struct skmem_mag *mg, *pmg;
2439 	int rounds, prounds;
2440 	uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
2441 
2442 	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2443 
2444 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2445 
2446 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
2447 		cp = &skm->skm_cpu_cache[cpuid];
2448 
2449 		SKM_CPU_LOCK_SPIN(cp);
2450 		mg = cp->cp_loaded;
2451 		pmg = cp->cp_ploaded;
2452 		rounds = cp->cp_rounds;
2453 		prounds = cp->cp_prounds;
2454 		cp->cp_loaded = NULL;
2455 		cp->cp_ploaded = NULL;
2456 		cp->cp_rounds = -1;
2457 		cp->cp_prounds = -1;
2458 		cp->cp_magsize = 0;
2459 		SKM_CPU_UNLOCK(cp);
2460 
2461 		if (mg != NULL) {
2462 			skmem_magazine_destroy(skm, mg, rounds);
2463 			++mg_cnt;
2464 		}
2465 		if (pmg != NULL) {
2466 			skmem_magazine_destroy(skm, pmg, prounds);
2467 			++pmg_cnt;
2468 		}
2469 	}
2470 
2471 	if (mg_cnt != 0 || pmg_cnt != 0) {
2472 		atomic_add_32(&skm->skm_cpu_mag_purge, 1);
2473 	}
2474 
2475 	skmem_depot_ws_zero(skm);
2476 	skmem_depot_ws_reap(skm);
2477 }
2478 
2479 /*
2480  * Enable magazines on a cache.  Must only be called on a cache with
2481  * its per-CPU magazines layer disabled (e.g. due to purge).
2482  */
2483 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)2484 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2485 {
2486 #pragma unused(arg)
2487 	struct skmem_cpu_cache *cp;
2488 	uint32_t cpuid;
2489 
2490 	if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2491 		return;
2492 	}
2493 
2494 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
2495 		cp = &skm->skm_cpu_cache[cpuid];
2496 		SKM_CPU_LOCK_SPIN(cp);
2497 		/* the magazines layer must be disabled at this point */
2498 		ASSERT(cp->cp_loaded == NULL);
2499 		ASSERT(cp->cp_ploaded == NULL);
2500 		ASSERT(cp->cp_rounds == -1);
2501 		ASSERT(cp->cp_prounds == -1);
2502 		ASSERT(cp->cp_magsize == 0);
2503 		cp->cp_magsize = skm->skm_magtype->mt_magsize;
2504 		SKM_CPU_UNLOCK(cp);
2505 	}
2506 
2507 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2508 	    SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2509 	    SKMEM_CPU_CACHE(skm)->cp_magsize);
2510 }
2511 
2512 /*
2513  * Enter the cache resize perimeter.  Upon success, claim exclusivity
2514  * on the perimeter and return 0, else EBUSY.  Caller may indicate
2515  * whether or not they're willing to wait.
2516  */
2517 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)2518 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2519 {
2520 	SKM_RESIZE_LOCK(skm);
2521 	if (skm->skm_rs_owner == current_thread()) {
2522 		ASSERT(skm->skm_rs_busy != 0);
2523 		skm->skm_rs_busy++;
2524 		goto done;
2525 	}
2526 	if (!can_sleep) {
2527 		if (skm->skm_rs_busy != 0) {
2528 			SKM_RESIZE_UNLOCK(skm);
2529 			return EBUSY;
2530 		}
2531 	} else {
2532 		while (skm->skm_rs_busy != 0) {
2533 			skm->skm_rs_want++;
2534 			(void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
2535 			SKM_RESIZE_UNLOCK(skm);
2536 			(void) thread_block(THREAD_CONTINUE_NULL);
2537 			SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2538 			    "(0x%llx) busy=%u", skm->skm_name,
2539 			    SK_KVA(skm), skm->skm_rs_busy);
2540 			SKM_RESIZE_LOCK(skm);
2541 		}
2542 	}
2543 	SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2544 	ASSERT(skm->skm_rs_busy == 0);
2545 	skm->skm_rs_busy++;
2546 	skm->skm_rs_owner = current_thread();
2547 done:
2548 	SKM_RESIZE_UNLOCK(skm);
2549 	return 0;
2550 }
2551 
2552 /*
2553  * Exit the cache resize perimeter and unblock any waiters.
2554  */
2555 static void
skmem_cache_resize_exit(struct skmem_cache * skm)2556 skmem_cache_resize_exit(struct skmem_cache *skm)
2557 {
2558 	uint32_t want;
2559 
2560 	SKM_RESIZE_LOCK(skm);
2561 	ASSERT(skm->skm_rs_busy != 0);
2562 	ASSERT(skm->skm_rs_owner == current_thread());
2563 	if (--skm->skm_rs_busy == 0) {
2564 		skm->skm_rs_owner = NULL;
2565 		/*
2566 		 * We're done; notify anyone that has lost the race.
2567 		 */
2568 		if ((want = skm->skm_rs_want) != 0) {
2569 			skm->skm_rs_want = 0;
2570 			wakeup((void *)&skm->skm_rs_busy);
2571 			SKM_RESIZE_UNLOCK(skm);
2572 		} else {
2573 			SKM_RESIZE_UNLOCK(skm);
2574 		}
2575 	} else {
2576 		SKM_RESIZE_UNLOCK(skm);
2577 	}
2578 }
2579 
2580 /*
2581  * Recompute a cache's magazine size.  This is an expensive operation
2582  * and should not be done frequently; larger magazines provide for a
2583  * higher transfer rate with the depot while smaller magazines reduce
2584  * the memory consumption.
2585  */
2586 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)2587 skmem_cache_magazine_resize(struct skmem_cache *skm)
2588 {
2589 	struct skmem_magtype *mtp = skm->skm_magtype;
2590 
2591 	/* insist that we are executing in the update thread call context */
2592 	ASSERT(sk_is_cache_update_protected());
2593 	ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2594 	/* depot contention only applies to dynamic mode */
2595 	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2596 
2597 	/*
2598 	 * Although we're executing in the context of the update thread
2599 	 * call, we need to protect the per-CPU states during resizing
2600 	 * against other synchronous cache purge/reenable requests that
2601 	 * could take place in parallel.
2602 	 */
2603 	if (skm->skm_chunksize < mtp->mt_maxbuf) {
2604 		(void) skmem_cache_resize_enter(skm, TRUE);
2605 		skmem_cache_magazine_purge(skm);
2606 
2607 		/*
2608 		 * Upgrade to the next magazine type with larger size.
2609 		 */
2610 		SKM_DEPOT_LOCK_SPIN(skm);
2611 		skm->skm_cpu_mag_resize++;
2612 		skm->skm_magtype = ++mtp;
2613 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2614 		skm->skm_depot_contention_prev =
2615 		    skm->skm_depot_contention + INT_MAX;
2616 		SKM_DEPOT_UNLOCK(skm);
2617 
2618 		skmem_cache_magazine_enable(skm, 0);
2619 		skmem_cache_resize_exit(skm);
2620 	}
2621 }
2622 
2623 /*
2624  * Rescale the cache's allocated-address hash table.
2625  */
2626 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2627 skmem_cache_hash_rescale(struct skmem_cache *skm)
2628 {
2629 	struct skmem_bufctl_bkt *old_table, *new_table;
2630 	size_t old_size, new_size;
2631 	uint32_t i, moved = 0;
2632 
2633 	/* insist that we are executing in the update thread call context */
2634 	ASSERT(sk_is_cache_update_protected());
2635 
2636 	/*
2637 	 * To get small average lookup time (lookup depth near 1.0), the hash
2638 	 * table size should be roughly the same (not necessarily equivalent)
2639 	 * as the cache size.
2640 	 */
2641 	new_size = MAX(skm->skm_hash_initial,
2642 	    (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2643 	new_size = MIN(skm->skm_hash_limit, new_size);
2644 	old_size = (skm->skm_hash_mask + 1);
2645 
2646 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2647 		return;
2648 	}
2649 
2650 	new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2651 	    Z_NOWAIT, skmem_tag_bufctl_hash);
2652 	if (__improbable(new_table == NULL)) {
2653 		return;
2654 	}
2655 
2656 	for (i = 0; i < new_size; i++) {
2657 		SLIST_INIT(&new_table[i].bcb_head);
2658 	}
2659 
2660 	SKM_SLAB_LOCK(skm);
2661 
2662 	old_size = (skm->skm_hash_mask + 1);
2663 	old_table = skm->skm_hash_table;
2664 
2665 	skm->skm_hash_mask = (new_size - 1);
2666 	skm->skm_hash_table = new_table;
2667 	skm->skm_sl_rescale++;
2668 
2669 	for (i = 0; i < old_size; i++) {
2670 		struct skmem_bufctl_bkt *bcb = &old_table[i];
2671 		struct skmem_bufctl_bkt *new_bcb;
2672 		struct skmem_bufctl *bc;
2673 
2674 		while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2675 			SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2676 			new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2677 			/*
2678 			 * Ideally we want to insert tail here, but simple
2679 			 * list doesn't give us that.  The fact that we are
2680 			 * essentially reversing the order is not a big deal
2681 			 * here vis-a-vis the new table size.
2682 			 */
2683 			SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2684 			++moved;
2685 		}
2686 		ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2687 	}
2688 
2689 	SK_DF(SK_VERB_MEM_CACHE,
2690 	    "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2691 	    (uint32_t)old_size, (uint32_t)new_size, moved);
2692 
2693 	SKM_SLAB_UNLOCK(skm);
2694 
2695 	sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2696 }
2697 
2698 /*
2699  * Apply a function to operate on all caches.
2700  */
2701 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2702 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2703 {
2704 	struct skmem_cache *skm;
2705 
2706 	net_update_uptime();
2707 
2708 	SKMEM_CACHE_LOCK();
2709 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2710 		func(skm, arg);
2711 	}
2712 	SKMEM_CACHE_UNLOCK();
2713 }
2714 
2715 /*
2716  * Reclaim unused memory from a cache.
2717  */
2718 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2719 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2720 {
2721 	/*
2722 	 * Inform the owner to free memory if possible; the reclaim
2723 	 * policy is left to the owner.  This is just an advisory.
2724 	 */
2725 	if (skm->skm_reclaim != NULL) {
2726 		skm->skm_reclaim(skm->skm_private);
2727 	}
2728 
2729 	if (lowmem) {
2730 		/*
2731 		 * If another thread is in the process of purging or
2732 		 * resizing, bail out and let the currently-ongoing
2733 		 * purging take its natural course.
2734 		 */
2735 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2736 			skmem_cache_magazine_purge(skm);
2737 			skmem_cache_magazine_enable(skm, 0);
2738 			skmem_cache_resize_exit(skm);
2739 		}
2740 	} else {
2741 		skmem_depot_ws_reap(skm);
2742 	}
2743 }
2744 
2745 /*
2746  * Thread call callback for reap.
2747  */
2748 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2749 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2750 {
2751 #pragma unused(dummy)
2752 	void (*func)(void) = arg;
2753 
2754 	ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2755 	func();
2756 }
2757 
2758 /*
2759  * Start reaping all caches; this is serialized via thread call.
2760  */
2761 static void
skmem_cache_reap_start(void)2762 skmem_cache_reap_start(void)
2763 {
2764 	SK_DF(SK_VERB_MEM_CACHE, "now running");
2765 	skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2766 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2767 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2768 }
2769 
2770 /*
2771  * Stop reaping; this would allow another reap request to occur.
2772  */
2773 static void
skmem_cache_reap_done(void)2774 skmem_cache_reap_done(void)
2775 {
2776 	volatile uint32_t *flag = &skmem_cache_reaping;
2777 
2778 	*flag = 0;
2779 	membar_sync();
2780 }
2781 
2782 /*
2783  * Immediately reap all unused memory of a cache.  If purging,
2784  * also purge the cached objects at the CPU layer.
2785  */
2786 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2787 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2788 {
2789 	if (purge) {
2790 		/*
2791 		 * If another thread is in the process of purging or
2792 		 * resizing, bail out and let the currently-ongoing
2793 		 * purging take its natural course.
2794 		 */
2795 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2796 			skmem_cache_magazine_purge(skm);
2797 			skmem_cache_magazine_enable(skm, 0);
2798 			skmem_cache_resize_exit(skm);
2799 		}
2800 	} else {
2801 		skmem_depot_ws_zero(skm);
2802 		skmem_depot_ws_reap(skm);
2803 	}
2804 }
2805 
2806 /*
2807  * Request a global reap operation to be dispatched.
2808  */
2809 void
skmem_cache_reap(void)2810 skmem_cache_reap(void)
2811 {
2812 	/* only one reaping episode is allowed at a time */
2813 	if (skmem_lock_owner == current_thread() ||
2814 	    !atomic_test_set_32(&skmem_cache_reaping, 0, 1)) {
2815 		return;
2816 	}
2817 
2818 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2819 }
2820 
2821 /*
2822  * Reap internal caches.
2823  */
2824 void
skmem_reap_caches(boolean_t purge)2825 skmem_reap_caches(boolean_t purge)
2826 {
2827 	skmem_cache_reap_now(skmem_slab_cache, purge);
2828 	skmem_cache_reap_now(skmem_bufctl_cache, purge);
2829 
2830 	/* packet buffer pool objects */
2831 	pp_reap_caches(purge);
2832 
2833 	/* also handle the region cache(s) */
2834 	skmem_region_reap_caches(purge);
2835 }
2836 
2837 /*
2838  * Thread call callback for update.
2839  */
2840 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2841 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2842 {
2843 #pragma unused(dummy, arg)
2844 	sk_protect_t protect;
2845 
2846 	protect = sk_cache_update_protect();
2847 	skmem_cache_applyall(skmem_cache_update, 0);
2848 	sk_cache_update_unprotect(protect);
2849 
2850 	skmem_dispatch(skmem_cache_update_tc, NULL,
2851 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2852 }
2853 
2854 /*
2855  * Given a buffer control, record the current transaction.
2856  */
2857 __attribute__((noinline, cold, not_tail_called))
2858 static inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)2859 skmem_audit_bufctl(struct skmem_bufctl *bc)
2860 {
2861 	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
2862 	struct timeval tv;
2863 
2864 	microuptime(&tv);
2865 	bca->bc_thread = current_thread();
2866 	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
2867 	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
2868 }
2869 
2870 /*
2871  * Given an object, find its buffer control and record the transaction.
2872  */
2873 __attribute__((noinline, cold, not_tail_called))
2874 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)2875 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
2876 {
2877 	struct skmem_bufctl_bkt *bcb;
2878 	struct skmem_bufctl *bc;
2879 
2880 	ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
2881 
2882 	SKM_SLAB_LOCK(skm);
2883 	while (list != NULL) {
2884 		void *buf = list;
2885 
2886 		bcb = SKMEM_CACHE_HASH(skm, buf);
2887 		SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
2888 			if (bc->bc_addr == buf) {
2889 				break;
2890 			}
2891 		}
2892 
2893 		if (__improbable(bc == NULL)) {
2894 			panic("%s: %s failed to get bufctl for %p",
2895 			    __func__, skm->skm_name, buf);
2896 			/* NOTREACHED */
2897 			__builtin_unreachable();
2898 		}
2899 
2900 		skmem_audit_bufctl(bc);
2901 
2902 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
2903 			break;
2904 		}
2905 
2906 		list = list->mo_next;
2907 	}
2908 	SKM_SLAB_UNLOCK(skm);
2909 }
2910 
2911 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * out,size_t len)2912 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len)
2913 {
2914 	size_t actual_space = sizeof(struct sk_stats_cache);
2915 	struct sk_stats_cache *sca = out;
2916 	int contention;
2917 
2918 	if (out == NULL || len < actual_space) {
2919 		goto done;
2920 	}
2921 
2922 	bzero(sca, sizeof(*sca));
2923 	(void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
2924 	    skm->skm_name);
2925 	uuid_copy(sca->sca_uuid, skm->skm_uuid);
2926 	uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
2927 	sca->sca_mode = skm->skm_mode;
2928 	sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
2929 	sca->sca_objsize = (uint64_t)skm->skm_objsize;
2930 	sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
2931 	sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
2932 	sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
2933 	sca->sca_objalign = (uint64_t)skm->skm_objalign;
2934 
2935 	sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
2936 	sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
2937 	sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
2938 	sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
2939 	sca->sca_depot_full = skm->skm_depot_full;
2940 	sca->sca_depot_empty = skm->skm_depot_empty;
2941 	sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
2942 	/* in case of a race this might be a negative value, turn it into 0 */
2943 	if ((contention = (int)(skm->skm_depot_contention -
2944 	    skm->skm_depot_contention_prev)) < 0) {
2945 		contention = 0;
2946 	}
2947 	sca->sca_depot_contention_factor = contention;
2948 
2949 	sca->sca_sl_create = skm->skm_sl_create;
2950 	sca->sca_sl_destroy = skm->skm_sl_destroy;
2951 	sca->sca_sl_alloc = skm->skm_sl_alloc;
2952 	sca->sca_sl_free = skm->skm_sl_free;
2953 	sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
2954 	sca->sca_sl_partial = skm->skm_sl_partial;
2955 	sca->sca_sl_empty = skm->skm_sl_empty;
2956 	sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
2957 	sca->sca_sl_rescale = skm->skm_sl_rescale;
2958 	sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
2959 
2960 done:
2961 	return actual_space;
2962 }
2963 
2964 static int
2965 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
2966 {
2967 #pragma unused(arg1, arg2, oidp)
2968 	struct skmem_cache *skm;
2969 	size_t actual_space;
2970 	size_t buffer_space;
2971 	size_t allocated_space;
2972 	caddr_t buffer = NULL;
2973 	caddr_t scan;
2974 	int error = 0;
2975 
2976 	if (!kauth_cred_issuser(kauth_cred_get())) {
2977 		return EPERM;
2978 	}
2979 
2980 	net_update_uptime();
2981 	buffer_space = req->oldlen;
2982 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2983 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2984 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2985 		}
2986 		allocated_space = buffer_space;
2987 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
2988 		if (__improbable(buffer == NULL)) {
2989 			return ENOBUFS;
2990 		}
2991 	} else if (req->oldptr == USER_ADDR_NULL) {
2992 		buffer_space = 0;
2993 	}
2994 	actual_space = 0;
2995 	scan = buffer;
2996 
2997 	SKMEM_CACHE_LOCK();
2998 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2999 		size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
3000 		if (scan != NULL) {
3001 			if (buffer_space < size) {
3002 				/* supplied buffer too small, stop copying */
3003 				error = ENOMEM;
3004 				break;
3005 			}
3006 			scan += size;
3007 			buffer_space -= size;
3008 		}
3009 		actual_space += size;
3010 	}
3011 	SKMEM_CACHE_UNLOCK();
3012 
3013 	if (actual_space != 0) {
3014 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
3015 		if (out_error != 0) {
3016 			error = out_error;
3017 		}
3018 	}
3019 	if (buffer != NULL) {
3020 		sk_free_data(buffer, allocated_space);
3021 	}
3022 
3023 	return error;
3024 }
3025