xref: /xnu-10002.81.5/bsd/skywalk/mem/skmem_cache.c (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587)
1 /*
2  * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <vm/vm_memtag.h>
35 
36 /*
37  * Memory allocator with per-CPU caching (magazines), derived from the kmem
38  * magazine concept and implementation as described in the following paper:
39  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
40  *
41  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
42  * reserved.  Use is subject to license terms.
43  *
44  * This derivative differs from the original kmem slab allocator, in that:
45  *
46  *   a) There is always a discrete bufctl per object, even for small sizes.
47  *      This increases the overhead, but is necessary as Skywalk objects
48  *      coming from the slab may be shared (RO or RW) with userland; therefore
49  *      embedding the KVA pointer linkage in freed objects is a non-starter.
50  *
51  *   b) Writing patterns to the slab at slab creation or destruction time
52  *      (when debugging is enabled) is not implemented, as the object may
53  *      be shared (RW) with userland and thus we cannot panic upon pattern
54  *      mismatch episodes.  This can be relaxed so that we conditionally
55  *      verify the pattern for kernel-only memory.
56  *
57  * This derivative also differs from Darwin's mcache allocator (which itself
58  * is a derivative of the original kmem slab allocator), in that:
59  *
60  *   1) The slab layer is internal to skmem_cache, unlike mcache's external
61  *      slab layer required to support mbufs.  skmem_cache also supports
62  *      constructing and deconstructing objects, while mcache does not.
63  *      This brings skmem_cache's model closer to that of the original
64  *      kmem slab allocator.
65  *
66  *   2) mcache allows for batch allocation and free by way of chaining the
67  *      objects together using a linked list.  This requires using a part
68  *      of the object to act as the linkage, which is against Skywalk's
69  *      requirements of not exposing any KVA pointer to userland.  Although
70  *      this is supported by skmem_cache, chaining is only possible if the
71  *      region is not mapped to userland.  That implies that kernel-only
72  *      objects can be chained provided the cache is created with batching
73  *      mode enabled, and that the object is large enough to contain the
74  *      skmem_obj structure.
75  *
76  * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
77  * implements features that are required by Skywalk.  In addition to being
78  * aware of userland access on the buffers, in also supports mirrored backend
79  * memory regions.  This allows a cache to manage two independent memory
80  * regions, such that allocating/freeing an object from/to one results in
81  * allocating/freeing a shadow object in another, thus guaranteeing that both
82  * objects share the same lifetime.
83  */
84 
85 static uint32_t ncpu;                   /* total # of initialized CPUs */
86 
87 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
88 static struct thread *skmem_lock_owner = THREAD_NULL;
89 
90 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
91 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
92 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
93 
94 #define SKMEM_CACHE_LOCK() do {                 \
95 	lck_mtx_lock(&skmem_cache_lock);        \
96 	skmem_lock_owner = current_thread();    \
97 } while (0)
98 #define SKMEM_CACHE_UNLOCK() do {               \
99 	skmem_lock_owner = THREAD_NULL;         \
100 	lck_mtx_unlock(&skmem_cache_lock);      \
101 } while (0)
102 #define SKMEM_CACHE_LOCK_ASSERT_HELD()          \
103 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
104 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD()       \
105 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
106 
107 #define SKM_SLAB_LOCK(_skm)                     \
108 	lck_mtx_lock(&(_skm)->skm_sl_lock)
109 #define SKM_SLAB_LOCK_ASSERT_HELD(_skm)         \
110 	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_OWNED)
111 #define SKM_SLAB_LOCK_ASSERT_NOTHELD(_skm)      \
112 	LCK_MTX_ASSERT(&(_skm)->skm_sl_lock, LCK_MTX_ASSERT_NOTOWNED)
113 #define SKM_SLAB_UNLOCK(_skm)                   \
114 	lck_mtx_unlock(&(_skm)->skm_sl_lock)
115 
116 #define SKM_DEPOT_LOCK(_skm)                    \
117 	lck_mtx_lock(&(_skm)->skm_dp_lock)
118 #define SKM_DEPOT_LOCK_SPIN(_skm)               \
119 	lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
120 #define SKM_DEPOT_CONVERT_LOCK(_skm)            \
121 	lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
122 #define SKM_DEPOT_LOCK_TRY(_skm)                \
123 	lck_mtx_try_lock(&(_skm)->skm_dp_lock)
124 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm)        \
125 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
126 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm)     \
127 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
128 #define SKM_DEPOT_UNLOCK(_skm)                  \
129 	lck_mtx_unlock(&(_skm)->skm_dp_lock)
130 
131 #define SKM_RESIZE_LOCK(_skm)                   \
132 	lck_mtx_lock(&(_skm)->skm_rs_lock)
133 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm)       \
134 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
135 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm)    \
136 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
137 #define SKM_RESIZE_UNLOCK(_skm)                 \
138 	lck_mtx_unlock(&(_skm)->skm_rs_lock)
139 
140 #define SKM_CPU_LOCK(_cp)                       \
141 	lck_mtx_lock(&(_cp)->cp_lock)
142 #define SKM_CPU_LOCK_SPIN(_cp)                  \
143 	lck_mtx_lock_spin(&(_cp)->cp_lock)
144 #define SKM_CPU_CONVERT_LOCK(_cp)               \
145 	lck_mtx_convert_spin(&(_cp)->cp_lock)
146 #define SKM_CPU_LOCK_ASSERT_HELD(_cp)           \
147 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
148 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp)        \
149 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
150 #define SKM_CPU_UNLOCK(_cp)                     \
151 	lck_mtx_unlock(&(_cp)->cp_lock)
152 
153 #define SKM_ZONE_MAX    256
154 
155 static struct zone *skm_zone;                   /* zone for skmem_cache */
156 
157 static struct skmem_cache *skmem_slab_cache;    /* cache for skmem_slab */
158 static struct skmem_cache *skmem_bufctl_cache;  /* cache for skmem_bufctl */
159 static unsigned int bc_size;                    /* size of bufctl */
160 
161 /*
162  * Magazine types (one per row.)
163  *
164  * The first column defines the number of objects that the magazine can hold.
165  * Using that number, we derive the effective number: the aggregate count of
166  * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
167  * This would result in an object size that is aligned on the CPU cache
168  * size boundary; the exception to this is the KASAN mode where the size
169  * would be larger due to the redzone regions.
170  *
171  * The second column defines the alignment of the magazine.  Because each
172  * magazine is used at the CPU-layer cache, we need to ensure there is no
173  * false sharing across the CPUs, and align the magazines to the maximum
174  * cache alignment size, for simplicity.  The value of 0 may be used to
175  * indicate natural pointer size alignment.
176  *
177  * The third column defines the starting magazine type for a given cache,
178  * determined at the cache's creation time based on its chunk size.
179  *
180  * The fourth column defines the magazine type limit for a given cache.
181  * Magazine resizing will only occur if the chunk size is less than this.
182  */
183 static struct skmem_magtype skmem_magtype[] = {
184 #if defined(__LP64__)
185 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
186 	  .mt_cache = NULL, .mt_cname = "" },
187 	{ .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
188 	  .mt_cache = NULL, .mt_cname = "" },
189 	{ .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
190 	  .mt_cache = NULL, .mt_cname = "" },
191 	{ .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
192 	  .mt_cache = NULL, .mt_cname = "" },
193 	{ .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
194 	  .mt_cache = NULL, .mt_cname = "" },
195 	{ .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
196 	  .mt_cache = NULL, .mt_cname = "" },
197 	{ .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
198 	  .mt_cache = NULL, .mt_cname = "" },
199 	{ .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
200 	  .mt_cache = NULL, .mt_cname = "" },
201 #else /* !__LP64__ */
202 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
203 	  .mt_cache = NULL, .mt_cname = "" },
204 #endif /* !__LP64__ */
205 };
206 
207 /*
208  * Hash table bounds.  Start with the initial value, and rescale up to
209  * the specified limit.  Ideally we don't need a limit, but in practice
210  * this helps guard against runaways.  These values should be revisited
211  * in future and be adjusted as needed.
212  */
213 #define SKMEM_CACHE_HASH_INITIAL        64      /* initial hash table size */
214 #define SKMEM_CACHE_HASH_LIMIT          8192    /* hash table size limit */
215 
216 #define SKMEM_CACHE_HASH_INDEX(_a, _s, _m)      (((_a) >> (_s)) & (_m))
217 #define SKMEM_CACHE_HASH(_skm, _buf)                                     \
218 	(&(_skm)->skm_hash_table[SKMEM_CACHE_HASH_INDEX((uintptr_t)_buf, \
219 	(_skm)->skm_hash_shift, (_skm)->skm_hash_mask)])
220 
221 /*
222  * The last magazine type.
223  */
224 static struct skmem_magtype *skmem_cache_magsize_last;
225 
226 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
227 static boolean_t skmem_cache_ready;
228 
229 static int skmem_slab_alloc_locked(struct skmem_cache *,
230     struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
231 static void skmem_slab_free_locked(struct skmem_cache *, void *);
232 static int skmem_slab_alloc_pseudo_locked(struct skmem_cache *,
233     struct skmem_obj_info *, struct skmem_obj_info *, uint32_t);
234 static void skmem_slab_free_pseudo_locked(struct skmem_cache *, void *);
235 static struct skmem_slab *skmem_slab_create(struct skmem_cache *, uint32_t);
236 static void skmem_slab_destroy(struct skmem_cache *, struct skmem_slab *);
237 static int skmem_magazine_ctor(struct skmem_obj_info *,
238     struct skmem_obj_info *, void *, uint32_t);
239 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
240     int);
241 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
242     struct skmem_maglist *, uint32_t *, struct skmem_mag **, uint32_t);
243 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
244     uint32_t *, struct skmem_mag *);
245 static void skmem_depot_ws_update(struct skmem_cache *);
246 static void skmem_depot_ws_zero(struct skmem_cache *);
247 static void skmem_depot_ws_reap(struct skmem_cache *);
248 static void skmem_cache_magazine_purge(struct skmem_cache *);
249 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
250 static void skmem_cache_magazine_resize(struct skmem_cache *);
251 static void skmem_cache_hash_rescale(struct skmem_cache *);
252 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
253 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
254     struct skmem_mag *, int);
255 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
256     uint32_t);
257 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
258 static void skmem_cache_reap_start(void);
259 static void skmem_cache_reap_done(void);
260 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
261 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
262 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
263 static void skmem_cache_resize_exit(struct skmem_cache *);
264 static void skmem_audit_bufctl(struct skmem_bufctl *);
265 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
266 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
267 
268 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
269     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
270     0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
271     "Skywalk cache statistics");
272 
273 static volatile uint32_t skmem_cache_reaping;
274 static thread_call_t skmem_cache_reap_tc;
275 static thread_call_t skmem_cache_update_tc;
276 
277 extern kern_return_t thread_terminate(thread_t);
278 extern unsigned int ml_wait_max_cpus(void);
279 
280 #define SKMEM_DEBUG_NOMAGAZINES 0x1     /* disable magazines layer */
281 #define SKMEM_DEBUG_AUDIT       0x2     /* audit transactions */
282 #define SKMEM_DEBUG_MASK        (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
283 
284 #if DEBUG
285 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
286 #else /* !DEBUG */
287 static uint32_t skmem_debug = 0;
288 #endif /* !DEBUG */
289 
290 static uint32_t skmem_clear_min = 0;    /* clear on free threshold */
291 
292 #define SKMEM_CACHE_UPDATE_INTERVAL     11      /* 11 seconds */
293 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
294 
295 #define SKMEM_DEPOT_CONTENTION  3       /* max failed trylock per interval */
296 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
297 
298 /*
299  * Too big a value will cause overflow and thus trip the assertion; the
300  * idea here is to set an upper limit for the time that a particular
301  * thread is allowed to perform retries before we give up and panic.
302  */
303 #define SKMEM_SLAB_MAX_BACKOFF          (20 * USEC_PER_SEC) /* seconds */
304 
305 /*
306  * Threshold (in msec) after which we reset the exponential backoff value
307  * back to its (random) initial value.  Note that we allow the actual delay
308  * to be at most twice this value.
309  */
310 #define SKMEM_SLAB_BACKOFF_THRES        1024    /* up to ~2 sec (2048 msec) */
311 
312 /*
313  * To reduce the likelihood of global synchronization between threads,
314  * we use some random value to start the exponential backoff.
315  */
316 #define SKMEM_SLAB_BACKOFF_RANDOM       4       /* range is [1,4] msec */
317 
318 #if (DEVELOPMENT || DEBUG)
319 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
320     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
321     SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
322 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
323     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
324     SKMEM_DEPOT_CONTENTION, "Depot contention");
325 
326 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
327 
328 /*
329  * Called by skmem_test_start() to set the update interval.
330  */
331 void
skmem_cache_test_start(uint32_t i)332 skmem_cache_test_start(uint32_t i)
333 {
334 	skmem_cache_update_interval_saved = skmem_cache_update_interval;
335 	skmem_cache_update_interval = i;
336 }
337 
338 /*
339  * Called by skmem_test_stop() to restore the update interval.
340  */
341 void
skmem_cache_test_stop(void)342 skmem_cache_test_stop(void)
343 {
344 	skmem_cache_update_interval = skmem_cache_update_interval_saved;
345 }
346 #endif /* (DEVELOPMENT || DEBUG) */
347 
348 #define SKMEM_TAG_BUFCTL_HASH   "com.apple.skywalk.bufctl.hash"
349 static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
350 
351 #define SKMEM_TAG_CACHE_MIB     "com.apple.skywalk.cache.mib"
352 static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
353 
354 static int __skmem_cache_pre_inited = 0;
355 static int __skmem_cache_inited = 0;
356 
357 /*
358  * Called before skmem_region_init().
359  */
360 void
skmem_cache_pre_init(void)361 skmem_cache_pre_init(void)
362 {
363 	vm_size_t skm_size;
364 
365 	ASSERT(!__skmem_cache_pre_inited);
366 
367 	ncpu = ml_wait_max_cpus();
368 
369 	/* allocate extra in case we need to manually align the pointer */
370 	if (skm_zone == NULL) {
371 		skm_size = SKMEM_CACHE_SIZE(ncpu);
372 #if KASAN
373 		/*
374 		 * When KASAN is enabled, the zone allocator adjusts the
375 		 * element size to include the redzone regions, in which
376 		 * case we assume that the elements won't start on the
377 		 * alignment boundary and thus need to do some fix-ups.
378 		 * These include increasing the effective object size
379 		 * which adds at least 136 bytes to the original size,
380 		 * as computed by skmem_region_params_config() above.
381 		 */
382 		skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
383 #endif /* KASAN */
384 		skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
385 		skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
386 		    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
387 	}
388 
389 	TAILQ_INIT(&skmem_cache_head);
390 
391 	__skmem_cache_pre_inited = 1;
392 }
393 
394 /*
395  * Called after skmem_region_init().
396  */
397 void
skmem_cache_init(void)398 skmem_cache_init(void)
399 {
400 	uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
401 	struct skmem_magtype *mtp;
402 	uint32_t i;
403 
404 	_CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
405 
406 	_CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
407 	_CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
408 	_CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
409 	_CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
410 	_CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
411 	_CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
412 	_CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
413 
414 	ASSERT(__skmem_cache_pre_inited);
415 	ASSERT(!__skmem_cache_inited);
416 
417 	PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
418 	skmem_debug &= SKMEM_DEBUG_MASK;
419 
420 #if (DEVELOPMENT || DEBUG)
421 	PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
422 	    sizeof(skmem_clear_min));
423 #endif /* (DEVELOPMENT || DEBUG) */
424 	if (skmem_clear_min == 0) {
425 		/* zeroing 2 CPU cache lines practically comes for free */
426 		skmem_clear_min = 2 * cpu_cache_line_size;
427 	} else {
428 		/* round it up to CPU cache line size */
429 		skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
430 		    cpu_cache_line_size);
431 	}
432 
433 	/* create a cache for buffer control structures */
434 	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
435 		bc_size = sizeof(struct skmem_bufctl_audit);
436 		skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
437 		    bc_size, sizeof(uint64_t), NULL, NULL,
438 		    NULL, NULL, NULL, 0);
439 	} else {
440 		bc_size = sizeof(struct skmem_bufctl);
441 		skmem_bufctl_cache = skmem_cache_create("bufctl",
442 		    bc_size, sizeof(uint64_t), NULL, NULL,
443 		    NULL, NULL, NULL, 0);
444 	}
445 
446 	/* create a cache for slab structures */
447 	skmem_slab_cache = skmem_cache_create("slab",
448 	    sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
449 	    NULL, NULL, 0);
450 
451 	/*
452 	 * Go thru the magazine type table and create an cache for each.
453 	 */
454 	for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
455 		mtp = &skmem_magtype[i];
456 
457 		if (mtp->mt_align != 0 &&
458 		    ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
459 		    mtp->mt_align < (int)cpu_cache_line_size)) {
460 			panic("%s: bad alignment %d", __func__, mtp->mt_align);
461 			/* NOTREACHED */
462 			__builtin_unreachable();
463 		}
464 		(void) snprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
465 		    "mg.%d", mtp->mt_magsize);
466 
467 		/* create an cache for this magazine type */
468 		mtp->mt_cache = skmem_cache_create(mtp->mt_cname,
469 		    SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
470 		    skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
471 
472 		/* remember the last magazine type */
473 		skmem_cache_magsize_last = mtp;
474 	}
475 
476 	VERIFY(skmem_cache_magsize_last != NULL);
477 	VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
478 	VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
479 
480 	/*
481 	 * Allocate thread calls for cache reap and update operations.
482 	 */
483 	skmem_cache_reap_tc =
484 	    thread_call_allocate_with_options(skmem_cache_reap_func,
485 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
486 	skmem_cache_update_tc =
487 	    thread_call_allocate_with_options(skmem_cache_update_func,
488 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
489 	if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
490 		panic("%s: thread_call_allocate failed", __func__);
491 		/* NOTREACHED */
492 		__builtin_unreachable();
493 	}
494 
495 	/*
496 	 * We're ready; go through existing skmem_cache entries
497 	 * (if any) and enable the magazines layer for each.
498 	 */
499 	skmem_cache_applyall(skmem_cache_magazine_enable, 0);
500 	skmem_cache_ready = TRUE;
501 
502 	/* and start the periodic cache update machinery */
503 	skmem_dispatch(skmem_cache_update_tc, NULL,
504 	    (skmem_cache_update_interval * NSEC_PER_SEC));
505 
506 	__skmem_cache_inited = 1;
507 }
508 
509 void
skmem_cache_fini(void)510 skmem_cache_fini(void)
511 {
512 	struct skmem_magtype *mtp;
513 	uint32_t i;
514 
515 	if (__skmem_cache_inited) {
516 		ASSERT(TAILQ_EMPTY(&skmem_cache_head));
517 
518 		for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
519 			mtp = &skmem_magtype[i];
520 			skmem_cache_destroy(mtp->mt_cache);
521 			mtp->mt_cache = NULL;
522 		}
523 		skmem_cache_destroy(skmem_slab_cache);
524 		skmem_slab_cache = NULL;
525 		skmem_cache_destroy(skmem_bufctl_cache);
526 		skmem_bufctl_cache = NULL;
527 
528 		if (skmem_cache_reap_tc != NULL) {
529 			(void) thread_call_cancel_wait(skmem_cache_reap_tc);
530 			(void) thread_call_free(skmem_cache_reap_tc);
531 			skmem_cache_reap_tc = NULL;
532 		}
533 		if (skmem_cache_update_tc != NULL) {
534 			(void) thread_call_cancel_wait(skmem_cache_update_tc);
535 			(void) thread_call_free(skmem_cache_update_tc);
536 			skmem_cache_update_tc = NULL;
537 		}
538 
539 		__skmem_cache_inited = 0;
540 	}
541 
542 	if (__skmem_cache_pre_inited) {
543 		if (skm_zone != NULL) {
544 			zdestroy(skm_zone);
545 			skm_zone = NULL;
546 		}
547 
548 		__skmem_cache_pre_inited = 0;
549 	}
550 }
551 
552 /*
553  * Create a cache.
554  */
555 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)556 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
557     skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
558     void *private, struct skmem_region *region, uint32_t cflags)
559 {
560 	boolean_t pseudo = (region == NULL);
561 	struct skmem_magtype *mtp;
562 	struct skmem_cache *skm;
563 	void *buf;
564 	size_t segsize;
565 	size_t chunksize;
566 	size_t objsize;
567 	size_t objalign;
568 	uint32_t i, cpuid;
569 
570 	/* enforce 64-bit minimum alignment for buffers */
571 	if (bufalign == 0) {
572 		bufalign = SKMEM_CACHE_ALIGN;
573 	}
574 	bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
575 
576 	/* enforce alignment to be a power of 2 */
577 	VERIFY(powerof2(bufalign));
578 
579 	if (region == NULL) {
580 		struct skmem_region_params srp;
581 
582 		/* batching is currently not supported on pseudo regions */
583 		VERIFY(!(cflags & SKMEM_CR_BATCH));
584 
585 		srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
586 		ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
587 
588 		/* objalign is always equal to bufalign */
589 		srp.srp_align = objalign = bufalign;
590 		srp.srp_r_obj_cnt = 1;
591 		srp.srp_r_obj_size = (uint32_t)bufsize;
592 		skmem_region_params_config(&srp);
593 
594 		/* allocate region for intrinsics */
595 		region = skmem_region_create(name, &srp, NULL, NULL, NULL);
596 		VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
597 		VERIFY(objalign == region->skr_align);
598 #if KASAN
599 		/*
600 		 * When KASAN is enabled, the zone allocator adjusts the
601 		 * element size to include the redzone regions, in which
602 		 * case we assume that the elements won't start on the
603 		 * alignment boundary and thus need to do some fix-ups.
604 		 * These include increasing the effective object size
605 		 * which adds at least 16 bytes to the original size,
606 		 * as computed by skmem_region_params_config() above.
607 		 */
608 		VERIFY(region->skr_c_obj_size >=
609 		    (bufsize + sizeof(uint64_t) + bufalign));
610 #endif /* KASAN */
611 		/* enable magazine resizing by default */
612 		cflags |= SKMEM_CR_DYNAMIC;
613 
614 		/*
615 		 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
616 		 * even though it's a no-op since the work is done
617 		 * at the zone layer instead.
618 		 */
619 		cflags |= SKMEM_CR_CLEARONFREE;
620 	} else {
621 		objalign = region->skr_align;
622 	}
623 
624 	ASSERT(region != NULL);
625 	ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
626 	segsize = region->skr_seg_size;
627 	ASSERT(bufalign <= segsize);
628 
629 	buf = zalloc_flags(skm_zone, Z_WAITOK | Z_ZERO);
630 #if KASAN
631 	/*
632 	 * In case we didn't get a cache-aligned memory, round it up
633 	 * accordingly.  This is needed in order to get the rest of
634 	 * structure members aligned properly.  It also means that
635 	 * the memory span gets shifted due to the round up, but it
636 	 * is okay since we've allocated extra space for this.
637 	 */
638 	skm = (struct skmem_cache *)
639 	    P2ROUNDUP((intptr_t)buf + sizeof(void *), CHANNEL_CACHE_ALIGN_MAX);
640 	void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
641 	*pbuf = buf;
642 #else /* !KASAN */
643 	/*
644 	 * We expect that the zone allocator would allocate elements
645 	 * rounded up to the requested alignment based on the object
646 	 * size computed in skmem_cache_pre_init() earlier, and
647 	 * 'skm' is therefore the element address itself.
648 	 */
649 	skm = buf;
650 #endif /* !KASAN */
651 	VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
652 
653 	if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
654 	    (cflags & SKMEM_CR_NOMAGAZINES)) {
655 		/*
656 		 * Either the caller insists that this cache should not
657 		 * utilize magazines layer, or that the system override
658 		 * to disable magazines layer on all caches has been set.
659 		 */
660 		skm->skm_mode |= SKM_MODE_NOMAGAZINES;
661 	} else {
662 		/*
663 		 * Region must be configured with enough objects
664 		 * to take into account objects at the CPU layer.
665 		 */
666 		ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
667 	}
668 
669 	if (cflags & SKMEM_CR_DYNAMIC) {
670 		/*
671 		 * Enable per-CPU cache magazine resizing.
672 		 */
673 		skm->skm_mode |= SKM_MODE_DYNAMIC;
674 	}
675 
676 	/* region stays around after defunct? */
677 	if (region->skr_mode & SKR_MODE_NOREDIRECT) {
678 		skm->skm_mode |= SKM_MODE_NOREDIRECT;
679 	}
680 
681 	if (cflags & SKMEM_CR_BATCH) {
682 		/*
683 		 * Batch alloc/free involves storing the next object
684 		 * pointer at the beginning of each object; this is
685 		 * okay for kernel-only regions, but not those that
686 		 * are mappable to user space (we can't leak kernel
687 		 * addresses).
688 		 */
689 		_CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
690 		VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
691 
692 		/* batching is currently not supported on pseudo regions */
693 		VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
694 
695 		/* validate object size */
696 		VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
697 
698 		skm->skm_mode |= SKM_MODE_BATCH;
699 	}
700 
701 	uuid_generate_random(skm->skm_uuid);
702 	(void) snprintf(skm->skm_name, sizeof(skm->skm_name),
703 	    "%s.%s", SKMEM_CACHE_PREFIX, name);
704 	skm->skm_bufsize = bufsize;
705 	skm->skm_bufalign = bufalign;
706 	skm->skm_objalign = objalign;
707 	skm->skm_ctor = ctor;
708 	skm->skm_dtor = dtor;
709 	skm->skm_reclaim = reclaim;
710 	skm->skm_private = private;
711 	skm->skm_slabsize = segsize;
712 
713 	skm->skm_region = region;
714 	/* callee holds reference */
715 	skmem_region_slab_config(region, skm, true);
716 	objsize = region->skr_c_obj_size;
717 	skm->skm_objsize = objsize;
718 
719 	if (pseudo) {
720 		/*
721 		 * Release reference from skmem_region_create()
722 		 * since skm->skm_region holds one now.
723 		 */
724 		ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
725 		skmem_region_release(region);
726 
727 		skm->skm_mode |= SKM_MODE_PSEUDO;
728 
729 		skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
730 		skm->skm_slab_free = skmem_slab_free_pseudo_locked;
731 	} else {
732 		skm->skm_slab_alloc = skmem_slab_alloc_locked;
733 		skm->skm_slab_free = skmem_slab_free_locked;
734 
735 		/* auditing was requested? (normal regions only) */
736 		if (skmem_debug & SKMEM_DEBUG_AUDIT) {
737 			ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
738 			skm->skm_mode |= SKM_MODE_AUDIT;
739 		}
740 	}
741 
742 	/*
743 	 * Clear upon free (to slab layer) as long as the region is
744 	 * not marked as read-only for kernel, and if the chunk size
745 	 * is within the threshold or if the caller had requested it.
746 	 */
747 	if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
748 		if (skm->skm_objsize <= skmem_clear_min ||
749 		    (cflags & SKMEM_CR_CLEARONFREE)) {
750 			skm->skm_mode |= SKM_MODE_CLEARONFREE;
751 		}
752 	}
753 
754 	chunksize = bufsize;
755 	if (bufalign >= SKMEM_CACHE_ALIGN) {
756 		chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
757 	}
758 
759 	chunksize = P2ROUNDUP(chunksize, bufalign);
760 	if (chunksize > objsize) {
761 		panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
762 		    __func__, bufsize, chunksize, objsize);
763 		/* NOTREACHED */
764 		__builtin_unreachable();
765 	}
766 	ASSERT(chunksize != 0);
767 	skm->skm_chunksize = chunksize;
768 
769 	lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
770 	TAILQ_INIT(&skm->skm_sl_partial_list);
771 	TAILQ_INIT(&skm->skm_sl_empty_list);
772 
773 	/* allocated-address hash table */
774 	skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
775 	skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
776 	skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
777 	    skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
778 
779 	skm->skm_hash_mask = (skm->skm_hash_initial - 1);
780 	skm->skm_hash_shift = flsll(chunksize) - 1;
781 
782 	for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
783 		SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
784 	}
785 
786 	lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
787 
788 	/* find a suitable magazine type for this chunk size */
789 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
790 		continue;
791 	}
792 
793 	skm->skm_magtype = mtp;
794 	if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
795 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
796 	}
797 
798 	/*
799 	 * Initialize the CPU layer.  Each per-CPU structure is aligned
800 	 * on the CPU cache line boundary to prevent false sharing.
801 	 */
802 	lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
803 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
804 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
805 
806 		VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
807 		lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
808 		    &skmem_lock_attr);
809 		ccp->cp_rounds = -1;
810 		ccp->cp_prounds = -1;
811 	}
812 
813 	SKMEM_CACHE_LOCK();
814 	TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
815 	SKMEM_CACHE_UNLOCK();
816 
817 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
818 	    skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
819 	SK_DF(SK_VERB_MEM_CACHE,
820 	    "  bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
821 	    (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
822 	    (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
823 	    (uint32_t)skm->skm_slabsize);
824 
825 	if (skmem_cache_ready) {
826 		skmem_cache_magazine_enable(skm, 0);
827 	}
828 
829 	if (cflags & SKMEM_CR_RECLAIM) {
830 		skm->skm_mode |= SKM_MODE_RECLAIM;
831 	}
832 
833 	return skm;
834 }
835 
836 /*
837  * Destroy a cache.
838  */
839 void
skmem_cache_destroy(struct skmem_cache * skm)840 skmem_cache_destroy(struct skmem_cache *skm)
841 {
842 	uint32_t cpuid;
843 
844 	SKMEM_CACHE_LOCK();
845 	TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
846 	SKMEM_CACHE_UNLOCK();
847 
848 	ASSERT(skm->skm_rs_busy == 0);
849 	ASSERT(skm->skm_rs_want == 0);
850 
851 	/* purge all cached objects for this cache */
852 	skmem_cache_magazine_purge(skm);
853 
854 	/*
855 	 * Panic if we detect there are unfreed objects; the caller
856 	 * destroying this cache is responsible for ensuring that all
857 	 * allocated objects have been freed prior to getting here.
858 	 */
859 	SKM_SLAB_LOCK(skm);
860 	if (skm->skm_sl_bufinuse != 0) {
861 		panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
862 		    skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
863 		/* NOTREACHED */
864 		__builtin_unreachable();
865 	}
866 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
867 	ASSERT(skm->skm_sl_partial == 0);
868 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
869 	ASSERT(skm->skm_sl_empty == 0);
870 	skm->skm_reclaim = NULL;
871 	skm->skm_ctor = NULL;
872 	skm->skm_dtor = NULL;
873 	SKM_SLAB_UNLOCK(skm);
874 
875 	if (skm->skm_hash_table != NULL) {
876 #if (DEBUG || DEVELOPMENT)
877 		for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
878 			ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
879 		}
880 #endif /* DEBUG || DEVELOPMENT */
881 
882 		sk_free_type_array(struct skmem_bufctl_bkt,
883 		    skm->skm_hash_mask + 1, skm->skm_hash_table);
884 		skm->skm_hash_table = NULL;
885 	}
886 
887 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
888 		lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
889 		    &skmem_cpu_lock_grp);
890 	}
891 	lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
892 	lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
893 	lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
894 
895 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
896 	    skm->skm_name, SK_KVA(skm));
897 
898 	/* callee releases reference */
899 	skmem_region_slab_config(skm->skm_region, skm, false);
900 	skm->skm_region = NULL;
901 
902 #if KASAN
903 	/* get the original address since we're about to free it */
904 	void **pbuf = (void **)((intptr_t)skm - sizeof(void *));
905 	skm = *pbuf;
906 #endif /* KASAN */
907 
908 	zfree(skm_zone, skm);
909 }
910 
911 /*
912  * Create a slab.
913  */
914 static struct skmem_slab *
skmem_slab_create(struct skmem_cache * skm,uint32_t skmflag)915 skmem_slab_create(struct skmem_cache *skm, uint32_t skmflag)
916 {
917 	struct skmem_region *skr = skm->skm_region;
918 	uint32_t objsize, chunks;
919 	size_t slabsize = skm->skm_slabsize;
920 	struct skmem_slab *sl;
921 	struct sksegment *sg, *sgm;
922 	char *buf, *bufm, *slab, *slabm;
923 
924 	/*
925 	 * Allocate a segment (a slab at our layer) from the region.
926 	 */
927 	slab = skmem_region_alloc(skr, (void **)&slabm, &sg, &sgm, skmflag);
928 	if (slab == NULL) {
929 		goto rg_alloc_failure;
930 	}
931 
932 	if ((sl = skmem_cache_alloc(skmem_slab_cache, SKMEM_SLEEP)) == NULL) {
933 		goto slab_alloc_failure;
934 	}
935 
936 	ASSERT(sg != NULL);
937 	ASSERT(sgm == NULL || sgm->sg_index == sg->sg_index);
938 
939 	bzero(sl, sizeof(*sl));
940 	sl->sl_cache = skm;
941 	sl->sl_base = buf = slab;
942 	sl->sl_basem = bufm = slabm;
943 	ASSERT(skr->skr_c_obj_size <= UINT32_MAX);
944 	objsize = (uint32_t)skr->skr_c_obj_size;
945 	ASSERT(skm->skm_objsize == objsize);
946 	ASSERT((slabsize / objsize) <= UINT32_MAX);
947 	sl->sl_chunks = chunks = (uint32_t)(slabsize / objsize);
948 	sl->sl_seg = sg;
949 	sl->sl_segm = sgm;
950 
951 	/*
952 	 * Create one or more buffer control structures for the slab,
953 	 * each one tracking a chunk of raw object from the segment,
954 	 * and insert these into the slab's list of buffer controls.
955 	 */
956 	ASSERT(chunks > 0);
957 	while (chunks != 0) {
958 		struct skmem_bufctl *bc;
959 
960 		bc = skmem_cache_alloc(skmem_bufctl_cache, SKMEM_SLEEP);
961 		if (bc == NULL) {
962 			goto bufctl_alloc_failure;
963 		}
964 
965 		bzero(bc, bc_size);
966 		bc->bc_addr = buf;
967 		bc->bc_addrm = bufm;
968 		bc->bc_slab = sl;
969 		bc->bc_idx = (sl->sl_chunks - chunks);
970 		if (skr->skr_mode & SKR_MODE_SHAREOK) {
971 			bc->bc_flags |= SKMEM_BUFCTL_SHAREOK;
972 		}
973 		SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
974 		bc->bc_lim = objsize;
975 		buf += objsize;
976 		if (bufm != NULL) {
977 			bufm += objsize;
978 		}
979 		--chunks;
980 	}
981 
982 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
983 	    SK_KVA(skm), SK_KVA(sl));
984 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
985 	    SK_KVA(slab), SK_KVA(slab + objsize));
986 
987 	return sl;
988 
989 bufctl_alloc_failure:
990 	skmem_slab_destroy(skm, sl);
991 
992 slab_alloc_failure:
993 	skmem_region_free(skr, slab, slabm);
994 
995 rg_alloc_failure:
996 	os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
997 
998 	return NULL;
999 }
1000 
1001 /*
1002  * Destroy a slab.
1003  */
1004 static void
skmem_slab_destroy(struct skmem_cache * skm,struct skmem_slab * sl)1005 skmem_slab_destroy(struct skmem_cache *skm, struct skmem_slab *sl)
1006 {
1007 	struct skmem_bufctl *bc, *tbc;
1008 	void *slab = sl->sl_base;
1009 	void *slabm = sl->sl_basem;
1010 
1011 	ASSERT(sl->sl_refcnt == 0);
1012 
1013 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx sl 0x%llx",
1014 	    SK_KVA(skm), SK_KVA(sl));
1015 	SK_DF(SK_VERB_MEM_CACHE, "  [%u] [0x%llx-0x%llx)", sl->sl_seg->sg_index,
1016 	    SK_KVA(slab), SK_KVA((uintptr_t)slab + skm->skm_objsize));
1017 
1018 	/*
1019 	 * Go through the slab's list of buffer controls and free
1020 	 * them, and then free the slab itself back to its cache.
1021 	 */
1022 	SLIST_FOREACH_SAFE(bc, &sl->sl_head, bc_link, tbc) {
1023 		SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1024 		skmem_cache_free(skmem_bufctl_cache, bc);
1025 	}
1026 	skmem_cache_free(skmem_slab_cache, sl);
1027 
1028 	/* and finally free the segment back to the backing region */
1029 	skmem_region_free(skm->skm_region, slab, slabm);
1030 }
1031 
1032 /*
1033  * Allocate a raw object from the (locked) slab layer.  Normal region variant.
1034  */
1035 static int
skmem_slab_alloc_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1036 skmem_slab_alloc_locked(struct skmem_cache *skm, struct skmem_obj_info *oi,
1037     struct skmem_obj_info *oim, uint32_t skmflag)
1038 {
1039 	struct skmem_bufctl_bkt *bcb;
1040 	struct skmem_bufctl *bc;
1041 	struct skmem_slab *sl;
1042 	uint32_t retries = 0;
1043 	uint64_t boff_total = 0;                /* in usec */
1044 	uint64_t boff = 0;                      /* in msec */
1045 	boolean_t new_slab;
1046 	void *buf;
1047 #if CONFIG_KERNEL_TAGGING
1048 	vm_offset_t tagged_address;             /* address tagging */
1049 	struct skmem_region *region;            /* region source for this slab */
1050 #endif /* CONFIG_KERNEL_TAGGING */
1051 
1052 	/* this flag is not for the caller to set */
1053 	VERIFY(!(skmflag & SKMEM_FAILOK));
1054 
1055 	/*
1056 	 * A slab is either in a partially-allocated list (at least it has
1057 	 * a free object available), or is in the empty list (everything
1058 	 * has been allocated.)  If we can't find a partially-allocated
1059 	 * slab, then we need to allocate a slab (segment) from the region.
1060 	 */
1061 again:
1062 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1063 	sl = TAILQ_FIRST(&skm->skm_sl_partial_list);
1064 	if (sl == NULL) {
1065 		uint32_t flags = skmflag;
1066 		boolean_t retry;
1067 
1068 		ASSERT(skm->skm_sl_partial == 0);
1069 		SKM_SLAB_UNLOCK(skm);
1070 		if (!(flags & SKMEM_NOSLEEP)) {
1071 			/*
1072 			 * Pick up a random value to start the exponential
1073 			 * backoff, if this is the first round, or if the
1074 			 * current value is over the threshold.  Otherwise,
1075 			 * double the backoff value.
1076 			 */
1077 			if (boff == 0 || boff > SKMEM_SLAB_BACKOFF_THRES) {
1078 				read_frandom(&boff, sizeof(boff));
1079 				boff = (boff % SKMEM_SLAB_BACKOFF_RANDOM) + 1;
1080 				ASSERT(boff > 0);
1081 			} else if (os_mul_overflow(boff, 2, &boff)) {
1082 				panic_plain("\"%s\": boff counter "
1083 				    "overflows\n", skm->skm_name);
1084 				/* NOTREACHED */
1085 				__builtin_unreachable();
1086 			}
1087 			/* add this value (in msec) to the total (in usec) */
1088 			if (os_add_overflow(boff_total,
1089 			    (boff * NSEC_PER_USEC), &boff_total)) {
1090 				panic_plain("\"%s\": boff_total counter "
1091 				    "overflows\n", skm->skm_name);
1092 				/* NOTREACHED */
1093 				__builtin_unreachable();
1094 			}
1095 		}
1096 		/*
1097 		 * In the event of a race between multiple threads trying
1098 		 * to create the last remaining (or the only) slab, let the
1099 		 * loser(s) attempt to retry after waiting a bit.  The winner
1100 		 * would have inserted the newly-created slab into the list.
1101 		 */
1102 		if (!(flags & SKMEM_NOSLEEP) &&
1103 		    boff_total <= SKMEM_SLAB_MAX_BACKOFF) {
1104 			retry = TRUE;
1105 			++retries;
1106 			flags |= SKMEM_FAILOK;
1107 		} else {
1108 			if (!(flags & SKMEM_NOSLEEP)) {
1109 				panic_plain("\"%s\": failed to allocate "
1110 				    "slab (sleeping mode) after %llu "
1111 				    "msec, %u retries\n\n%s", skm->skm_name,
1112 				    (boff_total / NSEC_PER_USEC), retries,
1113 				    skmem_dump(skm->skm_region));
1114 				/* NOTREACHED */
1115 				__builtin_unreachable();
1116 			}
1117 			retry = FALSE;
1118 		}
1119 
1120 		/*
1121 		 * Create a new slab.
1122 		 */
1123 		if ((sl = skmem_slab_create(skm, flags)) == NULL) {
1124 			if (retry) {
1125 				SK_ERR("\"%s\": failed to allocate "
1126 				    "slab (%ssleeping mode): waiting for %llu "
1127 				    "msec, total %llu msec, %u retries",
1128 				    skm->skm_name,
1129 				    (flags & SKMEM_NOSLEEP) ? "non-" : "",
1130 				    boff, (boff_total / NSEC_PER_USEC), retries);
1131 				VERIFY(boff > 0 && ((uint32_t)boff <=
1132 				    (SKMEM_SLAB_BACKOFF_THRES * 2)));
1133 				delay((uint32_t)boff * NSEC_PER_USEC);
1134 				SKM_SLAB_LOCK(skm);
1135 				goto again;
1136 			} else {
1137 				SK_RDERR(4, "\"%s\": failed to allocate slab "
1138 				    "(%ssleeping mode)", skm->skm_name,
1139 				    (flags & SKMEM_NOSLEEP) ? "non-" : "");
1140 				SKM_SLAB_LOCK(skm);
1141 			}
1142 			return ENOMEM;
1143 		}
1144 
1145 		SKM_SLAB_LOCK(skm);
1146 		skm->skm_sl_create++;
1147 		if ((skm->skm_sl_bufinuse += sl->sl_chunks) >
1148 		    skm->skm_sl_bufmax) {
1149 			skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1150 		}
1151 	}
1152 	skm->skm_sl_alloc++;
1153 
1154 	new_slab = (sl->sl_refcnt == 0);
1155 	ASSERT(new_slab || SKMEM_SLAB_IS_PARTIAL(sl));
1156 
1157 	sl->sl_refcnt++;
1158 	ASSERT(sl->sl_refcnt <= sl->sl_chunks);
1159 
1160 	/*
1161 	 * We either have a new slab, or a partially-allocated one.
1162 	 * Remove a buffer control from the slab, and insert it to
1163 	 * the allocated-address hash chain.
1164 	 */
1165 	bc = SLIST_FIRST(&sl->sl_head);
1166 	ASSERT(bc != NULL);
1167 	SLIST_REMOVE(&sl->sl_head, bc, skmem_bufctl, bc_link);
1168 
1169 	/* sanity check */
1170 	VERIFY(bc->bc_usecnt == 0);
1171 
1172 	/*
1173 	 * Also store the master object's region info for the caller.
1174 	 */
1175 	bzero(oi, sizeof(*oi));
1176 #if CONFIG_KERNEL_TAGGING
1177 	region = sl->sl_cache->skm_region;
1178 	if (region->skr_mode & SKR_MODE_MEMTAG) {
1179 		/*
1180 		 * If this region is configured to be tagged, we generate a
1181 		 * unique tag for the object address, and return this tagged
1182 		 * address to the caller. vm_memtag_assign_tag generates a
1183 		 * unique tag for the given address and size, and
1184 		 * vm_memtag_set_tag commits the tag to the backing memory
1185 		 * metadata. This tagged address is returned back to the client,
1186 		 * and when the client frees the address, we "re-tag" the
1187 		 * address to prevent against use-after-free attacks (more on
1188 		 * this in skmem_cache_batch_free).
1189 		 */
1190 		tagged_address = vm_memtag_assign_tag((vm_offset_t)bc->bc_addr,
1191 		    skm->skm_objsize);
1192 		vm_memtag_set_tag(tagged_address, skm->skm_objsize);
1193 		buf = (void *)tagged_address;
1194 	} else {
1195 		buf = bc->bc_addr;
1196 	}
1197 #else /* !CONFIG_KERNEL_TAGGING */
1198 	buf = bc->bc_addr;
1199 #endif /* CONFIG_KERNEL_TAGGING */
1200 	SKMEM_OBJ_ADDR(oi) = buf;
1201 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
1202 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1203 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1204 	SKMEM_OBJ_IDX_REG(oi) =
1205 	    ((sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx);
1206 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1207 	/*
1208 	 * And for slave object.
1209 	 */
1210 	if (oim != NULL) {
1211 		bzero(oim, sizeof(*oim));
1212 		if (bc->bc_addrm != NULL) {
1213 			SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1214 			SKMEM_OBJ_SIZE(oim) = SKMEM_OBJ_SIZE(oi);
1215 			SKMEM_OBJ_IDX_REG(oim) = SKMEM_OBJ_IDX_REG(oi);
1216 			SKMEM_OBJ_IDX_SEG(oim) = SKMEM_OBJ_IDX_SEG(oi);
1217 		}
1218 	}
1219 
1220 	if (skm->skm_mode & SKM_MODE_BATCH) {
1221 		((struct skmem_obj *)buf)->mo_next = NULL;
1222 	}
1223 
1224 	/* insert to allocated-address hash chain */
1225 	bcb = SKMEM_CACHE_HASH(skm, buf);
1226 	SLIST_INSERT_HEAD(&bcb->bcb_head, bc, bc_link);
1227 
1228 	if (SLIST_EMPTY(&sl->sl_head)) {
1229 		/*
1230 		 * If that was the last buffer control from this slab,
1231 		 * insert the slab into the empty list.  If it was in
1232 		 * the partially-allocated list, then remove the slab
1233 		 * from there as well.
1234 		 */
1235 		ASSERT(sl->sl_refcnt == sl->sl_chunks);
1236 		if (new_slab) {
1237 			ASSERT(sl->sl_chunks == 1);
1238 		} else {
1239 			ASSERT(sl->sl_chunks > 1);
1240 			ASSERT(skm->skm_sl_partial > 0);
1241 			skm->skm_sl_partial--;
1242 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1243 		}
1244 		skm->skm_sl_empty++;
1245 		ASSERT(skm->skm_sl_empty != 0);
1246 		TAILQ_INSERT_HEAD(&skm->skm_sl_empty_list, sl, sl_link);
1247 	} else {
1248 		/*
1249 		 * The slab is not empty; if it was newly allocated
1250 		 * above, then it's not in the partially-allocated
1251 		 * list and so we insert it there.
1252 		 */
1253 		ASSERT(SKMEM_SLAB_IS_PARTIAL(sl));
1254 		if (new_slab) {
1255 			skm->skm_sl_partial++;
1256 			ASSERT(skm->skm_sl_partial != 0);
1257 			TAILQ_INSERT_HEAD(&skm->skm_sl_partial_list,
1258 			    sl, sl_link);
1259 		}
1260 	}
1261 
1262 	/* if auditing is enabled, record this transaction */
1263 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1264 		skmem_audit_bufctl(bc);
1265 	}
1266 
1267 	return 0;
1268 }
1269 
1270 /*
1271  * Allocate a raw object from the (locked) slab layer.  Pseudo region variant.
1272  */
1273 static int
skmem_slab_alloc_pseudo_locked(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1274 skmem_slab_alloc_pseudo_locked(struct skmem_cache *skm,
1275     struct skmem_obj_info *oi, struct skmem_obj_info *oim, uint32_t skmflag)
1276 {
1277 	zalloc_flags_t zflags = (skmflag & SKMEM_NOSLEEP) ? Z_NOWAIT : Z_WAITOK;
1278 	struct skmem_region *skr = skm->skm_region;
1279 	void *obj, *buf;
1280 
1281 	/* this flag is not for the caller to set */
1282 	VERIFY(!(skmflag & SKMEM_FAILOK));
1283 
1284 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1285 
1286 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1287 	/* mirrored region is not applicable */
1288 	ASSERT(!(skr->skr_mode & SKR_MODE_MIRRORED));
1289 	/* batching is not yet supported */
1290 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH));
1291 
1292 	if ((obj = zalloc_flags(skr->skr_zreg, zflags | Z_ZERO)) == NULL) {
1293 		os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
1294 		return ENOMEM;
1295 	}
1296 
1297 #if KASAN
1298 	/*
1299 	 * Perform some fix-ups since the zone element isn't guaranteed
1300 	 * to be on the aligned boundary.  The effective object size
1301 	 * has been adjusted accordingly by skmem_region_create() earlier
1302 	 * at cache creation time.
1303 	 *
1304 	 * 'buf' is get the aligned address for this object.
1305 	 */
1306 	buf = (void *)P2ROUNDUP((intptr_t)obj + sizeof(u_int64_t),
1307 	    skm->skm_bufalign);
1308 
1309 	/*
1310 	 * Wind back a pointer size from the aligned address and
1311 	 * save the original address so we can free it later.
1312 	 */
1313 	void **pbuf = (void **)((intptr_t)buf - sizeof(void *));
1314 	*pbuf = obj;
1315 
1316 	VERIFY(((intptr_t)buf + skm->skm_bufsize) <=
1317 	    ((intptr_t)obj + skm->skm_objsize));
1318 #else /* !KASAN */
1319 	/*
1320 	 * We expect that the zone allocator would allocate elements
1321 	 * rounded up to the requested alignment based on the effective
1322 	 * object size computed in skmem_region_create() earlier, and
1323 	 * 'buf' is therefore the element address itself.
1324 	 */
1325 	buf = obj;
1326 #endif /* !KASAN */
1327 
1328 	/* make sure the object is aligned */
1329 	VERIFY(IS_P2ALIGNED(buf, skm->skm_bufalign));
1330 
1331 	/*
1332 	 * Return the object's info to the caller.
1333 	 */
1334 	bzero(oi, sizeof(*oi));
1335 	SKMEM_OBJ_ADDR(oi) = buf;
1336 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1337 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1338 	if (oim != NULL) {
1339 		bzero(oim, sizeof(*oim));
1340 	}
1341 
1342 	skm->skm_sl_alloc++;
1343 	skm->skm_sl_bufinuse++;
1344 	if (skm->skm_sl_bufinuse > skm->skm_sl_bufmax) {
1345 		skm->skm_sl_bufmax = skm->skm_sl_bufinuse;
1346 	}
1347 
1348 	return 0;
1349 }
1350 
1351 /*
1352  * Allocate a raw object from the slab layer.
1353  */
1354 static int
skmem_slab_alloc(struct skmem_cache * skm,struct skmem_obj_info * oi,struct skmem_obj_info * oim,uint32_t skmflag)1355 skmem_slab_alloc(struct skmem_cache *skm, struct skmem_obj_info *oi,
1356     struct skmem_obj_info *oim, uint32_t skmflag)
1357 {
1358 	int err;
1359 
1360 	SKM_SLAB_LOCK(skm);
1361 	err = skm->skm_slab_alloc(skm, oi, oim, skmflag);
1362 	SKM_SLAB_UNLOCK(skm);
1363 
1364 	return err;
1365 }
1366 
1367 /*
1368  * Allocate raw object(s) from the slab layer.
1369  */
1370 static uint32_t
skmem_slab_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)1371 skmem_slab_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1372     uint32_t num, uint32_t skmflag)
1373 {
1374 	uint32_t need = num;
1375 
1376 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1377 	*list = NULL;
1378 
1379 	SKM_SLAB_LOCK(skm);
1380 	for (;;) {
1381 		struct skmem_obj_info oi, oim;
1382 
1383 		/*
1384 		 * Get a single raw object from the slab layer.
1385 		 */
1386 		if (skm->skm_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1387 			break;
1388 		}
1389 
1390 		*list = SKMEM_OBJ_ADDR(&oi);
1391 		ASSERT((*list)->mo_next == NULL);
1392 		/* store these inside the object itself */
1393 		(*list)->mo_info = oi;
1394 		(*list)->mo_minfo = oim;
1395 		list = &(*list)->mo_next;
1396 
1397 		ASSERT(need != 0);
1398 		if (--need == 0) {
1399 			break;
1400 		}
1401 	}
1402 	SKM_SLAB_UNLOCK(skm);
1403 
1404 	return num - need;
1405 }
1406 
1407 /*
1408  * Free a raw object to the (locked) slab layer.  Normal region variant.
1409  */
1410 static void
skmem_slab_free_locked(struct skmem_cache * skm,void * buf)1411 skmem_slab_free_locked(struct skmem_cache *skm, void *buf)
1412 {
1413 	struct skmem_bufctl *bc, *tbc;
1414 	struct skmem_bufctl_bkt *bcb;
1415 	struct skmem_slab *sl = NULL;
1416 #if CONFIG_KERNEL_TAGGING
1417 	struct skmem_region *region;
1418 	vm_offset_t tagged_addr;
1419 	/*
1420 	 * If buf is tagged, then addr would have the canonicalized address.
1421 	 * If buf is untagged, then addr is same as buf.
1422 	 */
1423 	void *addr = (void *)vm_memtag_canonicalize_address((vm_offset_t)buf);
1424 #endif /* CONFIG_KERNEL_TAGGING */
1425 
1426 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1427 	ASSERT(buf != NULL);
1428 	/* caller is expected to clear mo_next */
1429 	ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1430 	    ((struct skmem_obj *)buf)->mo_next == NULL);
1431 
1432 	/*
1433 	 * Search the hash chain to find a matching buffer control for the
1434 	 * given object address.  If found, remove the buffer control from
1435 	 * the hash chain and insert it into the freelist.  Otherwise, we
1436 	 * panic since the caller has given us a bogus address.
1437 	 */
1438 	skm->skm_sl_free++;
1439 	bcb = SKMEM_CACHE_HASH(skm, buf);
1440 
1441 #if CONFIG_KERNEL_TAGGING
1442 	/*
1443 	 * If this region is configured to tag memory addresses, then buf is a
1444 	 * tagged address. When we search for the buffer control from the hash
1445 	 * table, we need to use the untagged address, because buffer control
1446 	 * maintains untagged address (bc_addr). vm_memtag_canonicalize_address
1447 	 * returns the untagged address.
1448 	 */
1449 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1450 		if (bc->bc_addr == addr) {
1451 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1452 			sl = bc->bc_slab;
1453 			break;
1454 		}
1455 	}
1456 #else /* !CONFIG_KERNEL_TAGGING */
1457 	SLIST_FOREACH_SAFE(bc, &bcb->bcb_head, bc_link, tbc) {
1458 		if (bc->bc_addr == buf) {
1459 			SLIST_REMOVE(&bcb->bcb_head, bc, skmem_bufctl, bc_link);
1460 			sl = bc->bc_slab;
1461 			break;
1462 		}
1463 	}
1464 #endif /* CONFIG_KERNEL_TAGGING */
1465 
1466 	if (bc == NULL) {
1467 		panic("%s: attempt to free invalid or already-freed obj %p "
1468 		    "on skm %p", __func__, buf, skm);
1469 		/* NOTREACHED */
1470 		__builtin_unreachable();
1471 	}
1472 	ASSERT(sl != NULL && sl->sl_cache == skm);
1473 
1474 #if CONFIG_KERNEL_TAGGING
1475 	/*
1476 	 * We use untagged address here, because SKMEM_SLAB_MEMBER compares the
1477 	 * address against sl_base, which is untagged.
1478 	 */
1479 	VERIFY(SKMEM_SLAB_MEMBER(sl, addr));
1480 #else /* !CONFIG_KERNEL_TAGGING */
1481 	VERIFY(SKMEM_SLAB_MEMBER(sl, buf));
1482 #endif /* CONFIG_KERNEL_TAGGING */
1483 
1484 	/* make sure this object is not currently in use by another object */
1485 	VERIFY(bc->bc_usecnt == 0);
1486 
1487 	/* if auditing is enabled, record this transaction */
1488 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1489 		skmem_audit_bufctl(bc);
1490 	}
1491 
1492 	/* if clear on free is requested, zero out the object */
1493 	if (skm->skm_mode & SKM_MODE_CLEARONFREE) {
1494 		bzero(buf, skm->skm_objsize);
1495 	}
1496 
1497 #if CONFIG_KERNEL_TAGGING
1498 	/*
1499 	 * If this region is configured to tag memory addresses, we re-tag this
1500 	 * address as the object is freed. We do the re-tagging in the magazine
1501 	 * layer too, but in case we need to free raw objects to the slab layer
1502 	 * (either becasue SKM_MODE_NOMAGAZINES is set, or the magazine layer
1503 	 * was not able to allocate empty magazines), we re-tag the addresses
1504 	 * here in the slab layer. Freeing to the slab layer is symmetrical to
1505 	 * allocating from the slab layer - when we allocate from slab layer, we
1506 	 * tag the address, and then construct the object; when we free to the
1507 	 * slab layer, we destruct the object, and retag the address.
1508 	 * We do the re-tagging here, because this is right after the last usage
1509 	 * of the buf variable (which is tagged).
1510 	 */
1511 	region = skm->skm_region;
1512 	if (region->skr_mode & SKR_MODE_MEMTAG) {
1513 		tagged_addr = vm_memtag_assign_tag((vm_offset_t)buf,
1514 		    skm->skm_objsize);
1515 		vm_memtag_set_tag(tagged_addr, skm->skm_objsize);
1516 	}
1517 #endif /* CONFIG_KERNEL_TAGGING */
1518 
1519 	/* insert the buffer control to the slab's freelist */
1520 	SLIST_INSERT_HEAD(&sl->sl_head, bc, bc_link);
1521 
1522 	ASSERT(sl->sl_refcnt >= 1);
1523 	if (--sl->sl_refcnt == 0) {
1524 		/*
1525 		 * If this was the last outstanding object for the slab,
1526 		 * remove the slab from the partially-allocated or empty
1527 		 * list, and destroy the slab (segment) back to the region.
1528 		 */
1529 		if (sl->sl_chunks == 1) {
1530 			ASSERT(skm->skm_sl_empty > 0);
1531 			skm->skm_sl_empty--;
1532 			TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1533 		} else {
1534 			ASSERT(skm->skm_sl_partial > 0);
1535 			skm->skm_sl_partial--;
1536 			TAILQ_REMOVE(&skm->skm_sl_partial_list, sl, sl_link);
1537 		}
1538 		ASSERT((int64_t)(skm->skm_sl_bufinuse - sl->sl_chunks) >= 0);
1539 		skm->skm_sl_bufinuse -= sl->sl_chunks;
1540 		skm->skm_sl_destroy++;
1541 		SKM_SLAB_UNLOCK(skm);
1542 		skmem_slab_destroy(skm, sl);
1543 		SKM_SLAB_LOCK(skm);
1544 		return;
1545 	}
1546 
1547 	ASSERT(bc == SLIST_FIRST(&sl->sl_head));
1548 	if (SLIST_NEXT(bc, bc_link) == NULL) {
1549 		/*
1550 		 * If this is the first (potentially amongst many) object
1551 		 * that's returned to the slab, remove the slab from the
1552 		 * empty list and insert to end of the partially-allocated
1553 		 * list. This should help avoid thrashing the partial slab
1554 		 * since we avoid disturbing what's already at the front.
1555 		 */
1556 		ASSERT(sl->sl_refcnt == (sl->sl_chunks - 1));
1557 		ASSERT(sl->sl_chunks > 1);
1558 		ASSERT(skm->skm_sl_empty > 0);
1559 		skm->skm_sl_empty--;
1560 		TAILQ_REMOVE(&skm->skm_sl_empty_list, sl, sl_link);
1561 		skm->skm_sl_partial++;
1562 		ASSERT(skm->skm_sl_partial != 0);
1563 		TAILQ_INSERT_TAIL(&skm->skm_sl_partial_list, sl, sl_link);
1564 	}
1565 }
1566 
1567 /*
1568  * Free a raw object to the (locked) slab layer.  Pseudo region variant.
1569  */
1570 static void
skmem_slab_free_pseudo_locked(struct skmem_cache * skm,void * buf)1571 skmem_slab_free_pseudo_locked(struct skmem_cache *skm, void *buf)
1572 {
1573 	struct skmem_region *skr = skm->skm_region;
1574 	void *obj = buf;
1575 
1576 	ASSERT(skr->skr_reg == NULL && skr->skr_zreg != NULL);
1577 
1578 	SKM_SLAB_LOCK_ASSERT_HELD(skm);
1579 
1580 	VERIFY(IS_P2ALIGNED(obj, skm->skm_bufalign));
1581 
1582 #if KASAN
1583 	/*
1584 	 * Since we stuffed the original zone element address before
1585 	 * the buffer address in KASAN mode, get it back since we're
1586 	 * about to free it.
1587 	 */
1588 	void **pbuf = (void **)((intptr_t)obj - sizeof(void *));
1589 
1590 	VERIFY(((intptr_t)obj + skm->skm_bufsize) <=
1591 	    ((intptr_t)*pbuf + skm->skm_objsize));
1592 
1593 	obj = *pbuf;
1594 #endif /* KASAN */
1595 
1596 	/* free it to zone */
1597 	zfree(skr->skr_zreg, obj);
1598 
1599 	skm->skm_sl_free++;
1600 	ASSERT(skm->skm_sl_bufinuse > 0);
1601 	skm->skm_sl_bufinuse--;
1602 }
1603 
1604 /*
1605  * Free a raw object to the slab layer.
1606  */
1607 static void
skmem_slab_free(struct skmem_cache * skm,void * buf)1608 skmem_slab_free(struct skmem_cache *skm, void *buf)
1609 {
1610 	if (skm->skm_mode & SKM_MODE_BATCH) {
1611 		((struct skmem_obj *)buf)->mo_next = NULL;
1612 	}
1613 
1614 	SKM_SLAB_LOCK(skm);
1615 	skm->skm_slab_free(skm, buf);
1616 	SKM_SLAB_UNLOCK(skm);
1617 }
1618 
1619 /*
1620  * Free raw object(s) to the slab layer.
1621  */
1622 static void
skmem_slab_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1623 skmem_slab_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1624 {
1625 	struct skmem_obj *listn;
1626 
1627 	ASSERT(list != NULL && (skm->skm_mode & SKM_MODE_BATCH));
1628 
1629 	SKM_SLAB_LOCK(skm);
1630 	for (;;) {
1631 		listn = list->mo_next;
1632 		list->mo_next = NULL;
1633 
1634 		/*
1635 		 * Free a single object to the slab layer.
1636 		 */
1637 		skm->skm_slab_free(skm, (void *)list);
1638 
1639 		/* if no more objects to free, we're done */
1640 		if ((list = listn) == NULL) {
1641 			break;
1642 		}
1643 	}
1644 	SKM_SLAB_UNLOCK(skm);
1645 }
1646 
1647 /*
1648  * Return the object's region info.
1649  */
1650 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)1651 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
1652     struct skmem_obj_info *oi, struct skmem_obj_info *oim)
1653 {
1654 	struct skmem_bufctl_bkt *bcb;
1655 	struct skmem_bufctl *bc;
1656 	struct skmem_slab *sl;
1657 
1658 	/*
1659 	 * Search the hash chain to find a matching buffer control for the
1660 	 * given object address.  If not found, panic since the caller has
1661 	 * given us a bogus address.
1662 	 */
1663 	SKM_SLAB_LOCK(skm);
1664 	bcb = SKMEM_CACHE_HASH(skm, buf);
1665 	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
1666 		if (bc->bc_addr == buf) {
1667 			break;
1668 		}
1669 	}
1670 
1671 	if (__improbable(bc == NULL)) {
1672 		panic("%s: %s failed to get object info for %p",
1673 		    __func__, skm->skm_name, buf);
1674 		/* NOTREACHED */
1675 		__builtin_unreachable();
1676 	}
1677 
1678 	/*
1679 	 * Return the master object's info to the caller.
1680 	 */
1681 	sl = bc->bc_slab;
1682 	SKMEM_OBJ_ADDR(oi) = bc->bc_addr;
1683 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
1684 	ASSERT(skm->skm_objsize <= UINT32_MAX);
1685 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
1686 	SKMEM_OBJ_IDX_REG(oi) =
1687 	    (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
1688 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
1689 	/*
1690 	 * And for slave object.
1691 	 */
1692 	if (oim != NULL) {
1693 		bzero(oim, sizeof(*oim));
1694 		if (bc->bc_addrm != NULL) {
1695 			SKMEM_OBJ_ADDR(oim) = bc->bc_addrm;
1696 			SKMEM_OBJ_SIZE(oim) = oi->oi_size;
1697 			SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
1698 			SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
1699 		}
1700 	}
1701 	SKM_SLAB_UNLOCK(skm);
1702 }
1703 
1704 /*
1705  * Magazine constructor.
1706  */
1707 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)1708 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
1709     void *arg, uint32_t skmflag)
1710 {
1711 #pragma unused(oim, skmflag)
1712 	struct skmem_mag *mg = SKMEM_OBJ_ADDR(oi);
1713 
1714 	ASSERT(oim == NULL);
1715 	ASSERT(arg != NULL);
1716 
1717 	/*
1718 	 * Store it in the magazine object since we'll
1719 	 * need to refer to it during magazine destroy;
1720 	 * we can't safely refer to skm_magtype as the
1721 	 * depot lock may not be acquired then.
1722 	 */
1723 	mg->mg_magtype = arg;
1724 
1725 	return 0;
1726 }
1727 
1728 /*
1729  * Destroy a magazine (free each object to the slab layer).
1730  */
1731 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)1732 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
1733     int nrounds)
1734 {
1735 	int round;
1736 
1737 	for (round = 0; round < nrounds; round++) {
1738 		void *buf = mg->mg_round[round];
1739 		struct skmem_obj *next;
1740 
1741 		if (skm->skm_mode & SKM_MODE_BATCH) {
1742 			next = ((struct skmem_obj *)buf)->mo_next;
1743 			((struct skmem_obj *)buf)->mo_next = NULL;
1744 		}
1745 
1746 		/* deconstruct the object */
1747 		if (skm->skm_dtor != NULL) {
1748 			skm->skm_dtor(buf, skm->skm_private);
1749 		}
1750 
1751 		/*
1752 		 * In non-batching mode, each object in the magazine has
1753 		 * no linkage to its neighbor, so free individual object
1754 		 * to the slab layer now.
1755 		 */
1756 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1757 			skmem_slab_free(skm, buf);
1758 		} else {
1759 			((struct skmem_obj *)buf)->mo_next = next;
1760 		}
1761 	}
1762 
1763 	/*
1764 	 * In batching mode, each object is linked to its neighbor at free
1765 	 * time, and so take the bottom-most object and free it to the slab
1766 	 * layer.  Because of the way the list is reversed during free, this
1767 	 * will bring along the rest of objects above it.
1768 	 */
1769 	if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1770 		skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1771 	}
1772 
1773 	/* free the magazine itself back to cache */
1774 	skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1775 }
1776 
1777 /*
1778  * Get one or more magazines from the depot.
1779  */
1780 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag ** list,uint32_t num)1781 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1782     uint32_t *count, struct skmem_mag **list, uint32_t num)
1783 {
1784 	SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1785 	struct skmem_mag *mg;
1786 	uint32_t need = num, c = 0;
1787 
1788 	ASSERT(list != NULL && need > 0);
1789 
1790 	if (!SKM_DEPOT_LOCK_TRY(skm)) {
1791 		/*
1792 		 * Track the amount of lock contention here; if the contention
1793 		 * level is high (more than skmem_cache_depot_contention per a
1794 		 * given skmem_cache_update_interval interval), then we treat
1795 		 * it as a sign that the per-CPU layer is not using the right
1796 		 * magazine type, and that we'd need to resize it.
1797 		 */
1798 		SKM_DEPOT_LOCK(skm);
1799 		if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1800 			skm->skm_depot_contention++;
1801 		}
1802 	}
1803 
1804 	while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1805 		SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1806 		SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1807 		ASSERT(ml->ml_total != 0);
1808 		if (--ml->ml_total < ml->ml_min) {
1809 			ml->ml_min = ml->ml_total;
1810 		}
1811 		c++;
1812 		ml->ml_alloc++;
1813 		if (--need == 0) {
1814 			break;
1815 		}
1816 	}
1817 	*count -= c;
1818 
1819 	SKM_DEPOT_UNLOCK(skm);
1820 
1821 	*list = SLIST_FIRST(&mg_list);
1822 
1823 	return num - need;
1824 }
1825 
1826 /*
1827  * Return one or more magazines to the depot.
1828  */
1829 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1830 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1831     uint32_t *count, struct skmem_mag *mg)
1832 {
1833 	struct skmem_mag *nmg;
1834 	uint32_t c = 0;
1835 
1836 	SKM_DEPOT_LOCK(skm);
1837 	while (mg != NULL) {
1838 		nmg = SLIST_NEXT(mg, mg_link);
1839 		SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1840 		ml->ml_total++;
1841 		c++;
1842 		mg = nmg;
1843 	}
1844 	*count += c;
1845 	SKM_DEPOT_UNLOCK(skm);
1846 }
1847 
1848 /*
1849  * Update the depot's working state statistics.
1850  */
1851 static void
skmem_depot_ws_update(struct skmem_cache * skm)1852 skmem_depot_ws_update(struct skmem_cache *skm)
1853 {
1854 	SKM_DEPOT_LOCK_SPIN(skm);
1855 	skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1856 	skm->skm_full.ml_min = skm->skm_full.ml_total;
1857 	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1858 	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1859 	SKM_DEPOT_UNLOCK(skm);
1860 }
1861 
1862 /*
1863  * Empty the depot's working state statistics (everything's reapable.)
1864  */
1865 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1866 skmem_depot_ws_zero(struct skmem_cache *skm)
1867 {
1868 	SKM_DEPOT_LOCK_SPIN(skm);
1869 	if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1870 	    skm->skm_full.ml_min != skm->skm_full.ml_total ||
1871 	    skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1872 	    skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1873 		skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1874 		skm->skm_full.ml_min = skm->skm_full.ml_total;
1875 		skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1876 		skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1877 		skm->skm_depot_ws_zero++;
1878 	}
1879 	SKM_DEPOT_UNLOCK(skm);
1880 }
1881 
1882 /*
1883  * Reap magazines that's outside of the working set.
1884  */
1885 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1886 skmem_depot_ws_reap(struct skmem_cache *skm)
1887 {
1888 	struct skmem_mag *mg, *nmg;
1889 	uint32_t f, e, reap;
1890 
1891 	reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1892 	if (reap != 0) {
1893 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1894 		    &skm->skm_depot_full, &mg, reap);
1895 		while (mg != NULL) {
1896 			nmg = SLIST_NEXT(mg, mg_link);
1897 			SLIST_NEXT(mg, mg_link) = NULL;
1898 			skmem_magazine_destroy(skm, mg,
1899 			    mg->mg_magtype->mt_magsize);
1900 			mg = nmg;
1901 		}
1902 	}
1903 
1904 	reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1905 	if (reap != 0) {
1906 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1907 		    &skm->skm_depot_empty, &mg, reap);
1908 		while (mg != NULL) {
1909 			nmg = SLIST_NEXT(mg, mg_link);
1910 			SLIST_NEXT(mg, mg_link) = NULL;
1911 			skmem_magazine_destroy(skm, mg, 0);
1912 			mg = nmg;
1913 		}
1914 	}
1915 
1916 	if (f != 0 || e != 0) {
1917 		os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed);
1918 	}
1919 }
1920 
1921 /*
1922  * Performs periodic maintenance on a cache.  This is serialized
1923  * through the update thread call, and so we guarantee there's at
1924  * most one update episode in the system at any given time.
1925  */
1926 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1927 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1928 {
1929 #pragma unused(arg)
1930 	boolean_t resize_mag = FALSE;
1931 	boolean_t rescale_hash = FALSE;
1932 
1933 	SKMEM_CACHE_LOCK_ASSERT_HELD();
1934 
1935 	/* insist that we are executing in the update thread call context */
1936 	ASSERT(sk_is_cache_update_protected());
1937 
1938 	/*
1939 	 * If the cache has become much larger or smaller than the
1940 	 * allocated-address hash table, rescale the hash table.
1941 	 */
1942 	SKM_SLAB_LOCK(skm);
1943 	if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1944 	    (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1945 	    (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1946 	    skm->skm_hash_mask > skm->skm_hash_initial)) {
1947 		rescale_hash = TRUE;
1948 	}
1949 	SKM_SLAB_UNLOCK(skm);
1950 
1951 	/*
1952 	 * Update the working set.
1953 	 */
1954 	skmem_depot_ws_update(skm);
1955 
1956 	/*
1957 	 * If the contention count is greater than the threshold during
1958 	 * the update interval, and if we are not already at the maximum
1959 	 * magazine size, increase it.
1960 	 */
1961 	SKM_DEPOT_LOCK_SPIN(skm);
1962 	if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1963 	    (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1964 	    skmem_cache_depot_contention) {
1965 		ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1966 		resize_mag = TRUE;
1967 	}
1968 	skm->skm_depot_contention_prev = skm->skm_depot_contention;
1969 	SKM_DEPOT_UNLOCK(skm);
1970 
1971 	if (rescale_hash) {
1972 		skmem_cache_hash_rescale(skm);
1973 	}
1974 
1975 	if (resize_mag) {
1976 		skmem_cache_magazine_resize(skm);
1977 	}
1978 }
1979 
1980 /*
1981  * Reload the CPU's magazines with mg and its follower (if any).
1982  */
1983 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1984 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1985     int rounds)
1986 {
1987 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1988 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1989 	ASSERT(cp->cp_magsize > 0);
1990 
1991 	cp->cp_loaded = mg;
1992 	cp->cp_rounds = rounds;
1993 	if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1994 		cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1995 		cp->cp_prounds = rounds;
1996 		SLIST_NEXT(mg, mg_link) = NULL;
1997 	} else {
1998 		ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1999 		cp->cp_ploaded = NULL;
2000 		cp->cp_prounds = -1;
2001 	}
2002 }
2003 
2004 /*
2005  * Reload the CPU's magazine with mg and save the previous one.
2006  */
2007 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)2008 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
2009 {
2010 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
2011 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
2012 	ASSERT(cp->cp_magsize > 0);
2013 
2014 	cp->cp_ploaded = cp->cp_loaded;
2015 	cp->cp_prounds = cp->cp_rounds;
2016 	cp->cp_loaded = mg;
2017 	cp->cp_rounds = rounds;
2018 }
2019 
2020 /*
2021  * Allocate a constructed object from the cache.
2022  */
2023 void *
skmem_cache_alloc(struct skmem_cache * skm,uint32_t skmflag)2024 skmem_cache_alloc(struct skmem_cache *skm, uint32_t skmflag)
2025 {
2026 	struct skmem_obj *buf;
2027 
2028 	(void) skmem_cache_batch_alloc(skm, &buf, 1, skmflag);
2029 	return buf;
2030 }
2031 
2032 /*
2033  * Allocate constructed object(s) from the cache.
2034  */
2035 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,uint32_t num,uint32_t skmflag)2036 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
2037     uint32_t num, uint32_t skmflag)
2038 {
2039 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2040 	struct skmem_obj **top = &(*list);
2041 	struct skmem_mag *mg;
2042 	uint32_t need = num;
2043 
2044 	ASSERT(list != NULL);
2045 	*list = NULL;
2046 
2047 	if (need == 0) {
2048 		return 0;
2049 	}
2050 	ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
2051 
2052 	SKM_CPU_LOCK(cp);
2053 	for (;;) {
2054 		/*
2055 		 * If we have an object in the current CPU's loaded
2056 		 * magazine, return it and we're done.
2057 		 */
2058 		if (cp->cp_rounds > 0) {
2059 			int objs = MIN((unsigned int)cp->cp_rounds, need);
2060 			/*
2061 			 * In the SKM_MODE_BATCH case, objects in are already
2062 			 * linked together with the most recently freed object
2063 			 * at the head of the list; grab as many objects as we
2064 			 * can.  Otherwise we'll just grab 1 object at most.
2065 			 */
2066 			*list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2067 			cp->cp_rounds -= objs;
2068 			cp->cp_alloc += objs;
2069 
2070 			if (skm->skm_mode & SKM_MODE_BATCH) {
2071 				struct skmem_obj *tail =
2072 				    cp->cp_loaded->mg_round[cp->cp_rounds];
2073 				list = &tail->mo_next;
2074 				*list = NULL;
2075 			}
2076 
2077 			/* if we got them all, return to caller */
2078 			if ((need -= objs) == 0) {
2079 				SKM_CPU_UNLOCK(cp);
2080 				goto done;
2081 			}
2082 		}
2083 
2084 		/*
2085 		 * The CPU's loaded magazine is empty.  If the previously
2086 		 * loaded magazine was full, exchange and try again.
2087 		 */
2088 		if (cp->cp_prounds > 0) {
2089 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2090 			continue;
2091 		}
2092 
2093 		/*
2094 		 * If the magazine layer is disabled, allocate from slab.
2095 		 * This can happen either because SKM_MODE_NOMAGAZINES is
2096 		 * set, or because we are resizing the magazine now.
2097 		 */
2098 		if (cp->cp_magsize == 0) {
2099 			break;
2100 		}
2101 
2102 		/*
2103 		 * Both of the CPU's magazines are empty; try to get
2104 		 * full magazine(s) from the depot layer.  Upon success,
2105 		 * reload and try again.  To prevent potential thrashing,
2106 		 * replace both empty magazines only if the requested
2107 		 * count exceeds a magazine's worth of objects.
2108 		 */
2109 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
2110 		    &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
2111 		if (mg != NULL) {
2112 			SLIST_HEAD(, skmem_mag) mg_list =
2113 			    SLIST_HEAD_INITIALIZER(mg_list);
2114 
2115 			if (cp->cp_ploaded != NULL) {
2116 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2117 				    mg_link);
2118 			}
2119 			if (SLIST_NEXT(mg, mg_link) == NULL) {
2120 				/*
2121 				 * Depot allocation returns only 1 magazine;
2122 				 * retain current empty magazine.
2123 				 */
2124 				skmem_cpu_reload(cp, mg, cp->cp_magsize);
2125 			} else {
2126 				/*
2127 				 * We got 2 full magazines from depot;
2128 				 * release the current empty magazine
2129 				 * back to the depot layer.
2130 				 */
2131 				if (cp->cp_loaded != NULL) {
2132 					SLIST_INSERT_HEAD(&mg_list,
2133 					    cp->cp_loaded, mg_link);
2134 				}
2135 				skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
2136 			}
2137 			skmem_depot_batch_free(skm, &skm->skm_empty,
2138 			    &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
2139 			continue;
2140 		}
2141 
2142 		/*
2143 		 * The depot layer doesn't have any full magazines;
2144 		 * allocate directly from the slab layer.
2145 		 */
2146 		break;
2147 	}
2148 	SKM_CPU_UNLOCK(cp);
2149 
2150 	if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
2151 		struct skmem_obj *rtop, *rlist, *rlistp = NULL;
2152 		uint32_t rlistc, c = 0;
2153 
2154 		/*
2155 		 * Get a list of raw objects from the slab layer.
2156 		 */
2157 		rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
2158 		ASSERT(rlistc == 0 || rlist != NULL);
2159 		rtop = rlist;
2160 
2161 		/*
2162 		 * Construct each object in the raw list.  Upon failure,
2163 		 * free any remaining objects in the list back to the slab
2164 		 * layer, and keep the ones that were successfully constructed.
2165 		 * Here, "oi" and "oim" in each skmem_obj refer to the objects
2166 		 * coming from the master and slave regions (on mirrored
2167 		 * regions), respectively.  They are stored inside the object
2168 		 * temporarily so that we can pass them to the constructor.
2169 		 */
2170 		while (skm->skm_ctor != NULL && rlist != NULL) {
2171 			struct skmem_obj_info *oi = &rlist->mo_info;
2172 			struct skmem_obj_info *oim = &rlist->mo_minfo;
2173 			struct skmem_obj *rlistn = rlist->mo_next;
2174 
2175 			/*
2176 			 * Note that the constructor guarantees at least
2177 			 * the size of a pointer at the top of the object
2178 			 * and no more than that.  That means we must not
2179 			 * refer to "oi" and "oim" any longer after the
2180 			 * object goes thru the constructor.
2181 			 */
2182 			if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
2183 			    oim : NULL), skm->skm_private, skmflag) != 0) {
2184 				VERIFY(rlist->mo_next == rlistn);
2185 				os_atomic_add(&skm->skm_sl_alloc_fail,
2186 				    rlistc - c, relaxed);
2187 				if (rlistp != NULL) {
2188 					rlistp->mo_next = NULL;
2189 				}
2190 				if (rlist == rtop) {
2191 					rtop = NULL;
2192 					ASSERT(c == 0);
2193 				}
2194 				skmem_slab_batch_free(skm, rlist);
2195 				rlist = NULL;
2196 				rlistc = c;
2197 				break;
2198 			}
2199 			VERIFY(rlist->mo_next == rlistn);
2200 
2201 			++c;                    /* # of constructed objs */
2202 			rlistp = rlist;
2203 			if ((rlist = rlist->mo_next) == NULL) {
2204 				ASSERT(rlistc == c);
2205 				break;
2206 			}
2207 		}
2208 
2209 		/*
2210 		 * At this point "top" points to the head of the chain we're
2211 		 * going to return to caller; "list" points to the tail of that
2212 		 * chain.  The second chain begins at "rtop", and we append
2213 		 * that after "list" to form a single chain.  "rlistc" is the
2214 		 * number of objects in "rtop" originated from the slab layer
2215 		 * that have been successfully constructed (if applicable).
2216 		 */
2217 		ASSERT(c == 0 || rtop != NULL);
2218 		need -= rlistc;
2219 		*list = rtop;
2220 	} else {
2221 		struct skmem_obj_info oi, oim;
2222 		void *buf;
2223 
2224 		ASSERT(*top == NULL && num == 1 && need == 1);
2225 
2226 		/*
2227 		 * Get a single raw object from the slab layer.
2228 		 */
2229 		if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
2230 			goto done;
2231 		}
2232 
2233 		buf = SKMEM_OBJ_ADDR(&oi);
2234 		ASSERT(buf != NULL);
2235 
2236 		/*
2237 		 * Construct the raw object.  Here, "oi" and "oim" refer to
2238 		 * the objects coming from the master and slave regions (on
2239 		 * mirrored regions), respectively.
2240 		 */
2241 		if (skm->skm_ctor != NULL &&
2242 		    skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
2243 		    &oim : NULL), skm->skm_private, skmflag) != 0) {
2244 			os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
2245 			skmem_slab_free(skm, buf);
2246 			goto done;
2247 		}
2248 
2249 		need = 0;
2250 		*list = buf;
2251 		ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
2252 		    (*list)->mo_next == NULL);
2253 	}
2254 
2255 done:
2256 	/* if auditing is enabled, record this transaction */
2257 	if (__improbable(*top != NULL &&
2258 	    (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2259 		skmem_audit_buf(skm, *top);
2260 	}
2261 
2262 	return num - need;
2263 }
2264 
2265 /*
2266  * Free a constructed object to the cache.
2267  */
2268 void
skmem_cache_free(struct skmem_cache * skm,void * buf)2269 skmem_cache_free(struct skmem_cache *skm, void *buf)
2270 {
2271 	if (skm->skm_mode & SKM_MODE_BATCH) {
2272 		((struct skmem_obj *)buf)->mo_next = NULL;
2273 	}
2274 	skmem_cache_batch_free(skm, (struct skmem_obj *)buf);
2275 }
2276 
2277 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)2278 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
2279 {
2280 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
2281 	struct skmem_magtype *mtp;
2282 	struct skmem_mag *mg;
2283 	struct skmem_obj *listn;
2284 #if CONFIG_KERNEL_TAGGING
2285 	vm_offset_t tagged_address;           /* address tagging */
2286 	struct skmem_region *region;          /* region source for this cache */
2287 #endif /* CONFIG_KERNEL_TAGGING */
2288 
2289 	/* if auditing is enabled, record this transaction */
2290 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
2291 		skmem_audit_buf(skm, list);
2292 	}
2293 
2294 	SKM_CPU_LOCK(cp);
2295 	for (;;) {
2296 		/*
2297 		 * If there's an available space in the current CPU's
2298 		 * loaded magazine, place it there and we're done.
2299 		 */
2300 		if ((unsigned int)cp->cp_rounds <
2301 		    (unsigned int)cp->cp_magsize) {
2302 			/*
2303 			 * In the SKM_MODE_BATCH case, reverse the list
2304 			 * while we place each object into the magazine;
2305 			 * this effectively causes the most recently
2306 			 * freed object to be reused during allocation.
2307 			 */
2308 			if (skm->skm_mode & SKM_MODE_BATCH) {
2309 				listn = list->mo_next;
2310 				list->mo_next = (cp->cp_rounds == 0) ? NULL :
2311 				    cp->cp_loaded->mg_round[cp->cp_rounds - 1];
2312 			} else {
2313 				listn = NULL;
2314 			}
2315 #if CONFIG_KERNEL_TAGGING
2316 			/*
2317 			 * If this region is configured to be tagged, we re-tag
2318 			 * the address that's being freed, to protect against
2319 			 * use-after-free bugs. This "re-tagged" address will
2320 			 * reside in the CPU's loaded magazine, and when cache
2321 			 * alloc is called, it is returned to client as is. At
2322 			 * this point, we know that this object will be freed to
2323 			 * the CPU's loaded magazine and not down to the slab
2324 			 * layer, so we won't be double tagging the same address
2325 			 * in the magazine layer and slab layer.
2326 			 */
2327 			region = skm->skm_region;
2328 			if (region->skr_mode & SKR_MODE_MEMTAG) {
2329 				tagged_address = vm_memtag_assign_tag(
2330 					(vm_offset_t)list, skm->skm_objsize);
2331 				vm_memtag_set_tag(tagged_address,
2332 				    skm->skm_objsize);
2333 				cp->cp_loaded->mg_round[cp->cp_rounds++] =
2334 				    (void *)tagged_address;
2335 			} else {
2336 				cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2337 			}
2338 #else /* !CONFIG_KERNEL_TAGGING */
2339 			cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
2340 #endif /* CONFIG_KERNEL_TAGGING */
2341 			cp->cp_free++;
2342 
2343 			if ((list = listn) != NULL) {
2344 				continue;
2345 			}
2346 
2347 			SKM_CPU_UNLOCK(cp);
2348 			return;
2349 		}
2350 
2351 		/*
2352 		 * The loaded magazine is full.  If the previously
2353 		 * loaded magazine was empty, exchange and try again.
2354 		 */
2355 		if (cp->cp_prounds == 0) {
2356 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
2357 			continue;
2358 		}
2359 
2360 		/*
2361 		 * If the magazine layer is disabled, free to slab.
2362 		 * This can happen either because SKM_MODE_NOMAGAZINES
2363 		 * is set, or because we are resizing the magazine now.
2364 		 */
2365 		if (cp->cp_magsize == 0) {
2366 			break;
2367 		}
2368 
2369 		/*
2370 		 * Both magazines for the CPU are full; try to get
2371 		 * empty magazine(s) from the depot.  If we get one,
2372 		 * exchange a full magazine with it and place the
2373 		 * object in there.
2374 		 *
2375 		 * TODO: Because the caller currently doesn't indicate
2376 		 * the number of objects in the list, we choose the more
2377 		 * conservative approach of allocating only 1 empty
2378 		 * magazine (to prevent potential thrashing).  Once we
2379 		 * have the object count, we can replace 1 with similar
2380 		 * logic as used in skmem_cache_batch_alloc().
2381 		 */
2382 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
2383 		    &skm->skm_depot_empty, &mg, 1);
2384 		if (mg != NULL) {
2385 			SLIST_HEAD(, skmem_mag) mg_list =
2386 			    SLIST_HEAD_INITIALIZER(mg_list);
2387 
2388 			if (cp->cp_ploaded != NULL) {
2389 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
2390 				    mg_link);
2391 			}
2392 			if (SLIST_NEXT(mg, mg_link) == NULL) {
2393 				/*
2394 				 * Depot allocation returns only 1 magazine;
2395 				 * retain current full magazine.
2396 				 */
2397 				skmem_cpu_reload(cp, mg, 0);
2398 			} else {
2399 				/*
2400 				 * We got 2 empty magazines from depot;
2401 				 * release the current full magazine back
2402 				 * to the depot layer.
2403 				 */
2404 				if (cp->cp_loaded != NULL) {
2405 					SLIST_INSERT_HEAD(&mg_list,
2406 					    cp->cp_loaded, mg_link);
2407 				}
2408 				skmem_cpu_batch_reload(cp, mg, 0);
2409 			}
2410 			skmem_depot_batch_free(skm, &skm->skm_full,
2411 			    &skm->skm_depot_full, SLIST_FIRST(&mg_list));
2412 			continue;
2413 		}
2414 
2415 		/*
2416 		 * We can't get any empty magazine from the depot, and
2417 		 * so we need to allocate one.  If the allocation fails,
2418 		 * just fall through, deconstruct and free the object
2419 		 * to the slab layer.
2420 		 */
2421 		mtp = skm->skm_magtype;
2422 		SKM_CPU_UNLOCK(cp);
2423 		mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
2424 		SKM_CPU_LOCK(cp);
2425 
2426 		if (mg != NULL) {
2427 			/*
2428 			 * We allocated an empty magazine, but since we
2429 			 * dropped the CPU lock above the magazine size
2430 			 * may have changed.  If that's the case free
2431 			 * the magazine and try again.
2432 			 */
2433 			if (cp->cp_magsize != mtp->mt_magsize) {
2434 				SKM_CPU_UNLOCK(cp);
2435 				skmem_cache_free(mtp->mt_cache, mg);
2436 				SKM_CPU_LOCK(cp);
2437 				continue;
2438 			}
2439 
2440 			/*
2441 			 * We have a magazine with the right size;
2442 			 * add it to the depot and try again.
2443 			 */
2444 			ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
2445 			skmem_depot_batch_free(skm, &skm->skm_empty,
2446 			    &skm->skm_depot_empty, mg);
2447 			continue;
2448 		}
2449 
2450 		/*
2451 		 * We can't get an empty magazine, so free to slab.
2452 		 */
2453 		break;
2454 	}
2455 	SKM_CPU_UNLOCK(cp);
2456 
2457 	/*
2458 	 * We weren't able to free the constructed object(s) to the
2459 	 * magazine layer, so deconstruct them and free to the slab.
2460 	 */
2461 	if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
2462 	    list->mo_next != NULL)) {
2463 		/* whatever is left from original list */
2464 		struct skmem_obj *top = list;
2465 
2466 		while (list != NULL && skm->skm_dtor != NULL) {
2467 			listn = list->mo_next;
2468 			list->mo_next = NULL;
2469 
2470 			/* deconstruct the object */
2471 			if (skm->skm_dtor != NULL) {
2472 				skm->skm_dtor((void *)list, skm->skm_private);
2473 			}
2474 
2475 			list->mo_next = listn;
2476 			list = listn;
2477 		}
2478 
2479 		skmem_slab_batch_free(skm, top);
2480 	} else {
2481 		/* deconstruct the object */
2482 		if (skm->skm_dtor != NULL) {
2483 			skm->skm_dtor((void *)list, skm->skm_private);
2484 		}
2485 
2486 		skmem_slab_free(skm, (void *)list);
2487 	}
2488 }
2489 
2490 /*
2491  * Return the maximum number of objects cached at the magazine layer
2492  * based on the chunk size.  This takes into account the starting
2493  * magazine type as well as the final magazine type used in resizing.
2494  */
2495 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)2496 skmem_cache_magazine_max(uint32_t chunksize)
2497 {
2498 	struct skmem_magtype *mtp;
2499 	uint32_t magsize_max;
2500 
2501 	VERIFY(ncpu != 0);
2502 	VERIFY(chunksize > 0);
2503 
2504 	/* find a suitable magazine type for this chunk size */
2505 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
2506 		continue;
2507 	}
2508 
2509 	/* and find the last magazine type  */
2510 	for (;;) {
2511 		magsize_max = mtp->mt_magsize;
2512 		if (mtp == skmem_cache_magsize_last ||
2513 		    chunksize >= mtp->mt_maxbuf) {
2514 			break;
2515 		}
2516 		++mtp;
2517 		VERIFY(mtp <= skmem_cache_magsize_last);
2518 	}
2519 
2520 	return ncpu * magsize_max * 2; /* two magazines per CPU */
2521 }
2522 
2523 /*
2524  * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
2525  */
2526 boolean_t
skmem_allow_magazines(void)2527 skmem_allow_magazines(void)
2528 {
2529 	return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
2530 }
2531 
2532 /*
2533  * Purge all magazines from a cache and disable its per-CPU magazines layer.
2534  */
2535 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)2536 skmem_cache_magazine_purge(struct skmem_cache *skm)
2537 {
2538 	struct skmem_cpu_cache *cp;
2539 	struct skmem_mag *mg, *pmg;
2540 	int rounds, prounds;
2541 	uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
2542 
2543 	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2544 
2545 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
2546 
2547 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
2548 		cp = &skm->skm_cpu_cache[cpuid];
2549 
2550 		SKM_CPU_LOCK_SPIN(cp);
2551 		mg = cp->cp_loaded;
2552 		pmg = cp->cp_ploaded;
2553 		rounds = cp->cp_rounds;
2554 		prounds = cp->cp_prounds;
2555 		cp->cp_loaded = NULL;
2556 		cp->cp_ploaded = NULL;
2557 		cp->cp_rounds = -1;
2558 		cp->cp_prounds = -1;
2559 		cp->cp_magsize = 0;
2560 		SKM_CPU_UNLOCK(cp);
2561 
2562 		if (mg != NULL) {
2563 			skmem_magazine_destroy(skm, mg, rounds);
2564 			++mg_cnt;
2565 		}
2566 		if (pmg != NULL) {
2567 			skmem_magazine_destroy(skm, pmg, prounds);
2568 			++pmg_cnt;
2569 		}
2570 	}
2571 
2572 	if (mg_cnt != 0 || pmg_cnt != 0) {
2573 		os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed);
2574 	}
2575 
2576 	skmem_depot_ws_zero(skm);
2577 	skmem_depot_ws_reap(skm);
2578 }
2579 
2580 /*
2581  * Enable magazines on a cache.  Must only be called on a cache with
2582  * its per-CPU magazines layer disabled (e.g. due to purge).
2583  */
2584 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)2585 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
2586 {
2587 #pragma unused(arg)
2588 	struct skmem_cpu_cache *cp;
2589 	uint32_t cpuid;
2590 
2591 	if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
2592 		return;
2593 	}
2594 
2595 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
2596 		cp = &skm->skm_cpu_cache[cpuid];
2597 		SKM_CPU_LOCK_SPIN(cp);
2598 		/* the magazines layer must be disabled at this point */
2599 		ASSERT(cp->cp_loaded == NULL);
2600 		ASSERT(cp->cp_ploaded == NULL);
2601 		ASSERT(cp->cp_rounds == -1);
2602 		ASSERT(cp->cp_prounds == -1);
2603 		ASSERT(cp->cp_magsize == 0);
2604 		cp->cp_magsize = skm->skm_magtype->mt_magsize;
2605 		SKM_CPU_UNLOCK(cp);
2606 	}
2607 
2608 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
2609 	    SK_KVA(skm), (uint32_t)skm->skm_chunksize,
2610 	    SKMEM_CPU_CACHE(skm)->cp_magsize);
2611 }
2612 
2613 /*
2614  * Enter the cache resize perimeter.  Upon success, claim exclusivity
2615  * on the perimeter and return 0, else EBUSY.  Caller may indicate
2616  * whether or not they're willing to wait.
2617  */
2618 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)2619 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
2620 {
2621 	SKM_RESIZE_LOCK(skm);
2622 	if (skm->skm_rs_owner == current_thread()) {
2623 		ASSERT(skm->skm_rs_busy != 0);
2624 		skm->skm_rs_busy++;
2625 		goto done;
2626 	}
2627 	if (!can_sleep) {
2628 		if (skm->skm_rs_busy != 0) {
2629 			SKM_RESIZE_UNLOCK(skm);
2630 			return EBUSY;
2631 		}
2632 	} else {
2633 		while (skm->skm_rs_busy != 0) {
2634 			skm->skm_rs_want++;
2635 			(void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
2636 			SKM_RESIZE_UNLOCK(skm);
2637 			(void) thread_block(THREAD_CONTINUE_NULL);
2638 			SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
2639 			    "(0x%llx) busy=%u", skm->skm_name,
2640 			    SK_KVA(skm), skm->skm_rs_busy);
2641 			SKM_RESIZE_LOCK(skm);
2642 		}
2643 	}
2644 	SKM_RESIZE_LOCK_ASSERT_HELD(skm);
2645 	ASSERT(skm->skm_rs_busy == 0);
2646 	skm->skm_rs_busy++;
2647 	skm->skm_rs_owner = current_thread();
2648 done:
2649 	SKM_RESIZE_UNLOCK(skm);
2650 	return 0;
2651 }
2652 
2653 /*
2654  * Exit the cache resize perimeter and unblock any waiters.
2655  */
2656 static void
skmem_cache_resize_exit(struct skmem_cache * skm)2657 skmem_cache_resize_exit(struct skmem_cache *skm)
2658 {
2659 	uint32_t want;
2660 
2661 	SKM_RESIZE_LOCK(skm);
2662 	ASSERT(skm->skm_rs_busy != 0);
2663 	ASSERT(skm->skm_rs_owner == current_thread());
2664 	if (--skm->skm_rs_busy == 0) {
2665 		skm->skm_rs_owner = NULL;
2666 		/*
2667 		 * We're done; notify anyone that has lost the race.
2668 		 */
2669 		if ((want = skm->skm_rs_want) != 0) {
2670 			skm->skm_rs_want = 0;
2671 			wakeup((void *)&skm->skm_rs_busy);
2672 			SKM_RESIZE_UNLOCK(skm);
2673 		} else {
2674 			SKM_RESIZE_UNLOCK(skm);
2675 		}
2676 	} else {
2677 		SKM_RESIZE_UNLOCK(skm);
2678 	}
2679 }
2680 
2681 /*
2682  * Recompute a cache's magazine size.  This is an expensive operation
2683  * and should not be done frequently; larger magazines provide for a
2684  * higher transfer rate with the depot while smaller magazines reduce
2685  * the memory consumption.
2686  */
2687 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)2688 skmem_cache_magazine_resize(struct skmem_cache *skm)
2689 {
2690 	struct skmem_magtype *mtp = skm->skm_magtype;
2691 
2692 	/* insist that we are executing in the update thread call context */
2693 	ASSERT(sk_is_cache_update_protected());
2694 	ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
2695 	/* depot contention only applies to dynamic mode */
2696 	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2697 
2698 	/*
2699 	 * Although we're executing in the context of the update thread
2700 	 * call, we need to protect the per-CPU states during resizing
2701 	 * against other synchronous cache purge/reenable requests that
2702 	 * could take place in parallel.
2703 	 */
2704 	if (skm->skm_chunksize < mtp->mt_maxbuf) {
2705 		(void) skmem_cache_resize_enter(skm, TRUE);
2706 		skmem_cache_magazine_purge(skm);
2707 
2708 		/*
2709 		 * Upgrade to the next magazine type with larger size.
2710 		 */
2711 		SKM_DEPOT_LOCK_SPIN(skm);
2712 		skm->skm_cpu_mag_resize++;
2713 		skm->skm_magtype = ++mtp;
2714 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2715 		skm->skm_depot_contention_prev =
2716 		    skm->skm_depot_contention + INT_MAX;
2717 		SKM_DEPOT_UNLOCK(skm);
2718 
2719 		skmem_cache_magazine_enable(skm, 0);
2720 		skmem_cache_resize_exit(skm);
2721 	}
2722 }
2723 
2724 /*
2725  * Rescale the cache's allocated-address hash table.
2726  */
2727 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2728 skmem_cache_hash_rescale(struct skmem_cache *skm)
2729 {
2730 	struct skmem_bufctl_bkt *old_table, *new_table;
2731 	size_t old_size, new_size;
2732 	uint32_t i, moved = 0;
2733 
2734 	/* insist that we are executing in the update thread call context */
2735 	ASSERT(sk_is_cache_update_protected());
2736 
2737 	/*
2738 	 * To get small average lookup time (lookup depth near 1.0), the hash
2739 	 * table size should be roughly the same (not necessarily equivalent)
2740 	 * as the cache size.
2741 	 */
2742 	new_size = MAX(skm->skm_hash_initial,
2743 	    (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2744 	new_size = MIN(skm->skm_hash_limit, new_size);
2745 	old_size = (skm->skm_hash_mask + 1);
2746 
2747 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2748 		return;
2749 	}
2750 
2751 	new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2752 	    Z_NOWAIT, skmem_tag_bufctl_hash);
2753 	if (__improbable(new_table == NULL)) {
2754 		return;
2755 	}
2756 
2757 	for (i = 0; i < new_size; i++) {
2758 		SLIST_INIT(&new_table[i].bcb_head);
2759 	}
2760 
2761 	SKM_SLAB_LOCK(skm);
2762 
2763 	old_size = (skm->skm_hash_mask + 1);
2764 	old_table = skm->skm_hash_table;
2765 
2766 	skm->skm_hash_mask = (new_size - 1);
2767 	skm->skm_hash_table = new_table;
2768 	skm->skm_sl_rescale++;
2769 
2770 	for (i = 0; i < old_size; i++) {
2771 		struct skmem_bufctl_bkt *bcb = &old_table[i];
2772 		struct skmem_bufctl_bkt *new_bcb;
2773 		struct skmem_bufctl *bc;
2774 
2775 		while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2776 			SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2777 			new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2778 			/*
2779 			 * Ideally we want to insert tail here, but simple
2780 			 * list doesn't give us that.  The fact that we are
2781 			 * essentially reversing the order is not a big deal
2782 			 * here vis-a-vis the new table size.
2783 			 */
2784 			SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2785 			++moved;
2786 		}
2787 		ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2788 	}
2789 
2790 	SK_DF(SK_VERB_MEM_CACHE,
2791 	    "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2792 	    (uint32_t)old_size, (uint32_t)new_size, moved);
2793 
2794 	SKM_SLAB_UNLOCK(skm);
2795 
2796 	sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2797 }
2798 
2799 /*
2800  * Apply a function to operate on all caches.
2801  */
2802 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2803 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2804 {
2805 	struct skmem_cache *skm;
2806 
2807 	net_update_uptime();
2808 
2809 	SKMEM_CACHE_LOCK();
2810 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2811 		func(skm, arg);
2812 	}
2813 	SKMEM_CACHE_UNLOCK();
2814 }
2815 
2816 /*
2817  * Reclaim unused memory from a cache.
2818  */
2819 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2820 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2821 {
2822 	/*
2823 	 * Inform the owner to free memory if possible; the reclaim
2824 	 * policy is left to the owner.  This is just an advisory.
2825 	 */
2826 	if (skm->skm_reclaim != NULL) {
2827 		skm->skm_reclaim(skm->skm_private);
2828 	}
2829 
2830 	if (lowmem) {
2831 		/*
2832 		 * If another thread is in the process of purging or
2833 		 * resizing, bail out and let the currently-ongoing
2834 		 * purging take its natural course.
2835 		 */
2836 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2837 			skmem_cache_magazine_purge(skm);
2838 			skmem_cache_magazine_enable(skm, 0);
2839 			skmem_cache_resize_exit(skm);
2840 		}
2841 	} else {
2842 		skmem_depot_ws_reap(skm);
2843 	}
2844 }
2845 
2846 /*
2847  * Thread call callback for reap.
2848  */
2849 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2850 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2851 {
2852 #pragma unused(dummy)
2853 	void (*func)(void) = arg;
2854 
2855 	ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2856 	func();
2857 }
2858 
2859 /*
2860  * Start reaping all caches; this is serialized via thread call.
2861  */
2862 static void
skmem_cache_reap_start(void)2863 skmem_cache_reap_start(void)
2864 {
2865 	SK_DF(SK_VERB_MEM_CACHE, "now running");
2866 	skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2867 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2868 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2869 }
2870 
2871 /*
2872  * Stop reaping; this would allow another reap request to occur.
2873  */
2874 static void
skmem_cache_reap_done(void)2875 skmem_cache_reap_done(void)
2876 {
2877 	volatile uint32_t *flag = &skmem_cache_reaping;
2878 
2879 	*flag = 0;
2880 	os_atomic_thread_fence(seq_cst);
2881 }
2882 
2883 /*
2884  * Immediately reap all unused memory of a cache.  If purging,
2885  * also purge the cached objects at the CPU layer.
2886  */
2887 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2888 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2889 {
2890 	/* if SKM_MODE_RECLIAM flag is set for this cache, we purge */
2891 	if (purge || (skm->skm_mode & SKM_MODE_RECLAIM)) {
2892 		/*
2893 		 * If another thread is in the process of purging or
2894 		 * resizing, bail out and let the currently-ongoing
2895 		 * purging take its natural course.
2896 		 */
2897 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2898 			skmem_cache_magazine_purge(skm);
2899 			skmem_cache_magazine_enable(skm, 0);
2900 			skmem_cache_resize_exit(skm);
2901 		}
2902 	} else {
2903 		skmem_depot_ws_zero(skm);
2904 		skmem_depot_ws_reap(skm);
2905 
2906 		/* clean up cp_ploaded magazines from each CPU */
2907 		SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2908 
2909 		struct skmem_cpu_cache *cp;
2910 		struct skmem_mag *pmg;
2911 		int prounds;
2912 		uint32_t cpuid;
2913 
2914 		for (cpuid = 0; cpuid < ncpu; cpuid++) {
2915 			cp = &skm->skm_cpu_cache[cpuid];
2916 
2917 			SKM_CPU_LOCK_SPIN(cp);
2918 			pmg = cp->cp_ploaded;
2919 			prounds = cp->cp_prounds;
2920 
2921 			cp->cp_ploaded = NULL;
2922 			cp->cp_prounds = -1;
2923 			SKM_CPU_UNLOCK(cp);
2924 
2925 			if (pmg != NULL) {
2926 				skmem_magazine_destroy(skm, pmg, prounds);
2927 			}
2928 		}
2929 	}
2930 }
2931 
2932 /*
2933  * Request a global reap operation to be dispatched.
2934  */
2935 void
skmem_cache_reap(void)2936 skmem_cache_reap(void)
2937 {
2938 	/* only one reaping episode is allowed at a time */
2939 	if (skmem_lock_owner == current_thread() ||
2940 	    !os_atomic_cmpxchg(&skmem_cache_reaping, 0, 1, acq_rel)) {
2941 		return;
2942 	}
2943 
2944 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2945 }
2946 
2947 /*
2948  * Reap internal caches.
2949  */
2950 void
skmem_reap_caches(boolean_t purge)2951 skmem_reap_caches(boolean_t purge)
2952 {
2953 	skmem_cache_reap_now(skmem_slab_cache, purge);
2954 	skmem_cache_reap_now(skmem_bufctl_cache, purge);
2955 
2956 	/* packet buffer pool objects */
2957 	pp_reap_caches(purge);
2958 
2959 	/* also handle the region cache(s) */
2960 	skmem_region_reap_caches(purge);
2961 }
2962 
2963 /*
2964  * Thread call callback for update.
2965  */
2966 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2967 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2968 {
2969 #pragma unused(dummy, arg)
2970 	sk_protect_t protect;
2971 
2972 	protect = sk_cache_update_protect();
2973 	skmem_cache_applyall(skmem_cache_update, 0);
2974 	sk_cache_update_unprotect(protect);
2975 
2976 	skmem_dispatch(skmem_cache_update_tc, NULL,
2977 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2978 }
2979 
2980 /*
2981  * Given a buffer control, record the current transaction.
2982  */
2983 __attribute__((noinline, cold, not_tail_called))
2984 static inline void
skmem_audit_bufctl(struct skmem_bufctl * bc)2985 skmem_audit_bufctl(struct skmem_bufctl *bc)
2986 {
2987 	struct skmem_bufctl_audit *bca = (struct skmem_bufctl_audit *)bc;
2988 	struct timeval tv;
2989 
2990 	microuptime(&tv);
2991 	bca->bc_thread = current_thread();
2992 	bca->bc_timestamp = (uint32_t)((tv.tv_sec * 1000) + (tv.tv_usec / 1000));
2993 	bca->bc_depth = OSBacktrace(bca->bc_stack, SKMEM_STACK_DEPTH);
2994 }
2995 
2996 /*
2997  * Given an object, find its buffer control and record the transaction.
2998  */
2999 __attribute__((noinline, cold, not_tail_called))
3000 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)3001 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
3002 {
3003 	struct skmem_bufctl_bkt *bcb;
3004 	struct skmem_bufctl *bc;
3005 
3006 	ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
3007 
3008 	SKM_SLAB_LOCK(skm);
3009 	while (list != NULL) {
3010 		void *buf = list;
3011 
3012 		bcb = SKMEM_CACHE_HASH(skm, buf);
3013 		SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
3014 			if (bc->bc_addr == buf) {
3015 				break;
3016 			}
3017 		}
3018 
3019 		if (__improbable(bc == NULL)) {
3020 			panic("%s: %s failed to get bufctl for %p",
3021 			    __func__, skm->skm_name, buf);
3022 			/* NOTREACHED */
3023 			__builtin_unreachable();
3024 		}
3025 
3026 		skmem_audit_bufctl(bc);
3027 
3028 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
3029 			break;
3030 		}
3031 
3032 		list = list->mo_next;
3033 	}
3034 	SKM_SLAB_UNLOCK(skm);
3035 }
3036 
3037 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * out,size_t len)3038 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *out, size_t len)
3039 {
3040 	size_t actual_space = sizeof(struct sk_stats_cache);
3041 	struct sk_stats_cache *sca = out;
3042 	int contention;
3043 
3044 	if (out == NULL || len < actual_space) {
3045 		goto done;
3046 	}
3047 
3048 	bzero(sca, sizeof(*sca));
3049 	(void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
3050 	    skm->skm_name);
3051 	uuid_copy(sca->sca_uuid, skm->skm_uuid);
3052 	uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
3053 	sca->sca_mode = skm->skm_mode;
3054 	sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
3055 	sca->sca_objsize = (uint64_t)skm->skm_objsize;
3056 	sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
3057 	sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
3058 	sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
3059 	sca->sca_objalign = (uint64_t)skm->skm_objalign;
3060 
3061 	sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
3062 	sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
3063 	sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
3064 	sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
3065 	sca->sca_depot_full = skm->skm_depot_full;
3066 	sca->sca_depot_empty = skm->skm_depot_empty;
3067 	sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
3068 	/* in case of a race this might be a negative value, turn it into 0 */
3069 	if ((contention = (int)(skm->skm_depot_contention -
3070 	    skm->skm_depot_contention_prev)) < 0) {
3071 		contention = 0;
3072 	}
3073 	sca->sca_depot_contention_factor = contention;
3074 
3075 	sca->sca_cpu_rounds = 0;
3076 	sca->sca_cpu_prounds = 0;
3077 	for (int cpuid = 0; cpuid < ncpu; cpuid++) {
3078 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
3079 
3080 		SKM_CPU_LOCK(ccp);
3081 		if (ccp->cp_rounds > -1) {
3082 			sca->sca_cpu_rounds += ccp->cp_rounds;
3083 		}
3084 		if (ccp->cp_prounds > -1) {
3085 			sca->sca_cpu_prounds += ccp->cp_prounds;
3086 		}
3087 		SKM_CPU_UNLOCK(ccp);
3088 	}
3089 
3090 	sca->sca_sl_create = skm->skm_sl_create;
3091 	sca->sca_sl_destroy = skm->skm_sl_destroy;
3092 	sca->sca_sl_alloc = skm->skm_sl_alloc;
3093 	sca->sca_sl_free = skm->skm_sl_free;
3094 	sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
3095 	sca->sca_sl_partial = skm->skm_sl_partial;
3096 	sca->sca_sl_empty = skm->skm_sl_empty;
3097 	sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
3098 	sca->sca_sl_rescale = skm->skm_sl_rescale;
3099 	sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
3100 
3101 done:
3102 	return actual_space;
3103 }
3104 
3105 static int
3106 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
3107 {
3108 #pragma unused(arg1, arg2, oidp)
3109 	struct skmem_cache *skm;
3110 	size_t actual_space;
3111 	size_t buffer_space;
3112 	size_t allocated_space;
3113 	caddr_t buffer = NULL;
3114 	caddr_t scan;
3115 	int error = 0;
3116 
3117 	if (!kauth_cred_issuser(kauth_cred_get())) {
3118 		return EPERM;
3119 	}
3120 
3121 	net_update_uptime();
3122 	buffer_space = req->oldlen;
3123 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3124 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3125 			buffer_space = SK_SYSCTL_ALLOC_MAX;
3126 		}
3127 		allocated_space = buffer_space;
3128 		buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_cache_mib);
3129 		if (__improbable(buffer == NULL)) {
3130 			return ENOBUFS;
3131 		}
3132 	} else if (req->oldptr == USER_ADDR_NULL) {
3133 		buffer_space = 0;
3134 	}
3135 	actual_space = 0;
3136 	scan = buffer;
3137 
3138 	SKMEM_CACHE_LOCK();
3139 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
3140 		size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
3141 		if (scan != NULL) {
3142 			if (buffer_space < size) {
3143 				/* supplied buffer too small, stop copying */
3144 				error = ENOMEM;
3145 				break;
3146 			}
3147 			scan += size;
3148 			buffer_space -= size;
3149 		}
3150 		actual_space += size;
3151 	}
3152 	SKMEM_CACHE_UNLOCK();
3153 
3154 	if (actual_space != 0) {
3155 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
3156 		if (out_error != 0) {
3157 			error = out_error;
3158 		}
3159 	}
3160 	if (buffer != NULL) {
3161 		sk_free_data(buffer, allocated_space);
3162 	}
3163 
3164 	return error;
3165 }
3166