xref: /xnu-11215.41.3/bsd/skywalk/mem/skmem_cache.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <vm/vm_memtag.h>
35 
36 /*
37  * Memory allocator with per-CPU caching (magazines), derived from the kmem
38  * magazine concept and implementation as described in the following paper:
39  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
40  *
41  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
42  * reserved.  Use is subject to license terms.
43  *
44  * This derivative differs from the original kmem slab allocator, in that:
45  *
46  *   a) There is always a discrete bufctl per object, even for small sizes.
47  *      This increases the overhead, but is necessary as Skywalk objects
48  *      coming from the slab may be shared (RO or RW) with userland; therefore
49  *      embedding the KVA pointer linkage in freed objects is a non-starter.
50  *
51  *   b) Writing patterns to the slab at slab creation or destruction time
52  *      (when debugging is enabled) is not implemented, as the object may
53  *      be shared (RW) with userland and thus we cannot panic upon pattern
54  *      mismatch episodes.  This can be relaxed so that we conditionally
55  *      verify the pattern for kernel-only memory.
56  *
57  * This derivative also differs from Darwin's mcache allocator (which itself
58  * is a derivative of the original kmem slab allocator), in that:
59  *
60  *   1) The slab layer is internal to skmem_cache, unlike mcache's external
61  *      slab layer required to support mbufs.  skmem_cache also supports
62  *      constructing and deconstructing objects, while mcache does not.
63  *      This brings skmem_cache's model closer to that of the original
64  *      kmem slab allocator.
65  *
66  *   2) mcache allows for batch allocation and free by way of chaining the
67  *      objects together using a linked list.  This requires using a part
68  *      of the object to act as the linkage, which is against Skywalk's
69  *      requirements of not exposing any KVA pointer to userland.  Although
70  *      this is supported by skmem_cache, chaining is only possible if the
71  *      region is not mapped to userland.  That implies that kernel-only
72  *      objects can be chained provided the cache is created with batching
73  *      mode enabled, and that the object is large enough to contain the
74  *      skmem_obj structure.
75  *
76  * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
77  * implements features that are required by Skywalk.  In addition to being
78  * aware of userland access on the buffers, in also supports mirrored backend
79  * memory regions.  This allows a cache to manage two independent memory
80  * regions, such that allocating/freeing an object from/to one results in
81  * allocating/freeing a shadow object in another, thus guaranteeing that both
82  * objects share the same lifetime.
83  */
84 
85 static uint32_t ncpu;                   /* total # of initialized CPUs */
86 
87 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
88 static struct thread *skmem_lock_owner = THREAD_NULL;
89 
90 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
91 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
92 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
93 
94 #define SKMEM_CACHE_LOCK() do {                 \
95 	lck_mtx_lock(&skmem_cache_lock);        \
96 	skmem_lock_owner = current_thread();    \
97 } while (0)
98 #define SKMEM_CACHE_UNLOCK() do {               \
99 	skmem_lock_owner = THREAD_NULL;         \
100 	lck_mtx_unlock(&skmem_cache_lock);      \
101 } while (0)
102 #define SKMEM_CACHE_LOCK_ASSERT_HELD()          \
103 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
104 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD()       \
105 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
106 
107 #define SKM_DEPOT_LOCK(_skm)                    \
108 	lck_mtx_lock(&(_skm)->skm_dp_lock)
109 #define SKM_DEPOT_LOCK_SPIN(_skm)               \
110 	lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
111 #define SKM_DEPOT_CONVERT_LOCK(_skm)            \
112 	lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
113 #define SKM_DEPOT_LOCK_TRY(_skm)                \
114 	lck_mtx_try_lock(&(_skm)->skm_dp_lock)
115 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm)        \
116 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
117 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm)     \
118 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
119 #define SKM_DEPOT_UNLOCK(_skm)                  \
120 	lck_mtx_unlock(&(_skm)->skm_dp_lock)
121 
122 #define SKM_RESIZE_LOCK(_skm)                   \
123 	lck_mtx_lock(&(_skm)->skm_rs_lock)
124 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm)       \
125 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
126 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm)    \
127 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
128 #define SKM_RESIZE_UNLOCK(_skm)                 \
129 	lck_mtx_unlock(&(_skm)->skm_rs_lock)
130 
131 #define SKM_CPU_LOCK(_cp)                       \
132 	lck_mtx_lock(&(_cp)->cp_lock)
133 #define SKM_CPU_LOCK_SPIN(_cp)                  \
134 	lck_mtx_lock_spin(&(_cp)->cp_lock)
135 #define SKM_CPU_CONVERT_LOCK(_cp)               \
136 	lck_mtx_convert_spin(&(_cp)->cp_lock)
137 #define SKM_CPU_LOCK_ASSERT_HELD(_cp)           \
138 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
139 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp)        \
140 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
141 #define SKM_CPU_UNLOCK(_cp)                     \
142 	lck_mtx_unlock(&(_cp)->cp_lock)
143 
144 #define SKM_ZONE_MAX    256
145 
146 static struct zone *skm_zone;                   /* zone for skmem_cache */
147 /*
148  * XXX -fbounds-safety: Took out ZC_DESTRUCTIBLE flag because of static assert
149  * in ZONE_DEFINE_TYPE
150  */
151 ZONE_DECLARE(skm_zone, struct zone *);
152 
153 struct skmem_cache *skmem_slab_cache;    /* cache for skmem_slab */
154 struct skmem_cache *skmem_bufctl_cache;  /* cache for skmem_bufctl */
155 
156 unsigned int bc_size;                    /* size of bufctl */
157 
158 /*
159  * XXX: -fbounds-safety: we added objsize to skmem_cache_batch_alloc(), but this
160  * is only used by -fbounds-safety, so we use __unused if -fbounds-safety is
161  * disabled. The utility macro for that is SK_BF_ARG()
162  */
163 #if !__has_ptrcheck
164 #define SK_FB_ARG __unused
165 #else
166 #define SK_FB_ARG
167 #endif
168 
169 /*
170  * Magazine types (one per row.)
171  *
172  * The first column defines the number of objects that the magazine can hold.
173  * Using that number, we derive the effective number: the aggregate count of
174  * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
175  * This would result in an object size that is aligned on the CPU cache
176  * size boundary; the exception to this is the KASAN mode where the size
177  * would be larger due to the redzone regions.
178  *
179  * The second column defines the alignment of the magazine.  Because each
180  * magazine is used at the CPU-layer cache, we need to ensure there is no
181  * false sharing across the CPUs, and align the magazines to the maximum
182  * cache alignment size, for simplicity.  The value of 0 may be used to
183  * indicate natural pointer size alignment.
184  *
185  * The third column defines the starting magazine type for a given cache,
186  * determined at the cache's creation time based on its chunk size.
187  *
188  * The fourth column defines the magazine type limit for a given cache.
189  * Magazine resizing will only occur if the chunk size is less than this.
190  */
191 static struct skmem_magtype skmem_magtype[] = {
192 #if defined(__LP64__)
193 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
194 	  .mt_cache = NULL, .mt_cname = "" },
195 	{ .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
196 	  .mt_cache = NULL, .mt_cname = "" },
197 	{ .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
198 	  .mt_cache = NULL, .mt_cname = "" },
199 	{ .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
200 	  .mt_cache = NULL, .mt_cname = "" },
201 	{ .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
202 	  .mt_cache = NULL, .mt_cname = "" },
203 	{ .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
204 	  .mt_cache = NULL, .mt_cname = "" },
205 	{ .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
206 	  .mt_cache = NULL, .mt_cname = "" },
207 	{ .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
208 	  .mt_cache = NULL, .mt_cname = "" },
209 #else /* !__LP64__ */
210 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
211 	  .mt_cache = NULL, .mt_cname = "" },
212 #endif /* !__LP64__ */
213 };
214 
215 /*
216  * Hash table bounds.  Start with the initial value, and rescale up to
217  * the specified limit.  Ideally we don't need a limit, but in practice
218  * this helps guard against runaways.  These values should be revisited
219  * in future and be adjusted as needed.
220  */
221 #define SKMEM_CACHE_HASH_INITIAL        64      /* initial hash table size */
222 #define SKMEM_CACHE_HASH_LIMIT          8192    /* hash table size limit */
223 
224 /*
225  * The last magazine type.
226  */
227 static struct skmem_magtype *skmem_cache_magsize_last;
228 
229 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
230 static boolean_t skmem_cache_ready;
231 static int skmem_magazine_ctor(struct skmem_obj_info *,
232     struct skmem_obj_info *, void *, uint32_t);
233 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
234     int);
235 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
236     struct skmem_maglist *, uint32_t *, struct skmem_mag *__bidi_indexable *, uint32_t);
237 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
238     uint32_t *, struct skmem_mag *);
239 static void skmem_depot_ws_update(struct skmem_cache *);
240 static void skmem_depot_ws_zero(struct skmem_cache *);
241 static void skmem_depot_ws_reap(struct skmem_cache *);
242 #define SKMEM_CACHE_FREE_NOCACHE    0x1
243 static void skmem_cache_batch_free_common(struct skmem_cache *, struct skmem_obj *, uint32_t);
244 static void skmem_cache_magazine_purge(struct skmem_cache *);
245 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
246 static void skmem_cache_magazine_resize(struct skmem_cache *);
247 static void skmem_cache_hash_rescale(struct skmem_cache *);
248 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
249 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
250     struct skmem_mag *, int);
251 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
252     uint32_t);
253 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
254 static void skmem_cache_reap_start(void);
255 static void skmem_cache_reap_done(void);
256 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
257 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
258 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
259 static void skmem_cache_resize_exit(struct skmem_cache *);
260 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
261 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
262 
263 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
264     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
265     0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
266     "Skywalk cache statistics");
267 
268 static volatile uint32_t skmem_cache_reaping;
269 static thread_call_t skmem_cache_reap_tc;
270 static thread_call_t skmem_cache_update_tc;
271 
272 extern kern_return_t thread_terminate(thread_t);
273 extern unsigned int ml_wait_max_cpus(void);
274 
275 #define SKMEM_DEBUG_NOMAGAZINES 0x1     /* disable magazines layer */
276 #define SKMEM_DEBUG_AUDIT       0x2     /* audit transactions */
277 #define SKMEM_DEBUG_MASK        (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
278 
279 #if DEBUG
280 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
281 #else /* !DEBUG */
282 static uint32_t skmem_debug = 0;
283 #endif /* !DEBUG */
284 
285 static uint32_t skmem_clear_min = 0;    /* clear on free threshold */
286 
287 #define SKMEM_CACHE_UPDATE_INTERVAL     11      /* 11 seconds */
288 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
289 
290 #define SKMEM_DEPOT_CONTENTION  3       /* max failed trylock per interval */
291 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
292 
293 #if (DEVELOPMENT || DEBUG)
294 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
295     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
296     SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
297 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
298     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
299     SKMEM_DEPOT_CONTENTION, "Depot contention");
300 
301 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
302 
303 /*
304  * Called by skmem_test_start() to set the update interval.
305  */
306 void
skmem_cache_test_start(uint32_t i)307 skmem_cache_test_start(uint32_t i)
308 {
309 	skmem_cache_update_interval_saved = skmem_cache_update_interval;
310 	skmem_cache_update_interval = i;
311 }
312 
313 /*
314  * Called by skmem_test_stop() to restore the update interval.
315  */
316 void
skmem_cache_test_stop(void)317 skmem_cache_test_stop(void)
318 {
319 	skmem_cache_update_interval = skmem_cache_update_interval_saved;
320 }
321 #endif /* (DEVELOPMENT || DEBUG) */
322 
323 #define SKMEM_TAG_BUFCTL_HASH   "com.apple.skywalk.bufctl.hash"
324 static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
325 
326 #define SKMEM_TAG_CACHE_MIB     "com.apple.skywalk.cache.mib"
327 static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
328 
329 static int __skmem_cache_pre_inited = 0;
330 static int __skmem_cache_inited = 0;
331 
332 /*
333  * Called before skmem_region_init().
334  */
335 void
skmem_cache_pre_init(void)336 skmem_cache_pre_init(void)
337 {
338 	vm_size_t skm_size;
339 
340 	ASSERT(!__skmem_cache_pre_inited);
341 
342 	ncpu = ml_wait_max_cpus();
343 
344 	/* allocate extra in case we need to manually align the pointer */
345 	if (skm_zone == NULL) {
346 		skm_size = SKMEM_CACHE_SIZE(ncpu);
347 #if KASAN
348 		/*
349 		 * When KASAN is enabled, the zone allocator adjusts the
350 		 * element size to include the redzone regions, in which
351 		 * case we assume that the elements won't start on the
352 		 * alignment boundary and thus need to do some fix-ups.
353 		 * These include increasing the effective object size
354 		 * which adds at least 136 bytes to the original size,
355 		 * as computed by skmem_region_params_config() above.
356 		 */
357 		skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
358 #endif /* KASAN */
359 		skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
360 		skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
361 		    ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
362 	}
363 
364 	TAILQ_INIT(&skmem_cache_head);
365 
366 	__skmem_cache_pre_inited = 1;
367 }
368 
369 /*
370  * Called after skmem_region_init().
371  */
372 void
skmem_cache_init(void)373 skmem_cache_init(void)
374 {
375 	uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
376 	struct skmem_magtype *mtp;
377 	uint32_t i;
378 
379 	_CASSERT(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
380 
381 	_CASSERT(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
382 	_CASSERT(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
383 	_CASSERT(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
384 	_CASSERT(SKM_MODE_BATCH == SCA_MODE_BATCH);
385 	_CASSERT(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
386 	_CASSERT(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
387 	_CASSERT(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
388 
389 	ASSERT(__skmem_cache_pre_inited);
390 	ASSERT(!__skmem_cache_inited);
391 
392 	_CASSERT(offsetof(struct skmem_bufctl, bc_addr) == offsetof(struct skmem_bufctl_audit, bc_addr));
393 	_CASSERT(offsetof(struct skmem_bufctl, bc_addrm) == offsetof(struct skmem_bufctl_audit, bc_addrm));
394 	_CASSERT(offsetof(struct skmem_bufctl, bc_slab) == offsetof(struct skmem_bufctl_audit, bc_slab));
395 	_CASSERT(offsetof(struct skmem_bufctl, bc_lim) == offsetof(struct skmem_bufctl_audit, bc_lim));
396 	_CASSERT(offsetof(struct skmem_bufctl, bc_flags) == offsetof(struct skmem_bufctl_audit, bc_flags));
397 	_CASSERT(offsetof(struct skmem_bufctl, bc_idx) == offsetof(struct skmem_bufctl_audit, bc_idx));
398 	_CASSERT(offsetof(struct skmem_bufctl, bc_usecnt) == offsetof(struct skmem_bufctl_audit, bc_usecnt));
399 	_CASSERT(sizeof(struct skmem_bufctl) == offsetof(struct skmem_bufctl_audit, bc_thread));
400 
401 	PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
402 	skmem_debug &= SKMEM_DEBUG_MASK;
403 
404 #if (DEVELOPMENT || DEBUG)
405 	PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
406 	    sizeof(skmem_clear_min));
407 #endif /* (DEVELOPMENT || DEBUG) */
408 	if (skmem_clear_min == 0) {
409 		/* zeroing 2 CPU cache lines practically comes for free */
410 		skmem_clear_min = 2 * cpu_cache_line_size;
411 	} else {
412 		/* round it up to CPU cache line size */
413 		skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
414 		    cpu_cache_line_size);
415 	}
416 
417 	/* create a cache for buffer control structures */
418 	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
419 		bc_size = sizeof(struct skmem_bufctl_audit);
420 		skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
421 		    bc_size, sizeof(uint64_t), NULL, NULL,
422 		    NULL, NULL, NULL, 0);
423 	} else {
424 		bc_size = sizeof(struct skmem_bufctl);
425 		skmem_bufctl_cache = skmem_cache_create("bufctl",
426 		    bc_size, sizeof(uint64_t), NULL, NULL,
427 		    NULL, NULL, NULL, 0);
428 	}
429 
430 	/* create a cache for slab structures */
431 	skmem_slab_cache = skmem_cache_create("slab",
432 	    sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
433 	    NULL, NULL, 0);
434 
435 	/*
436 	 * Go thru the magazine type table and create a cache for each.
437 	 */
438 	for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
439 		const char *__null_terminated mt_cname = NULL;
440 		mtp = &skmem_magtype[i];
441 
442 		if (mtp->mt_align != 0 &&
443 		    ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
444 		    mtp->mt_align < (int)cpu_cache_line_size)) {
445 			panic("%s: bad alignment %d", __func__, mtp->mt_align);
446 			/* NOTREACHED */
447 			__builtin_unreachable();
448 		}
449 		mt_cname = tsnprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
450 		    "mg.%d", mtp->mt_magsize);
451 
452 		/* create a cache for this magazine type */
453 		mtp->mt_cache = skmem_cache_create(mt_cname,
454 		    SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
455 		    skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
456 
457 		/* remember the last magazine type */
458 		skmem_cache_magsize_last = mtp;
459 	}
460 
461 	VERIFY(skmem_cache_magsize_last != NULL);
462 	VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
463 	VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
464 
465 	/*
466 	 * Allocate thread calls for cache reap and update operations.
467 	 */
468 	skmem_cache_reap_tc =
469 	    thread_call_allocate_with_options(skmem_cache_reap_func,
470 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
471 	skmem_cache_update_tc =
472 	    thread_call_allocate_with_options(skmem_cache_update_func,
473 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
474 	if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
475 		panic("%s: thread_call_allocate failed", __func__);
476 		/* NOTREACHED */
477 		__builtin_unreachable();
478 	}
479 
480 	/*
481 	 * We're ready; go through existing skmem_cache entries
482 	 * (if any) and enable the magazines layer for each.
483 	 */
484 	skmem_cache_applyall(skmem_cache_magazine_enable, 0);
485 	skmem_cache_ready = TRUE;
486 
487 	/* and start the periodic cache update machinery */
488 	skmem_dispatch(skmem_cache_update_tc, NULL,
489 	    (skmem_cache_update_interval * NSEC_PER_SEC));
490 
491 	__skmem_cache_inited = 1;
492 }
493 
494 void
skmem_cache_fini(void)495 skmem_cache_fini(void)
496 {
497 	struct skmem_magtype *mtp;
498 	uint32_t i;
499 
500 	if (__skmem_cache_inited) {
501 		ASSERT(TAILQ_EMPTY(&skmem_cache_head));
502 
503 		for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
504 			mtp = &skmem_magtype[i];
505 			skmem_cache_destroy(mtp->mt_cache);
506 			mtp->mt_cache = NULL;
507 		}
508 		skmem_cache_destroy(skmem_slab_cache);
509 		skmem_slab_cache = NULL;
510 		skmem_cache_destroy(skmem_bufctl_cache);
511 		skmem_bufctl_cache = NULL;
512 
513 		if (skmem_cache_reap_tc != NULL) {
514 			(void) thread_call_cancel_wait(skmem_cache_reap_tc);
515 			(void) thread_call_free(skmem_cache_reap_tc);
516 			skmem_cache_reap_tc = NULL;
517 		}
518 		if (skmem_cache_update_tc != NULL) {
519 			(void) thread_call_cancel_wait(skmem_cache_update_tc);
520 			(void) thread_call_free(skmem_cache_update_tc);
521 			skmem_cache_update_tc = NULL;
522 		}
523 
524 		__skmem_cache_inited = 0;
525 	}
526 
527 	if (__skmem_cache_pre_inited) {
528 		if (skm_zone != NULL) {
529 			zdestroy(skm_zone);
530 			skm_zone = NULL;
531 		}
532 
533 		__skmem_cache_pre_inited = 0;
534 	}
535 }
536 
537 /*
538  * Create a cache.
539  */
540 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)541 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
542     skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
543     void *private, struct skmem_region *region, uint32_t cflags)
544 {
545 	boolean_t pseudo = (region == NULL);
546 	struct skmem_magtype *mtp;
547 	struct skmem_cache *__single skm;
548 #if KASAN
549 	void *buf;
550 	size_t skm_align_off;
551 #endif
552 	size_t segsize;
553 	size_t chunksize;
554 	size_t objsize;
555 	size_t objalign;
556 	uint32_t i, cpuid;
557 
558 	/* enforce 64-bit minimum alignment for buffers */
559 	if (bufalign == 0) {
560 		bufalign = SKMEM_CACHE_ALIGN;
561 	}
562 	bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
563 
564 	/* enforce alignment to be a power of 2 */
565 	VERIFY(powerof2(bufalign));
566 
567 	if (region == NULL) {
568 		struct skmem_region_params srp = {};
569 
570 		/* batching is currently not supported on pseudo regions */
571 		VERIFY(!(cflags & SKMEM_CR_BATCH));
572 
573 		srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
574 		ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
575 
576 		/* objalign is always equal to bufalign */
577 		srp.srp_align = objalign = bufalign;
578 		srp.srp_r_obj_cnt = 1;
579 		srp.srp_r_obj_size = (uint32_t)bufsize;
580 		skmem_region_params_config(&srp);
581 
582 		/* allocate region for intrinsics */
583 		region = skmem_region_create(name, &srp, NULL, NULL, NULL);
584 		VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
585 		VERIFY(objalign == region->skr_align);
586 #if KASAN
587 		/*
588 		 * When KASAN is enabled, the zone allocator adjusts the
589 		 * element size to include the redzone regions, in which
590 		 * case we assume that the elements won't start on the
591 		 * alignment boundary and thus need to do some fix-ups.
592 		 * These include increasing the effective object size
593 		 * which adds at least 16 bytes to the original size,
594 		 * as computed by skmem_region_params_config() above.
595 		 */
596 		VERIFY(region->skr_c_obj_size >=
597 		    (bufsize + sizeof(uint64_t) + bufalign));
598 #endif /* KASAN */
599 		/* enable magazine resizing by default */
600 		cflags |= SKMEM_CR_DYNAMIC;
601 
602 		/*
603 		 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
604 		 * even though it's a no-op since the work is done
605 		 * at the zone layer instead.
606 		 */
607 		cflags |= SKMEM_CR_CLEARONFREE;
608 	} else {
609 		objalign = region->skr_align;
610 	}
611 
612 	ASSERT(region != NULL);
613 	ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
614 	segsize = region->skr_seg_size;
615 	ASSERT(bufalign <= segsize);
616 
617 #if KASAN
618 	buf = zalloc_flags_buf(skm_zone, Z_WAITOK | Z_ZERO);
619 	/*
620 	 * We need to align `buf` such that offsetof(struct skmem_cache, skm_align)
621 	 * is aligned to a cache line boundary. In KASAN builds, allocations are
622 	 * preceded by metadata that changes the alignment of the object. The
623 	 * extra required size is accounted for at the time skm_zone is created.
624 	 * We then save the actual start of the allocation to skm_start, as it's
625 	 * the address we need to actually free.
626 	 */
627 	skm_align_off = offsetof(struct skmem_cache, skm_align);
628 	uintptr_t diff = P2ROUNDUP((intptr_t)buf + skm_align_off,
629 	    CHANNEL_CACHE_ALIGN_MAX) - (intptr_t)buf;
630 	skm = (void *)((char *)buf + diff);
631 	skm->skm_start = buf;
632 #else /* !KASAN */
633 	/*
634 	 * We expect that the zone allocator would allocate elements
635 	 * rounded up to the requested alignment based on the object
636 	 * size computed in skmem_cache_pre_init() earlier, and
637 	 * 'skm' is therefore the element address itself.
638 	 */
639 	skm = zalloc_flags_buf(skm_zone, Z_WAITOK | Z_ZERO);
640 #endif /* !KASAN */
641 	skm->skm_cpu_cache_count = ncpu;
642 
643 	VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
644 
645 	if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
646 	    (cflags & SKMEM_CR_NOMAGAZINES)) {
647 		/*
648 		 * Either the caller insists that this cache should not
649 		 * utilize magazines layer, or that the system override
650 		 * to disable magazines layer on all caches has been set.
651 		 */
652 		skm->skm_mode |= SKM_MODE_NOMAGAZINES;
653 	} else {
654 		/*
655 		 * Region must be configured with enough objects
656 		 * to take into account objects at the CPU layer.
657 		 */
658 		ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
659 	}
660 
661 	if (cflags & SKMEM_CR_DYNAMIC) {
662 		/*
663 		 * Enable per-CPU cache magazine resizing.
664 		 */
665 		skm->skm_mode |= SKM_MODE_DYNAMIC;
666 	}
667 
668 	/* region stays around after defunct? */
669 	if (region->skr_mode & SKR_MODE_NOREDIRECT) {
670 		skm->skm_mode |= SKM_MODE_NOREDIRECT;
671 	}
672 
673 	if (cflags & SKMEM_CR_BATCH) {
674 		/*
675 		 * Batch alloc/free involves storing the next object
676 		 * pointer at the beginning of each object; this is
677 		 * okay for kernel-only regions, but not those that
678 		 * are mappable to user space (we can't leak kernel
679 		 * addresses).
680 		 */
681 		_CASSERT(offsetof(struct skmem_obj, mo_next) == 0);
682 		VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
683 
684 		/* batching is currently not supported on pseudo regions */
685 		VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
686 
687 		/* validate object size */
688 		VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
689 
690 		skm->skm_mode |= SKM_MODE_BATCH;
691 	}
692 
693 	uuid_generate_random(skm->skm_uuid);
694 	(void) snprintf(skm->skm_name, sizeof(skm->skm_name),
695 	    "%s.%s", SKMEM_CACHE_PREFIX, name);
696 	skm->skm_bufsize = bufsize;
697 	skm->skm_bufalign = bufalign;
698 	skm->skm_objalign = objalign;
699 	skm->skm_ctor = ctor;
700 	skm->skm_dtor = dtor;
701 	skm->skm_reclaim = reclaim;
702 	skm->skm_private = private;
703 	skm->skm_slabsize = segsize;
704 
705 	skm->skm_region = region;
706 	/* callee holds reference */
707 	skmem_region_slab_config(region, skm, true);
708 	objsize = region->skr_c_obj_size;
709 	skm->skm_objsize = objsize;
710 
711 	if (pseudo) {
712 		/*
713 		 * Release reference from skmem_region_create()
714 		 * since skm->skm_region holds one now.
715 		 */
716 		ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
717 		skmem_region_release(region);
718 
719 		skm->skm_mode |= SKM_MODE_PSEUDO;
720 
721 		skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
722 		skm->skm_slab_free = skmem_slab_free_pseudo_locked;
723 	} else {
724 		skm->skm_slab_alloc = skmem_slab_alloc_locked;
725 		skm->skm_slab_free = skmem_slab_free_locked;
726 
727 		/* auditing was requested? (normal regions only) */
728 		if (skmem_debug & SKMEM_DEBUG_AUDIT) {
729 			ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
730 			skm->skm_mode |= SKM_MODE_AUDIT;
731 		}
732 	}
733 
734 	/*
735 	 * Clear upon free (to slab layer) as long as the region is
736 	 * not marked as read-only for kernel, and if the chunk size
737 	 * is within the threshold or if the caller had requested it.
738 	 */
739 	if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
740 		if (skm->skm_objsize <= skmem_clear_min ||
741 		    (cflags & SKMEM_CR_CLEARONFREE)) {
742 			skm->skm_mode |= SKM_MODE_CLEARONFREE;
743 		}
744 	}
745 
746 	chunksize = bufsize;
747 	if (bufalign >= SKMEM_CACHE_ALIGN) {
748 		chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
749 	}
750 
751 	chunksize = P2ROUNDUP(chunksize, bufalign);
752 	if (chunksize > objsize) {
753 		panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
754 		    __func__, bufsize, chunksize, objsize);
755 		/* NOTREACHED */
756 		__builtin_unreachable();
757 	}
758 	ASSERT(chunksize != 0);
759 	skm->skm_chunksize = chunksize;
760 
761 	lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
762 	TAILQ_INIT(&skm->skm_sl_partial_list);
763 	TAILQ_INIT(&skm->skm_sl_empty_list);
764 
765 	/* allocated-address hash table */
766 	skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
767 	skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
768 	skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
769 	    skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
770 	skm->skm_hash_size = skm->skm_hash_initial;
771 
772 	skm->skm_hash_mask = (skm->skm_hash_initial - 1);
773 	skm->skm_hash_shift = flsll(chunksize) - 1;
774 
775 	for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
776 		SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
777 	}
778 
779 	lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
780 
781 	/* find a suitable magazine type for this chunk size */
782 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
783 		continue;
784 	}
785 
786 	skm->skm_magtype = mtp;
787 	if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
788 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
789 	}
790 
791 	/*
792 	 * Initialize the CPU layer.  Each per-CPU structure is aligned
793 	 * on the CPU cache line boundary to prevent false sharing.
794 	 */
795 	lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
796 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
797 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
798 
799 		VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
800 		lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
801 		    &skmem_lock_attr);
802 		ccp->cp_rounds = -1;
803 		ccp->cp_prounds = -1;
804 	}
805 
806 	SKMEM_CACHE_LOCK();
807 	TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
808 	SKMEM_CACHE_UNLOCK();
809 
810 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx mode 0x%b",
811 	    skm->skm_name, SK_KVA(skm), skm->skm_mode, SKM_MODE_BITS);
812 	SK_DF(SK_VERB_MEM_CACHE,
813 	    "  bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
814 	    (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
815 	    (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
816 	    (uint32_t)skm->skm_slabsize);
817 
818 	if (skmem_cache_ready) {
819 		skmem_cache_magazine_enable(skm, 0);
820 	}
821 
822 	if (cflags & SKMEM_CR_RECLAIM) {
823 		skm->skm_mode |= SKM_MODE_RECLAIM;
824 	}
825 
826 	return skm;
827 }
828 
829 /*
830  * Destroy a cache.
831  */
832 void
skmem_cache_destroy(struct skmem_cache * skm)833 skmem_cache_destroy(struct skmem_cache *skm)
834 {
835 	uint32_t cpuid;
836 
837 	SKMEM_CACHE_LOCK();
838 	TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
839 	SKMEM_CACHE_UNLOCK();
840 
841 	ASSERT(skm->skm_rs_busy == 0);
842 	ASSERT(skm->skm_rs_want == 0);
843 
844 	/* purge all cached objects for this cache */
845 	skmem_cache_magazine_purge(skm);
846 
847 	/*
848 	 * Panic if we detect there are unfreed objects; the caller
849 	 * destroying this cache is responsible for ensuring that all
850 	 * allocated objects have been freed prior to getting here.
851 	 */
852 	SKM_SLAB_LOCK(skm);
853 	if (skm->skm_sl_bufinuse != 0) {
854 		panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
855 		    skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
856 		/* NOTREACHED */
857 		__builtin_unreachable();
858 	}
859 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
860 	ASSERT(skm->skm_sl_partial == 0);
861 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
862 	ASSERT(skm->skm_sl_empty == 0);
863 	skm->skm_reclaim = NULL;
864 	skm->skm_ctor = NULL;
865 	skm->skm_dtor = NULL;
866 	SKM_SLAB_UNLOCK(skm);
867 
868 	if (skm->skm_hash_table != NULL) {
869 #if (DEBUG || DEVELOPMENT)
870 		for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
871 			ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
872 		}
873 #endif /* DEBUG || DEVELOPMENT */
874 
875 		/* XXX -fbounds-safety: __counted_by pointer (skm_hash_table)
876 		 * cannot be pointed to by any other variable */
877 		struct skmem_bufctl_bkt *__indexable htable = skm->skm_hash_table;
878 		sk_free_type_array(struct skmem_bufctl_bkt,
879 		    skm->skm_hash_size, htable);
880 		skm->skm_hash_table = NULL;
881 		htable = NULL;
882 		skm->skm_hash_size = 0;
883 	}
884 
885 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
886 		lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
887 		    &skmem_cpu_lock_grp);
888 	}
889 	lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
890 	lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
891 	lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
892 
893 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm 0x%llx",
894 	    skm->skm_name, SK_KVA(skm));
895 
896 	/* callee releases reference */
897 	skmem_region_slab_config(skm->skm_region, skm, false);
898 	skm->skm_region = NULL;
899 
900 #if KASAN
901 	/* get the original address since we're about to free it */
902 	zfree(skm_zone, skm->skm_start);
903 #else
904 	zfree(skm_zone, skm);
905 #endif /* KASAN */
906 }
907 
908 /*
909  * Return the object's region info.
910  */
911 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)912 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
913     struct skmem_obj_info *oi, struct skmem_obj_info *oim)
914 {
915 	struct skmem_bufctl_bkt *bcb;
916 	struct skmem_bufctl *bc;
917 	struct skmem_slab *sl;
918 
919 	/*
920 	 * Search the hash chain to find a matching buffer control for the
921 	 * given object address.  If not found, panic since the caller has
922 	 * given us a bogus address.
923 	 */
924 	SKM_SLAB_LOCK(skm);
925 	bcb = SKMEM_CACHE_HASH(skm, buf);
926 	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
927 		if (bc->bc_addr == buf) {
928 			break;
929 		}
930 	}
931 
932 	if (__improbable(bc == NULL)) {
933 		panic("%s: %s failed to get object info for %p",
934 		    __func__, skm->skm_name, buf);
935 		/* NOTREACHED */
936 		__builtin_unreachable();
937 	}
938 
939 	/*
940 	 * Return the master object's info to the caller.
941 	 */
942 	sl = bc->bc_slab;
943 	SKMEM_OBJ_ADDR(oi) = __unsafe_forge_bidi_indexable(void *, bc->bc_addr,
944 	    (uint32_t)skm->skm_objsize);
945 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
946 	ASSERT(skm->skm_objsize <= UINT32_MAX);
947 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
948 	SKMEM_OBJ_IDX_REG(oi) =
949 	    (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
950 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
951 	/*
952 	 * And for slave object.
953 	 */
954 	if (oim != NULL) {
955 		bzero(oim, sizeof(*oim));
956 		if (bc->bc_addrm != NULL) {
957 			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
958 				void *, bc->bc_addrm, oi->oi_size);
959 			SKMEM_OBJ_SIZE(oim) = oi->oi_size;
960 			SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
961 			SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
962 		}
963 	}
964 	SKM_SLAB_UNLOCK(skm);
965 }
966 
967 /*
968  * Magazine constructor.
969  */
970 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)971 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
972     void *arg, uint32_t skmflag)
973 {
974 #pragma unused(oim, skmflag)
975 	struct skmem_mag *__single mg = SKMEM_OBJ_ADDR(oi);
976 
977 	ASSERT(oim == NULL);
978 	ASSERT(arg != NULL);
979 
980 	/*
981 	 * Store it in the magazine object since we'll
982 	 * need to refer to it during magazine destroy;
983 	 * we can't safely refer to skm_magtype as the
984 	 * depot lock may not be acquired then.
985 	 */
986 	mg->mg_magtype = arg;
987 
988 	return 0;
989 }
990 
991 /*
992  * Destroy a magazine (free each object to the slab layer).
993  */
994 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)995 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
996     int nrounds)
997 {
998 	int round;
999 
1000 	for (round = 0; round < nrounds; round++) {
1001 		void *__single buf = mg->mg_round[round];
1002 		struct skmem_obj *next;
1003 
1004 		if (skm->skm_mode & SKM_MODE_BATCH) {
1005 			next = ((struct skmem_obj *)buf)->mo_next;
1006 			((struct skmem_obj *)buf)->mo_next = NULL;
1007 		}
1008 
1009 		/* deconstruct the object */
1010 		if (skm->skm_dtor != NULL) {
1011 			skm->skm_dtor(buf, skm->skm_private);
1012 		}
1013 
1014 		/*
1015 		 * In non-batching mode, each object in the magazine has
1016 		 * no linkage to its neighbor, so free individual object
1017 		 * to the slab layer now.
1018 		 */
1019 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1020 			skmem_slab_free(skm, buf);
1021 		} else {
1022 			((struct skmem_obj *)buf)->mo_next = next;
1023 		}
1024 	}
1025 
1026 	/*
1027 	 * In batching mode, each object is linked to its neighbor at free
1028 	 * time, and so take the bottom-most object and free it to the slab
1029 	 * layer.  Because of the way the list is reversed during free, this
1030 	 * will bring along the rest of objects above it.
1031 	 */
1032 	if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1033 		skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1034 	}
1035 
1036 	/* free the magazine itself back to cache */
1037 	skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1038 }
1039 
1040 /*
1041  * Get one or more magazines from the depot.
1042  */
1043 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * __bidi_indexable * list,uint32_t num)1044 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1045     uint32_t *count, struct skmem_mag *__bidi_indexable *list, uint32_t num)
1046 {
1047 	SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1048 	struct skmem_mag *mg;
1049 	uint32_t need = num, c = 0;
1050 
1051 	ASSERT(list != NULL && need > 0);
1052 
1053 	if (!SKM_DEPOT_LOCK_TRY(skm)) {
1054 		/*
1055 		 * Track the amount of lock contention here; if the contention
1056 		 * level is high (more than skmem_cache_depot_contention per a
1057 		 * given skmem_cache_update_interval interval), then we treat
1058 		 * it as a sign that the per-CPU layer is not using the right
1059 		 * magazine type, and that we'd need to resize it.
1060 		 */
1061 		SKM_DEPOT_LOCK(skm);
1062 		if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1063 			skm->skm_depot_contention++;
1064 		}
1065 	}
1066 
1067 	while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1068 		SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1069 		SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1070 		ASSERT(ml->ml_total != 0);
1071 		if (--ml->ml_total < ml->ml_min) {
1072 			ml->ml_min = ml->ml_total;
1073 		}
1074 		c++;
1075 		ml->ml_alloc++;
1076 		if (--need == 0) {
1077 			break;
1078 		}
1079 	}
1080 	*count -= c;
1081 
1082 	SKM_DEPOT_UNLOCK(skm);
1083 
1084 	*list = SLIST_FIRST(&mg_list);
1085 
1086 	return num - need;
1087 }
1088 
1089 /*
1090  * Return one or more magazines to the depot.
1091  */
1092 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1093 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1094     uint32_t *count, struct skmem_mag *mg)
1095 {
1096 	struct skmem_mag *nmg;
1097 	uint32_t c = 0;
1098 
1099 	SKM_DEPOT_LOCK(skm);
1100 	while (mg != NULL) {
1101 		nmg = SLIST_NEXT(mg, mg_link);
1102 		SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1103 		ml->ml_total++;
1104 		c++;
1105 		mg = nmg;
1106 	}
1107 	*count += c;
1108 	SKM_DEPOT_UNLOCK(skm);
1109 }
1110 
1111 /*
1112  * Update the depot's working state statistics.
1113  */
1114 static void
skmem_depot_ws_update(struct skmem_cache * skm)1115 skmem_depot_ws_update(struct skmem_cache *skm)
1116 {
1117 	SKM_DEPOT_LOCK_SPIN(skm);
1118 	skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1119 	skm->skm_full.ml_min = skm->skm_full.ml_total;
1120 	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1121 	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1122 	SKM_DEPOT_UNLOCK(skm);
1123 }
1124 
1125 /*
1126  * Empty the depot's working state statistics (everything's reapable.)
1127  */
1128 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1129 skmem_depot_ws_zero(struct skmem_cache *skm)
1130 {
1131 	SKM_DEPOT_LOCK_SPIN(skm);
1132 	if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1133 	    skm->skm_full.ml_min != skm->skm_full.ml_total ||
1134 	    skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1135 	    skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1136 		skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1137 		skm->skm_full.ml_min = skm->skm_full.ml_total;
1138 		skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1139 		skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1140 		skm->skm_depot_ws_zero++;
1141 	}
1142 	SKM_DEPOT_UNLOCK(skm);
1143 }
1144 
1145 /*
1146  * Reap magazines that's outside of the working set.
1147  */
1148 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1149 skmem_depot_ws_reap(struct skmem_cache *skm)
1150 {
1151 	struct skmem_mag *mg, *nmg;
1152 	uint32_t f, e, reap;
1153 
1154 	reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1155 	if (reap != 0) {
1156 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1157 		    &skm->skm_depot_full, &mg, reap);
1158 		while (mg != NULL) {
1159 			nmg = SLIST_NEXT(mg, mg_link);
1160 			SLIST_NEXT(mg, mg_link) = NULL;
1161 			skmem_magazine_destroy(skm, mg,
1162 			    mg->mg_magtype->mt_magsize);
1163 			mg = nmg;
1164 		}
1165 	}
1166 
1167 	reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1168 	if (reap != 0) {
1169 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1170 		    &skm->skm_depot_empty, &mg, reap);
1171 		while (mg != NULL) {
1172 			nmg = SLIST_NEXT(mg, mg_link);
1173 			SLIST_NEXT(mg, mg_link) = NULL;
1174 			skmem_magazine_destroy(skm, mg, 0);
1175 			mg = nmg;
1176 		}
1177 	}
1178 
1179 	if (f != 0 || e != 0) {
1180 		os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed);
1181 	}
1182 }
1183 
1184 /*
1185  * Performs periodic maintenance on a cache.  This is serialized
1186  * through the update thread call, and so we guarantee there's at
1187  * most one update episode in the system at any given time.
1188  */
1189 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1190 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1191 {
1192 #pragma unused(arg)
1193 	boolean_t resize_mag = FALSE;
1194 	boolean_t rescale_hash = FALSE;
1195 
1196 	SKMEM_CACHE_LOCK_ASSERT_HELD();
1197 
1198 	/* insist that we are executing in the update thread call context */
1199 	ASSERT(sk_is_cache_update_protected());
1200 
1201 	/*
1202 	 * If the cache has become much larger or smaller than the
1203 	 * allocated-address hash table, rescale the hash table.
1204 	 */
1205 	SKM_SLAB_LOCK(skm);
1206 	if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1207 	    (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1208 	    (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1209 	    skm->skm_hash_mask > skm->skm_hash_initial)) {
1210 		rescale_hash = TRUE;
1211 	}
1212 	SKM_SLAB_UNLOCK(skm);
1213 
1214 	/*
1215 	 * Update the working set.
1216 	 */
1217 	skmem_depot_ws_update(skm);
1218 
1219 	/*
1220 	 * If the contention count is greater than the threshold during
1221 	 * the update interval, and if we are not already at the maximum
1222 	 * magazine size, increase it.
1223 	 */
1224 	SKM_DEPOT_LOCK_SPIN(skm);
1225 	if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1226 	    (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1227 	    skmem_cache_depot_contention) {
1228 		ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1229 		resize_mag = TRUE;
1230 	}
1231 	skm->skm_depot_contention_prev = skm->skm_depot_contention;
1232 	SKM_DEPOT_UNLOCK(skm);
1233 
1234 	if (rescale_hash) {
1235 		skmem_cache_hash_rescale(skm);
1236 	}
1237 
1238 	if (resize_mag) {
1239 		skmem_cache_magazine_resize(skm);
1240 	}
1241 }
1242 
1243 /*
1244  * Reload the CPU's magazines with mg and its follower (if any).
1245  */
1246 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1247 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1248     int rounds)
1249 {
1250 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1251 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1252 	ASSERT(cp->cp_magsize > 0);
1253 
1254 	cp->cp_loaded = mg;
1255 	cp->cp_rounds = rounds;
1256 	if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1257 		cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1258 		cp->cp_prounds = rounds;
1259 		SLIST_NEXT(mg, mg_link) = NULL;
1260 	} else {
1261 		ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1262 		cp->cp_ploaded = NULL;
1263 		cp->cp_prounds = -1;
1264 	}
1265 }
1266 
1267 /*
1268  * Reload the CPU's magazine with mg and save the previous one.
1269  */
1270 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1271 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
1272 {
1273 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1274 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1275 	ASSERT(cp->cp_magsize > 0);
1276 
1277 	cp->cp_ploaded = cp->cp_loaded;
1278 	cp->cp_prounds = cp->cp_rounds;
1279 	cp->cp_loaded = mg;
1280 	cp->cp_rounds = rounds;
1281 }
1282 
1283 /*
1284  * Allocate constructed object(s) from the cache.
1285  */
1286 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,size_t SK_FB_ARG objsize,uint32_t num,uint32_t skmflag)1287 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1288     size_t SK_FB_ARG objsize, uint32_t num, uint32_t skmflag)
1289 {
1290 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1291 	struct skmem_obj **top = list;
1292 	struct skmem_mag *mg;
1293 	uint32_t need = num;
1294 
1295 	ASSERT(list != NULL);
1296 	*list = NULL;
1297 
1298 	if (need == 0) {
1299 		return 0;
1300 	}
1301 	ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
1302 
1303 	SKM_CPU_LOCK(cp);
1304 	for (;;) {
1305 		/*
1306 		 * If we have an object in the current CPU's loaded
1307 		 * magazine, return it and we're done.
1308 		 */
1309 		if (cp->cp_rounds > 0) {
1310 			int objs = MIN((unsigned int)cp->cp_rounds, need);
1311 			/*
1312 			 * In the SKM_MODE_BATCH case, objects in are already
1313 			 * linked together with the most recently freed object
1314 			 * at the head of the list; grab as many objects as we
1315 			 * can.  Otherwise we'll just grab 1 object at most.
1316 			 */
1317 			*list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1318 			cp->cp_rounds -= objs;
1319 			cp->cp_alloc += objs;
1320 
1321 			if (skm->skm_mode & SKM_MODE_BATCH) {
1322 				struct skmem_obj *__single tail =
1323 				    cp->cp_loaded->mg_round[cp->cp_rounds];
1324 				list = &tail->mo_next;
1325 				*list = NULL;
1326 			}
1327 
1328 			/* if we got them all, return to caller */
1329 			if ((need -= objs) == 0) {
1330 				SKM_CPU_UNLOCK(cp);
1331 				goto done;
1332 			}
1333 		}
1334 
1335 		/*
1336 		 * The CPU's loaded magazine is empty.  If the previously
1337 		 * loaded magazine was full, exchange and try again.
1338 		 */
1339 		if (cp->cp_prounds > 0) {
1340 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
1341 			continue;
1342 		}
1343 
1344 		/*
1345 		 * If the magazine layer is disabled, allocate from slab.
1346 		 * This can happen either because SKM_MODE_NOMAGAZINES is
1347 		 * set, or because we are resizing the magazine now.
1348 		 */
1349 		if (cp->cp_magsize == 0) {
1350 			break;
1351 		}
1352 
1353 		/*
1354 		 * Both of the CPU's magazines are empty; try to get
1355 		 * full magazine(s) from the depot layer.  Upon success,
1356 		 * reload and try again.  To prevent potential thrashing,
1357 		 * replace both empty magazines only if the requested
1358 		 * count exceeds a magazine's worth of objects.
1359 		 */
1360 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1361 		    &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
1362 		if (mg != NULL) {
1363 			SLIST_HEAD(, skmem_mag) mg_list =
1364 			    SLIST_HEAD_INITIALIZER(mg_list);
1365 
1366 			if (cp->cp_ploaded != NULL) {
1367 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
1368 				    mg_link);
1369 			}
1370 			if (SLIST_NEXT(mg, mg_link) == NULL) {
1371 				/*
1372 				 * Depot allocation returns only 1 magazine;
1373 				 * retain current empty magazine.
1374 				 */
1375 				skmem_cpu_reload(cp, mg, cp->cp_magsize);
1376 			} else {
1377 				/*
1378 				 * We got 2 full magazines from depot;
1379 				 * release the current empty magazine
1380 				 * back to the depot layer.
1381 				 */
1382 				if (cp->cp_loaded != NULL) {
1383 					SLIST_INSERT_HEAD(&mg_list,
1384 					    cp->cp_loaded, mg_link);
1385 				}
1386 				skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
1387 			}
1388 			skmem_depot_batch_free(skm, &skm->skm_empty,
1389 			    &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
1390 			continue;
1391 		}
1392 
1393 		/*
1394 		 * The depot layer doesn't have any full magazines;
1395 		 * allocate directly from the slab layer.
1396 		 */
1397 		break;
1398 	}
1399 	SKM_CPU_UNLOCK(cp);
1400 
1401 	if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
1402 		struct skmem_obj *rtop, *__single rlist, *rlistp = NULL;
1403 		uint32_t rlistc, c = 0;
1404 
1405 		/*
1406 		 * Get a list of raw objects from the slab layer.
1407 		 */
1408 		rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
1409 		ASSERT(rlistc == 0 || rlist != NULL);
1410 		rtop = rlist;
1411 
1412 		/*
1413 		 * Construct each object in the raw list.  Upon failure,
1414 		 * free any remaining objects in the list back to the slab
1415 		 * layer, and keep the ones that were successfully constructed.
1416 		 * Here, "oi" and "oim" in each skmem_obj refer to the objects
1417 		 * coming from the master and slave regions (on mirrored
1418 		 * regions), respectively.  They are stored inside the object
1419 		 * temporarily so that we can pass them to the constructor.
1420 		 */
1421 		while (skm->skm_ctor != NULL && rlist != NULL) {
1422 			struct skmem_obj_info *oi = &rlist->mo_info;
1423 			struct skmem_obj_info *oim = &rlist->mo_minfo;
1424 			struct skmem_obj *rlistn = rlist->mo_next;
1425 
1426 			/*
1427 			 * Note that the constructor guarantees at least
1428 			 * the size of a pointer at the top of the object
1429 			 * and no more than that.  That means we must not
1430 			 * refer to "oi" and "oim" any longer after the
1431 			 * object goes thru the constructor.
1432 			 */
1433 			if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
1434 			    oim : NULL), skm->skm_private, skmflag) != 0) {
1435 				VERIFY(rlist->mo_next == rlistn);
1436 				os_atomic_add(&skm->skm_sl_alloc_fail,
1437 				    rlistc - c, relaxed);
1438 				if (rlistp != NULL) {
1439 					rlistp->mo_next = NULL;
1440 				}
1441 				if (rlist == rtop) {
1442 					rtop = NULL;
1443 					ASSERT(c == 0);
1444 				}
1445 				skmem_slab_batch_free(skm, rlist);
1446 				rlist = NULL;
1447 				rlistc = c;
1448 				break;
1449 			}
1450 			VERIFY(rlist->mo_next == rlistn);
1451 
1452 			++c;                    /* # of constructed objs */
1453 			rlistp = rlist;
1454 			if ((rlist = rlist->mo_next) == NULL) {
1455 				ASSERT(rlistc == c);
1456 				break;
1457 			}
1458 		}
1459 
1460 		/*
1461 		 * At this point "top" points to the head of the chain we're
1462 		 * going to return to caller; "list" points to the tail of that
1463 		 * chain.  The second chain begins at "rtop", and we append
1464 		 * that after "list" to form a single chain.  "rlistc" is the
1465 		 * number of objects in "rtop" originated from the slab layer
1466 		 * that have been successfully constructed (if applicable).
1467 		 */
1468 		ASSERT(c == 0 || rtop != NULL);
1469 		need -= rlistc;
1470 		*list = rtop;
1471 	} else {
1472 		struct skmem_obj_info oi, oim;
1473 		void *buf;
1474 
1475 		ASSERT(*top == NULL && num == 1 && need == 1);
1476 
1477 		/*
1478 		 * Get a single raw object from the slab layer.
1479 		 */
1480 		if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1481 			goto done;
1482 		}
1483 
1484 		buf = SKMEM_OBJ_ADDR(&oi);
1485 		ASSERT(buf != NULL);
1486 
1487 		/*
1488 		 * Construct the raw object.  Here, "oi" and "oim" refer to
1489 		 * the objects coming from the master and slave regions (on
1490 		 * mirrored regions), respectively.
1491 		 */
1492 		if (skm->skm_ctor != NULL &&
1493 		    skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
1494 		    &oim : NULL), skm->skm_private, skmflag) != 0) {
1495 			os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
1496 			skmem_slab_free(skm, buf);
1497 			goto done;
1498 		}
1499 
1500 		need = 0;
1501 		*list = buf;
1502 		ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1503 		    (*list)->mo_next == NULL);
1504 	}
1505 
1506 done:
1507 	/* if auditing is enabled, record this transaction */
1508 	if (__improbable(*top != NULL &&
1509 	    (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1510 		skmem_audit_buf(skm,
1511 		    __unsafe_forge_bidi_indexable(struct skmem_obj *, *top, objsize));
1512 	}
1513 
1514 	return num - need;
1515 }
1516 
1517 /*
1518  * Free a constructed object to the cache.
1519  */
1520 void
skmem_cache_free(struct skmem_cache * skm,void * buf)1521 skmem_cache_free(struct skmem_cache *skm, void *buf)
1522 {
1523 	if (skm->skm_mode & SKM_MODE_BATCH) {
1524 		((struct skmem_obj *)buf)->mo_next = NULL;
1525 	}
1526 	skmem_cache_batch_free_common(skm, (struct skmem_obj *)buf, 0);
1527 }
1528 
1529 /*
1530  * Free a constructed object.
1531  */
1532 void
skmem_cache_free_nocache(struct skmem_cache * skm,void * buf)1533 skmem_cache_free_nocache(struct skmem_cache *skm, void *buf)
1534 {
1535 	if (skm->skm_mode & SKM_MODE_BATCH) {
1536 		((struct skmem_obj *)buf)->mo_next = NULL;
1537 	}
1538 	skmem_cache_batch_free_common(skm, (struct skmem_obj *)buf, SKMEM_CACHE_FREE_NOCACHE);
1539 }
1540 
1541 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1542 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1543 {
1544 	skmem_cache_batch_free_common(skm, list, 0);
1545 }
1546 
1547 void
skmem_cache_batch_free_nocache(struct skmem_cache * skm,struct skmem_obj * list)1548 skmem_cache_batch_free_nocache(struct skmem_cache *skm, struct skmem_obj *list)
1549 {
1550 	skmem_cache_batch_free_common(skm, list, SKMEM_CACHE_FREE_NOCACHE);
1551 }
1552 
1553 static void
skmem_cache_batch_free_common(struct skmem_cache * skm,struct skmem_obj * list,uint32_t flags)1554 skmem_cache_batch_free_common(struct skmem_cache *skm, struct skmem_obj *list, uint32_t flags)
1555 {
1556 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1557 	struct skmem_magtype *mtp;
1558 	/*
1559 	 * XXX -fbounds-safety: Don't mark mg as __single, because it's a struct
1560 	 * with a flexible array, and when we allocate it, the alloc function
1561 	 * returns an __indexable to tell us the bounds. But if we mark this as
1562 	 * __single, we lose that information. It might compile fine, but at
1563 	 * runtime, before we actually assign the count value, there will be a
1564 	 * comparison between current count value and the new count value we
1565 	 * assign, where current count is supposed to be greater than the new
1566 	 * count. Unfortunately, this will most likely fail.
1567 	 */
1568 	struct skmem_mag *mg;
1569 	struct skmem_obj *listn;
1570 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1571 	vm_offset_t tagged_address;           /* address tagging */
1572 	struct skmem_region *region;          /* region source for this cache */
1573 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1574 
1575 	/* if auditing is enabled, record this transaction */
1576 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1577 		skmem_audit_buf(skm, list);
1578 	}
1579 
1580 	if (flags & SKMEM_CACHE_FREE_NOCACHE) {
1581 		goto nocache;
1582 	}
1583 
1584 	SKM_CPU_LOCK(cp);
1585 	for (;;) {
1586 		/*
1587 		 * If there's an available space in the current CPU's
1588 		 * loaded magazine, place it there and we're done.
1589 		 */
1590 		if ((unsigned int)cp->cp_rounds <
1591 		    (unsigned int)cp->cp_magsize) {
1592 			/*
1593 			 * In the SKM_MODE_BATCH case, reverse the list
1594 			 * while we place each object into the magazine;
1595 			 * this effectively causes the most recently
1596 			 * freed object to be reused during allocation.
1597 			 */
1598 			if (skm->skm_mode & SKM_MODE_BATCH) {
1599 				listn = list->mo_next;
1600 				list->mo_next = (cp->cp_rounds == 0) ? NULL :
1601 				    cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1602 			} else {
1603 				listn = NULL;
1604 			}
1605 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1606 			/*
1607 			 * If this region is configured to be tagged, we re-tag
1608 			 * the address that's being freed, to protect against
1609 			 * use-after-free bugs. This "re-tagged" address will
1610 			 * reside in the CPU's loaded magazine, and when cache
1611 			 * alloc is called, it is returned to client as is. At
1612 			 * this point, we know that this object will be freed to
1613 			 * the CPU's loaded magazine and not down to the slab
1614 			 * layer, so we won't be double tagging the same address
1615 			 * in the magazine layer and slab layer.
1616 			 */
1617 			region = skm->skm_region;
1618 			if (region->skr_mode & SKR_MODE_MEMTAG) {
1619 				tagged_address = vm_memtag_assign_tag(
1620 					(vm_offset_t)list, skm->skm_objsize);
1621 				vm_memtag_set_tag(tagged_address,
1622 				    skm->skm_objsize);
1623 				cp->cp_loaded->mg_round[cp->cp_rounds++] =
1624 				    __unsafe_forge_bidi_indexable(
1625 					struct skmem_obj *, tagged_address,
1626 					skm->skm_objsize);
1627 			} else {
1628 				cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
1629 			}
1630 #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1631 			cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
1632 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1633 			cp->cp_free++;
1634 
1635 			if ((list = listn) != NULL) {
1636 				continue;
1637 			}
1638 
1639 			SKM_CPU_UNLOCK(cp);
1640 			return;
1641 		}
1642 
1643 		/*
1644 		 * The loaded magazine is full.  If the previously
1645 		 * loaded magazine was empty, exchange and try again.
1646 		 */
1647 		if (cp->cp_prounds == 0) {
1648 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
1649 			continue;
1650 		}
1651 
1652 		/*
1653 		 * If the magazine layer is disabled, free to slab.
1654 		 * This can happen either because SKM_MODE_NOMAGAZINES
1655 		 * is set, or because we are resizing the magazine now.
1656 		 */
1657 		if (cp->cp_magsize == 0) {
1658 			break;
1659 		}
1660 
1661 		/*
1662 		 * Both magazines for the CPU are full; try to get
1663 		 * empty magazine(s) from the depot.  If we get one,
1664 		 * exchange a full magazine with it and place the
1665 		 * object in there.
1666 		 *
1667 		 * TODO: Because the caller currently doesn't indicate
1668 		 * the number of objects in the list, we choose the more
1669 		 * conservative approach of allocating only 1 empty
1670 		 * magazine (to prevent potential thrashing).  Once we
1671 		 * have the object count, we can replace 1 with similar
1672 		 * logic as used in skmem_cache_batch_alloc().
1673 		 */
1674 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1675 		    &skm->skm_depot_empty, &mg, 1);
1676 		if (mg != NULL) {
1677 			SLIST_HEAD(, skmem_mag) mg_list =
1678 			    SLIST_HEAD_INITIALIZER(mg_list);
1679 
1680 			if (cp->cp_ploaded != NULL) {
1681 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
1682 				    mg_link);
1683 			}
1684 			if (SLIST_NEXT(mg, mg_link) == NULL) {
1685 				/*
1686 				 * Depot allocation returns only 1 magazine;
1687 				 * retain current full magazine.
1688 				 */
1689 				skmem_cpu_reload(cp, mg, 0);
1690 			} else {
1691 				/*
1692 				 * We got 2 empty magazines from depot;
1693 				 * release the current full magazine back
1694 				 * to the depot layer.
1695 				 */
1696 				if (cp->cp_loaded != NULL) {
1697 					SLIST_INSERT_HEAD(&mg_list,
1698 					    cp->cp_loaded, mg_link);
1699 				}
1700 				skmem_cpu_batch_reload(cp, mg, 0);
1701 			}
1702 			skmem_depot_batch_free(skm, &skm->skm_full,
1703 			    &skm->skm_depot_full, SLIST_FIRST(&mg_list));
1704 			continue;
1705 		}
1706 
1707 		/*
1708 		 * We can't get any empty magazine from the depot, and
1709 		 * so we need to allocate one.  If the allocation fails,
1710 		 * just fall through, deconstruct and free the object
1711 		 * to the slab layer.
1712 		 */
1713 		mtp = skm->skm_magtype;
1714 		SKM_CPU_UNLOCK(cp);
1715 		mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
1716 		SKM_CPU_LOCK(cp);
1717 
1718 		if (mg != NULL) {
1719 			/*
1720 			 * XXX -fbounds-safety requires mg to be set before
1721 			 * setting mg->mg_count. But self-assignment mg = mg was
1722 			 * not allowed. As such, we used the following
1723 			 * workaround
1724 			 */
1725 			void *vmg = mg;
1726 			mg = vmg;
1727 			mg->mg_count = mg->mg_magtype->mt_magsize;
1728 			/*
1729 			 * We allocated an empty magazine, but since we
1730 			 * dropped the CPU lock above the magazine size
1731 			 * may have changed.  If that's the case free
1732 			 * the magazine and try again.
1733 			 */
1734 			if (cp->cp_magsize != mtp->mt_magsize) {
1735 				SKM_CPU_UNLOCK(cp);
1736 				skmem_cache_free(mtp->mt_cache, mg);
1737 				SKM_CPU_LOCK(cp);
1738 				continue;
1739 			}
1740 
1741 			/*
1742 			 * We have a magazine with the right size;
1743 			 * add it to the depot and try again.
1744 			 */
1745 			ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1746 			skmem_depot_batch_free(skm, &skm->skm_empty,
1747 			    &skm->skm_depot_empty, mg);
1748 			continue;
1749 		}
1750 
1751 		/*
1752 		 * We can't get an empty magazine, so free to slab.
1753 		 */
1754 		break;
1755 	}
1756 	SKM_CPU_UNLOCK(cp);
1757 
1758 nocache:
1759 	/*
1760 	 * We weren't able to free the constructed object(s) to the
1761 	 * magazine layer, so deconstruct them and free to the slab.
1762 	 */
1763 	if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
1764 	    list->mo_next != NULL)) {
1765 		/* whatever is left from original list */
1766 		struct skmem_obj *top = list;
1767 
1768 		while (list != NULL && skm->skm_dtor != NULL) {
1769 			listn = list->mo_next;
1770 			list->mo_next = NULL;
1771 
1772 			/* deconstruct the object */
1773 			if (skm->skm_dtor != NULL) {
1774 				skm->skm_dtor((void *)list, skm->skm_private);
1775 			}
1776 
1777 			list->mo_next = listn;
1778 			list = listn;
1779 		}
1780 
1781 		skmem_slab_batch_free(skm, top);
1782 	} else {
1783 		/* deconstruct the object */
1784 		if (skm->skm_dtor != NULL) {
1785 			skm->skm_dtor((void *)list, skm->skm_private);
1786 		}
1787 
1788 		skmem_slab_free(skm, (void *)list);
1789 	}
1790 }
1791 
1792 /*
1793  * Return the maximum number of objects cached at the magazine layer
1794  * based on the chunk size.  This takes into account the starting
1795  * magazine type as well as the final magazine type used in resizing.
1796  */
1797 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)1798 skmem_cache_magazine_max(uint32_t chunksize)
1799 {
1800 	struct skmem_magtype *mtp;
1801 	uint32_t magsize_max;
1802 
1803 	VERIFY(ncpu != 0);
1804 	VERIFY(chunksize > 0);
1805 
1806 	/* find a suitable magazine type for this chunk size */
1807 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
1808 		continue;
1809 	}
1810 
1811 	/* and find the last magazine type  */
1812 	for (;;) {
1813 		magsize_max = mtp->mt_magsize;
1814 		if (mtp == skmem_cache_magsize_last ||
1815 		    chunksize >= mtp->mt_maxbuf) {
1816 			break;
1817 		}
1818 		++mtp;
1819 		VERIFY(mtp <= skmem_cache_magsize_last);
1820 	}
1821 
1822 	return ncpu * magsize_max * 2; /* two magazines per CPU */
1823 }
1824 
1825 /*
1826  * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
1827  */
1828 boolean_t
skmem_allow_magazines(void)1829 skmem_allow_magazines(void)
1830 {
1831 	return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
1832 }
1833 
1834 /*
1835  * Purge all magazines from a cache and disable its per-CPU magazines layer.
1836  */
1837 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)1838 skmem_cache_magazine_purge(struct skmem_cache *skm)
1839 {
1840 	struct skmem_cpu_cache *cp;
1841 	struct skmem_mag *mg, *pmg;
1842 	int rounds, prounds;
1843 	uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
1844 
1845 	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
1846 
1847 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx", SK_KVA(skm));
1848 
1849 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
1850 		cp = &skm->skm_cpu_cache[cpuid];
1851 
1852 		SKM_CPU_LOCK_SPIN(cp);
1853 		mg = cp->cp_loaded;
1854 		pmg = cp->cp_ploaded;
1855 		rounds = cp->cp_rounds;
1856 		prounds = cp->cp_prounds;
1857 		cp->cp_loaded = NULL;
1858 		cp->cp_ploaded = NULL;
1859 		cp->cp_rounds = -1;
1860 		cp->cp_prounds = -1;
1861 		cp->cp_magsize = 0;
1862 		SKM_CPU_UNLOCK(cp);
1863 
1864 		if (mg != NULL) {
1865 			skmem_magazine_destroy(skm, mg, rounds);
1866 			++mg_cnt;
1867 		}
1868 		if (pmg != NULL) {
1869 			skmem_magazine_destroy(skm, pmg, prounds);
1870 			++pmg_cnt;
1871 		}
1872 	}
1873 
1874 	if (mg_cnt != 0 || pmg_cnt != 0) {
1875 		os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed);
1876 	}
1877 
1878 	skmem_depot_ws_zero(skm);
1879 	skmem_depot_ws_reap(skm);
1880 }
1881 
1882 /*
1883  * Enable magazines on a cache.  Must only be called on a cache with
1884  * its per-CPU magazines layer disabled (e.g. due to purge).
1885  */
1886 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)1887 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
1888 {
1889 #pragma unused(arg)
1890 	struct skmem_cpu_cache *cp;
1891 	uint32_t cpuid;
1892 
1893 	if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
1894 		return;
1895 	}
1896 
1897 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
1898 		cp = &skm->skm_cpu_cache[cpuid];
1899 		SKM_CPU_LOCK_SPIN(cp);
1900 		/* the magazines layer must be disabled at this point */
1901 		ASSERT(cp->cp_loaded == NULL);
1902 		ASSERT(cp->cp_ploaded == NULL);
1903 		ASSERT(cp->cp_rounds == -1);
1904 		ASSERT(cp->cp_prounds == -1);
1905 		ASSERT(cp->cp_magsize == 0);
1906 		cp->cp_magsize = skm->skm_magtype->mt_magsize;
1907 		SKM_CPU_UNLOCK(cp);
1908 	}
1909 
1910 	SK_DF(SK_VERB_MEM_CACHE, "skm 0x%llx chunksize %u magsize %d",
1911 	    SK_KVA(skm), (uint32_t)skm->skm_chunksize,
1912 	    SKMEM_CPU_CACHE(skm)->cp_magsize);
1913 }
1914 
1915 /*
1916  * Enter the cache resize perimeter.  Upon success, claim exclusivity
1917  * on the perimeter and return 0, else EBUSY.  Caller may indicate
1918  * whether or not they're willing to wait.
1919  */
1920 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)1921 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
1922 {
1923 	SKM_RESIZE_LOCK(skm);
1924 	if (skm->skm_rs_owner == current_thread()) {
1925 		ASSERT(skm->skm_rs_busy != 0);
1926 		skm->skm_rs_busy++;
1927 		goto done;
1928 	}
1929 	if (!can_sleep) {
1930 		if (skm->skm_rs_busy != 0) {
1931 			SKM_RESIZE_UNLOCK(skm);
1932 			return EBUSY;
1933 		}
1934 	} else {
1935 		while (skm->skm_rs_busy != 0) {
1936 			skm->skm_rs_want++;
1937 			(void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
1938 			SKM_RESIZE_UNLOCK(skm);
1939 			(void) thread_block(THREAD_CONTINUE_NULL);
1940 			SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
1941 			    "(0x%llx) busy=%u", skm->skm_name,
1942 			    SK_KVA(skm), skm->skm_rs_busy);
1943 			SKM_RESIZE_LOCK(skm);
1944 		}
1945 	}
1946 	SKM_RESIZE_LOCK_ASSERT_HELD(skm);
1947 	ASSERT(skm->skm_rs_busy == 0);
1948 	skm->skm_rs_busy++;
1949 	skm->skm_rs_owner = current_thread();
1950 done:
1951 	SKM_RESIZE_UNLOCK(skm);
1952 	return 0;
1953 }
1954 
1955 /*
1956  * Exit the cache resize perimeter and unblock any waiters.
1957  */
1958 static void
skmem_cache_resize_exit(struct skmem_cache * skm)1959 skmem_cache_resize_exit(struct skmem_cache *skm)
1960 {
1961 	uint32_t want;
1962 
1963 	SKM_RESIZE_LOCK(skm);
1964 	ASSERT(skm->skm_rs_busy != 0);
1965 	ASSERT(skm->skm_rs_owner == current_thread());
1966 	if (--skm->skm_rs_busy == 0) {
1967 		skm->skm_rs_owner = NULL;
1968 		/*
1969 		 * We're done; notify anyone that has lost the race.
1970 		 */
1971 		if ((want = skm->skm_rs_want) != 0) {
1972 			skm->skm_rs_want = 0;
1973 			wakeup((void *)&skm->skm_rs_busy);
1974 			SKM_RESIZE_UNLOCK(skm);
1975 		} else {
1976 			SKM_RESIZE_UNLOCK(skm);
1977 		}
1978 	} else {
1979 		SKM_RESIZE_UNLOCK(skm);
1980 	}
1981 }
1982 
1983 /*
1984  * Recompute a cache's magazine size.  This is an expensive operation
1985  * and should not be done frequently; larger magazines provide for a
1986  * higher transfer rate with the depot while smaller magazines reduce
1987  * the memory consumption.
1988  */
1989 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)1990 skmem_cache_magazine_resize(struct skmem_cache *skm)
1991 {
1992 	struct skmem_magtype *mtp = __unsafe_forge_bidi_indexable(
1993 		struct skmem_magtype *, skm->skm_magtype, sizeof(skmem_magtype));
1994 
1995 	/* insist that we are executing in the update thread call context */
1996 	ASSERT(sk_is_cache_update_protected());
1997 	ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
1998 	/* depot contention only applies to dynamic mode */
1999 	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
2000 
2001 	/*
2002 	 * Although we're executing in the context of the update thread
2003 	 * call, we need to protect the per-CPU states during resizing
2004 	 * against other synchronous cache purge/reenable requests that
2005 	 * could take place in parallel.
2006 	 */
2007 	if (skm->skm_chunksize < mtp->mt_maxbuf) {
2008 		(void) skmem_cache_resize_enter(skm, TRUE);
2009 		skmem_cache_magazine_purge(skm);
2010 
2011 		/*
2012 		 * Upgrade to the next magazine type with larger size.
2013 		 */
2014 		SKM_DEPOT_LOCK_SPIN(skm);
2015 		skm->skm_cpu_mag_resize++;
2016 		skm->skm_magtype = ++mtp;
2017 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2018 		skm->skm_depot_contention_prev =
2019 		    skm->skm_depot_contention + INT_MAX;
2020 		SKM_DEPOT_UNLOCK(skm);
2021 
2022 		skmem_cache_magazine_enable(skm, 0);
2023 		skmem_cache_resize_exit(skm);
2024 	}
2025 }
2026 
2027 /*
2028  * Rescale the cache's allocated-address hash table.
2029  */
2030 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2031 skmem_cache_hash_rescale(struct skmem_cache *skm)
2032 {
2033 	struct skmem_bufctl_bkt *__indexable old_table, *new_table;
2034 	size_t old_size, new_size;
2035 	uint32_t i, moved = 0;
2036 
2037 	/* insist that we are executing in the update thread call context */
2038 	ASSERT(sk_is_cache_update_protected());
2039 
2040 	/*
2041 	 * To get small average lookup time (lookup depth near 1.0), the hash
2042 	 * table size should be roughly the same (not necessarily equivalent)
2043 	 * as the cache size.
2044 	 */
2045 	new_size = MAX(skm->skm_hash_initial,
2046 	    (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2047 	new_size = MIN(skm->skm_hash_limit, new_size);
2048 	old_size = (skm->skm_hash_mask + 1);
2049 
2050 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2051 		return;
2052 	}
2053 
2054 	new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2055 	    Z_NOWAIT, skmem_tag_bufctl_hash);
2056 	if (__improbable(new_table == NULL)) {
2057 		return;
2058 	}
2059 
2060 	for (i = 0; i < new_size; i++) {
2061 		SLIST_INIT(&new_table[i].bcb_head);
2062 	}
2063 
2064 	SKM_SLAB_LOCK(skm);
2065 
2066 	old_size = (skm->skm_hash_mask + 1);
2067 	old_table = skm->skm_hash_table;
2068 
2069 	skm->skm_hash_mask = (new_size - 1);
2070 	skm->skm_hash_table = new_table;
2071 	skm->skm_hash_size = new_size;
2072 	skm->skm_sl_rescale++;
2073 
2074 	for (i = 0; i < old_size; i++) {
2075 		struct skmem_bufctl_bkt *bcb = &old_table[i];
2076 		struct skmem_bufctl_bkt *new_bcb;
2077 		struct skmem_bufctl *bc;
2078 
2079 		while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2080 			SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2081 			new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2082 			/*
2083 			 * Ideally we want to insert tail here, but simple
2084 			 * list doesn't give us that.  The fact that we are
2085 			 * essentially reversing the order is not a big deal
2086 			 * here vis-a-vis the new table size.
2087 			 */
2088 			SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2089 			++moved;
2090 		}
2091 		ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2092 	}
2093 
2094 	SK_DF(SK_VERB_MEM_CACHE,
2095 	    "skm 0x%llx old_size %u new_size %u [%u moved]", SK_KVA(skm),
2096 	    (uint32_t)old_size, (uint32_t)new_size, moved);
2097 
2098 	SKM_SLAB_UNLOCK(skm);
2099 
2100 	sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2101 }
2102 
2103 /*
2104  * Apply a function to operate on all caches.
2105  */
2106 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2107 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2108 {
2109 	struct skmem_cache *skm;
2110 
2111 	net_update_uptime();
2112 
2113 	SKMEM_CACHE_LOCK();
2114 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2115 		func(skm, arg);
2116 	}
2117 	SKMEM_CACHE_UNLOCK();
2118 }
2119 
2120 /*
2121  * Reclaim unused memory from a cache.
2122  */
2123 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2124 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2125 {
2126 	/*
2127 	 * Inform the owner to free memory if possible; the reclaim
2128 	 * policy is left to the owner.  This is just an advisory.
2129 	 */
2130 	if (skm->skm_reclaim != NULL) {
2131 		skm->skm_reclaim(skm->skm_private);
2132 	}
2133 
2134 	if (lowmem) {
2135 		/*
2136 		 * If another thread is in the process of purging or
2137 		 * resizing, bail out and let the currently-ongoing
2138 		 * purging take its natural course.
2139 		 */
2140 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2141 			skmem_cache_magazine_purge(skm);
2142 			skmem_cache_magazine_enable(skm, 0);
2143 			skmem_cache_resize_exit(skm);
2144 		}
2145 	} else {
2146 		skmem_depot_ws_reap(skm);
2147 	}
2148 }
2149 
2150 /*
2151  * Thread call callback for reap.
2152  */
2153 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2154 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2155 {
2156 #pragma unused(dummy)
2157 	void (*func)(void) = arg;
2158 
2159 	ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2160 	func();
2161 }
2162 
2163 /*
2164  * Start reaping all caches; this is serialized via thread call.
2165  */
2166 static void
skmem_cache_reap_start(void)2167 skmem_cache_reap_start(void)
2168 {
2169 	SK_DF(SK_VERB_MEM_CACHE, "now running");
2170 	skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2171 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2172 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2173 }
2174 
2175 /*
2176  * Stop reaping; this would allow another reap request to occur.
2177  */
2178 static void
skmem_cache_reap_done(void)2179 skmem_cache_reap_done(void)
2180 {
2181 	volatile uint32_t *flag = &skmem_cache_reaping;
2182 
2183 	*flag = 0;
2184 	os_atomic_thread_fence(seq_cst);
2185 }
2186 
2187 /*
2188  * Immediately reap all unused memory of a cache.  If purging,
2189  * also purge the cached objects at the CPU layer.
2190  */
2191 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2192 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2193 {
2194 	/* if SKM_MODE_RECLIAM flag is set for this cache, we purge */
2195 	if (purge || (skm->skm_mode & SKM_MODE_RECLAIM)) {
2196 		/*
2197 		 * If another thread is in the process of purging or
2198 		 * resizing, bail out and let the currently-ongoing
2199 		 * purging take its natural course.
2200 		 */
2201 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2202 			skmem_cache_magazine_purge(skm);
2203 			skmem_cache_magazine_enable(skm, 0);
2204 			skmem_cache_resize_exit(skm);
2205 		}
2206 	} else {
2207 		skmem_depot_ws_zero(skm);
2208 		skmem_depot_ws_reap(skm);
2209 
2210 		/* clean up cp_ploaded magazines from each CPU */
2211 		SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2212 
2213 		struct skmem_cpu_cache *cp;
2214 		struct skmem_mag *pmg;
2215 		int prounds;
2216 		uint32_t cpuid;
2217 
2218 		for (cpuid = 0; cpuid < ncpu; cpuid++) {
2219 			cp = &skm->skm_cpu_cache[cpuid];
2220 
2221 			SKM_CPU_LOCK_SPIN(cp);
2222 			pmg = cp->cp_ploaded;
2223 			prounds = cp->cp_prounds;
2224 
2225 			cp->cp_ploaded = NULL;
2226 			cp->cp_prounds = -1;
2227 			SKM_CPU_UNLOCK(cp);
2228 
2229 			if (pmg != NULL) {
2230 				skmem_magazine_destroy(skm, pmg, prounds);
2231 			}
2232 		}
2233 	}
2234 }
2235 
2236 /*
2237  * Request a global reap operation to be dispatched.
2238  */
2239 void
skmem_cache_reap(void)2240 skmem_cache_reap(void)
2241 {
2242 	/* only one reaping episode is allowed at a time */
2243 	if (skmem_lock_owner == current_thread() ||
2244 	    !os_atomic_cmpxchg(&skmem_cache_reaping, 0, 1, acq_rel)) {
2245 		return;
2246 	}
2247 
2248 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2249 }
2250 
2251 /*
2252  * Reap internal caches.
2253  */
2254 void
skmem_reap_caches(boolean_t purge)2255 skmem_reap_caches(boolean_t purge)
2256 {
2257 	skmem_cache_reap_now(skmem_slab_cache, purge);
2258 	skmem_cache_reap_now(skmem_bufctl_cache, purge);
2259 
2260 	/* packet buffer pool objects */
2261 	pp_reap_caches(purge);
2262 
2263 	/* also handle the region cache(s) */
2264 	skmem_region_reap_caches(purge);
2265 }
2266 
2267 /*
2268  * Thread call callback for update.
2269  */
2270 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2271 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2272 {
2273 #pragma unused(dummy, arg)
2274 	sk_protect_t protect;
2275 
2276 	protect = sk_cache_update_protect();
2277 	skmem_cache_applyall(skmem_cache_update, 0);
2278 	sk_cache_update_unprotect(protect);
2279 
2280 	skmem_dispatch(skmem_cache_update_tc, NULL,
2281 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2282 }
2283 
2284 /*
2285  * Given an object, find its buffer control and record the transaction.
2286  */
2287 __attribute__((noinline, cold, not_tail_called))
2288 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)2289 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
2290 {
2291 	struct skmem_bufctl_bkt *bcb;
2292 	struct skmem_bufctl *bc;
2293 
2294 	ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
2295 
2296 	SKM_SLAB_LOCK(skm);
2297 	while (list != NULL) {
2298 		void *__single buf = list;
2299 
2300 		bcb = SKMEM_CACHE_HASH(skm, buf);
2301 		SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
2302 			if (bc->bc_addr == buf) {
2303 				break;
2304 			}
2305 		}
2306 
2307 		if (__improbable(bc == NULL)) {
2308 			panic("%s: %s failed to get bufctl for %p",
2309 			    __func__, skm->skm_name, buf);
2310 			/* NOTREACHED */
2311 			__builtin_unreachable();
2312 		}
2313 
2314 		skmem_audit_bufctl(bc);
2315 
2316 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
2317 			break;
2318 		}
2319 
2320 		list = list->mo_next;
2321 	}
2322 	SKM_SLAB_UNLOCK(skm);
2323 }
2324 
2325 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * __sized_by (len)out,size_t len)2326 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *__sized_by(len) out,
2327     size_t len)
2328 {
2329 	size_t actual_space = sizeof(struct sk_stats_cache);
2330 	struct sk_stats_cache *__single sca;
2331 	int contention;
2332 
2333 	if (out == NULL || len < actual_space) {
2334 		goto done;
2335 	}
2336 	sca = out;
2337 
2338 	bzero(sca, sizeof(*sca));
2339 	(void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
2340 	    skm->skm_name);
2341 	uuid_copy(sca->sca_uuid, skm->skm_uuid);
2342 	uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
2343 	sca->sca_mode = skm->skm_mode;
2344 	sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
2345 	sca->sca_objsize = (uint64_t)skm->skm_objsize;
2346 	sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
2347 	sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
2348 	sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
2349 	sca->sca_objalign = (uint64_t)skm->skm_objalign;
2350 
2351 	sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
2352 	sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
2353 	sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
2354 	sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
2355 	sca->sca_depot_full = skm->skm_depot_full;
2356 	sca->sca_depot_empty = skm->skm_depot_empty;
2357 	sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
2358 	/* in case of a race this might be a negative value, turn it into 0 */
2359 	if ((contention = (int)(skm->skm_depot_contention -
2360 	    skm->skm_depot_contention_prev)) < 0) {
2361 		contention = 0;
2362 	}
2363 	sca->sca_depot_contention_factor = contention;
2364 
2365 	sca->sca_cpu_rounds = 0;
2366 	sca->sca_cpu_prounds = 0;
2367 	for (int cpuid = 0; cpuid < ncpu; cpuid++) {
2368 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
2369 
2370 		SKM_CPU_LOCK(ccp);
2371 		if (ccp->cp_rounds > -1) {
2372 			sca->sca_cpu_rounds += ccp->cp_rounds;
2373 		}
2374 		if (ccp->cp_prounds > -1) {
2375 			sca->sca_cpu_prounds += ccp->cp_prounds;
2376 		}
2377 		SKM_CPU_UNLOCK(ccp);
2378 	}
2379 
2380 	sca->sca_sl_create = skm->skm_sl_create;
2381 	sca->sca_sl_destroy = skm->skm_sl_destroy;
2382 	sca->sca_sl_alloc = skm->skm_sl_alloc;
2383 	sca->sca_sl_free = skm->skm_sl_free;
2384 	sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
2385 	sca->sca_sl_partial = skm->skm_sl_partial;
2386 	sca->sca_sl_empty = skm->skm_sl_empty;
2387 	sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
2388 	sca->sca_sl_rescale = skm->skm_sl_rescale;
2389 	sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
2390 
2391 done:
2392 	return actual_space;
2393 }
2394 
2395 static int
2396 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
2397 {
2398 #pragma unused(arg1, arg2, oidp)
2399 	struct skmem_cache *skm;
2400 	size_t actual_space;
2401 	size_t buffer_space;
2402 	size_t allocated_space = 0;
2403 	caddr_t __sized_by(allocated_space) buffer = NULL;
2404 	caddr_t scan;
2405 	int error = 0;
2406 
2407 	if (!kauth_cred_issuser(kauth_cred_get())) {
2408 		return EPERM;
2409 	}
2410 
2411 	net_update_uptime();
2412 	buffer_space = req->oldlen;
2413 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2414 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2415 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2416 		}
2417 		caddr_t temp;
2418 		temp = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_cache_mib);
2419 		if (__improbable(temp == NULL)) {
2420 			return ENOBUFS;
2421 		}
2422 		buffer = temp;
2423 		allocated_space = buffer_space;
2424 	} else if (req->oldptr == USER_ADDR_NULL) {
2425 		buffer_space = 0;
2426 	}
2427 	actual_space = 0;
2428 	scan = buffer;
2429 
2430 	SKMEM_CACHE_LOCK();
2431 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2432 		size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
2433 		if (scan != NULL) {
2434 			if (buffer_space < size) {
2435 				/* supplied buffer too small, stop copying */
2436 				error = ENOMEM;
2437 				break;
2438 			}
2439 			scan += size;
2440 			buffer_space -= size;
2441 		}
2442 		actual_space += size;
2443 	}
2444 	SKMEM_CACHE_UNLOCK();
2445 
2446 	if (actual_space != 0) {
2447 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
2448 		if (out_error != 0) {
2449 			error = out_error;
2450 		}
2451 	}
2452 	if (buffer != NULL) {
2453 		sk_free_data_sized_by(buffer, allocated_space);
2454 	}
2455 
2456 	return error;
2457 }
2458