xref: /xnu-12377.81.4/bsd/skywalk/mem/skmem_cache.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <skywalk/os_skywalk_private.h>
30 #define _FN_KPRINTF
31 #include <pexpert/pexpert.h>    /* for PE_parse_boot_argn */
32 #include <libkern/OSDebug.h>    /* for OSBacktrace */
33 #include <kern/sched_prim.h>    /* for assert_wait */
34 #include <kern/uipc_domain.h>
35 #include <vm/vm_memtag.h>
36 
37 /*
38  * Memory allocator with per-CPU caching (magazines), derived from the kmem
39  * magazine concept and implementation as described in the following paper:
40  * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
41  *
42  * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
43  * reserved.  Use is subject to license terms.
44  *
45  * This derivative differs from the original kmem slab allocator, in that:
46  *
47  *   a) There is always a discrete bufctl per object, even for small sizes.
48  *      This increases the overhead, but is necessary as Skywalk objects
49  *      coming from the slab may be shared (RO or RW) with userland; therefore
50  *      embedding the KVA pointer linkage in freed objects is a non-starter.
51  *
52  *   b) Writing patterns to the slab at slab creation or destruction time
53  *      (when debugging is enabled) is not implemented, as the object may
54  *      be shared (RW) with userland and thus we cannot panic upon pattern
55  *      mismatch episodes.  This can be relaxed so that we conditionally
56  *      verify the pattern for kernel-only memory.
57  *
58  * This derivative also differs from Darwin's mcache allocator (which itself
59  * is a derivative of the original kmem slab allocator), in that:
60  *
61  *   1) The slab layer is internal to skmem_cache, unlike mcache's external
62  *      slab layer required to support mbufs.  skmem_cache also supports
63  *      constructing and deconstructing objects, while mcache does not.
64  *      This brings skmem_cache's model closer to that of the original
65  *      kmem slab allocator.
66  *
67  *   2) mcache allows for batch allocation and free by way of chaining the
68  *      objects together using a linked list.  This requires using a part
69  *      of the object to act as the linkage, which is against Skywalk's
70  *      requirements of not exposing any KVA pointer to userland.  Although
71  *      this is supported by skmem_cache, chaining is only possible if the
72  *      region is not mapped to userland.  That implies that kernel-only
73  *      objects can be chained provided the cache is created with batching
74  *      mode enabled, and that the object is large enough to contain the
75  *      skmem_obj structure.
76  *
77  * In other words, skmem_cache is a hybrid of a hybrid custom allocator that
78  * implements features that are required by Skywalk.  In addition to being
79  * aware of userland access on the buffers, in also supports mirrored backend
80  * memory regions.  This allows a cache to manage two independent memory
81  * regions, such that allocating/freeing an object from/to one results in
82  * allocating/freeing a shadow object in another, thus guaranteeing that both
83  * objects share the same lifetime.
84  */
85 
86 static uint32_t ncpu;                   /* total # of initialized CPUs */
87 
88 static LCK_MTX_DECLARE_ATTR(skmem_cache_lock, &skmem_lock_grp, &skmem_lock_attr);
89 static struct thread *skmem_lock_owner = THREAD_NULL;
90 
91 static LCK_GRP_DECLARE(skmem_sl_lock_grp, "skmem_slab");
92 static LCK_GRP_DECLARE(skmem_dp_lock_grp, "skmem_depot");
93 static LCK_GRP_DECLARE(skmem_cpu_lock_grp, "skmem_cpu_cache");
94 
95 #define SKMEM_CACHE_LOCK() do {                 \
96 	lck_mtx_lock(&skmem_cache_lock);        \
97 	skmem_lock_owner = current_thread();    \
98 } while (0)
99 #define SKMEM_CACHE_UNLOCK() do {               \
100 	skmem_lock_owner = THREAD_NULL;         \
101 	lck_mtx_unlock(&skmem_cache_lock);      \
102 } while (0)
103 #define SKMEM_CACHE_LOCK_ASSERT_HELD()          \
104 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_OWNED)
105 #define SKMEM_CACHE_LOCK_ASSERT_NOTHELD()       \
106 	LCK_MTX_ASSERT(&skmem_cache_lock, LCK_MTX_ASSERT_NOTOWNED)
107 
108 #define SKM_DEPOT_LOCK(_skm)                    \
109 	lck_mtx_lock(&(_skm)->skm_dp_lock)
110 #define SKM_DEPOT_LOCK_SPIN(_skm)               \
111 	lck_mtx_lock_spin(&(_skm)->skm_dp_lock)
112 #define SKM_DEPOT_CONVERT_LOCK(_skm)            \
113 	lck_mtx_convert_spin(&(_skm)->skm_dp_lock)
114 #define SKM_DEPOT_LOCK_TRY(_skm)                \
115 	lck_mtx_try_lock(&(_skm)->skm_dp_lock)
116 #define SKM_DEPOT_LOCK_ASSERT_HELD(_skm)        \
117 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_OWNED)
118 #define SKM_DEPOT_LOCK_ASSERT_NOTHELD(_skm)     \
119 	LCK_MTX_ASSERT(&(_skm)->skm_dp_lock, LCK_MTX_ASSERT_NOTOWNED)
120 #define SKM_DEPOT_UNLOCK(_skm)                  \
121 	lck_mtx_unlock(&(_skm)->skm_dp_lock)
122 
123 #define SKM_RESIZE_LOCK(_skm)                   \
124 	lck_mtx_lock(&(_skm)->skm_rs_lock)
125 #define SKM_RESIZE_LOCK_ASSERT_HELD(_skm)       \
126 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_OWNED)
127 #define SKM_RESIZE_LOCK_ASSERT_NOTHELD(_skm)    \
128 	LCK_MTX_ASSERT(&(_skm)->skm_rs_lock, LCK_MTX_ASSERT_NOTOWNED)
129 #define SKM_RESIZE_UNLOCK(_skm)                 \
130 	lck_mtx_unlock(&(_skm)->skm_rs_lock)
131 
132 #define SKM_CPU_LOCK(_cp)                       \
133 	lck_mtx_lock(&(_cp)->cp_lock)
134 #define SKM_CPU_LOCK_SPIN(_cp)                  \
135 	lck_mtx_lock_spin(&(_cp)->cp_lock)
136 #define SKM_CPU_CONVERT_LOCK(_cp)               \
137 	lck_mtx_convert_spin(&(_cp)->cp_lock)
138 #define SKM_CPU_LOCK_ASSERT_HELD(_cp)           \
139 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_OWNED)
140 #define SKM_CPU_LOCK_ASSERT_NOTHELD(_cp)        \
141 	LCK_MTX_ASSERT(&(_cp)->cp_lock, LCK_MTX_ASSERT_NOTOWNED)
142 #define SKM_CPU_UNLOCK(_cp)                     \
143 	lck_mtx_unlock(&(_cp)->cp_lock)
144 
145 #define SKM_ZONE_MAX    256
146 
147 static struct zone *skm_zone;                   /* zone for skmem_cache */
148 /*
149  * XXX -fbounds-safety: Took out ZC_DESTRUCTIBLE flag because of static assert
150  * in ZONE_DEFINE_TYPE
151  */
152 ZONE_DECLARE(skm_zone, struct zone *);
153 
154 struct skmem_cache *skmem_slab_cache;    /* cache for skmem_slab */
155 struct skmem_cache *skmem_bufctl_cache;  /* cache for skmem_bufctl */
156 
157 unsigned int bc_size;                    /* size of bufctl */
158 
159 /*
160  * XXX: -fbounds-safety: we added objsize to skmem_cache_batch_alloc(), but this
161  * is only used by -fbounds-safety, so we use __unused if -fbounds-safety is
162  * disabled. The utility macro for that is SK_BF_ARG()
163  */
164 #if !__has_ptrcheck
165 #define SK_FB_ARG __unused
166 #else
167 #define SK_FB_ARG
168 #endif
169 
170 /*
171  * Magazine types (one per row.)
172  *
173  * The first column defines the number of objects that the magazine can hold.
174  * Using that number, we derive the effective number: the aggregate count of
175  * object pointers, plus 2 pointers (skmem_mag linkage + magazine type).
176  * This would result in an object size that is aligned on the CPU cache
177  * size boundary; the exception to this is the KASAN mode where the size
178  * would be larger due to the redzone regions.
179  *
180  * The second column defines the alignment of the magazine.  Because each
181  * magazine is used at the CPU-layer cache, we need to ensure there is no
182  * false sharing across the CPUs, and align the magazines to the maximum
183  * cache alignment size, for simplicity.  The value of 0 may be used to
184  * indicate natural pointer size alignment.
185  *
186  * The third column defines the starting magazine type for a given cache,
187  * determined at the cache's creation time based on its chunk size.
188  *
189  * The fourth column defines the magazine type limit for a given cache.
190  * Magazine resizing will only occur if the chunk size is less than this.
191  */
192 static struct skmem_magtype skmem_magtype[] = {
193 #if defined(__LP64__)
194 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 128, .mt_maxbuf = 512,
195 	  .mt_cache = NULL, .mt_cname = "" },
196 	{ .mt_magsize = 30, .mt_align = 0, .mt_minbuf = 96, .mt_maxbuf = 256,
197 	  .mt_cache = NULL, .mt_cname = "" },
198 	{ .mt_magsize = 46, .mt_align = 0, .mt_minbuf = 64, .mt_maxbuf = 128,
199 	  .mt_cache = NULL, .mt_cname = "" },
200 	{ .mt_magsize = 62, .mt_align = 0, .mt_minbuf = 32, .mt_maxbuf = 64,
201 	  .mt_cache = NULL, .mt_cname = "" },
202 	{ .mt_magsize = 94, .mt_align = 0, .mt_minbuf = 16, .mt_maxbuf = 32,
203 	  .mt_cache = NULL, .mt_cname = "" },
204 	{ .mt_magsize = 126, .mt_align = 0, .mt_minbuf = 8, .mt_maxbuf = 16,
205 	  .mt_cache = NULL, .mt_cname = "" },
206 	{ .mt_magsize = 142, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 8,
207 	  .mt_cache = NULL, .mt_cname = "" },
208 	{ .mt_magsize = 158, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
209 	  .mt_cache = NULL, .mt_cname = "" },
210 #else /* !__LP64__ */
211 	{ .mt_magsize = 14, .mt_align = 0, .mt_minbuf = 0, .mt_maxbuf = 0,
212 	  .mt_cache = NULL, .mt_cname = "" },
213 #endif /* !__LP64__ */
214 };
215 
216 /*
217  * Hash table bounds.  Start with the initial value, and rescale up to
218  * the specified limit.  Ideally we don't need a limit, but in practice
219  * this helps guard against runaways.  These values should be revisited
220  * in future and be adjusted as needed.
221  */
222 #define SKMEM_CACHE_HASH_INITIAL        64      /* initial hash table size */
223 #define SKMEM_CACHE_HASH_LIMIT          8192    /* hash table size limit */
224 
225 /*
226  * The last magazine type.
227  */
228 static struct skmem_magtype *skmem_cache_magsize_last;
229 
230 static TAILQ_HEAD(, skmem_cache) skmem_cache_head;
231 static boolean_t skmem_cache_ready;
232 static int skmem_magazine_ctor(struct skmem_obj_info *,
233     struct skmem_obj_info *, void *, uint32_t);
234 static void skmem_magazine_destroy(struct skmem_cache *, struct skmem_mag *,
235     int);
236 static uint32_t skmem_depot_batch_alloc(struct skmem_cache *,
237     struct skmem_maglist *, uint32_t *, struct skmem_mag *__bidi_indexable *, uint32_t);
238 static void skmem_depot_batch_free(struct skmem_cache *, struct skmem_maglist *,
239     uint32_t *, struct skmem_mag *);
240 static void skmem_depot_ws_update(struct skmem_cache *);
241 static void skmem_depot_ws_zero(struct skmem_cache *);
242 static void skmem_depot_ws_reap(struct skmem_cache *);
243 #define SKMEM_CACHE_FREE_NOCACHE    0x1
244 static void skmem_cache_batch_free_common(struct skmem_cache *, struct skmem_obj *, uint32_t);
245 static void skmem_cache_magazine_purge(struct skmem_cache *);
246 static void skmem_cache_magazine_enable(struct skmem_cache *, uint32_t);
247 static void skmem_cache_magazine_resize(struct skmem_cache *);
248 static void skmem_cache_hash_rescale(struct skmem_cache *);
249 static void skmem_cpu_reload(struct skmem_cpu_cache *, struct skmem_mag *, int);
250 static void skmem_cpu_batch_reload(struct skmem_cpu_cache *,
251     struct skmem_mag *, int);
252 static void skmem_cache_applyall(void (*)(struct skmem_cache *, uint32_t),
253     uint32_t);
254 static void skmem_cache_reclaim(struct skmem_cache *, uint32_t);
255 static void skmem_cache_reap_start(void);
256 static void skmem_cache_reap_done(void);
257 static void skmem_cache_reap_func(thread_call_param_t, thread_call_param_t);
258 static void skmem_cache_update_func(thread_call_param_t, thread_call_param_t);
259 static int skmem_cache_resize_enter(struct skmem_cache *, boolean_t);
260 static void skmem_cache_resize_exit(struct skmem_cache *);
261 static void skmem_audit_buf(struct skmem_cache *, struct skmem_obj *);
262 static int skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS;
263 
264 SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, cache,
265     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
266     0, 0, skmem_cache_mib_get_sysctl, "S,sk_stats_cache",
267     "Skywalk cache statistics");
268 
269 static volatile uint32_t skmem_cache_reaping;
270 static thread_call_t skmem_cache_reap_tc;
271 static thread_call_t skmem_cache_update_tc;
272 
273 extern kern_return_t thread_terminate(thread_t);
274 extern unsigned int ml_wait_max_cpus(void);
275 
276 #define SKMEM_DEBUG_NOMAGAZINES 0x1     /* disable magazines layer */
277 #define SKMEM_DEBUG_AUDIT       0x2     /* audit transactions */
278 #define SKMEM_DEBUG_MASK        (SKMEM_DEBUG_NOMAGAZINES|SKMEM_DEBUG_AUDIT)
279 
280 #if DEBUG
281 static uint32_t skmem_debug = SKMEM_DEBUG_AUDIT;
282 #else /* !DEBUG */
283 static uint32_t skmem_debug = 0;
284 #endif /* !DEBUG */
285 
286 static uint32_t skmem_clear_min = 0;    /* clear on free threshold */
287 
288 #define SKMEM_CACHE_UPDATE_INTERVAL     11      /* 11 seconds */
289 static uint32_t skmem_cache_update_interval = SKMEM_CACHE_UPDATE_INTERVAL;
290 
291 #define SKMEM_DEPOT_CONTENTION  3       /* max failed trylock per interval */
292 static int skmem_cache_depot_contention = SKMEM_DEPOT_CONTENTION;
293 
294 #if (DEVELOPMENT || DEBUG)
295 SYSCTL_UINT(_kern_skywalk_mem, OID_AUTO, cache_update_interval,
296     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_update_interval,
297     SKMEM_CACHE_UPDATE_INTERVAL, "Cache update interval");
298 SYSCTL_INT(_kern_skywalk_mem, OID_AUTO, cache_depot_contention,
299     CTLFLAG_RW | CTLFLAG_LOCKED, &skmem_cache_depot_contention,
300     SKMEM_DEPOT_CONTENTION, "Depot contention");
301 
302 static uint32_t skmem_cache_update_interval_saved = SKMEM_CACHE_UPDATE_INTERVAL;
303 
304 /*
305  * Called by skmem_test_start() to set the update interval.
306  */
307 void
skmem_cache_test_start(uint32_t i)308 skmem_cache_test_start(uint32_t i)
309 {
310 	skmem_cache_update_interval_saved = skmem_cache_update_interval;
311 	skmem_cache_update_interval = i;
312 }
313 
314 /*
315  * Called by skmem_test_stop() to restore the update interval.
316  */
317 void
skmem_cache_test_stop(void)318 skmem_cache_test_stop(void)
319 {
320 	skmem_cache_update_interval = skmem_cache_update_interval_saved;
321 }
322 #endif /* (DEVELOPMENT || DEBUG) */
323 
324 #define SKMEM_TAG_BUFCTL_HASH   "com.apple.skywalk.bufctl.hash"
325 static SKMEM_TAG_DEFINE(skmem_tag_bufctl_hash, SKMEM_TAG_BUFCTL_HASH);
326 
327 #define SKMEM_TAG_CACHE_MIB     "com.apple.skywalk.cache.mib"
328 static SKMEM_TAG_DEFINE(skmem_tag_cache_mib, SKMEM_TAG_CACHE_MIB);
329 
330 static int __skmem_cache_pre_inited = 0;
331 static int __skmem_cache_inited = 0;
332 
333 /*
334  * Called before skmem_region_init().
335  */
336 void
skmem_cache_pre_init(void)337 skmem_cache_pre_init(void)
338 {
339 	vm_size_t skm_size;
340 
341 	ASSERT(!__skmem_cache_pre_inited);
342 
343 	ncpu = ml_wait_max_cpus();
344 
345 	/* allocate extra in case we need to manually align the pointer */
346 	if (skm_zone == NULL) {
347 		skm_size = SKMEM_CACHE_SIZE(ncpu);
348 #if KASAN
349 		/*
350 		 * When KASAN is enabled, the zone allocator adjusts the
351 		 * element size to include the redzone regions, in which
352 		 * case we assume that the elements won't start on the
353 		 * alignment boundary and thus need to do some fix-ups.
354 		 * These include increasing the effective object size
355 		 * which adds at least 136 bytes to the original size,
356 		 * as computed by skmem_region_params_config() above.
357 		 */
358 		skm_size += (sizeof(void *) + CHANNEL_CACHE_ALIGN_MAX);
359 #endif /* KASAN */
360 		skm_size = P2ROUNDUP(skm_size, CHANNEL_CACHE_ALIGN_MAX);
361 		skm_zone = zone_create(SKMEM_ZONE_PREFIX ".skm", skm_size,
362 		    ZC_ZFREE_CLEARMEM | ZC_DESTRUCTIBLE);
363 	}
364 
365 	TAILQ_INIT(&skmem_cache_head);
366 
367 	__skmem_cache_pre_inited = 1;
368 }
369 
370 /*
371  * Called after skmem_region_init().
372  */
373 void
skmem_cache_init(void)374 skmem_cache_init(void)
375 {
376 	uint32_t cpu_cache_line_size = skmem_cpu_cache_line_size();
377 	struct skmem_magtype *mtp;
378 	uint32_t i;
379 
380 	static_assert(SKMEM_CACHE_HASH_LIMIT >= SKMEM_CACHE_HASH_INITIAL);
381 
382 	static_assert(SKM_MODE_NOMAGAZINES == SCA_MODE_NOMAGAZINES);
383 	static_assert(SKM_MODE_AUDIT == SCA_MODE_AUDIT);
384 	static_assert(SKM_MODE_NOREDIRECT == SCA_MODE_NOREDIRECT);
385 	static_assert(SKM_MODE_BATCH == SCA_MODE_BATCH);
386 	static_assert(SKM_MODE_DYNAMIC == SCA_MODE_DYNAMIC);
387 	static_assert(SKM_MODE_CLEARONFREE == SCA_MODE_CLEARONFREE);
388 	static_assert(SKM_MODE_PSEUDO == SCA_MODE_PSEUDO);
389 
390 	ASSERT(__skmem_cache_pre_inited);
391 	ASSERT(!__skmem_cache_inited);
392 
393 	static_assert(offsetof(struct skmem_bufctl, bc_addr) == offsetof(struct skmem_bufctl_audit, bc_addr));
394 	static_assert(offsetof(struct skmem_bufctl, bc_addrm) == offsetof(struct skmem_bufctl_audit, bc_addrm));
395 	static_assert(offsetof(struct skmem_bufctl, bc_slab) == offsetof(struct skmem_bufctl_audit, bc_slab));
396 	static_assert(offsetof(struct skmem_bufctl, bc_lim) == offsetof(struct skmem_bufctl_audit, bc_lim));
397 	static_assert(offsetof(struct skmem_bufctl, bc_flags) == offsetof(struct skmem_bufctl_audit, bc_flags));
398 	static_assert(offsetof(struct skmem_bufctl, bc_idx) == offsetof(struct skmem_bufctl_audit, bc_idx));
399 	static_assert(offsetof(struct skmem_bufctl, bc_usecnt) == offsetof(struct skmem_bufctl_audit, bc_usecnt));
400 	static_assert(sizeof(struct skmem_bufctl) == offsetof(struct skmem_bufctl_audit, bc_thread));
401 
402 	PE_parse_boot_argn("skmem_debug", &skmem_debug, sizeof(skmem_debug));
403 	skmem_debug &= SKMEM_DEBUG_MASK;
404 
405 #if (DEVELOPMENT || DEBUG)
406 	PE_parse_boot_argn("skmem_clear_min", &skmem_clear_min,
407 	    sizeof(skmem_clear_min));
408 #endif /* (DEVELOPMENT || DEBUG) */
409 	if (skmem_clear_min == 0) {
410 		/* zeroing 2 CPU cache lines practically comes for free */
411 		skmem_clear_min = 2 * cpu_cache_line_size;
412 	} else {
413 		/* round it up to CPU cache line size */
414 		skmem_clear_min = (uint32_t)P2ROUNDUP(skmem_clear_min,
415 		    cpu_cache_line_size);
416 	}
417 
418 	/* create a cache for buffer control structures */
419 	if (skmem_debug & SKMEM_DEBUG_AUDIT) {
420 		bc_size = sizeof(struct skmem_bufctl_audit);
421 		skmem_bufctl_cache = skmem_cache_create("bufctl.audit",
422 		    bc_size, sizeof(uint64_t), NULL, NULL,
423 		    NULL, NULL, NULL, 0);
424 	} else {
425 		bc_size = sizeof(struct skmem_bufctl);
426 		skmem_bufctl_cache = skmem_cache_create("bufctl",
427 		    bc_size, sizeof(uint64_t), NULL, NULL,
428 		    NULL, NULL, NULL, 0);
429 	}
430 
431 	/* create a cache for slab structures */
432 	skmem_slab_cache = skmem_cache_create("slab",
433 	    sizeof(struct skmem_slab), sizeof(uint64_t), NULL, NULL, NULL,
434 	    NULL, NULL, 0);
435 
436 	/*
437 	 * Go thru the magazine type table and create a cache for each.
438 	 */
439 	for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
440 		const char *__null_terminated mt_cname = NULL;
441 		mtp = &skmem_magtype[i];
442 
443 		if (mtp->mt_align != 0 &&
444 		    ((mtp->mt_align & (mtp->mt_align - 1)) != 0 ||
445 		    mtp->mt_align < (int)cpu_cache_line_size)) {
446 			panic("%s: bad alignment %d", __func__, mtp->mt_align);
447 			/* NOTREACHED */
448 			__builtin_unreachable();
449 		}
450 		mt_cname = tsnprintf(mtp->mt_cname, sizeof(mtp->mt_cname),
451 		    "mg.%d", mtp->mt_magsize);
452 
453 		/* create a cache for this magazine type */
454 		mtp->mt_cache = skmem_cache_create(mt_cname,
455 		    SKMEM_MAG_SIZE(mtp->mt_magsize), mtp->mt_align,
456 		    skmem_magazine_ctor, NULL, NULL, mtp, NULL, 0);
457 
458 		/* remember the last magazine type */
459 		skmem_cache_magsize_last = mtp;
460 	}
461 
462 	VERIFY(skmem_cache_magsize_last != NULL);
463 	VERIFY(skmem_cache_magsize_last->mt_minbuf == 0);
464 	VERIFY(skmem_cache_magsize_last->mt_maxbuf == 0);
465 
466 	/*
467 	 * Allocate thread calls for cache reap and update operations.
468 	 */
469 	skmem_cache_reap_tc =
470 	    thread_call_allocate_with_options(skmem_cache_reap_func,
471 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
472 	skmem_cache_update_tc =
473 	    thread_call_allocate_with_options(skmem_cache_update_func,
474 	    NULL, THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
475 	if (skmem_cache_reap_tc == NULL || skmem_cache_update_tc == NULL) {
476 		panic("%s: thread_call_allocate failed", __func__);
477 		/* NOTREACHED */
478 		__builtin_unreachable();
479 	}
480 
481 	/*
482 	 * We're ready; go through existing skmem_cache entries
483 	 * (if any) and enable the magazines layer for each.
484 	 */
485 	skmem_cache_applyall(skmem_cache_magazine_enable, 0);
486 	skmem_cache_ready = TRUE;
487 
488 	/* and start the periodic cache update machinery */
489 	skmem_dispatch(skmem_cache_update_tc, NULL,
490 	    (skmem_cache_update_interval * NSEC_PER_SEC));
491 
492 	__skmem_cache_inited = 1;
493 }
494 
495 void
skmem_cache_fini(void)496 skmem_cache_fini(void)
497 {
498 	struct skmem_magtype *mtp;
499 	uint32_t i;
500 
501 	if (__skmem_cache_inited) {
502 		ASSERT(TAILQ_EMPTY(&skmem_cache_head));
503 
504 		for (i = 0; i < sizeof(skmem_magtype) / sizeof(*mtp); i++) {
505 			mtp = &skmem_magtype[i];
506 			skmem_cache_destroy(mtp->mt_cache);
507 			mtp->mt_cache = NULL;
508 		}
509 		skmem_cache_destroy(skmem_slab_cache);
510 		skmem_slab_cache = NULL;
511 		skmem_cache_destroy(skmem_bufctl_cache);
512 		skmem_bufctl_cache = NULL;
513 
514 		if (skmem_cache_reap_tc != NULL) {
515 			(void) thread_call_cancel_wait(skmem_cache_reap_tc);
516 			(void) thread_call_free(skmem_cache_reap_tc);
517 			skmem_cache_reap_tc = NULL;
518 		}
519 		if (skmem_cache_update_tc != NULL) {
520 			(void) thread_call_cancel_wait(skmem_cache_update_tc);
521 			(void) thread_call_free(skmem_cache_update_tc);
522 			skmem_cache_update_tc = NULL;
523 		}
524 
525 		__skmem_cache_inited = 0;
526 	}
527 
528 	if (__skmem_cache_pre_inited) {
529 		if (skm_zone != NULL) {
530 			zdestroy(skm_zone);
531 			skm_zone = NULL;
532 		}
533 
534 		__skmem_cache_pre_inited = 0;
535 	}
536 }
537 
538 /*
539  * Create a cache.
540  */
541 struct skmem_cache *
skmem_cache_create(const char * name,size_t bufsize,size_t bufalign,skmem_ctor_fn_t ctor,skmem_dtor_fn_t dtor,skmem_reclaim_fn_t reclaim,void * private,struct skmem_region * region,uint32_t cflags)542 skmem_cache_create(const char *name, size_t bufsize, size_t bufalign,
543     skmem_ctor_fn_t ctor, skmem_dtor_fn_t dtor, skmem_reclaim_fn_t reclaim,
544     void *private, struct skmem_region *region, uint32_t cflags)
545 {
546 	boolean_t pseudo = (region == NULL);
547 	struct skmem_magtype *mtp;
548 	struct skmem_cache *__single skm;
549 #if KASAN
550 	void *buf;
551 	size_t skm_align_off;
552 #endif
553 	size_t segsize;
554 	size_t chunksize;
555 	size_t objsize;
556 	size_t objalign;
557 	uint32_t i, cpuid;
558 
559 	/* enforce 64-bit minimum alignment for buffers */
560 	if (bufalign == 0) {
561 		bufalign = SKMEM_CACHE_ALIGN;
562 	}
563 	bufalign = P2ROUNDUP(bufalign, SKMEM_CACHE_ALIGN);
564 
565 	/* enforce alignment to be a power of 2 */
566 	VERIFY(powerof2(bufalign));
567 
568 	if (region == NULL) {
569 		struct skmem_region_params srp = {};
570 
571 		/* batching is currently not supported on pseudo regions */
572 		VERIFY(!(cflags & SKMEM_CR_BATCH));
573 
574 		srp = *skmem_get_default(SKMEM_REGION_INTRINSIC);
575 		ASSERT(srp.srp_cflags == SKMEM_REGION_CR_PSEUDO);
576 
577 		/* objalign is always equal to bufalign */
578 		srp.srp_align = objalign = bufalign;
579 		srp.srp_r_obj_cnt = 1;
580 		srp.srp_r_obj_size = (uint32_t)bufsize;
581 		skmem_region_params_config(&srp);
582 
583 		/* allocate region for intrinsics */
584 		region = skmem_region_create(name, &srp, NULL, NULL, NULL);
585 		VERIFY(region->skr_c_obj_size >= P2ROUNDUP(bufsize, bufalign));
586 		VERIFY(objalign == region->skr_align);
587 #if KASAN
588 		/*
589 		 * When KASAN is enabled, the zone allocator adjusts the
590 		 * element size to include the redzone regions, in which
591 		 * case we assume that the elements won't start on the
592 		 * alignment boundary and thus need to do some fix-ups.
593 		 * These include increasing the effective object size
594 		 * which adds at least 16 bytes to the original size,
595 		 * as computed by skmem_region_params_config() above.
596 		 */
597 		VERIFY(region->skr_c_obj_size >=
598 		    (bufsize + sizeof(uint64_t) + bufalign));
599 #endif /* KASAN */
600 		/* enable magazine resizing by default */
601 		cflags |= SKMEM_CR_DYNAMIC;
602 
603 		/*
604 		 * For consistency with ZC_ZFREE_CLEARMEM on skr->zreg,
605 		 * even though it's a no-op since the work is done
606 		 * at the zone layer instead.
607 		 */
608 		cflags |= SKMEM_CR_CLEARONFREE;
609 	} else {
610 		objalign = region->skr_align;
611 	}
612 
613 	ASSERT(region != NULL);
614 	ASSERT(!(region->skr_mode & SKR_MODE_MIRRORED));
615 	segsize = region->skr_seg_size;
616 	ASSERT(bufalign <= segsize);
617 
618 #if KASAN
619 	buf = zalloc_flags_buf(skm_zone, Z_WAITOK | Z_ZERO);
620 	/*
621 	 * We need to align `buf` such that offsetof(struct skmem_cache, skm_align)
622 	 * is aligned to a cache line boundary. In KASAN builds, allocations are
623 	 * preceded by metadata that changes the alignment of the object. The
624 	 * extra required size is accounted for at the time skm_zone is created.
625 	 * We then save the actual start of the allocation to skm_start, as it's
626 	 * the address we need to actually free.
627 	 */
628 	skm_align_off = offsetof(struct skmem_cache, skm_align);
629 	uintptr_t diff = P2ROUNDUP((intptr_t)buf + skm_align_off,
630 	    CHANNEL_CACHE_ALIGN_MAX) - (intptr_t)buf;
631 	skm = (void *)((char *)buf + diff);
632 	skm->skm_start = buf;
633 #else /* !KASAN */
634 	/*
635 	 * We expect that the zone allocator would allocate elements
636 	 * rounded up to the requested alignment based on the object
637 	 * size computed in skmem_cache_pre_init() earlier, and
638 	 * 'skm' is therefore the element address itself.
639 	 */
640 	skm = zalloc_flags_buf(skm_zone, Z_WAITOK | Z_ZERO);
641 #endif /* !KASAN */
642 	skm->skm_cpu_cache_count = ncpu;
643 
644 	VERIFY(IS_P2ALIGNED(skm, CHANNEL_CACHE_ALIGN_MAX));
645 
646 	if ((skmem_debug & SKMEM_DEBUG_NOMAGAZINES) ||
647 	    (cflags & SKMEM_CR_NOMAGAZINES)) {
648 		/*
649 		 * Either the caller insists that this cache should not
650 		 * utilize magazines layer, or that the system override
651 		 * to disable magazines layer on all caches has been set.
652 		 */
653 		skm->skm_mode |= SKM_MODE_NOMAGAZINES;
654 	} else {
655 		/*
656 		 * Region must be configured with enough objects
657 		 * to take into account objects at the CPU layer.
658 		 */
659 		ASSERT(!(region->skr_mode & SKR_MODE_NOMAGAZINES));
660 	}
661 
662 	if (cflags & SKMEM_CR_DYNAMIC) {
663 		/*
664 		 * Enable per-CPU cache magazine resizing.
665 		 */
666 		skm->skm_mode |= SKM_MODE_DYNAMIC;
667 	}
668 
669 	/* region stays around after defunct? */
670 	if (region->skr_mode & SKR_MODE_NOREDIRECT) {
671 		skm->skm_mode |= SKM_MODE_NOREDIRECT;
672 	}
673 
674 	if (cflags & SKMEM_CR_BATCH) {
675 		/*
676 		 * Batch alloc/free involves storing the next object
677 		 * pointer at the beginning of each object; this is
678 		 * okay for kernel-only regions, but not those that
679 		 * are mappable to user space (we can't leak kernel
680 		 * addresses).
681 		 */
682 		static_assert(offsetof(struct skmem_obj, mo_next) == 0);
683 		VERIFY(!(region->skr_mode & SKR_MODE_MMAPOK));
684 
685 		/* batching is currently not supported on pseudo regions */
686 		VERIFY(!(region->skr_mode & SKR_MODE_PSEUDO));
687 
688 		/* validate object size */
689 		VERIFY(region->skr_c_obj_size >= sizeof(struct skmem_obj));
690 
691 		skm->skm_mode |= SKM_MODE_BATCH;
692 	}
693 
694 	uuid_generate_random(skm->skm_uuid);
695 	(void) snprintf(skm->skm_name, sizeof(skm->skm_name),
696 	    "%s.%s", SKMEM_CACHE_PREFIX, name);
697 	skm->skm_bufsize = bufsize;
698 	skm->skm_bufalign = bufalign;
699 	skm->skm_objalign = objalign;
700 	skm->skm_ctor = ctor;
701 	skm->skm_dtor = dtor;
702 	skm->skm_reclaim = reclaim;
703 	skm->skm_private = private;
704 	skm->skm_slabsize = segsize;
705 
706 	skm->skm_region = region;
707 	/* callee holds reference */
708 	skmem_region_slab_config(region, skm, true);
709 	objsize = region->skr_c_obj_size;
710 	skm->skm_objsize = objsize;
711 
712 	if (pseudo) {
713 		/*
714 		 * Release reference from skmem_region_create()
715 		 * since skm->skm_region holds one now.
716 		 */
717 		ASSERT(region->skr_mode & SKR_MODE_PSEUDO);
718 		skmem_region_release(region);
719 
720 		skm->skm_mode |= SKM_MODE_PSEUDO;
721 
722 		skm->skm_slab_alloc = skmem_slab_alloc_pseudo_locked;
723 		skm->skm_slab_free = skmem_slab_free_pseudo_locked;
724 	} else {
725 		skm->skm_slab_alloc = skmem_slab_alloc_locked;
726 		skm->skm_slab_free = skmem_slab_free_locked;
727 
728 		/* auditing was requested? (normal regions only) */
729 		if (skmem_debug & SKMEM_DEBUG_AUDIT) {
730 			ASSERT(bc_size == sizeof(struct skmem_bufctl_audit));
731 			skm->skm_mode |= SKM_MODE_AUDIT;
732 		}
733 	}
734 
735 	/*
736 	 * Clear upon free (to slab layer) as long as the region is
737 	 * not marked as read-only for kernel, and if the chunk size
738 	 * is within the threshold or if the caller had requested it.
739 	 */
740 	if (!(region->skr_mode & SKR_MODE_KREADONLY)) {
741 		if (skm->skm_objsize <= skmem_clear_min ||
742 		    (cflags & SKMEM_CR_CLEARONFREE)) {
743 			skm->skm_mode |= SKM_MODE_CLEARONFREE;
744 		}
745 	}
746 
747 	chunksize = bufsize;
748 	if (bufalign >= SKMEM_CACHE_ALIGN) {
749 		chunksize = P2ROUNDUP(chunksize, SKMEM_CACHE_ALIGN);
750 	}
751 
752 	chunksize = P2ROUNDUP(chunksize, bufalign);
753 	if (chunksize > objsize) {
754 		panic("%s: (bufsize %lu, chunksize %lu) > objsize %lu",
755 		    __func__, bufsize, chunksize, objsize);
756 		/* NOTREACHED */
757 		__builtin_unreachable();
758 	}
759 	ASSERT(chunksize != 0);
760 	skm->skm_chunksize = chunksize;
761 
762 	lck_mtx_init(&skm->skm_sl_lock, &skmem_sl_lock_grp, &skmem_lock_attr);
763 	TAILQ_INIT(&skm->skm_sl_partial_list);
764 	TAILQ_INIT(&skm->skm_sl_empty_list);
765 
766 	/* allocated-address hash table */
767 	skm->skm_hash_initial = SKMEM_CACHE_HASH_INITIAL;
768 	skm->skm_hash_limit = SKMEM_CACHE_HASH_LIMIT;
769 	skm->skm_hash_table = sk_alloc_type_array(struct skmem_bufctl_bkt,
770 	    skm->skm_hash_initial, Z_WAITOK | Z_NOFAIL, skmem_tag_bufctl_hash);
771 	skm->skm_hash_size = skm->skm_hash_initial;
772 
773 	skm->skm_hash_mask = (skm->skm_hash_initial - 1);
774 	skm->skm_hash_shift = flsll(chunksize) - 1;
775 
776 	for (i = 0; i < (skm->skm_hash_mask + 1); i++) {
777 		SLIST_INIT(&skm->skm_hash_table[i].bcb_head);
778 	}
779 
780 	lck_mtx_init(&skm->skm_dp_lock, &skmem_dp_lock_grp, &skmem_lock_attr);
781 
782 	/* find a suitable magazine type for this chunk size */
783 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
784 		continue;
785 	}
786 
787 	skm->skm_magtype = mtp;
788 	if (!(skm->skm_mode & SKM_MODE_NOMAGAZINES)) {
789 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
790 	}
791 
792 	/*
793 	 * Initialize the CPU layer.  Each per-CPU structure is aligned
794 	 * on the CPU cache line boundary to prevent false sharing.
795 	 */
796 	lck_mtx_init(&skm->skm_rs_lock, &skmem_cpu_lock_grp, &skmem_lock_attr);
797 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
798 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
799 
800 		VERIFY(IS_P2ALIGNED(ccp, CHANNEL_CACHE_ALIGN_MAX));
801 		lck_mtx_init(&ccp->cp_lock, &skmem_cpu_lock_grp,
802 		    &skmem_lock_attr);
803 		ccp->cp_rounds = -1;
804 		ccp->cp_prounds = -1;
805 	}
806 
807 	SKMEM_CACHE_LOCK();
808 	TAILQ_INSERT_TAIL(&skmem_cache_head, skm, skm_link);
809 	SKMEM_CACHE_UNLOCK();
810 
811 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm %p mode 0x%x",
812 	    skm->skm_name, SK_KVA(skm), skm->skm_mode);
813 	SK_DF(SK_VERB_MEM_CACHE,
814 	    "  bufsz %u bufalign %u chunksz %u objsz %u slabsz %u",
815 	    (uint32_t)skm->skm_bufsize, (uint32_t)skm->skm_bufalign,
816 	    (uint32_t)skm->skm_chunksize, (uint32_t)skm->skm_objsize,
817 	    (uint32_t)skm->skm_slabsize);
818 
819 	if (skmem_cache_ready) {
820 		skmem_cache_magazine_enable(skm, 0);
821 	}
822 
823 	if (cflags & SKMEM_CR_RECLAIM) {
824 		skm->skm_mode |= SKM_MODE_RECLAIM;
825 	}
826 
827 	return skm;
828 }
829 
830 /*
831  * Destroy a cache.
832  */
833 void
skmem_cache_destroy(struct skmem_cache * skm)834 skmem_cache_destroy(struct skmem_cache *skm)
835 {
836 	uint32_t cpuid;
837 
838 	SKMEM_CACHE_LOCK();
839 	TAILQ_REMOVE(&skmem_cache_head, skm, skm_link);
840 	SKMEM_CACHE_UNLOCK();
841 
842 	ASSERT(skm->skm_rs_busy == 0);
843 	ASSERT(skm->skm_rs_want == 0);
844 
845 	/* purge all cached objects for this cache */
846 	skmem_cache_magazine_purge(skm);
847 
848 	/*
849 	 * Panic if we detect there are unfreed objects; the caller
850 	 * destroying this cache is responsible for ensuring that all
851 	 * allocated objects have been freed prior to getting here.
852 	 */
853 	SKM_SLAB_LOCK(skm);
854 	if (skm->skm_sl_bufinuse != 0) {
855 		panic("%s: '%s' (%p) not empty (%llu unfreed)", __func__,
856 		    skm->skm_name, (void *)skm, skm->skm_sl_bufinuse);
857 		/* NOTREACHED */
858 		__builtin_unreachable();
859 	}
860 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_partial_list));
861 	ASSERT(skm->skm_sl_partial == 0);
862 	ASSERT(TAILQ_EMPTY(&skm->skm_sl_empty_list));
863 	ASSERT(skm->skm_sl_empty == 0);
864 	skm->skm_reclaim = NULL;
865 	skm->skm_ctor = NULL;
866 	skm->skm_dtor = NULL;
867 	SKM_SLAB_UNLOCK(skm);
868 
869 	if (skm->skm_hash_table != NULL) {
870 #if (DEBUG || DEVELOPMENT)
871 		for (uint32_t i = 0; i < (skm->skm_hash_mask + 1); i++) {
872 			ASSERT(SLIST_EMPTY(&skm->skm_hash_table[i].bcb_head));
873 		}
874 #endif /* DEBUG || DEVELOPMENT */
875 
876 		/* XXX -fbounds-safety: __counted_by pointer (skm_hash_table)
877 		 * cannot be pointed to by any other variable */
878 		struct skmem_bufctl_bkt *__indexable htable = skm->skm_hash_table;
879 		sk_free_type_array(struct skmem_bufctl_bkt,
880 		    skm->skm_hash_size, htable);
881 		skm->skm_hash_table = NULL;
882 		htable = NULL;
883 		skm->skm_hash_size = 0;
884 	}
885 
886 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
887 		lck_mtx_destroy(&skm->skm_cpu_cache[cpuid].cp_lock,
888 		    &skmem_cpu_lock_grp);
889 	}
890 	lck_mtx_destroy(&skm->skm_rs_lock, &skmem_cpu_lock_grp);
891 	lck_mtx_destroy(&skm->skm_dp_lock, &skmem_dp_lock_grp);
892 	lck_mtx_destroy(&skm->skm_sl_lock, &skmem_sl_lock_grp);
893 
894 	SK_DF(SK_VERB_MEM_CACHE, "\"%s\": skm %p",
895 	    skm->skm_name, SK_KVA(skm));
896 
897 	/* callee releases reference */
898 	skmem_region_slab_config(skm->skm_region, skm, false);
899 	skm->skm_region = NULL;
900 
901 #if KASAN
902 	/* get the original address since we're about to free it */
903 	zfree(skm_zone, skm->skm_start);
904 #else
905 	zfree(skm_zone, skm);
906 #endif /* KASAN */
907 }
908 
909 /*
910  * Return the object's region info.
911  */
912 void
skmem_cache_get_obj_info(struct skmem_cache * skm,void * buf,struct skmem_obj_info * oi,struct skmem_obj_info * oim)913 skmem_cache_get_obj_info(struct skmem_cache *skm, void *buf,
914     struct skmem_obj_info *oi, struct skmem_obj_info *oim)
915 {
916 	struct skmem_bufctl_bkt *bcb;
917 	struct skmem_bufctl *bc;
918 	struct skmem_slab *sl;
919 
920 	/*
921 	 * Search the hash chain to find a matching buffer control for the
922 	 * given object address.  If not found, panic since the caller has
923 	 * given us a bogus address.
924 	 */
925 	SKM_SLAB_LOCK(skm);
926 	bcb = SKMEM_CACHE_HASH(skm, buf);
927 	SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
928 		if (bc->bc_addr == buf) {
929 			break;
930 		}
931 	}
932 
933 	if (__improbable(bc == NULL)) {
934 		panic("%s: %s failed to get object info for %p",
935 		    __func__, skm->skm_name, buf);
936 		/* NOTREACHED */
937 		__builtin_unreachable();
938 	}
939 
940 	/*
941 	 * Return the master object's info to the caller.
942 	 */
943 	sl = bc->bc_slab;
944 	SKMEM_OBJ_ADDR(oi) = __unsafe_forge_bidi_indexable(void *, bc->bc_addr,
945 	    (uint32_t)skm->skm_objsize);
946 	SKMEM_OBJ_SIZE(oi) = (uint32_t)skm->skm_objsize;
947 	ASSERT(skm->skm_objsize <= UINT32_MAX);
948 	SKMEM_OBJ_BUFCTL(oi) = bc;      /* master only; NULL for slave */
949 	SKMEM_OBJ_IDX_REG(oi) =
950 	    (sl->sl_seg->sg_index * sl->sl_chunks) + bc->bc_idx;
951 	SKMEM_OBJ_IDX_SEG(oi) = bc->bc_idx;
952 	/*
953 	 * And for slave object.
954 	 */
955 	if (oim != NULL) {
956 		bzero(oim, sizeof(*oim));
957 		if (bc->bc_addrm != NULL) {
958 			SKMEM_OBJ_ADDR(oim) = __unsafe_forge_bidi_indexable(
959 				void *, bc->bc_addrm, oi->oi_size);
960 			SKMEM_OBJ_SIZE(oim) = oi->oi_size;
961 			SKMEM_OBJ_IDX_REG(oim) = oi->oi_idx_reg;
962 			SKMEM_OBJ_IDX_SEG(oim) = oi->oi_idx_seg;
963 		}
964 	}
965 	SKM_SLAB_UNLOCK(skm);
966 }
967 
968 /*
969  * Magazine constructor.
970  */
971 static int
skmem_magazine_ctor(struct skmem_obj_info * oi,struct skmem_obj_info * oim,void * arg,uint32_t skmflag)972 skmem_magazine_ctor(struct skmem_obj_info *oi, struct skmem_obj_info *oim,
973     void *arg, uint32_t skmflag)
974 {
975 #pragma unused(oim, skmflag)
976 	struct skmem_mag *__single mg = SKMEM_OBJ_ADDR(oi);
977 
978 	ASSERT(oim == NULL);
979 	ASSERT(arg != NULL);
980 
981 	/*
982 	 * Store it in the magazine object since we'll
983 	 * need to refer to it during magazine destroy;
984 	 * we can't safely refer to skm_magtype as the
985 	 * depot lock may not be acquired then.
986 	 */
987 	mg->mg_magtype = arg;
988 
989 	return 0;
990 }
991 
992 /*
993  * Destroy a magazine (free each object to the slab layer).
994  */
995 static void
skmem_magazine_destroy(struct skmem_cache * skm,struct skmem_mag * mg,int nrounds)996 skmem_magazine_destroy(struct skmem_cache *skm, struct skmem_mag *mg,
997     int nrounds)
998 {
999 	int round;
1000 
1001 	for (round = 0; round < nrounds; round++) {
1002 		void *__single buf = mg->mg_round[round];
1003 		struct skmem_obj *next;
1004 
1005 		if (skm->skm_mode & SKM_MODE_BATCH) {
1006 			next = ((struct skmem_obj *)buf)->mo_next;
1007 			((struct skmem_obj *)buf)->mo_next = NULL;
1008 		}
1009 
1010 		/* deconstruct the object */
1011 		if (skm->skm_dtor != NULL) {
1012 			skm->skm_dtor(buf, skm->skm_private);
1013 		}
1014 
1015 		/*
1016 		 * In non-batching mode, each object in the magazine has
1017 		 * no linkage to its neighbor, so free individual object
1018 		 * to the slab layer now.
1019 		 */
1020 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
1021 			skmem_slab_free(skm, buf);
1022 		} else {
1023 			((struct skmem_obj *)buf)->mo_next = next;
1024 		}
1025 	}
1026 
1027 	/*
1028 	 * In batching mode, each object is linked to its neighbor at free
1029 	 * time, and so take the bottom-most object and free it to the slab
1030 	 * layer.  Because of the way the list is reversed during free, this
1031 	 * will bring along the rest of objects above it.
1032 	 */
1033 	if (nrounds > 0 && (skm->skm_mode & SKM_MODE_BATCH)) {
1034 		skmem_slab_batch_free(skm, mg->mg_round[nrounds - 1]);
1035 	}
1036 
1037 	/* free the magazine itself back to cache */
1038 	skmem_cache_free(mg->mg_magtype->mt_cache, mg);
1039 }
1040 
1041 /*
1042  * Get one or more magazines from the depot.
1043  */
1044 static uint32_t
skmem_depot_batch_alloc(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * __bidi_indexable * list,uint32_t num)1045 skmem_depot_batch_alloc(struct skmem_cache *skm, struct skmem_maglist *ml,
1046     uint32_t *count, struct skmem_mag *__bidi_indexable *list, uint32_t num)
1047 {
1048 	SLIST_HEAD(, skmem_mag) mg_list = SLIST_HEAD_INITIALIZER(mg_list);
1049 	struct skmem_mag *mg;
1050 	uint32_t need = num, c = 0;
1051 
1052 	ASSERT(list != NULL && need > 0);
1053 
1054 	if (!SKM_DEPOT_LOCK_TRY(skm)) {
1055 		/*
1056 		 * Track the amount of lock contention here; if the contention
1057 		 * level is high (more than skmem_cache_depot_contention per a
1058 		 * given skmem_cache_update_interval interval), then we treat
1059 		 * it as a sign that the per-CPU layer is not using the right
1060 		 * magazine type, and that we'd need to resize it.
1061 		 */
1062 		SKM_DEPOT_LOCK(skm);
1063 		if (skm->skm_mode & SKM_MODE_DYNAMIC) {
1064 			skm->skm_depot_contention++;
1065 		}
1066 	}
1067 
1068 	while ((mg = SLIST_FIRST(&ml->ml_list)) != NULL) {
1069 		SLIST_REMOVE_HEAD(&ml->ml_list, mg_link);
1070 		SLIST_INSERT_HEAD(&mg_list, mg, mg_link);
1071 		ASSERT(ml->ml_total != 0);
1072 		if (--ml->ml_total < ml->ml_min) {
1073 			ml->ml_min = ml->ml_total;
1074 		}
1075 		c++;
1076 		ml->ml_alloc++;
1077 		if (--need == 0) {
1078 			break;
1079 		}
1080 	}
1081 	*count -= c;
1082 
1083 	SKM_DEPOT_UNLOCK(skm);
1084 
1085 	*list = SLIST_FIRST(&mg_list);
1086 
1087 	return num - need;
1088 }
1089 
1090 /*
1091  * Return one or more magazines to the depot.
1092  */
1093 static void
skmem_depot_batch_free(struct skmem_cache * skm,struct skmem_maglist * ml,uint32_t * count,struct skmem_mag * mg)1094 skmem_depot_batch_free(struct skmem_cache *skm, struct skmem_maglist *ml,
1095     uint32_t *count, struct skmem_mag *mg)
1096 {
1097 	struct skmem_mag *nmg;
1098 	uint32_t c = 0;
1099 
1100 	SKM_DEPOT_LOCK(skm);
1101 	while (mg != NULL) {
1102 		nmg = SLIST_NEXT(mg, mg_link);
1103 		SLIST_INSERT_HEAD(&ml->ml_list, mg, mg_link);
1104 		ml->ml_total++;
1105 		c++;
1106 		mg = nmg;
1107 	}
1108 	*count += c;
1109 	SKM_DEPOT_UNLOCK(skm);
1110 }
1111 
1112 /*
1113  * Update the depot's working state statistics.
1114  */
1115 static void
skmem_depot_ws_update(struct skmem_cache * skm)1116 skmem_depot_ws_update(struct skmem_cache *skm)
1117 {
1118 	SKM_DEPOT_LOCK_SPIN(skm);
1119 	skm->skm_full.ml_reaplimit = skm->skm_full.ml_min;
1120 	skm->skm_full.ml_min = skm->skm_full.ml_total;
1121 	skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_min;
1122 	skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1123 	SKM_DEPOT_UNLOCK(skm);
1124 }
1125 
1126 /*
1127  * Empty the depot's working state statistics (everything's reapable.)
1128  */
1129 static void
skmem_depot_ws_zero(struct skmem_cache * skm)1130 skmem_depot_ws_zero(struct skmem_cache *skm)
1131 {
1132 	SKM_DEPOT_LOCK_SPIN(skm);
1133 	if (skm->skm_full.ml_reaplimit != skm->skm_full.ml_total ||
1134 	    skm->skm_full.ml_min != skm->skm_full.ml_total ||
1135 	    skm->skm_empty.ml_reaplimit != skm->skm_empty.ml_total ||
1136 	    skm->skm_empty.ml_min != skm->skm_empty.ml_total) {
1137 		skm->skm_full.ml_reaplimit = skm->skm_full.ml_total;
1138 		skm->skm_full.ml_min = skm->skm_full.ml_total;
1139 		skm->skm_empty.ml_reaplimit = skm->skm_empty.ml_total;
1140 		skm->skm_empty.ml_min = skm->skm_empty.ml_total;
1141 		skm->skm_depot_ws_zero++;
1142 	}
1143 	SKM_DEPOT_UNLOCK(skm);
1144 }
1145 
1146 /*
1147  * Reap magazines that's outside of the working set.
1148  */
1149 static void
skmem_depot_ws_reap(struct skmem_cache * skm)1150 skmem_depot_ws_reap(struct skmem_cache *skm)
1151 {
1152 	struct skmem_mag *mg, *nmg;
1153 	uint32_t f, e, reap;
1154 
1155 	reap = f = MIN(skm->skm_full.ml_reaplimit, skm->skm_full.ml_min);
1156 	if (reap != 0) {
1157 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1158 		    &skm->skm_depot_full, &mg, reap);
1159 		while (mg != NULL) {
1160 			nmg = SLIST_NEXT(mg, mg_link);
1161 			SLIST_NEXT(mg, mg_link) = NULL;
1162 			skmem_magazine_destroy(skm, mg,
1163 			    mg->mg_magtype->mt_magsize);
1164 			mg = nmg;
1165 		}
1166 	}
1167 
1168 	reap = e = MIN(skm->skm_empty.ml_reaplimit, skm->skm_empty.ml_min);
1169 	if (reap != 0) {
1170 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1171 		    &skm->skm_depot_empty, &mg, reap);
1172 		while (mg != NULL) {
1173 			nmg = SLIST_NEXT(mg, mg_link);
1174 			SLIST_NEXT(mg, mg_link) = NULL;
1175 			skmem_magazine_destroy(skm, mg, 0);
1176 			mg = nmg;
1177 		}
1178 	}
1179 
1180 	if (f != 0 || e != 0) {
1181 		os_atomic_inc(&skm->skm_cpu_mag_reap, relaxed);
1182 	}
1183 }
1184 
1185 /*
1186  * Performs periodic maintenance on a cache.  This is serialized
1187  * through the update thread call, and so we guarantee there's at
1188  * most one update episode in the system at any given time.
1189  */
1190 static void
skmem_cache_update(struct skmem_cache * skm,uint32_t arg)1191 skmem_cache_update(struct skmem_cache *skm, uint32_t arg)
1192 {
1193 #pragma unused(arg)
1194 	boolean_t resize_mag = FALSE;
1195 	boolean_t rescale_hash = FALSE;
1196 
1197 	SKMEM_CACHE_LOCK_ASSERT_HELD();
1198 
1199 	/* insist that we are executing in the update thread call context */
1200 	ASSERT(sk_is_cache_update_protected());
1201 
1202 	/*
1203 	 * If the cache has become much larger or smaller than the
1204 	 * allocated-address hash table, rescale the hash table.
1205 	 */
1206 	SKM_SLAB_LOCK(skm);
1207 	if ((skm->skm_sl_bufinuse > (skm->skm_hash_mask << 1) &&
1208 	    (skm->skm_hash_mask + 1) < skm->skm_hash_limit) ||
1209 	    (skm->skm_sl_bufinuse < (skm->skm_hash_mask >> 1) &&
1210 	    skm->skm_hash_mask > skm->skm_hash_initial)) {
1211 		rescale_hash = TRUE;
1212 	}
1213 	SKM_SLAB_UNLOCK(skm);
1214 
1215 	/*
1216 	 * Update the working set.
1217 	 */
1218 	skmem_depot_ws_update(skm);
1219 
1220 	/*
1221 	 * If the contention count is greater than the threshold during
1222 	 * the update interval, and if we are not already at the maximum
1223 	 * magazine size, increase it.
1224 	 */
1225 	SKM_DEPOT_LOCK_SPIN(skm);
1226 	if (skm->skm_chunksize < skm->skm_magtype->mt_maxbuf &&
1227 	    (int)(skm->skm_depot_contention - skm->skm_depot_contention_prev) >
1228 	    skmem_cache_depot_contention) {
1229 		ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1230 		resize_mag = TRUE;
1231 	}
1232 	skm->skm_depot_contention_prev = skm->skm_depot_contention;
1233 	SKM_DEPOT_UNLOCK(skm);
1234 
1235 	if (rescale_hash) {
1236 		skmem_cache_hash_rescale(skm);
1237 	}
1238 
1239 	if (resize_mag) {
1240 		skmem_cache_magazine_resize(skm);
1241 	}
1242 }
1243 
1244 /*
1245  * Reload the CPU's magazines with mg and its follower (if any).
1246  */
1247 static void
skmem_cpu_batch_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1248 skmem_cpu_batch_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg,
1249     int rounds)
1250 {
1251 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1252 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1253 	ASSERT(cp->cp_magsize > 0);
1254 
1255 	cp->cp_loaded = mg;
1256 	cp->cp_rounds = rounds;
1257 	if (__probable(SLIST_NEXT(mg, mg_link) != NULL)) {
1258 		cp->cp_ploaded = SLIST_NEXT(mg, mg_link);
1259 		cp->cp_prounds = rounds;
1260 		SLIST_NEXT(mg, mg_link) = NULL;
1261 	} else {
1262 		ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1263 		cp->cp_ploaded = NULL;
1264 		cp->cp_prounds = -1;
1265 	}
1266 }
1267 
1268 /*
1269  * Reload the CPU's magazine with mg and save the previous one.
1270  */
1271 static void
skmem_cpu_reload(struct skmem_cpu_cache * cp,struct skmem_mag * mg,int rounds)1272 skmem_cpu_reload(struct skmem_cpu_cache *cp, struct skmem_mag *mg, int rounds)
1273 {
1274 	ASSERT((cp->cp_loaded == NULL && cp->cp_rounds == -1) ||
1275 	    (cp->cp_loaded && cp->cp_rounds + rounds == cp->cp_magsize));
1276 	ASSERT(cp->cp_magsize > 0);
1277 
1278 	cp->cp_ploaded = cp->cp_loaded;
1279 	cp->cp_prounds = cp->cp_rounds;
1280 	cp->cp_loaded = mg;
1281 	cp->cp_rounds = rounds;
1282 }
1283 
1284 /*
1285  * Allocate constructed object(s) from the cache.
1286  */
1287 uint32_t
skmem_cache_batch_alloc(struct skmem_cache * skm,struct skmem_obj ** list,size_t SK_FB_ARG objsize,uint32_t num,uint32_t skmflag)1288 skmem_cache_batch_alloc(struct skmem_cache *skm, struct skmem_obj **list,
1289     size_t SK_FB_ARG objsize, uint32_t num, uint32_t skmflag)
1290 {
1291 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1292 	struct skmem_obj **top = list;
1293 	struct skmem_mag *mg;
1294 	uint32_t need = num;
1295 
1296 	ASSERT(list != NULL);
1297 	*list = NULL;
1298 
1299 	if (need == 0) {
1300 		return 0;
1301 	}
1302 	ASSERT(need == 1 || (skm->skm_mode & SKM_MODE_BATCH));
1303 
1304 	SKM_CPU_LOCK(cp);
1305 	for (;;) {
1306 		/*
1307 		 * If we have an object in the current CPU's loaded
1308 		 * magazine, return it and we're done.
1309 		 */
1310 		if (cp->cp_rounds > 0) {
1311 			int objs = MIN((unsigned int)cp->cp_rounds, need);
1312 			/*
1313 			 * In the SKM_MODE_BATCH case, objects in are already
1314 			 * linked together with the most recently freed object
1315 			 * at the head of the list; grab as many objects as we
1316 			 * can.  Otherwise we'll just grab 1 object at most.
1317 			 */
1318 			*list = cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1319 			cp->cp_rounds -= objs;
1320 			cp->cp_alloc += objs;
1321 
1322 			if (skm->skm_mode & SKM_MODE_BATCH) {
1323 				struct skmem_obj *__single tail =
1324 				    cp->cp_loaded->mg_round[cp->cp_rounds];
1325 				list = &tail->mo_next;
1326 				*list = NULL;
1327 			}
1328 
1329 			/* if we got them all, return to caller */
1330 			if ((need -= objs) == 0) {
1331 				SKM_CPU_UNLOCK(cp);
1332 				goto done;
1333 			}
1334 		}
1335 
1336 		/*
1337 		 * The CPU's loaded magazine is empty.  If the previously
1338 		 * loaded magazine was full, exchange and try again.
1339 		 */
1340 		if (cp->cp_prounds > 0) {
1341 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
1342 			continue;
1343 		}
1344 
1345 		/*
1346 		 * If the magazine layer is disabled, allocate from slab.
1347 		 * This can happen either because SKM_MODE_NOMAGAZINES is
1348 		 * set, or because we are resizing the magazine now.
1349 		 */
1350 		if (cp->cp_magsize == 0) {
1351 			break;
1352 		}
1353 
1354 		/*
1355 		 * Both of the CPU's magazines are empty; try to get
1356 		 * full magazine(s) from the depot layer.  Upon success,
1357 		 * reload and try again.  To prevent potential thrashing,
1358 		 * replace both empty magazines only if the requested
1359 		 * count exceeds a magazine's worth of objects.
1360 		 */
1361 		(void) skmem_depot_batch_alloc(skm, &skm->skm_full,
1362 		    &skm->skm_depot_full, &mg, (need <= cp->cp_magsize) ? 1 : 2);
1363 		if (mg != NULL) {
1364 			SLIST_HEAD(, skmem_mag) mg_list =
1365 			    SLIST_HEAD_INITIALIZER(mg_list);
1366 
1367 			if (cp->cp_ploaded != NULL) {
1368 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
1369 				    mg_link);
1370 			}
1371 			if (SLIST_NEXT(mg, mg_link) == NULL) {
1372 				/*
1373 				 * Depot allocation returns only 1 magazine;
1374 				 * retain current empty magazine.
1375 				 */
1376 				skmem_cpu_reload(cp, mg, cp->cp_magsize);
1377 			} else {
1378 				/*
1379 				 * We got 2 full magazines from depot;
1380 				 * release the current empty magazine
1381 				 * back to the depot layer.
1382 				 */
1383 				if (cp->cp_loaded != NULL) {
1384 					SLIST_INSERT_HEAD(&mg_list,
1385 					    cp->cp_loaded, mg_link);
1386 				}
1387 				skmem_cpu_batch_reload(cp, mg, cp->cp_magsize);
1388 			}
1389 			skmem_depot_batch_free(skm, &skm->skm_empty,
1390 			    &skm->skm_depot_empty, SLIST_FIRST(&mg_list));
1391 			continue;
1392 		}
1393 
1394 		/*
1395 		 * The depot layer doesn't have any full magazines;
1396 		 * allocate directly from the slab layer.
1397 		 */
1398 		break;
1399 	}
1400 	SKM_CPU_UNLOCK(cp);
1401 
1402 	if (__probable(num > 1 && (skm->skm_mode & SKM_MODE_BATCH) != 0)) {
1403 		struct skmem_obj *rtop, *__single rlist, *rlistp = NULL;
1404 		uint32_t rlistc, c = 0;
1405 
1406 		/*
1407 		 * Get a list of raw objects from the slab layer.
1408 		 */
1409 		rlistc = skmem_slab_batch_alloc(skm, &rlist, need, skmflag);
1410 		ASSERT(rlistc == 0 || rlist != NULL);
1411 		rtop = rlist;
1412 
1413 		/*
1414 		 * Construct each object in the raw list.  Upon failure,
1415 		 * free any remaining objects in the list back to the slab
1416 		 * layer, and keep the ones that were successfully constructed.
1417 		 * Here, "oi" and "oim" in each skmem_obj refer to the objects
1418 		 * coming from the master and slave regions (on mirrored
1419 		 * regions), respectively.  They are stored inside the object
1420 		 * temporarily so that we can pass them to the constructor.
1421 		 */
1422 		while (skm->skm_ctor != NULL && rlist != NULL) {
1423 			struct skmem_obj_info *oi = &rlist->mo_info;
1424 			struct skmem_obj_info *oim = &rlist->mo_minfo;
1425 			struct skmem_obj *rlistn = rlist->mo_next;
1426 
1427 			/*
1428 			 * Note that the constructor guarantees at least
1429 			 * the size of a pointer at the top of the object
1430 			 * and no more than that.  That means we must not
1431 			 * refer to "oi" and "oim" any longer after the
1432 			 * object goes thru the constructor.
1433 			 */
1434 			if (skm->skm_ctor(oi, ((SKMEM_OBJ_ADDR(oim) != NULL) ?
1435 			    oim : NULL), skm->skm_private, skmflag) != 0) {
1436 				VERIFY(rlist->mo_next == rlistn);
1437 				os_atomic_add(&skm->skm_sl_alloc_fail,
1438 				    rlistc - c, relaxed);
1439 				if (rlistp != NULL) {
1440 					rlistp->mo_next = NULL;
1441 				}
1442 				if (rlist == rtop) {
1443 					rtop = NULL;
1444 					ASSERT(c == 0);
1445 				}
1446 				skmem_slab_batch_free(skm, rlist);
1447 				rlist = NULL;
1448 				rlistc = c;
1449 				break;
1450 			}
1451 			VERIFY(rlist->mo_next == rlistn);
1452 
1453 			++c;                    /* # of constructed objs */
1454 			rlistp = rlist;
1455 			if ((rlist = rlist->mo_next) == NULL) {
1456 				ASSERT(rlistc == c);
1457 				break;
1458 			}
1459 		}
1460 
1461 		/*
1462 		 * At this point "top" points to the head of the chain we're
1463 		 * going to return to caller; "list" points to the tail of that
1464 		 * chain.  The second chain begins at "rtop", and we append
1465 		 * that after "list" to form a single chain.  "rlistc" is the
1466 		 * number of objects in "rtop" originated from the slab layer
1467 		 * that have been successfully constructed (if applicable).
1468 		 */
1469 		ASSERT(c == 0 || rtop != NULL);
1470 		need -= rlistc;
1471 		*list = rtop;
1472 	} else {
1473 		struct skmem_obj_info oi, oim;
1474 		void *buf;
1475 
1476 		ASSERT(*top == NULL && num == 1 && need == 1);
1477 
1478 		/*
1479 		 * Get a single raw object from the slab layer.
1480 		 */
1481 		if (skmem_slab_alloc(skm, &oi, &oim, skmflag) != 0) {
1482 			goto done;
1483 		}
1484 
1485 		buf = SKMEM_OBJ_ADDR(&oi);
1486 		ASSERT(buf != NULL);
1487 
1488 		/*
1489 		 * Construct the raw object.  Here, "oi" and "oim" refer to
1490 		 * the objects coming from the master and slave regions (on
1491 		 * mirrored regions), respectively.
1492 		 */
1493 		if (skm->skm_ctor != NULL &&
1494 		    skm->skm_ctor(&oi, ((SKMEM_OBJ_ADDR(&oim) != NULL) ?
1495 		    &oim : NULL), skm->skm_private, skmflag) != 0) {
1496 			os_atomic_inc(&skm->skm_sl_alloc_fail, relaxed);
1497 			skmem_slab_free(skm, buf);
1498 			goto done;
1499 		}
1500 
1501 		need = 0;
1502 		*list = buf;
1503 		ASSERT(!(skm->skm_mode & SKM_MODE_BATCH) ||
1504 		    (*list)->mo_next == NULL);
1505 	}
1506 
1507 done:
1508 	/* if auditing is enabled, record this transaction */
1509 	if (__improbable(*top != NULL &&
1510 	    (skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1511 		skmem_audit_buf(skm,
1512 		    __unsafe_forge_bidi_indexable(struct skmem_obj *, *top, objsize));
1513 	}
1514 
1515 	return num - need;
1516 }
1517 
1518 /*
1519  * Free a constructed object to the cache.
1520  */
1521 void
skmem_cache_free(struct skmem_cache * skm,void * buf)1522 skmem_cache_free(struct skmem_cache *skm, void *buf)
1523 {
1524 	if (skm->skm_mode & SKM_MODE_BATCH) {
1525 		((struct skmem_obj *)buf)->mo_next = NULL;
1526 	}
1527 	skmem_cache_batch_free_common(skm, (struct skmem_obj *)buf, 0);
1528 }
1529 
1530 /*
1531  * Free a constructed object.
1532  */
1533 void
skmem_cache_free_nocache(struct skmem_cache * skm,void * buf)1534 skmem_cache_free_nocache(struct skmem_cache *skm, void *buf)
1535 {
1536 	if (skm->skm_mode & SKM_MODE_BATCH) {
1537 		((struct skmem_obj *)buf)->mo_next = NULL;
1538 	}
1539 	skmem_cache_batch_free_common(skm, (struct skmem_obj *)buf, SKMEM_CACHE_FREE_NOCACHE);
1540 }
1541 
1542 void
skmem_cache_batch_free(struct skmem_cache * skm,struct skmem_obj * list)1543 skmem_cache_batch_free(struct skmem_cache *skm, struct skmem_obj *list)
1544 {
1545 	skmem_cache_batch_free_common(skm, list, 0);
1546 }
1547 
1548 void
skmem_cache_batch_free_nocache(struct skmem_cache * skm,struct skmem_obj * list)1549 skmem_cache_batch_free_nocache(struct skmem_cache *skm, struct skmem_obj *list)
1550 {
1551 	skmem_cache_batch_free_common(skm, list, SKMEM_CACHE_FREE_NOCACHE);
1552 }
1553 
1554 static void
skmem_cache_batch_free_common(struct skmem_cache * skm,struct skmem_obj * list,uint32_t flags)1555 skmem_cache_batch_free_common(struct skmem_cache *skm, struct skmem_obj *list, uint32_t flags)
1556 {
1557 	struct skmem_cpu_cache *cp = SKMEM_CPU_CACHE(skm);
1558 	struct skmem_magtype *mtp;
1559 	/*
1560 	 * XXX -fbounds-safety: Don't mark mg as __single, because it's a struct
1561 	 * with a flexible array, and when we allocate it, the alloc function
1562 	 * returns an __indexable to tell us the bounds. But if we mark this as
1563 	 * __single, we lose that information. It might compile fine, but at
1564 	 * runtime, before we actually assign the count value, there will be a
1565 	 * comparison between current count value and the new count value we
1566 	 * assign, where current count is supposed to be greater than the new
1567 	 * count. Unfortunately, this will most likely fail.
1568 	 */
1569 	struct skmem_mag *mg;
1570 	struct skmem_obj *listn;
1571 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1572 	vm_map_address_t tagged_address;      /* address tagging */
1573 	struct skmem_region *region;          /* region source for this cache */
1574 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1575 
1576 	/* if auditing is enabled, record this transaction */
1577 	if (__improbable((skm->skm_mode & SKM_MODE_AUDIT) != 0)) {
1578 		skmem_audit_buf(skm, list);
1579 	}
1580 
1581 	if (flags & SKMEM_CACHE_FREE_NOCACHE) {
1582 		goto nocache;
1583 	}
1584 
1585 	SKM_CPU_LOCK(cp);
1586 	for (;;) {
1587 		/*
1588 		 * If there's an available space in the current CPU's
1589 		 * loaded magazine, place it there and we're done.
1590 		 */
1591 		if ((unsigned int)cp->cp_rounds <
1592 		    (unsigned int)cp->cp_magsize) {
1593 			/*
1594 			 * In the SKM_MODE_BATCH case, reverse the list
1595 			 * while we place each object into the magazine;
1596 			 * this effectively causes the most recently
1597 			 * freed object to be reused during allocation.
1598 			 */
1599 			if (skm->skm_mode & SKM_MODE_BATCH) {
1600 				listn = list->mo_next;
1601 				list->mo_next = (cp->cp_rounds == 0) ? NULL :
1602 				    cp->cp_loaded->mg_round[cp->cp_rounds - 1];
1603 			} else {
1604 				listn = NULL;
1605 			}
1606 #if CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT)
1607 			region = skm->skm_region;
1608 			if (region->skr_mode & SKR_MODE_MEMTAG) {
1609 				tagged_address = (vm_map_address_t)vm_memtag_generate_and_store_tag(
1610 					(caddr_t)list, skm->skm_objsize);
1611 				cp->cp_loaded->mg_round[cp->cp_rounds++] =
1612 				    __unsafe_forge_bidi_indexable(
1613 					struct skmem_obj *, tagged_address,
1614 					skm->skm_objsize);
1615 			} else {
1616 				cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
1617 			}
1618 #else /* !CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1619 			cp->cp_loaded->mg_round[cp->cp_rounds++] = list;
1620 #endif /* CONFIG_KERNEL_TAGGING && !defined(KASAN_LIGHT) */
1621 			cp->cp_free++;
1622 
1623 			if ((list = listn) != NULL) {
1624 				continue;
1625 			}
1626 
1627 			SKM_CPU_UNLOCK(cp);
1628 			return;
1629 		}
1630 
1631 		/*
1632 		 * The loaded magazine is full.  If the previously
1633 		 * loaded magazine was empty, exchange and try again.
1634 		 */
1635 		if (cp->cp_prounds == 0) {
1636 			skmem_cpu_reload(cp, cp->cp_ploaded, cp->cp_prounds);
1637 			continue;
1638 		}
1639 
1640 		/*
1641 		 * If the magazine layer is disabled, free to slab.
1642 		 * This can happen either because SKM_MODE_NOMAGAZINES
1643 		 * is set, or because we are resizing the magazine now.
1644 		 */
1645 		if (cp->cp_magsize == 0) {
1646 			break;
1647 		}
1648 
1649 		/*
1650 		 * Both magazines for the CPU are full; try to get
1651 		 * empty magazine(s) from the depot.  If we get one,
1652 		 * exchange a full magazine with it and place the
1653 		 * object in there.
1654 		 *
1655 		 * TODO: Because the caller currently doesn't indicate
1656 		 * the number of objects in the list, we choose the more
1657 		 * conservative approach of allocating only 1 empty
1658 		 * magazine (to prevent potential thrashing).  Once we
1659 		 * have the object count, we can replace 1 with similar
1660 		 * logic as used in skmem_cache_batch_alloc().
1661 		 */
1662 		(void) skmem_depot_batch_alloc(skm, &skm->skm_empty,
1663 		    &skm->skm_depot_empty, &mg, 1);
1664 		if (mg != NULL) {
1665 			SLIST_HEAD(, skmem_mag) mg_list =
1666 			    SLIST_HEAD_INITIALIZER(mg_list);
1667 
1668 			if (cp->cp_ploaded != NULL) {
1669 				SLIST_INSERT_HEAD(&mg_list, cp->cp_ploaded,
1670 				    mg_link);
1671 			}
1672 			if (SLIST_NEXT(mg, mg_link) == NULL) {
1673 				/*
1674 				 * Depot allocation returns only 1 magazine;
1675 				 * retain current full magazine.
1676 				 */
1677 				skmem_cpu_reload(cp, mg, 0);
1678 			} else {
1679 				/*
1680 				 * We got 2 empty magazines from depot;
1681 				 * release the current full magazine back
1682 				 * to the depot layer.
1683 				 */
1684 				if (cp->cp_loaded != NULL) {
1685 					SLIST_INSERT_HEAD(&mg_list,
1686 					    cp->cp_loaded, mg_link);
1687 				}
1688 				skmem_cpu_batch_reload(cp, mg, 0);
1689 			}
1690 			skmem_depot_batch_free(skm, &skm->skm_full,
1691 			    &skm->skm_depot_full, SLIST_FIRST(&mg_list));
1692 			continue;
1693 		}
1694 
1695 		/*
1696 		 * We can't get any empty magazine from the depot, and
1697 		 * so we need to allocate one.  If the allocation fails,
1698 		 * just fall through, deconstruct and free the object
1699 		 * to the slab layer.
1700 		 */
1701 		mtp = skm->skm_magtype;
1702 		SKM_CPU_UNLOCK(cp);
1703 		mg = skmem_cache_alloc(mtp->mt_cache, SKMEM_NOSLEEP);
1704 		SKM_CPU_LOCK(cp);
1705 
1706 		if (mg != NULL) {
1707 			/*
1708 			 * XXX -fbounds-safety requires mg to be set before
1709 			 * setting mg->mg_count. But self-assignment mg = mg was
1710 			 * not allowed. As such, we used the following
1711 			 * workaround
1712 			 */
1713 			void *vmg = mg;
1714 			mg = vmg;
1715 			mg->mg_count = mg->mg_magtype->mt_magsize;
1716 			/*
1717 			 * We allocated an empty magazine, but since we
1718 			 * dropped the CPU lock above the magazine size
1719 			 * may have changed.  If that's the case free
1720 			 * the magazine and try again.
1721 			 */
1722 			if (cp->cp_magsize != mtp->mt_magsize) {
1723 				SKM_CPU_UNLOCK(cp);
1724 				skmem_cache_free(mtp->mt_cache, mg);
1725 				SKM_CPU_LOCK(cp);
1726 				continue;
1727 			}
1728 
1729 			/*
1730 			 * We have a magazine with the right size;
1731 			 * add it to the depot and try again.
1732 			 */
1733 			ASSERT(SLIST_NEXT(mg, mg_link) == NULL);
1734 			skmem_depot_batch_free(skm, &skm->skm_empty,
1735 			    &skm->skm_depot_empty, mg);
1736 			continue;
1737 		}
1738 
1739 		/*
1740 		 * We can't get an empty magazine, so free to slab.
1741 		 */
1742 		break;
1743 	}
1744 	SKM_CPU_UNLOCK(cp);
1745 
1746 nocache:
1747 	/*
1748 	 * We weren't able to free the constructed object(s) to the
1749 	 * magazine layer, so deconstruct them and free to the slab.
1750 	 */
1751 	if (__probable((skm->skm_mode & SKM_MODE_BATCH) &&
1752 	    list->mo_next != NULL)) {
1753 		/* whatever is left from original list */
1754 		struct skmem_obj *top = list;
1755 
1756 		while (list != NULL && skm->skm_dtor != NULL) {
1757 			listn = list->mo_next;
1758 			list->mo_next = NULL;
1759 
1760 			/* deconstruct the object */
1761 			if (skm->skm_dtor != NULL) {
1762 				skm->skm_dtor((void *)list, skm->skm_private);
1763 			}
1764 
1765 			list->mo_next = listn;
1766 			list = listn;
1767 		}
1768 
1769 		skmem_slab_batch_free(skm, top);
1770 	} else {
1771 		/* deconstruct the object */
1772 		if (skm->skm_dtor != NULL) {
1773 			skm->skm_dtor((void *)list, skm->skm_private);
1774 		}
1775 
1776 		skmem_slab_free(skm, (void *)list);
1777 	}
1778 }
1779 
1780 /*
1781  * Return the maximum number of objects cached at the magazine layer
1782  * based on the chunk size.  This takes into account the starting
1783  * magazine type as well as the final magazine type used in resizing.
1784  */
1785 uint32_t
skmem_cache_magazine_max(uint32_t chunksize)1786 skmem_cache_magazine_max(uint32_t chunksize)
1787 {
1788 	struct skmem_magtype *mtp;
1789 	uint32_t magsize_max;
1790 
1791 	VERIFY(ncpu != 0);
1792 	VERIFY(chunksize > 0);
1793 
1794 	/* find a suitable magazine type for this chunk size */
1795 	for (mtp = skmem_magtype; chunksize <= mtp->mt_minbuf; mtp++) {
1796 		continue;
1797 	}
1798 
1799 	/* and find the last magazine type  */
1800 	for (;;) {
1801 		magsize_max = mtp->mt_magsize;
1802 		if (mtp == skmem_cache_magsize_last ||
1803 		    chunksize >= mtp->mt_maxbuf) {
1804 			break;
1805 		}
1806 		++mtp;
1807 		VERIFY(mtp <= skmem_cache_magsize_last);
1808 	}
1809 
1810 	return ncpu * magsize_max * 2; /* two magazines per CPU */
1811 }
1812 
1813 /*
1814  * Return true if SKMEM_DEBUG_NOMAGAZINES is not set on skmem_debug.
1815  */
1816 boolean_t
skmem_allow_magazines(void)1817 skmem_allow_magazines(void)
1818 {
1819 	return !(skmem_debug & SKMEM_DEBUG_NOMAGAZINES);
1820 }
1821 
1822 /*
1823  * Purge all magazines from a cache and disable its per-CPU magazines layer.
1824  */
1825 static void
skmem_cache_magazine_purge(struct skmem_cache * skm)1826 skmem_cache_magazine_purge(struct skmem_cache *skm)
1827 {
1828 	struct skmem_cpu_cache *cp;
1829 	struct skmem_mag *mg, *pmg;
1830 	int rounds, prounds;
1831 	uint32_t cpuid, mg_cnt = 0, pmg_cnt = 0;
1832 
1833 	SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
1834 
1835 	SK_DF(SK_VERB_MEM_CACHE, "skm %p", SK_KVA(skm));
1836 
1837 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
1838 		cp = &skm->skm_cpu_cache[cpuid];
1839 
1840 		SKM_CPU_LOCK_SPIN(cp);
1841 		mg = cp->cp_loaded;
1842 		pmg = cp->cp_ploaded;
1843 		rounds = cp->cp_rounds;
1844 		prounds = cp->cp_prounds;
1845 		cp->cp_loaded = NULL;
1846 		cp->cp_ploaded = NULL;
1847 		cp->cp_rounds = -1;
1848 		cp->cp_prounds = -1;
1849 		cp->cp_magsize = 0;
1850 		SKM_CPU_UNLOCK(cp);
1851 
1852 		if (mg != NULL) {
1853 			skmem_magazine_destroy(skm, mg, rounds);
1854 			++mg_cnt;
1855 		}
1856 		if (pmg != NULL) {
1857 			skmem_magazine_destroy(skm, pmg, prounds);
1858 			++pmg_cnt;
1859 		}
1860 	}
1861 
1862 	if (mg_cnt != 0 || pmg_cnt != 0) {
1863 		os_atomic_inc(&skm->skm_cpu_mag_purge, relaxed);
1864 	}
1865 
1866 	skmem_depot_ws_zero(skm);
1867 	skmem_depot_ws_reap(skm);
1868 }
1869 
1870 /*
1871  * Enable magazines on a cache.  Must only be called on a cache with
1872  * its per-CPU magazines layer disabled (e.g. due to purge).
1873  */
1874 static void
skmem_cache_magazine_enable(struct skmem_cache * skm,uint32_t arg)1875 skmem_cache_magazine_enable(struct skmem_cache *skm, uint32_t arg)
1876 {
1877 #pragma unused(arg)
1878 	struct skmem_cpu_cache *cp;
1879 	uint32_t cpuid;
1880 
1881 	if (skm->skm_mode & SKM_MODE_NOMAGAZINES) {
1882 		return;
1883 	}
1884 
1885 	for (cpuid = 0; cpuid < ncpu; cpuid++) {
1886 		cp = &skm->skm_cpu_cache[cpuid];
1887 		SKM_CPU_LOCK_SPIN(cp);
1888 		/* the magazines layer must be disabled at this point */
1889 		ASSERT(cp->cp_loaded == NULL);
1890 		ASSERT(cp->cp_ploaded == NULL);
1891 		ASSERT(cp->cp_rounds == -1);
1892 		ASSERT(cp->cp_prounds == -1);
1893 		ASSERT(cp->cp_magsize == 0);
1894 		cp->cp_magsize = skm->skm_magtype->mt_magsize;
1895 		SKM_CPU_UNLOCK(cp);
1896 	}
1897 
1898 	SK_DF(SK_VERB_MEM_CACHE, "skm %p chunksize %u magsize %d",
1899 	    SK_KVA(skm), (uint32_t)skm->skm_chunksize,
1900 	    SKMEM_CPU_CACHE(skm)->cp_magsize);
1901 }
1902 
1903 /*
1904  * Enter the cache resize perimeter.  Upon success, claim exclusivity
1905  * on the perimeter and return 0, else EBUSY.  Caller may indicate
1906  * whether or not they're willing to wait.
1907  */
1908 static int
skmem_cache_resize_enter(struct skmem_cache * skm,boolean_t can_sleep)1909 skmem_cache_resize_enter(struct skmem_cache *skm, boolean_t can_sleep)
1910 {
1911 	SKM_RESIZE_LOCK(skm);
1912 	if (skm->skm_rs_owner == current_thread()) {
1913 		ASSERT(skm->skm_rs_busy != 0);
1914 		skm->skm_rs_busy++;
1915 		goto done;
1916 	}
1917 	if (!can_sleep) {
1918 		if (skm->skm_rs_busy != 0) {
1919 			SKM_RESIZE_UNLOCK(skm);
1920 			return EBUSY;
1921 		}
1922 	} else {
1923 		while (skm->skm_rs_busy != 0) {
1924 			skm->skm_rs_want++;
1925 			(void) assert_wait(&skm->skm_rs_busy, THREAD_UNINT);
1926 			SKM_RESIZE_UNLOCK(skm);
1927 			(void) thread_block(THREAD_CONTINUE_NULL);
1928 			SK_DF(SK_VERB_MEM_CACHE, "waited for skm \"%s\" "
1929 			    "(%p) busy=%u", skm->skm_name,
1930 			    SK_KVA(skm), skm->skm_rs_busy);
1931 			SKM_RESIZE_LOCK(skm);
1932 		}
1933 	}
1934 	SKM_RESIZE_LOCK_ASSERT_HELD(skm);
1935 	ASSERT(skm->skm_rs_busy == 0);
1936 	skm->skm_rs_busy++;
1937 	skm->skm_rs_owner = current_thread();
1938 done:
1939 	SKM_RESIZE_UNLOCK(skm);
1940 	return 0;
1941 }
1942 
1943 /*
1944  * Exit the cache resize perimeter and unblock any waiters.
1945  */
1946 static void
skmem_cache_resize_exit(struct skmem_cache * skm)1947 skmem_cache_resize_exit(struct skmem_cache *skm)
1948 {
1949 	uint32_t want;
1950 
1951 	SKM_RESIZE_LOCK(skm);
1952 	ASSERT(skm->skm_rs_busy != 0);
1953 	ASSERT(skm->skm_rs_owner == current_thread());
1954 	if (--skm->skm_rs_busy == 0) {
1955 		skm->skm_rs_owner = NULL;
1956 		/*
1957 		 * We're done; notify anyone that has lost the race.
1958 		 */
1959 		if ((want = skm->skm_rs_want) != 0) {
1960 			skm->skm_rs_want = 0;
1961 			wakeup((void *)&skm->skm_rs_busy);
1962 			SKM_RESIZE_UNLOCK(skm);
1963 		} else {
1964 			SKM_RESIZE_UNLOCK(skm);
1965 		}
1966 	} else {
1967 		SKM_RESIZE_UNLOCK(skm);
1968 	}
1969 }
1970 
1971 /*
1972  * Recompute a cache's magazine size.  This is an expensive operation
1973  * and should not be done frequently; larger magazines provide for a
1974  * higher transfer rate with the depot while smaller magazines reduce
1975  * the memory consumption.
1976  */
1977 static void
skmem_cache_magazine_resize(struct skmem_cache * skm)1978 skmem_cache_magazine_resize(struct skmem_cache *skm)
1979 {
1980 	struct skmem_magtype *mtp = __unsafe_forge_bidi_indexable(
1981 		struct skmem_magtype *, skm->skm_magtype, sizeof(skmem_magtype));
1982 
1983 	/* insist that we are executing in the update thread call context */
1984 	ASSERT(sk_is_cache_update_protected());
1985 	ASSERT(!(skm->skm_mode & SKM_MODE_NOMAGAZINES));
1986 	/* depot contention only applies to dynamic mode */
1987 	ASSERT(skm->skm_mode & SKM_MODE_DYNAMIC);
1988 
1989 	/*
1990 	 * Although we're executing in the context of the update thread
1991 	 * call, we need to protect the per-CPU states during resizing
1992 	 * against other synchronous cache purge/reenable requests that
1993 	 * could take place in parallel.
1994 	 */
1995 	if (skm->skm_chunksize < mtp->mt_maxbuf) {
1996 		(void) skmem_cache_resize_enter(skm, TRUE);
1997 		skmem_cache_magazine_purge(skm);
1998 
1999 		/*
2000 		 * Upgrade to the next magazine type with larger size.
2001 		 */
2002 		SKM_DEPOT_LOCK_SPIN(skm);
2003 		skm->skm_cpu_mag_resize++;
2004 		skm->skm_magtype = ++mtp;
2005 		skm->skm_cpu_mag_size = skm->skm_magtype->mt_magsize;
2006 		skm->skm_depot_contention_prev =
2007 		    skm->skm_depot_contention + INT_MAX;
2008 		SKM_DEPOT_UNLOCK(skm);
2009 
2010 		skmem_cache_magazine_enable(skm, 0);
2011 		skmem_cache_resize_exit(skm);
2012 	}
2013 }
2014 
2015 /*
2016  * Rescale the cache's allocated-address hash table.
2017  */
2018 static void
skmem_cache_hash_rescale(struct skmem_cache * skm)2019 skmem_cache_hash_rescale(struct skmem_cache *skm)
2020 {
2021 	struct skmem_bufctl_bkt *__indexable old_table, *new_table;
2022 	size_t old_size, new_size;
2023 	uint32_t i, moved = 0;
2024 
2025 	/* insist that we are executing in the update thread call context */
2026 	ASSERT(sk_is_cache_update_protected());
2027 
2028 	/*
2029 	 * To get small average lookup time (lookup depth near 1.0), the hash
2030 	 * table size should be roughly the same (not necessarily equivalent)
2031 	 * as the cache size.
2032 	 */
2033 	new_size = MAX(skm->skm_hash_initial,
2034 	    (1 << (flsll(3 * skm->skm_sl_bufinuse + 4) - 2)));
2035 	new_size = MIN(skm->skm_hash_limit, new_size);
2036 	old_size = (skm->skm_hash_mask + 1);
2037 
2038 	if ((old_size >> 1) <= new_size && new_size <= (old_size << 1)) {
2039 		return;
2040 	}
2041 
2042 	new_table = sk_alloc_type_array(struct skmem_bufctl_bkt, new_size,
2043 	    Z_NOWAIT, skmem_tag_bufctl_hash);
2044 	if (__improbable(new_table == NULL)) {
2045 		return;
2046 	}
2047 
2048 	for (i = 0; i < new_size; i++) {
2049 		SLIST_INIT(&new_table[i].bcb_head);
2050 	}
2051 
2052 	SKM_SLAB_LOCK(skm);
2053 
2054 	old_size = (skm->skm_hash_mask + 1);
2055 	old_table = skm->skm_hash_table;
2056 
2057 	skm->skm_hash_mask = (new_size - 1);
2058 	skm->skm_hash_table = new_table;
2059 	skm->skm_hash_size = new_size;
2060 	skm->skm_sl_rescale++;
2061 
2062 	for (i = 0; i < old_size; i++) {
2063 		struct skmem_bufctl_bkt *bcb = &old_table[i];
2064 		struct skmem_bufctl_bkt *new_bcb;
2065 		struct skmem_bufctl *bc;
2066 
2067 		while ((bc = SLIST_FIRST(&bcb->bcb_head)) != NULL) {
2068 			SLIST_REMOVE_HEAD(&bcb->bcb_head, bc_link);
2069 			new_bcb = SKMEM_CACHE_HASH(skm, bc->bc_addr);
2070 			/*
2071 			 * Ideally we want to insert tail here, but simple
2072 			 * list doesn't give us that.  The fact that we are
2073 			 * essentially reversing the order is not a big deal
2074 			 * here vis-a-vis the new table size.
2075 			 */
2076 			SLIST_INSERT_HEAD(&new_bcb->bcb_head, bc, bc_link);
2077 			++moved;
2078 		}
2079 		ASSERT(SLIST_EMPTY(&bcb->bcb_head));
2080 	}
2081 
2082 	SK_DF(SK_VERB_MEM_CACHE,
2083 	    "skm %p old_size %u new_size %u [%u moved]", SK_KVA(skm),
2084 	    (uint32_t)old_size, (uint32_t)new_size, moved);
2085 
2086 	SKM_SLAB_UNLOCK(skm);
2087 
2088 	sk_free_type_array(struct skmem_bufctl_bkt, old_size, old_table);
2089 }
2090 
2091 /*
2092  * Apply a function to operate on all caches.
2093  */
2094 static void
skmem_cache_applyall(void (* func)(struct skmem_cache *,uint32_t),uint32_t arg)2095 skmem_cache_applyall(void (*func)(struct skmem_cache *, uint32_t), uint32_t arg)
2096 {
2097 	struct skmem_cache *skm;
2098 
2099 	net_update_uptime();
2100 
2101 	SKMEM_CACHE_LOCK();
2102 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2103 		func(skm, arg);
2104 	}
2105 	SKMEM_CACHE_UNLOCK();
2106 }
2107 
2108 /*
2109  * Reclaim unused memory from a cache.
2110  */
2111 static void
skmem_cache_reclaim(struct skmem_cache * skm,uint32_t lowmem)2112 skmem_cache_reclaim(struct skmem_cache *skm, uint32_t lowmem)
2113 {
2114 	/*
2115 	 * Inform the owner to free memory if possible; the reclaim
2116 	 * policy is left to the owner.  This is just an advisory.
2117 	 */
2118 	if (skm->skm_reclaim != NULL) {
2119 		skm->skm_reclaim(skm->skm_private);
2120 	}
2121 
2122 	if (lowmem) {
2123 		/*
2124 		 * If another thread is in the process of purging or
2125 		 * resizing, bail out and let the currently-ongoing
2126 		 * purging take its natural course.
2127 		 */
2128 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2129 			skmem_cache_magazine_purge(skm);
2130 			skmem_cache_magazine_enable(skm, 0);
2131 			skmem_cache_resize_exit(skm);
2132 		}
2133 	} else {
2134 		skmem_depot_ws_reap(skm);
2135 	}
2136 }
2137 
2138 /*
2139  * Thread call callback for reap.
2140  */
2141 static void
skmem_cache_reap_func(thread_call_param_t dummy,thread_call_param_t arg)2142 skmem_cache_reap_func(thread_call_param_t dummy, thread_call_param_t arg)
2143 {
2144 #pragma unused(dummy)
2145 	void (*func)(void) = arg;
2146 
2147 	ASSERT(func == skmem_cache_reap_start || func == skmem_cache_reap_done);
2148 	func();
2149 }
2150 
2151 /*
2152  * Start reaping all caches; this is serialized via thread call.
2153  */
2154 static void
skmem_cache_reap_start(void)2155 skmem_cache_reap_start(void)
2156 {
2157 	SK_DF(SK_VERB_MEM_CACHE, "now running");
2158 	skmem_cache_applyall(skmem_cache_reclaim, skmem_lowmem_check());
2159 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_done,
2160 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2161 }
2162 
2163 /*
2164  * Stop reaping; this would allow another reap request to occur.
2165  */
2166 static void
skmem_cache_reap_done(void)2167 skmem_cache_reap_done(void)
2168 {
2169 	volatile uint32_t *flag = &skmem_cache_reaping;
2170 
2171 	*flag = 0;
2172 	os_atomic_thread_fence(seq_cst);
2173 }
2174 
2175 /*
2176  * Immediately reap all unused memory of a cache.  If purging,
2177  * also purge the cached objects at the CPU layer.
2178  */
2179 void
skmem_cache_reap_now(struct skmem_cache * skm,boolean_t purge)2180 skmem_cache_reap_now(struct skmem_cache *skm, boolean_t purge)
2181 {
2182 	/* if SKM_MODE_RECLIAM flag is set for this cache, we purge */
2183 	if (purge || (skm->skm_mode & SKM_MODE_RECLAIM)) {
2184 		/*
2185 		 * If another thread is in the process of purging or
2186 		 * resizing, bail out and let the currently-ongoing
2187 		 * purging take its natural course.
2188 		 */
2189 		if (skmem_cache_resize_enter(skm, FALSE) == 0) {
2190 			skmem_cache_magazine_purge(skm);
2191 			skmem_cache_magazine_enable(skm, 0);
2192 			skmem_cache_resize_exit(skm);
2193 		}
2194 	} else {
2195 		skmem_depot_ws_zero(skm);
2196 		skmem_depot_ws_reap(skm);
2197 
2198 		/* clean up cp_ploaded magazines from each CPU */
2199 		SKM_SLAB_LOCK_ASSERT_NOTHELD(skm);
2200 
2201 		struct skmem_cpu_cache *cp;
2202 		struct skmem_mag *pmg;
2203 		int prounds;
2204 		uint32_t cpuid;
2205 
2206 		for (cpuid = 0; cpuid < ncpu; cpuid++) {
2207 			cp = &skm->skm_cpu_cache[cpuid];
2208 
2209 			SKM_CPU_LOCK_SPIN(cp);
2210 			pmg = cp->cp_ploaded;
2211 			prounds = cp->cp_prounds;
2212 
2213 			cp->cp_ploaded = NULL;
2214 			cp->cp_prounds = -1;
2215 			SKM_CPU_UNLOCK(cp);
2216 
2217 			if (pmg != NULL) {
2218 				skmem_magazine_destroy(skm, pmg, prounds);
2219 			}
2220 		}
2221 	}
2222 }
2223 
2224 /*
2225  * Request a global reap operation to be dispatched.
2226  */
2227 void
skmem_cache_reap(void)2228 skmem_cache_reap(void)
2229 {
2230 	/* only one reaping episode is allowed at a time */
2231 	if (skmem_lock_owner == current_thread() ||
2232 	    !os_atomic_cmpxchg(&skmem_cache_reaping, 0, 1, acq_rel)) {
2233 		return;
2234 	}
2235 
2236 	skmem_dispatch(skmem_cache_reap_tc, skmem_cache_reap_start, 0);
2237 }
2238 
2239 /*
2240  * Reap internal caches.
2241  */
2242 void
skmem_reap_caches(boolean_t purge)2243 skmem_reap_caches(boolean_t purge)
2244 {
2245 	skmem_cache_reap_now(skmem_slab_cache, purge);
2246 	skmem_cache_reap_now(skmem_bufctl_cache, purge);
2247 
2248 	/* packet buffer pool objects */
2249 	pp_reap_caches(purge);
2250 
2251 	/* also handle the region cache(s) */
2252 	skmem_region_reap_caches(purge);
2253 }
2254 
2255 /*
2256  * Thread call callback for update.
2257  */
2258 static void
skmem_cache_update_func(thread_call_param_t dummy,thread_call_param_t arg)2259 skmem_cache_update_func(thread_call_param_t dummy, thread_call_param_t arg)
2260 {
2261 #pragma unused(dummy, arg)
2262 	sk_protect_t protect;
2263 
2264 	protect = sk_cache_update_protect();
2265 	skmem_cache_applyall(skmem_cache_update, 0);
2266 	sk_cache_update_unprotect(protect);
2267 
2268 	skmem_dispatch(skmem_cache_update_tc, NULL,
2269 	    (skmem_cache_update_interval * NSEC_PER_SEC));
2270 }
2271 
2272 /*
2273  * Given an object, find its buffer control and record the transaction.
2274  */
2275 __attribute__((noinline, cold, not_tail_called))
2276 static inline void
skmem_audit_buf(struct skmem_cache * skm,struct skmem_obj * list)2277 skmem_audit_buf(struct skmem_cache *skm, struct skmem_obj *list)
2278 {
2279 	struct skmem_bufctl_bkt *bcb;
2280 	struct skmem_bufctl *bc;
2281 
2282 	ASSERT(!(skm->skm_mode & SKM_MODE_PSEUDO));
2283 
2284 	SKM_SLAB_LOCK(skm);
2285 	while (list != NULL) {
2286 		void *__single buf = list;
2287 
2288 		bcb = SKMEM_CACHE_HASH(skm, buf);
2289 		SLIST_FOREACH(bc, &bcb->bcb_head, bc_link) {
2290 			if (bc->bc_addr == buf) {
2291 				break;
2292 			}
2293 		}
2294 
2295 		if (__improbable(bc == NULL)) {
2296 			panic("%s: %s failed to get bufctl for %p",
2297 			    __func__, skm->skm_name, buf);
2298 			/* NOTREACHED */
2299 			__builtin_unreachable();
2300 		}
2301 
2302 		skmem_audit_bufctl(bc);
2303 
2304 		if (!(skm->skm_mode & SKM_MODE_BATCH)) {
2305 			break;
2306 		}
2307 
2308 		list = list->mo_next;
2309 	}
2310 	SKM_SLAB_UNLOCK(skm);
2311 }
2312 
2313 static size_t
skmem_cache_mib_get_stats(struct skmem_cache * skm,void * __sized_by (len)out,size_t len)2314 skmem_cache_mib_get_stats(struct skmem_cache *skm, void *__sized_by(len) out,
2315     size_t len)
2316 {
2317 	size_t actual_space = sizeof(struct sk_stats_cache);
2318 	struct sk_stats_cache *__single sca;
2319 	int contention;
2320 
2321 	if (out == NULL || len < actual_space) {
2322 		goto done;
2323 	}
2324 	sca = out;
2325 
2326 	bzero(sca, sizeof(*sca));
2327 	(void) snprintf(sca->sca_name, sizeof(sca->sca_name), "%s",
2328 	    skm->skm_name);
2329 	uuid_copy(sca->sca_uuid, skm->skm_uuid);
2330 	uuid_copy(sca->sca_ruuid, skm->skm_region->skr_uuid);
2331 	sca->sca_mode = skm->skm_mode;
2332 	sca->sca_bufsize = (uint64_t)skm->skm_bufsize;
2333 	sca->sca_objsize = (uint64_t)skm->skm_objsize;
2334 	sca->sca_chunksize = (uint64_t)skm->skm_chunksize;
2335 	sca->sca_slabsize = (uint64_t)skm->skm_slabsize;
2336 	sca->sca_bufalign = (uint64_t)skm->skm_bufalign;
2337 	sca->sca_objalign = (uint64_t)skm->skm_objalign;
2338 
2339 	sca->sca_cpu_mag_size = skm->skm_cpu_mag_size;
2340 	sca->sca_cpu_mag_resize = skm->skm_cpu_mag_resize;
2341 	sca->sca_cpu_mag_purge = skm->skm_cpu_mag_purge;
2342 	sca->sca_cpu_mag_reap = skm->skm_cpu_mag_reap;
2343 	sca->sca_depot_full = skm->skm_depot_full;
2344 	sca->sca_depot_empty = skm->skm_depot_empty;
2345 	sca->sca_depot_ws_zero = skm->skm_depot_ws_zero;
2346 	/* in case of a race this might be a negative value, turn it into 0 */
2347 	if ((contention = (int)(skm->skm_depot_contention -
2348 	    skm->skm_depot_contention_prev)) < 0) {
2349 		contention = 0;
2350 	}
2351 	sca->sca_depot_contention_factor = contention;
2352 
2353 	sca->sca_cpu_rounds = 0;
2354 	sca->sca_cpu_prounds = 0;
2355 	for (int cpuid = 0; cpuid < ncpu; cpuid++) {
2356 		struct skmem_cpu_cache *ccp = &skm->skm_cpu_cache[cpuid];
2357 
2358 		SKM_CPU_LOCK(ccp);
2359 		if (ccp->cp_rounds > -1) {
2360 			sca->sca_cpu_rounds += ccp->cp_rounds;
2361 		}
2362 		if (ccp->cp_prounds > -1) {
2363 			sca->sca_cpu_prounds += ccp->cp_prounds;
2364 		}
2365 		SKM_CPU_UNLOCK(ccp);
2366 	}
2367 
2368 	sca->sca_sl_create = skm->skm_sl_create;
2369 	sca->sca_sl_destroy = skm->skm_sl_destroy;
2370 	sca->sca_sl_alloc = skm->skm_sl_alloc;
2371 	sca->sca_sl_free = skm->skm_sl_free;
2372 	sca->sca_sl_alloc_fail = skm->skm_sl_alloc_fail;
2373 	sca->sca_sl_partial = skm->skm_sl_partial;
2374 	sca->sca_sl_empty = skm->skm_sl_empty;
2375 	sca->sca_sl_bufinuse = skm->skm_sl_bufinuse;
2376 	sca->sca_sl_rescale = skm->skm_sl_rescale;
2377 	sca->sca_sl_hash_size = (skm->skm_hash_mask + 1);
2378 
2379 done:
2380 	return actual_space;
2381 }
2382 
2383 static int
2384 skmem_cache_mib_get_sysctl SYSCTL_HANDLER_ARGS
2385 {
2386 #pragma unused(arg1, arg2, oidp)
2387 	struct skmem_cache *skm;
2388 	size_t actual_space;
2389 	size_t buffer_space;
2390 	size_t allocated_space = 0;
2391 	caddr_t __sized_by(allocated_space) buffer = NULL;
2392 	caddr_t scan;
2393 	int error = 0;
2394 
2395 	if (!kauth_cred_issuser(kauth_cred_get())) {
2396 		return EPERM;
2397 	}
2398 
2399 	net_update_uptime();
2400 	buffer_space = req->oldlen;
2401 	if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
2402 		if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
2403 			buffer_space = SK_SYSCTL_ALLOC_MAX;
2404 		}
2405 		caddr_t temp;
2406 		temp = sk_alloc_data(buffer_space, Z_WAITOK, skmem_tag_cache_mib);
2407 		if (__improbable(temp == NULL)) {
2408 			return ENOBUFS;
2409 		}
2410 		buffer = temp;
2411 		allocated_space = buffer_space;
2412 	} else if (req->oldptr == USER_ADDR_NULL) {
2413 		buffer_space = 0;
2414 	}
2415 	actual_space = 0;
2416 	scan = buffer;
2417 
2418 	SKMEM_CACHE_LOCK();
2419 	TAILQ_FOREACH(skm, &skmem_cache_head, skm_link) {
2420 		size_t size = skmem_cache_mib_get_stats(skm, scan, buffer_space);
2421 		if (scan != NULL) {
2422 			if (buffer_space < size) {
2423 				/* supplied buffer too small, stop copying */
2424 				error = ENOMEM;
2425 				break;
2426 			}
2427 			scan += size;
2428 			buffer_space -= size;
2429 		}
2430 		actual_space += size;
2431 	}
2432 	SKMEM_CACHE_UNLOCK();
2433 
2434 	if (actual_space != 0) {
2435 		int out_error = SYSCTL_OUT(req, buffer, actual_space);
2436 		if (out_error != 0) {
2437 			error = out_error;
2438 		}
2439 	}
2440 	if (buffer != NULL) {
2441 		sk_free_data_sized_by(buffer, allocated_space);
2442 	}
2443 
2444 	return error;
2445 }
2446