1 /*
2 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <os/atomic_private.h>
68
69 #include <string.h>
70
71 #include <sys/mcache.h>
72
73 #define MCACHE_SIZE(n) \
74 __builtin_offsetof(mcache_t, mc_cpu[n])
75
76 /* Allocate extra in case we need to manually align the pointer */
77 #define MCACHE_ALLOC_SIZE \
78 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
79
80 #define MCACHE_CPU(c) \
81 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
82
83 /*
84 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
85 * to serialize accesses to the global list of caches in the system.
86 * They also record the thread currently running in the critical
87 * section, so that we can avoid recursive requests to reap the
88 * caches when memory runs low.
89 */
90 #define MCACHE_LIST_LOCK() { \
91 lck_mtx_lock(&mcache_llock); \
92 mcache_llock_owner = current_thread(); \
93 }
94
95 #define MCACHE_LIST_UNLOCK() { \
96 mcache_llock_owner = NULL; \
97 lck_mtx_unlock(&mcache_llock); \
98 }
99
100 #define MCACHE_LOCK(l) lck_mtx_lock(l)
101 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
102 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
103
104 static unsigned int ncpu;
105 static unsigned int cache_line_size;
106 static struct thread *mcache_llock_owner;
107 static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
108 static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
109 static struct zone *mcache_zone;
110 static const uint32_t mcache_reap_interval = 15;
111 static const uint32_t mcache_reap_interval_leeway = 2;
112 static UInt32 mcache_reaping;
113 static int mcache_ready;
114 static int mcache_updating;
115
116 static int mcache_bkt_contention = 3;
117 #if DEBUG
118 static unsigned int mcache_flags = MCF_DEBUG;
119 #else
120 static unsigned int mcache_flags = 0;
121 #endif
122
123 int mca_trn_max = MCA_TRN_MAX;
124
125 static mcache_bkttype_t mcache_bkttype[] = {
126 { 1, 4096, 32768, NULL },
127 { 3, 2048, 16384, NULL },
128 { 7, 1024, 12288, NULL },
129 { 15, 256, 8192, NULL },
130 { 31, 64, 4096, NULL },
131 { 47, 0, 2048, NULL },
132 { 63, 0, 1024, NULL },
133 { 95, 0, 512, NULL },
134 { 143, 0, 256, NULL },
135 { 165, 0, 0, NULL },
136 };
137
138 static mcache_t *mcache_create_common(const char *, size_t, size_t,
139 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
140 mcache_notifyfn_t, void *, u_int32_t, int);
141 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
142 unsigned int, int);
143 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
144 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
145 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
146 static void mcache_cpu_batch_refill(mcache_cpu_t *, mcache_bkt_t *, int);
147 static uint32_t mcache_bkt_batch_alloc(mcache_t *, mcache_bktlist_t *,
148 mcache_bkt_t **, uint32_t);
149 static void mcache_bkt_batch_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
150 static void mcache_cache_bkt_enable(mcache_t *);
151 static void mcache_bkt_purge(mcache_t *);
152 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
153 static void mcache_bkt_ws_update(mcache_t *);
154 static void mcache_bkt_ws_zero(mcache_t *);
155 static void mcache_bkt_ws_reap(mcache_t *);
156 static void mcache_dispatch(void (*)(void *), void *);
157 static void mcache_cache_reap(mcache_t *);
158 static void mcache_cache_update(mcache_t *);
159 static void mcache_cache_bkt_resize(void *);
160 static void mcache_cache_enable(void *);
161 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
162 static void mcache_update_timeout(void *);
163 static void mcache_applyall(void (*)(mcache_t *));
164 static void mcache_reap_start(void *);
165 static void mcache_reap_done(void *);
166 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
167 static void mcache_notify(mcache_t *, u_int32_t);
168 static void mcache_purge(void *);
169
170 static LIST_HEAD(, mcache) mcache_head;
171 mcache_t *mcache_audit_cache;
172
173 static thread_call_t mcache_reap_tcall;
174 static thread_call_t mcache_update_tcall;
175
176 /*
177 * Initialize the framework; this is currently called as part of BSD init.
178 */
179 __private_extern__ void
mcache_init(void)180 mcache_init(void)
181 {
182 mcache_bkttype_t *btp;
183 unsigned int i;
184 char name[32];
185
186 VERIFY(mca_trn_max >= 2);
187
188 ncpu = ml_wait_max_cpus();
189 (void) mcache_cache_line_size(); /* prime it */
190
191 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
192 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
193 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
194 panic("mcache_init: thread_call_allocate failed");
195 /* NOTREACHED */
196 __builtin_unreachable();
197 }
198
199 mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE,
200 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
201
202 LIST_INIT(&mcache_head);
203
204 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
205 btp = &mcache_bkttype[i];
206 (void) snprintf(name, sizeof(name), "bkt_%d",
207 btp->bt_bktsize);
208 btp->bt_cache = mcache_create(name,
209 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
210 }
211
212 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
213 mcache_flags &= MCF_FLAGS_MASK;
214
215 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
216 0, 0, MCR_SLEEP);
217
218 mcache_applyall(mcache_cache_bkt_enable);
219 mcache_ready = 1;
220
221 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
222 ncpu, CPU_CACHE_LINE_SIZE);
223 }
224
225 /*
226 * Return the global mcache flags.
227 */
228 __private_extern__ unsigned int
mcache_getflags(void)229 mcache_getflags(void)
230 {
231 return mcache_flags;
232 }
233
234 /*
235 * Return the CPU cache line size.
236 */
237 __private_extern__ unsigned int
mcache_cache_line_size(void)238 mcache_cache_line_size(void)
239 {
240 if (cache_line_size == 0) {
241 ml_cpu_info_t cpu_info;
242 ml_cpu_get_info(&cpu_info);
243 cache_line_size = (unsigned int)cpu_info.cache_line_size;
244 }
245 return cache_line_size;
246 }
247
248 /*
249 * Create a cache using the zone allocator as the backend slab allocator.
250 * The caller may specify any alignment for the object; if it specifies 0
251 * the default alignment (MCACHE_ALIGN) will be used.
252 */
253 __private_extern__ mcache_t *
mcache_create(const char * name,size_t bufsize,size_t align,u_int32_t flags,int wait __unused)254 mcache_create(const char *name, size_t bufsize, size_t align,
255 u_int32_t flags, int wait __unused)
256 {
257 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
258 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
259 }
260
261 /*
262 * Create a cache using a custom backend slab allocator. Since the caller
263 * is responsible for allocation, no alignment guarantee will be provided
264 * by this framework.
265 */
266 __private_extern__ mcache_t *
mcache_create_ext(const char * name,size_t bufsize,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int wait __unused)267 mcache_create_ext(const char *name, size_t bufsize,
268 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
269 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
270 u_int32_t flags, int wait __unused)
271 {
272 return mcache_create_common(name, bufsize, 0, allocfn,
273 freefn, auditfn, logfn, notifyfn, arg, flags, 0);
274 }
275
276 /*
277 * Common cache creation routine.
278 */
279 static mcache_t *
mcache_create_common(const char * name,size_t bufsize,size_t align,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int need_zone)280 mcache_create_common(const char *name, size_t bufsize, size_t align,
281 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
282 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
283 u_int32_t flags, int need_zone)
284 {
285 mcache_bkttype_t *btp;
286 mcache_t *cp = NULL;
287 size_t chunksize;
288 void *buf, **pbuf;
289 unsigned int c;
290 char lck_name[64];
291
292 buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
293
294 /*
295 * In case we didn't get a cache-aligned memory, round it up
296 * accordingly. This is needed in order to get the rest of
297 * structure members aligned properly. It also means that
298 * the memory span gets shifted due to the round up, but it
299 * is okay since we've allocated extra space for this.
300 */
301 cp = (mcache_t *)
302 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
303 pbuf = (void **)((intptr_t)cp - sizeof(void *));
304 *pbuf = buf;
305
306 /*
307 * Guaranteed alignment is valid only when we use the internal
308 * slab allocator (currently set to use the zone allocator).
309 */
310 if (!need_zone) {
311 align = 1;
312 } else {
313 /* Enforce 64-bit minimum alignment for zone-based buffers */
314 if (align == 0) {
315 align = MCACHE_ALIGN;
316 }
317 align = P2ROUNDUP(align, MCACHE_ALIGN);
318 }
319
320 if ((align & (align - 1)) != 0) {
321 panic("mcache_create: bad alignment %lu", align);
322 /* NOTREACHED */
323 __builtin_unreachable();
324 }
325
326 cp->mc_align = align;
327 cp->mc_slab_alloc = allocfn;
328 cp->mc_slab_free = freefn;
329 cp->mc_slab_audit = auditfn;
330 cp->mc_slab_log = logfn;
331 cp->mc_slab_notify = notifyfn;
332 cp->mc_private = need_zone ? cp : arg;
333 cp->mc_bufsize = bufsize;
334 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
335
336 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
337
338 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
339 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);
340
341 /*
342 * Allocation chunk size is the object's size plus any extra size
343 * needed to satisfy the object's alignment. It is enforced to be
344 * at least the size of an LP64 pointer to simplify auditing and to
345 * handle multiple-element allocation requests, where the elements
346 * returned are linked together in a list.
347 */
348 chunksize = MAX(bufsize, sizeof(u_int64_t));
349 if (need_zone) {
350 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
351 chunksize += sizeof(uint64_t) + align;
352 chunksize = P2ROUNDUP(chunksize, align);
353 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize,
354 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
355 }
356 cp->mc_chunksize = chunksize;
357
358 /*
359 * Initialize the bucket layer.
360 */
361 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
362 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
363 LCK_GRP_ATTR_NULL);
364 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);
365
366 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
367 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
368 LCK_GRP_ATTR_NULL);
369 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);
370
371 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
372 continue;
373 }
374
375 cp->cache_bkttype = btp;
376
377 /*
378 * Initialize the CPU layer. Each per-CPU structure is aligned
379 * on the CPU cache line boundary to prevent false sharing.
380 */
381 for (c = 0; c < ncpu; c++) {
382 mcache_cpu_t *ccp = &cp->mc_cpu[c];
383
384 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
385 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
386 ccp->cc_objs = -1;
387 ccp->cc_pobjs = -1;
388 }
389
390 if (mcache_ready) {
391 mcache_cache_bkt_enable(cp);
392 }
393
394 /* TODO: dynamically create sysctl for stats */
395
396 MCACHE_LIST_LOCK();
397 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
398 MCACHE_LIST_UNLOCK();
399
400 /*
401 * If cache buckets are enabled and this is the first cache
402 * created, start the periodic cache update.
403 */
404 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
405 mcache_updating = 1;
406 mcache_update_timeout(NULL);
407 }
408 if (cp->mc_flags & MCF_DEBUG) {
409 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
410 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
411 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
412 }
413 return cp;
414 }
415
416 /*
417 * Allocate one or more objects from a cache.
418 */
419 __private_extern__ unsigned int
mcache_alloc_ext(mcache_t * cp,mcache_obj_t ** list,unsigned int num,int wait)420 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
421 {
422 mcache_cpu_t *ccp;
423 mcache_obj_t **top = &(*list);
424 mcache_bkt_t *bkt;
425 unsigned int need = num;
426 boolean_t nwretry = FALSE;
427
428 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
429 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
430
431 ASSERT(list != NULL);
432 *list = NULL;
433
434 if (num == 0) {
435 return 0;
436 }
437
438 retry_alloc:
439 /* We may not always be running in the same CPU in case of retries */
440 ccp = MCACHE_CPU(cp);
441
442 MCACHE_LOCK(&ccp->cc_lock);
443 for (;;) {
444 /*
445 * If we have an object in the current CPU's filled bucket,
446 * chain the object to any previous objects and return if
447 * we've satisfied the number of requested objects.
448 */
449 if (ccp->cc_objs > 0) {
450 mcache_obj_t *tail;
451 int objs;
452
453 /*
454 * Objects in the bucket are already linked together
455 * with the most recently freed object at the head of
456 * the list; grab as many objects as we can.
457 */
458 objs = MIN((unsigned int)ccp->cc_objs, need);
459 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
460 ccp->cc_objs -= objs;
461 ccp->cc_alloc += objs;
462
463 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
464 list = &tail->obj_next;
465 *list = NULL;
466
467 /* If we got them all, return to caller */
468 if ((need -= objs) == 0) {
469 MCACHE_UNLOCK(&ccp->cc_lock);
470
471 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
472 cp->mc_slab_log != NULL) {
473 (*cp->mc_slab_log)(num, *top, TRUE);
474 }
475
476 if (cp->mc_flags & MCF_DEBUG) {
477 goto debug_alloc;
478 }
479
480 return num;
481 }
482 }
483
484 /*
485 * The CPU's filled bucket is empty. If the previous filled
486 * bucket was full, exchange and try again.
487 */
488 if (ccp->cc_pobjs > 0) {
489 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
490 continue;
491 }
492
493 /*
494 * If the bucket layer is disabled, allocate from slab. This
495 * can happen either because MCF_NOCPUCACHE is set, or because
496 * the bucket layer is currently being resized.
497 */
498 if (ccp->cc_bktsize == 0) {
499 break;
500 }
501
502 /*
503 * Both of the CPU's buckets are empty; try to get full
504 * bucket(s) from the bucket layer. Upon success, refill
505 * this CPU and place any empty bucket into the empty list.
506 * To prevent potential thrashing, replace both empty buckets
507 * only if the requested count exceeds a bucket's worth of
508 * objects.
509 */
510 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full,
511 &bkt, (need <= ccp->cc_bktsize) ? 1 : 2);
512 if (bkt != NULL) {
513 mcache_bkt_t *bkt_list = NULL;
514
515 if (ccp->cc_pfilled != NULL) {
516 ccp->cc_pfilled->bkt_next = bkt_list;
517 bkt_list = ccp->cc_pfilled;
518 }
519 if (bkt->bkt_next == NULL) {
520 /*
521 * Bucket layer allocation returns only 1
522 * magazine; retain current empty magazine.
523 */
524 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
525 } else {
526 /*
527 * We got 2 full buckets from the bucket
528 * layer; release the current empty bucket
529 * back to the bucket layer.
530 */
531 if (ccp->cc_filled != NULL) {
532 ccp->cc_filled->bkt_next = bkt_list;
533 bkt_list = ccp->cc_filled;
534 }
535 mcache_cpu_batch_refill(ccp, bkt,
536 ccp->cc_bktsize);
537 }
538 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt_list);
539 continue;
540 }
541
542 /*
543 * The bucket layer has no full buckets; allocate the
544 * object(s) directly from the slab layer.
545 */
546 break;
547 }
548 MCACHE_UNLOCK(&ccp->cc_lock);
549
550 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
551
552 /*
553 * If this is a blocking allocation, or if it is non-blocking and
554 * the cache's full bucket is non-empty, then retry the allocation.
555 */
556 if (need > 0) {
557 if (!(wait & MCR_NONBLOCKING)) {
558 os_atomic_inc(&cp->mc_wretry_cnt, relaxed);
559 goto retry_alloc;
560 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
561 !mcache_bkt_isempty(cp)) {
562 if (!nwretry) {
563 nwretry = TRUE;
564 }
565 os_atomic_inc(&cp->mc_nwretry_cnt, relaxed);
566 goto retry_alloc;
567 } else if (nwretry) {
568 os_atomic_inc(&cp->mc_nwfail_cnt, relaxed);
569 }
570 }
571
572 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
573 (*cp->mc_slab_log)((num - need), *top, TRUE);
574 }
575
576 if (!(cp->mc_flags & MCF_DEBUG)) {
577 return num - need;
578 }
579
580 debug_alloc:
581 if (cp->mc_flags & MCF_DEBUG) {
582 mcache_obj_t **o = top;
583 unsigned int n;
584
585 n = 0;
586 /*
587 * Verify that the chain of objects have the same count as
588 * what we are about to report to the caller. Any mismatch
589 * here means that the object list is insanely broken and
590 * therefore we must panic.
591 */
592 while (*o != NULL) {
593 o = &(*o)->obj_next;
594 ++n;
595 }
596 if (n != (num - need)) {
597 panic("mcache_alloc_ext: %s cp %p corrupted list "
598 "(got %d actual %d)\n", cp->mc_name,
599 (void *)cp, num - need, n);
600 /* NOTREACHED */
601 __builtin_unreachable();
602 }
603 }
604
605 /* Invoke the slab layer audit callback if auditing is enabled */
606 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
607 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
608 }
609
610 return num - need;
611 }
612
613 /*
614 * Allocate a single object from a cache.
615 */
616 __private_extern__ void *
mcache_alloc(mcache_t * cp,int wait)617 mcache_alloc(mcache_t *cp, int wait)
618 {
619 mcache_obj_t *buf;
620
621 (void) mcache_alloc_ext(cp, &buf, 1, wait);
622 return buf;
623 }
624
625 __private_extern__ void
mcache_waiter_inc(mcache_t * cp)626 mcache_waiter_inc(mcache_t *cp)
627 {
628 os_atomic_inc(&cp->mc_waiter_cnt, relaxed);
629 }
630
631 __private_extern__ void
mcache_waiter_dec(mcache_t * cp)632 mcache_waiter_dec(mcache_t *cp)
633 {
634 os_atomic_dec(&cp->mc_waiter_cnt, relaxed);
635 }
636
637 __private_extern__ boolean_t
mcache_bkt_isempty(mcache_t * cp)638 mcache_bkt_isempty(mcache_t *cp)
639 {
640 /*
641 * This isn't meant to accurately tell whether there are
642 * any full buckets in the cache; it is simply a way to
643 * obtain "hints" about the state of the cache.
644 */
645 return cp->mc_full.bl_total == 0;
646 }
647
648 /*
649 * Notify the slab layer about an event.
650 */
651 static void
mcache_notify(mcache_t * cp,u_int32_t event)652 mcache_notify(mcache_t *cp, u_int32_t event)
653 {
654 if (cp->mc_slab_notify != NULL) {
655 (*cp->mc_slab_notify)(cp->mc_private, event);
656 }
657 }
658
659 /*
660 * Purge the cache and disable its buckets.
661 */
662 static void
mcache_purge(void * arg)663 mcache_purge(void *arg)
664 {
665 mcache_t *cp = arg;
666
667 mcache_bkt_purge(cp);
668 /*
669 * We cannot simply call mcache_cache_bkt_enable() from here as
670 * a bucket resize may be in flight and we would cause the CPU
671 * layers of the cache to point to different sizes. Therefore,
672 * we simply increment the enable count so that during the next
673 * periodic cache update the buckets can be reenabled.
674 */
675 lck_mtx_lock_spin(&cp->mc_sync_lock);
676 cp->mc_enable_cnt++;
677 lck_mtx_unlock(&cp->mc_sync_lock);
678 }
679
680 __private_extern__ boolean_t
mcache_purge_cache(mcache_t * cp,boolean_t async)681 mcache_purge_cache(mcache_t *cp, boolean_t async)
682 {
683 /*
684 * Purging a cache that has no per-CPU caches or is already
685 * in the process of being purged is rather pointless.
686 */
687 if (cp->mc_flags & MCF_NOCPUCACHE) {
688 return FALSE;
689 }
690
691 lck_mtx_lock_spin(&cp->mc_sync_lock);
692 if (cp->mc_purge_cnt > 0) {
693 lck_mtx_unlock(&cp->mc_sync_lock);
694 return FALSE;
695 }
696 cp->mc_purge_cnt++;
697 lck_mtx_unlock(&cp->mc_sync_lock);
698
699 if (async) {
700 mcache_dispatch(mcache_purge, cp);
701 } else {
702 mcache_purge(cp);
703 }
704
705 return TRUE;
706 }
707
708 /*
709 * Free a single object to a cache.
710 */
711 __private_extern__ void
mcache_free(mcache_t * cp,void * buf)712 mcache_free(mcache_t *cp, void *buf)
713 {
714 ((mcache_obj_t *)buf)->obj_next = NULL;
715 mcache_free_ext(cp, (mcache_obj_t *)buf);
716 }
717
718 /*
719 * Free one or more objects to a cache.
720 */
721 __private_extern__ void
mcache_free_ext(mcache_t * cp,mcache_obj_t * list)722 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
723 {
724 mcache_cpu_t *ccp = MCACHE_CPU(cp);
725 mcache_bkttype_t *btp;
726 mcache_obj_t *nlist;
727 mcache_bkt_t *bkt;
728
729 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
730 (*cp->mc_slab_log)(0, list, FALSE);
731 }
732
733 /* Invoke the slab layer audit callback if auditing is enabled */
734 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
735 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
736 }
737
738 MCACHE_LOCK(&ccp->cc_lock);
739 for (;;) {
740 /*
741 * If there is space in the current CPU's filled bucket, put
742 * the object there and return once all objects are freed.
743 * Note the cast to unsigned integer takes care of the case
744 * where the bucket layer is disabled (when cc_objs is -1).
745 */
746 if ((unsigned int)ccp->cc_objs <
747 (unsigned int)ccp->cc_bktsize) {
748 /*
749 * Reverse the list while we place the object into the
750 * bucket; this effectively causes the most recently
751 * freed object(s) to be reused during allocation.
752 */
753 nlist = list->obj_next;
754 list->obj_next = (ccp->cc_objs == 0) ? NULL :
755 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
756 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
757 ccp->cc_free++;
758
759 if ((list = nlist) != NULL) {
760 continue;
761 }
762
763 /* We are done; return to caller */
764 MCACHE_UNLOCK(&ccp->cc_lock);
765
766 /* If there is a waiter below, notify it */
767 if (cp->mc_waiter_cnt > 0) {
768 mcache_notify(cp, MCN_RETRYALLOC);
769 }
770 return;
771 }
772
773 /*
774 * The CPU's filled bucket is full. If the previous filled
775 * bucket was empty, exchange and try again.
776 */
777 if (ccp->cc_pobjs == 0) {
778 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
779 continue;
780 }
781
782 /*
783 * If the bucket layer is disabled, free to slab. This can
784 * happen either because MCF_NOCPUCACHE is set, or because
785 * the bucket layer is currently being resized.
786 */
787 if (ccp->cc_bktsize == 0) {
788 break;
789 }
790
791 /*
792 * Both of the CPU's buckets are full; try to get empty
793 * buckets from the bucket layer. Upon success, empty this
794 * CPU and place any full bucket into the full list.
795 *
796 * TODO: Because the caller currently doesn't indicate
797 * the number of objects in the list, we choose the more
798 * conservative approach of allocating only 1 empty
799 * bucket (to prevent potential thrashing). Once we
800 * have the object count, we can replace 1 with similar
801 * logic as used in mcache_alloc_ext().
802 */
803 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, 1);
804 if (bkt != NULL) {
805 mcache_bkt_t *bkt_list = NULL;
806
807 if (ccp->cc_pfilled != NULL) {
808 ccp->cc_pfilled->bkt_next = bkt_list;
809 bkt_list = ccp->cc_pfilled;
810 }
811 if (bkt->bkt_next == NULL) {
812 /*
813 * Bucket layer allocation returns only 1
814 * bucket; retain current full bucket.
815 */
816 mcache_cpu_refill(ccp, bkt, 0);
817 } else {
818 /*
819 * We got 2 empty buckets from the bucket
820 * layer; release the current full bucket
821 * back to the bucket layer.
822 */
823 if (ccp->cc_filled != NULL) {
824 ccp->cc_filled->bkt_next = bkt_list;
825 bkt_list = ccp->cc_filled;
826 }
827 mcache_cpu_batch_refill(ccp, bkt, 0);
828 }
829 mcache_bkt_batch_free(cp, &cp->mc_full, bkt_list);
830 continue;
831 }
832 btp = cp->cache_bkttype;
833
834 /*
835 * We need an empty bucket to put our freed objects into
836 * but couldn't get an empty bucket from the bucket layer;
837 * attempt to allocate one. We do not want to block for
838 * allocation here, and if the bucket allocation fails
839 * we will simply fall through to the slab layer.
840 */
841 MCACHE_UNLOCK(&ccp->cc_lock);
842 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
843 MCACHE_LOCK(&ccp->cc_lock);
844
845 if (bkt != NULL) {
846 /*
847 * We have an empty bucket, but since we drop the
848 * CPU lock above, the cache's bucket size may have
849 * changed. If so, free the bucket and try again.
850 */
851 if (ccp->cc_bktsize != btp->bt_bktsize) {
852 MCACHE_UNLOCK(&ccp->cc_lock);
853 mcache_free(btp->bt_cache, bkt);
854 MCACHE_LOCK(&ccp->cc_lock);
855 continue;
856 }
857
858 /*
859 * Store it in the bucket object since we'll
860 * need to refer to it during bucket destroy;
861 * we can't safely refer to cache_bkttype as
862 * the bucket lock may not be acquired then.
863 */
864 bkt->bkt_type = btp;
865
866 /*
867 * We have an empty bucket of the right size;
868 * add it to the bucket layer and try again.
869 */
870 ASSERT(bkt->bkt_next == NULL);
871 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt);
872 continue;
873 }
874
875 /*
876 * The bucket layer has no empty buckets; free the
877 * object(s) directly to the slab layer.
878 */
879 break;
880 }
881 MCACHE_UNLOCK(&ccp->cc_lock);
882
883 /* If there is a waiter below, notify it */
884 if (cp->mc_waiter_cnt > 0) {
885 mcache_notify(cp, MCN_RETRYALLOC);
886 }
887
888 /* Advise the slab layer to purge the object(s) */
889 (*cp->mc_slab_free)(cp->mc_private, list,
890 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
891 }
892
893 /*
894 * Cache destruction routine.
895 */
896 __private_extern__ void
mcache_destroy(mcache_t * cp)897 mcache_destroy(mcache_t *cp)
898 {
899 void **pbuf;
900
901 MCACHE_LIST_LOCK();
902 LIST_REMOVE(cp, mc_list);
903 MCACHE_LIST_UNLOCK();
904
905 mcache_bkt_purge(cp);
906
907 /*
908 * This cache is dead; there should be no further transaction.
909 * If it's still invoked, make sure that it induces a fault.
910 */
911 cp->mc_slab_alloc = NULL;
912 cp->mc_slab_free = NULL;
913 cp->mc_slab_audit = NULL;
914
915 lck_grp_free(cp->mc_bkt_lock_grp);
916 lck_grp_free(cp->mc_cpu_lock_grp);
917 lck_grp_free(cp->mc_sync_lock_grp);
918
919 /*
920 * TODO: We need to destroy the zone here, but cannot do it
921 * because there is no such way to achieve that. Until then
922 * the memory allocated for the zone structure is leaked.
923 * Once it is achievable, uncomment these lines:
924 *
925 * if (cp->mc_slab_zone != NULL) {
926 * zdestroy(cp->mc_slab_zone);
927 * cp->mc_slab_zone = NULL;
928 * }
929 */
930
931 /* Get the original address since we're about to free it */
932 pbuf = (void **)((intptr_t)cp - sizeof(void *));
933
934 zfree(mcache_zone, *pbuf);
935 }
936
937 /*
938 * Internal slab allocator used as a backend for simple caches. The current
939 * implementation uses the zone allocator for simplicity reasons.
940 */
941 static unsigned int
mcache_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)942 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
943 int wait)
944 {
945 #pragma unused(wait)
946 mcache_t *cp = arg;
947 unsigned int need = num;
948 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
949 u_int32_t flags = cp->mc_flags;
950 void *buf, *base, **pbuf;
951 mcache_obj_t **list = *plist;
952
953 *list = NULL;
954
955 for (;;) {
956 buf = zalloc_flags(cp->mc_slab_zone, Z_WAITOK | Z_NOFAIL);
957
958 /* Get the aligned base address for this object */
959 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
960 cp->mc_align);
961
962 /*
963 * Wind back a pointer size from the aligned base and
964 * save the original address so we can free it later.
965 */
966 pbuf = (void **)((intptr_t)base - sizeof(void *));
967 *pbuf = buf;
968
969 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
970 ((intptr_t)buf + cp->mc_chunksize));
971
972 /*
973 * If auditing is enabled, patternize the contents of
974 * the buffer starting from the 64-bit aligned base to
975 * the end of the buffer; the length is rounded up to
976 * the nearest 64-bit multiply; this is because we use
977 * 64-bit memory access to set/check the pattern.
978 */
979 if (flags & MCF_DEBUG) {
980 VERIFY(((intptr_t)base + rsize) <=
981 ((intptr_t)buf + cp->mc_chunksize));
982 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
983 }
984
985 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
986 *list = (mcache_obj_t *)base;
987
988 (*list)->obj_next = NULL;
989 list = *plist = &(*list)->obj_next;
990
991 /* If we got them all, return to mcache */
992 if (--need == 0) {
993 break;
994 }
995 }
996
997 return num - need;
998 }
999
1000 /*
1001 * Internal slab deallocator used as a backend for simple caches.
1002 */
1003 static void
mcache_slab_free(void * arg,mcache_obj_t * list,__unused boolean_t purged)1004 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1005 {
1006 mcache_t *cp = arg;
1007 mcache_obj_t *nlist;
1008 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1009 u_int32_t flags = cp->mc_flags;
1010 void *base;
1011 void **pbuf;
1012
1013 for (;;) {
1014 nlist = list->obj_next;
1015 list->obj_next = NULL;
1016
1017 base = list;
1018 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1019
1020 /* Get the original address since we're about to free it */
1021 pbuf = (void **)((intptr_t)base - sizeof(void *));
1022
1023 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1024 ((intptr_t)*pbuf + cp->mc_chunksize));
1025
1026 if (flags & MCF_DEBUG) {
1027 VERIFY(((intptr_t)base + rsize) <=
1028 ((intptr_t)*pbuf + cp->mc_chunksize));
1029 mcache_audit_free_verify(NULL, base, 0, rsize);
1030 }
1031
1032 /* Free it to zone */
1033 zfree(cp->mc_slab_zone, *pbuf);
1034
1035 /* No more objects to free; return to mcache */
1036 if ((list = nlist) == NULL) {
1037 break;
1038 }
1039 }
1040 }
1041
1042 /*
1043 * Internal slab auditor for simple caches.
1044 */
1045 static void
mcache_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1046 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1047 {
1048 mcache_t *cp = arg;
1049 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1050 void *base, **pbuf;
1051
1052 while (list != NULL) {
1053 mcache_obj_t *next = list->obj_next;
1054
1055 base = list;
1056 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1057
1058 /* Get the original address */
1059 pbuf = (void **)((intptr_t)base - sizeof(void *));
1060
1061 VERIFY(((intptr_t)base + rsize) <=
1062 ((intptr_t)*pbuf + cp->mc_chunksize));
1063
1064 if (!alloc) {
1065 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1066 } else {
1067 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1068 }
1069
1070 list = list->obj_next = next;
1071 }
1072 }
1073
1074 /*
1075 * Refill the CPU's buckets with bkt and its follower (if any).
1076 */
1077 static void
mcache_cpu_batch_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1078 mcache_cpu_batch_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1079 {
1080 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1081 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1082 ASSERT(ccp->cc_bktsize > 0);
1083
1084 ccp->cc_filled = bkt;
1085 ccp->cc_objs = objs;
1086 if (__probable(bkt->bkt_next != NULL)) {
1087 ccp->cc_pfilled = bkt->bkt_next;
1088 ccp->cc_pobjs = objs;
1089 bkt->bkt_next = NULL;
1090 } else {
1091 ASSERT(bkt->bkt_next == NULL);
1092 ccp->cc_pfilled = NULL;
1093 ccp->cc_pobjs = -1;
1094 }
1095 }
1096
1097 /*
1098 * Refill the CPU's filled bucket with bkt and save the previous one.
1099 */
1100 static void
mcache_cpu_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1101 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1102 {
1103 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1104 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1105 ASSERT(ccp->cc_bktsize > 0);
1106
1107 ccp->cc_pfilled = ccp->cc_filled;
1108 ccp->cc_pobjs = ccp->cc_objs;
1109 ccp->cc_filled = bkt;
1110 ccp->cc_objs = objs;
1111 }
1112
1113 /*
1114 * Get one or more buckets from the bucket layer.
1115 */
1116 static uint32_t
mcache_bkt_batch_alloc(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t ** list,uint32_t num)1117 mcache_bkt_batch_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t **list,
1118 uint32_t num)
1119 {
1120 mcache_bkt_t *bkt_list = NULL;
1121 mcache_bkt_t *bkt;
1122 uint32_t need = num;
1123
1124 ASSERT(list != NULL && need > 0);
1125
1126 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1127 /*
1128 * The bucket layer lock is held by another CPU; increase
1129 * the contention count so that we can later resize the
1130 * bucket size accordingly.
1131 */
1132 MCACHE_LOCK(&cp->mc_bkt_lock);
1133 cp->mc_bkt_contention++;
1134 }
1135
1136 while ((bkt = blp->bl_list) != NULL) {
1137 blp->bl_list = bkt->bkt_next;
1138 bkt->bkt_next = bkt_list;
1139 bkt_list = bkt;
1140 if (--blp->bl_total < blp->bl_min) {
1141 blp->bl_min = blp->bl_total;
1142 }
1143 blp->bl_alloc++;
1144 if (--need == 0) {
1145 break;
1146 }
1147 }
1148
1149 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1150
1151 *list = bkt_list;
1152
1153 return num - need;
1154 }
1155
1156 /*
1157 * Return one or more buckets to the bucket layer.
1158 */
1159 static void
mcache_bkt_batch_free(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t * bkt)1160 mcache_bkt_batch_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1161 {
1162 mcache_bkt_t *nbkt;
1163
1164 MCACHE_LOCK(&cp->mc_bkt_lock);
1165 while (bkt != NULL) {
1166 nbkt = bkt->bkt_next;
1167 bkt->bkt_next = blp->bl_list;
1168 blp->bl_list = bkt;
1169 blp->bl_total++;
1170 bkt = nbkt;
1171 }
1172 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1173 }
1174
1175 /*
1176 * Enable the bucket layer of a cache.
1177 */
1178 static void
mcache_cache_bkt_enable(mcache_t * cp)1179 mcache_cache_bkt_enable(mcache_t *cp)
1180 {
1181 mcache_cpu_t *ccp;
1182 unsigned int cpu;
1183
1184 if (cp->mc_flags & MCF_NOCPUCACHE) {
1185 return;
1186 }
1187
1188 for (cpu = 0; cpu < ncpu; cpu++) {
1189 ccp = &cp->mc_cpu[cpu];
1190 MCACHE_LOCK(&ccp->cc_lock);
1191 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1192 MCACHE_UNLOCK(&ccp->cc_lock);
1193 }
1194 }
1195
1196 /*
1197 * Purge all buckets from a cache and disable its bucket layer.
1198 */
1199 static void
mcache_bkt_purge(mcache_t * cp)1200 mcache_bkt_purge(mcache_t *cp)
1201 {
1202 mcache_cpu_t *ccp;
1203 mcache_bkt_t *bp, *pbp;
1204 int objs, pobjs;
1205 unsigned int cpu;
1206
1207 for (cpu = 0; cpu < ncpu; cpu++) {
1208 ccp = &cp->mc_cpu[cpu];
1209
1210 MCACHE_LOCK(&ccp->cc_lock);
1211
1212 bp = ccp->cc_filled;
1213 pbp = ccp->cc_pfilled;
1214 objs = ccp->cc_objs;
1215 pobjs = ccp->cc_pobjs;
1216 ccp->cc_filled = NULL;
1217 ccp->cc_pfilled = NULL;
1218 ccp->cc_objs = -1;
1219 ccp->cc_pobjs = -1;
1220 ccp->cc_bktsize = 0;
1221
1222 MCACHE_UNLOCK(&ccp->cc_lock);
1223
1224 if (bp != NULL) {
1225 mcache_bkt_destroy(cp, bp, objs);
1226 }
1227 if (pbp != NULL) {
1228 mcache_bkt_destroy(cp, pbp, pobjs);
1229 }
1230 }
1231
1232 mcache_bkt_ws_zero(cp);
1233 mcache_bkt_ws_reap(cp);
1234 }
1235
1236 /*
1237 * Free one or more objects in the bucket to the slab layer,
1238 * and also free the bucket itself.
1239 */
1240 static void
mcache_bkt_destroy(mcache_t * cp,mcache_bkt_t * bkt,int nobjs)1241 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1242 {
1243 if (nobjs > 0) {
1244 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1245
1246 if (cp->mc_flags & MCF_DEBUG) {
1247 mcache_obj_t *o = top;
1248 int cnt = 0;
1249
1250 /*
1251 * Verify that the chain of objects in the bucket is
1252 * valid. Any mismatch here means a mistake when the
1253 * object(s) were freed to the CPU layer, so we panic.
1254 */
1255 while (o != NULL) {
1256 o = o->obj_next;
1257 ++cnt;
1258 }
1259 if (cnt != nobjs) {
1260 panic("mcache_bkt_destroy: %s cp %p corrupted "
1261 "list in bkt %p (nobjs %d actual %d)\n",
1262 cp->mc_name, (void *)cp, (void *)bkt,
1263 nobjs, cnt);
1264 /* NOTREACHED */
1265 __builtin_unreachable();
1266 }
1267 }
1268
1269 /* Advise the slab layer to purge the object(s) */
1270 (*cp->mc_slab_free)(cp->mc_private, top,
1271 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1272 }
1273 mcache_free(bkt->bkt_type->bt_cache, bkt);
1274 }
1275
1276 /*
1277 * Update the bucket layer working set statistics.
1278 */
1279 static void
mcache_bkt_ws_update(mcache_t * cp)1280 mcache_bkt_ws_update(mcache_t *cp)
1281 {
1282 MCACHE_LOCK(&cp->mc_bkt_lock);
1283
1284 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1285 cp->mc_full.bl_min = cp->mc_full.bl_total;
1286 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1287 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1288
1289 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1290 }
1291
1292 /*
1293 * Mark everything as eligible for reaping (working set is zero).
1294 */
1295 static void
mcache_bkt_ws_zero(mcache_t * cp)1296 mcache_bkt_ws_zero(mcache_t *cp)
1297 {
1298 MCACHE_LOCK(&cp->mc_bkt_lock);
1299
1300 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1301 cp->mc_full.bl_min = cp->mc_full.bl_total;
1302 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1303 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1304
1305 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1306 }
1307
1308 /*
1309 * Reap all buckets that are beyond the working set.
1310 */
1311 static void
mcache_bkt_ws_reap(mcache_t * cp)1312 mcache_bkt_ws_reap(mcache_t *cp)
1313 {
1314 mcache_bkt_t *bkt, *nbkt;
1315 uint32_t reap;
1316
1317 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1318 if (reap != 0) {
1319 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full, &bkt, reap);
1320 while (bkt != NULL) {
1321 nbkt = bkt->bkt_next;
1322 bkt->bkt_next = NULL;
1323 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1324 bkt = nbkt;
1325 }
1326 }
1327
1328 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1329 if (reap != 0) {
1330 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, reap);
1331 while (bkt != NULL) {
1332 nbkt = bkt->bkt_next;
1333 bkt->bkt_next = NULL;
1334 mcache_bkt_destroy(cp, bkt, 0);
1335 bkt = nbkt;
1336 }
1337 }
1338 }
1339
1340 static void
mcache_reap_timeout(thread_call_param_t dummy __unused,thread_call_param_t arg)1341 mcache_reap_timeout(thread_call_param_t dummy __unused,
1342 thread_call_param_t arg)
1343 {
1344 volatile UInt32 *flag = arg;
1345
1346 ASSERT(flag == &mcache_reaping);
1347
1348 *flag = 0;
1349 }
1350
1351 static void
mcache_reap_done(void * flag)1352 mcache_reap_done(void *flag)
1353 {
1354 uint64_t deadline, leeway;
1355
1356 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1357 &deadline);
1358 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1359 NSEC_PER_SEC, &leeway);
1360 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1361 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1362 }
1363
1364 static void
mcache_reap_start(void * arg)1365 mcache_reap_start(void *arg)
1366 {
1367 UInt32 *flag = arg;
1368
1369 ASSERT(flag == &mcache_reaping);
1370
1371 mcache_applyall(mcache_cache_reap);
1372 mcache_dispatch(mcache_reap_done, flag);
1373 }
1374
1375 __private_extern__ void
mcache_reap(void)1376 mcache_reap(void)
1377 {
1378 UInt32 *flag = &mcache_reaping;
1379
1380 if (mcache_llock_owner == current_thread() ||
1381 !OSCompareAndSwap(0, 1, flag)) {
1382 return;
1383 }
1384
1385 mcache_dispatch(mcache_reap_start, flag);
1386 }
1387
1388 __private_extern__ void
mcache_reap_now(mcache_t * cp,boolean_t purge)1389 mcache_reap_now(mcache_t *cp, boolean_t purge)
1390 {
1391 if (purge) {
1392 mcache_bkt_purge(cp);
1393 mcache_cache_bkt_enable(cp);
1394 } else {
1395 mcache_bkt_ws_zero(cp);
1396 mcache_bkt_ws_reap(cp);
1397 }
1398 }
1399
1400 static void
mcache_cache_reap(mcache_t * cp)1401 mcache_cache_reap(mcache_t *cp)
1402 {
1403 mcache_bkt_ws_reap(cp);
1404 }
1405
1406 /*
1407 * Performs period maintenance on a cache.
1408 */
1409 static void
mcache_cache_update(mcache_t * cp)1410 mcache_cache_update(mcache_t *cp)
1411 {
1412 int need_bkt_resize = 0;
1413 int need_bkt_reenable = 0;
1414
1415 lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);
1416
1417 mcache_bkt_ws_update(cp);
1418
1419 /*
1420 * Cache resize and post-purge reenable are mutually exclusive.
1421 * If the cache was previously purged, there is no point of
1422 * increasing the bucket size as there was an indication of
1423 * memory pressure on the system.
1424 */
1425 lck_mtx_lock_spin(&cp->mc_sync_lock);
1426 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1427 need_bkt_reenable = 1;
1428 }
1429 lck_mtx_unlock(&cp->mc_sync_lock);
1430
1431 MCACHE_LOCK(&cp->mc_bkt_lock);
1432 /*
1433 * If the contention count is greater than the threshold, and if
1434 * we are not already at the maximum bucket size, increase it.
1435 * Otherwise, if this cache was previously purged by the user
1436 * then we simply reenable it.
1437 */
1438 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1439 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1440 mcache_bkt_contention && !need_bkt_reenable) {
1441 need_bkt_resize = 1;
1442 }
1443
1444 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1445 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1446
1447 if (need_bkt_resize) {
1448 mcache_dispatch(mcache_cache_bkt_resize, cp);
1449 } else if (need_bkt_reenable) {
1450 mcache_dispatch(mcache_cache_enable, cp);
1451 }
1452 }
1453
1454 /*
1455 * Recompute a cache's bucket size. This is an expensive operation
1456 * and should not be done frequently; larger buckets provide for a
1457 * higher transfer rate with the bucket while smaller buckets reduce
1458 * the memory consumption.
1459 */
1460 static void
mcache_cache_bkt_resize(void * arg)1461 mcache_cache_bkt_resize(void *arg)
1462 {
1463 mcache_t *cp = arg;
1464 mcache_bkttype_t *btp = cp->cache_bkttype;
1465
1466 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1467 mcache_bkt_purge(cp);
1468
1469 /*
1470 * Upgrade to the next bucket type with larger bucket size;
1471 * temporarily set the previous contention snapshot to a
1472 * negative number to prevent unnecessary resize request.
1473 */
1474 MCACHE_LOCK(&cp->mc_bkt_lock);
1475 cp->cache_bkttype = ++btp;
1476 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1477 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1478
1479 mcache_cache_enable(cp);
1480 }
1481 }
1482
1483 /*
1484 * Reenable a previously disabled cache due to purge.
1485 */
1486 static void
mcache_cache_enable(void * arg)1487 mcache_cache_enable(void *arg)
1488 {
1489 mcache_t *cp = arg;
1490
1491 lck_mtx_lock_spin(&cp->mc_sync_lock);
1492 cp->mc_purge_cnt = 0;
1493 cp->mc_enable_cnt = 0;
1494 lck_mtx_unlock(&cp->mc_sync_lock);
1495
1496 mcache_cache_bkt_enable(cp);
1497 }
1498
1499 static void
mcache_update_timeout(__unused void * arg)1500 mcache_update_timeout(__unused void *arg)
1501 {
1502 uint64_t deadline, leeway;
1503
1504 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1505 &deadline);
1506 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1507 NSEC_PER_SEC, &leeway);
1508 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1509 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1510 }
1511
1512 static void
mcache_update(thread_call_param_t arg __unused,thread_call_param_t dummy __unused)1513 mcache_update(thread_call_param_t arg __unused,
1514 thread_call_param_t dummy __unused)
1515 {
1516 mcache_applyall(mcache_cache_update);
1517 mcache_update_timeout(NULL);
1518 }
1519
1520 static void
mcache_applyall(void (* func)(mcache_t *))1521 mcache_applyall(void (*func)(mcache_t *))
1522 {
1523 mcache_t *cp;
1524
1525 MCACHE_LIST_LOCK();
1526 LIST_FOREACH(cp, &mcache_head, mc_list) {
1527 func(cp);
1528 }
1529 MCACHE_LIST_UNLOCK();
1530 }
1531
1532 static void
mcache_dispatch(void (* func)(void *),void * arg)1533 mcache_dispatch(void (*func)(void *), void *arg)
1534 {
1535 ASSERT(func != NULL);
1536 timeout(func, arg, hz / 1000);
1537 }
1538
1539 __private_extern__ void
mcache_buffer_log(mcache_audit_t * mca,void * addr,mcache_t * cp,struct timeval * base_ts)1540 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1541 struct timeval *base_ts)
1542 {
1543 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1544 void *stack[MCACHE_STACK_DEPTH + 1];
1545 struct mca_trn *transaction;
1546
1547 transaction = &mca->mca_trns[mca->mca_next_trn];
1548
1549 mca->mca_addr = addr;
1550 mca->mca_cache = cp;
1551
1552 transaction->mca_thread = current_thread();
1553
1554 bzero(stack, sizeof(stack));
1555 transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1556 bcopy(&stack[1], transaction->mca_stack,
1557 sizeof(transaction->mca_stack));
1558
1559 microuptime(&now);
1560 if (base_ts != NULL) {
1561 base = *base_ts;
1562 }
1563 /* tstamp is in ms relative to base_ts */
1564 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1565 if ((now.tv_sec - base.tv_sec) > 0) {
1566 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1567 }
1568
1569 mca->mca_next_trn =
1570 (mca->mca_next_trn + 1) % mca_trn_max;
1571 }
1572
1573 /*
1574 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1575 * mcache_verify_set_pattern() are marked as noinline to prevent the
1576 * compiler from aliasing pointers when they are inlined inside the callers
1577 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1578 */
1579 __private_extern__ OS_NOINLINE void
mcache_set_pattern(u_int64_t pattern,void * buf_arg,size_t size)1580 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1581 {
1582 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1583 u_int64_t *buf = (u_int64_t *)buf_arg;
1584
1585 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1586 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1587
1588 while (buf < buf_end) {
1589 *buf++ = pattern;
1590 }
1591 }
1592
1593 __private_extern__ OS_NOINLINE void *
mcache_verify_pattern(u_int64_t pattern,void * buf_arg,size_t size)1594 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1595 {
1596 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1597 u_int64_t *buf;
1598
1599 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1600 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1601
1602 for (buf = buf_arg; buf < buf_end; buf++) {
1603 if (*buf != pattern) {
1604 return buf;
1605 }
1606 }
1607 return NULL;
1608 }
1609
1610 OS_NOINLINE static void *
mcache_verify_set_pattern(u_int64_t old,u_int64_t new,void * buf_arg,size_t size)1611 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1612 size_t size)
1613 {
1614 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1615 u_int64_t *buf;
1616
1617 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1618 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1619
1620 for (buf = buf_arg; buf < buf_end; buf++) {
1621 if (*buf != old) {
1622 mcache_set_pattern(old, buf_arg,
1623 (uintptr_t)buf - (uintptr_t)buf_arg);
1624 return buf;
1625 }
1626 *buf = new;
1627 }
1628 return NULL;
1629 }
1630
1631 __private_extern__ void
mcache_audit_free_verify(mcache_audit_t * mca,void * base,size_t offset,size_t size)1632 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1633 size_t size)
1634 {
1635 void *addr;
1636 u_int64_t *oaddr64;
1637 mcache_obj_t *next;
1638
1639 addr = (void *)((uintptr_t)base + offset);
1640 next = ((mcache_obj_t *)addr)->obj_next;
1641
1642 /* For the "obj_next" pointer in the buffer */
1643 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1644 *oaddr64 = MCACHE_FREE_PATTERN;
1645
1646 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1647 (caddr_t)base, size)) != NULL) {
1648 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1649 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1650 /* NOTREACHED */
1651 }
1652 ((mcache_obj_t *)addr)->obj_next = next;
1653 }
1654
1655 __private_extern__ void
mcache_audit_free_verify_set(mcache_audit_t * mca,void * base,size_t offset,size_t size)1656 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1657 size_t size)
1658 {
1659 void *addr;
1660 u_int64_t *oaddr64;
1661 mcache_obj_t *next;
1662
1663 addr = (void *)((uintptr_t)base + offset);
1664 next = ((mcache_obj_t *)addr)->obj_next;
1665
1666 /* For the "obj_next" pointer in the buffer */
1667 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1668 *oaddr64 = MCACHE_FREE_PATTERN;
1669
1670 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1671 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1672 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1673 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1674 /* NOTREACHED */
1675 }
1676 ((mcache_obj_t *)addr)->obj_next = next;
1677 }
1678
1679 #undef panic
1680
1681 #define DUMP_TRN_FMT() \
1682 "%s transaction thread %p saved PC stack (%d deep):\n" \
1683 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1684 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1685
1686 #define DUMP_TRN_FIELDS(s, x) \
1687 s, \
1688 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1689 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1690 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1691 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1692 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1693 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1694 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1695 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1696 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1697
1698 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1699 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1700
1701 __private_extern__ char *
mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE],mcache_audit_t * mca)1702 mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
1703 {
1704 snprintf(buf, DUMP_MCA_BUF_SIZE,
1705 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1706 DUMP_TRN_FMT()
1707 DUMP_TRN_FMT(),
1708
1709 mca, mca->mca_addr, mca->mca_cache,
1710 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1711 mca->mca_next_trn,
1712
1713 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1714 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1715
1716 return buf;
1717 }
1718
1719 __private_extern__ void
mcache_audit_panic(mcache_audit_t * mca,void * addr,size_t offset,int64_t expected,int64_t got)1720 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1721 int64_t expected, int64_t got)
1722 {
1723 char buf[DUMP_MCA_BUF_SIZE];
1724
1725 if (mca == NULL) {
1726 panic("mcache_audit: buffer %p modified after free at "
1727 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1728 offset, got, expected);
1729 /* NOTREACHED */
1730 __builtin_unreachable();
1731 }
1732
1733 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1734 "(0x%llx instead of 0x%llx)\n%s\n",
1735 addr, offset, got, expected, mcache_dump_mca(buf, mca));
1736 /* NOTREACHED */
1737 __builtin_unreachable();
1738 }
1739