1 /*
2 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <string.h>
68
69 #include <sys/mcache.h>
70
71 #define MCACHE_SIZE(n) \
72 __builtin_offsetof(mcache_t, mc_cpu[n])
73
74 /* Allocate extra in case we need to manually align the pointer */
75 #define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78 #define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81 /*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88 #define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(&mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91 }
92
93 #define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(&mcache_llock); \
96 }
97
98 #define MCACHE_LOCK(l) lck_mtx_lock(l)
99 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102 static unsigned int ncpu;
103 static unsigned int cache_line_size;
104 static struct thread *mcache_llock_owner;
105 static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
106 static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
107 static struct zone *mcache_zone;
108 static const uint32_t mcache_reap_interval = 15;
109 static const uint32_t mcache_reap_interval_leeway = 2;
110 static UInt32 mcache_reaping;
111 static int mcache_ready;
112 static int mcache_updating;
113
114 static int mcache_bkt_contention = 3;
115 #if DEBUG
116 static unsigned int mcache_flags = MCF_DEBUG;
117 #else
118 static unsigned int mcache_flags = 0;
119 #endif
120
121 int mca_trn_max = MCA_TRN_MAX;
122
123 static mcache_bkttype_t mcache_bkttype[] = {
124 { 1, 4096, 32768, NULL },
125 { 3, 2048, 16384, NULL },
126 { 7, 1024, 12288, NULL },
127 { 15, 256, 8192, NULL },
128 { 31, 64, 4096, NULL },
129 { 47, 0, 2048, NULL },
130 { 63, 0, 1024, NULL },
131 { 95, 0, 512, NULL },
132 { 143, 0, 256, NULL },
133 { 165, 0, 0, NULL },
134 };
135
136 static mcache_t *mcache_create_common(const char *, size_t, size_t,
137 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
138 mcache_notifyfn_t, void *, u_int32_t, int);
139 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
140 unsigned int, int);
141 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
142 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
143 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
144 static void mcache_cpu_batch_refill(mcache_cpu_t *, mcache_bkt_t *, int);
145 static uint32_t mcache_bkt_batch_alloc(mcache_t *, mcache_bktlist_t *,
146 mcache_bkt_t **, uint32_t);
147 static void mcache_bkt_batch_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
148 static void mcache_cache_bkt_enable(mcache_t *);
149 static void mcache_bkt_purge(mcache_t *);
150 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
151 static void mcache_bkt_ws_update(mcache_t *);
152 static void mcache_bkt_ws_zero(mcache_t *);
153 static void mcache_bkt_ws_reap(mcache_t *);
154 static void mcache_dispatch(void (*)(void *), void *);
155 static void mcache_cache_reap(mcache_t *);
156 static void mcache_cache_update(mcache_t *);
157 static void mcache_cache_bkt_resize(void *);
158 static void mcache_cache_enable(void *);
159 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
160 static void mcache_update_timeout(void *);
161 static void mcache_applyall(void (*)(mcache_t *));
162 static void mcache_reap_start(void *);
163 static void mcache_reap_done(void *);
164 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
165 static void mcache_notify(mcache_t *, u_int32_t);
166 static void mcache_purge(void *);
167
168 static LIST_HEAD(, mcache) mcache_head;
169 mcache_t *mcache_audit_cache;
170
171 static thread_call_t mcache_reap_tcall;
172 static thread_call_t mcache_update_tcall;
173
174 /*
175 * Initialize the framework; this is currently called as part of BSD init.
176 */
177 __private_extern__ void
mcache_init(void)178 mcache_init(void)
179 {
180 mcache_bkttype_t *btp;
181 unsigned int i;
182 char name[32];
183
184 VERIFY(mca_trn_max >= 2);
185
186 ncpu = ml_wait_max_cpus();
187 (void) mcache_cache_line_size(); /* prime it */
188
189 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
190 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
191 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
192 panic("mcache_init: thread_call_allocate failed");
193 /* NOTREACHED */
194 __builtin_unreachable();
195 }
196
197 mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE, ZC_DESTRUCTIBLE);
198
199 LIST_INIT(&mcache_head);
200
201 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
202 btp = &mcache_bkttype[i];
203 (void) snprintf(name, sizeof(name), "bkt_%d",
204 btp->bt_bktsize);
205 btp->bt_cache = mcache_create(name,
206 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
207 }
208
209 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
210 mcache_flags &= MCF_FLAGS_MASK;
211
212 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
213 0, 0, MCR_SLEEP);
214
215 mcache_applyall(mcache_cache_bkt_enable);
216 mcache_ready = 1;
217
218 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
219 ncpu, CPU_CACHE_LINE_SIZE);
220 }
221
222 /*
223 * Return the global mcache flags.
224 */
225 __private_extern__ unsigned int
mcache_getflags(void)226 mcache_getflags(void)
227 {
228 return mcache_flags;
229 }
230
231 /*
232 * Return the CPU cache line size.
233 */
234 __private_extern__ unsigned int
mcache_cache_line_size(void)235 mcache_cache_line_size(void)
236 {
237 if (cache_line_size == 0) {
238 ml_cpu_info_t cpu_info;
239 ml_cpu_get_info(&cpu_info);
240 cache_line_size = (unsigned int)cpu_info.cache_line_size;
241 }
242 return cache_line_size;
243 }
244
245 /*
246 * Create a cache using the zone allocator as the backend slab allocator.
247 * The caller may specify any alignment for the object; if it specifies 0
248 * the default alignment (MCACHE_ALIGN) will be used.
249 */
250 __private_extern__ mcache_t *
mcache_create(const char * name,size_t bufsize,size_t align,u_int32_t flags,int wait __unused)251 mcache_create(const char *name, size_t bufsize, size_t align,
252 u_int32_t flags, int wait __unused)
253 {
254 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
255 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
256 }
257
258 /*
259 * Create a cache using a custom backend slab allocator. Since the caller
260 * is responsible for allocation, no alignment guarantee will be provided
261 * by this framework.
262 */
263 __private_extern__ mcache_t *
mcache_create_ext(const char * name,size_t bufsize,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int wait __unused)264 mcache_create_ext(const char *name, size_t bufsize,
265 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
266 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
267 u_int32_t flags, int wait __unused)
268 {
269 return mcache_create_common(name, bufsize, 0, allocfn,
270 freefn, auditfn, logfn, notifyfn, arg, flags, 0);
271 }
272
273 /*
274 * Common cache creation routine.
275 */
276 static mcache_t *
mcache_create_common(const char * name,size_t bufsize,size_t align,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int need_zone)277 mcache_create_common(const char *name, size_t bufsize, size_t align,
278 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
279 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
280 u_int32_t flags, int need_zone)
281 {
282 mcache_bkttype_t *btp;
283 mcache_t *cp = NULL;
284 size_t chunksize;
285 void *buf, **pbuf;
286 unsigned int c;
287 char lck_name[64];
288
289 buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
290
291 /*
292 * In case we didn't get a cache-aligned memory, round it up
293 * accordingly. This is needed in order to get the rest of
294 * structure members aligned properly. It also means that
295 * the memory span gets shifted due to the round up, but it
296 * is okay since we've allocated extra space for this.
297 */
298 cp = (mcache_t *)
299 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
300 pbuf = (void **)((intptr_t)cp - sizeof(void *));
301 *pbuf = buf;
302
303 /*
304 * Guaranteed alignment is valid only when we use the internal
305 * slab allocator (currently set to use the zone allocator).
306 */
307 if (!need_zone) {
308 align = 1;
309 } else {
310 /* Enforce 64-bit minimum alignment for zone-based buffers */
311 if (align == 0) {
312 align = MCACHE_ALIGN;
313 }
314 align = P2ROUNDUP(align, MCACHE_ALIGN);
315 }
316
317 if ((align & (align - 1)) != 0) {
318 panic("mcache_create: bad alignment %lu", align);
319 /* NOTREACHED */
320 __builtin_unreachable();
321 }
322
323 cp->mc_align = align;
324 cp->mc_slab_alloc = allocfn;
325 cp->mc_slab_free = freefn;
326 cp->mc_slab_audit = auditfn;
327 cp->mc_slab_log = logfn;
328 cp->mc_slab_notify = notifyfn;
329 cp->mc_private = need_zone ? cp : arg;
330 cp->mc_bufsize = bufsize;
331 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
332
333 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
334
335 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
336 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);
337
338 /*
339 * Allocation chunk size is the object's size plus any extra size
340 * needed to satisfy the object's alignment. It is enforced to be
341 * at least the size of an LP64 pointer to simplify auditing and to
342 * handle multiple-element allocation requests, where the elements
343 * returned are linked together in a list.
344 */
345 chunksize = MAX(bufsize, sizeof(u_int64_t));
346 if (need_zone) {
347 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
348 chunksize += sizeof(uint64_t) + align;
349 chunksize = P2ROUNDUP(chunksize, align);
350 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize, ZC_DESTRUCTIBLE);
351 }
352 cp->mc_chunksize = chunksize;
353
354 /*
355 * Initialize the bucket layer.
356 */
357 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
358 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
359 LCK_GRP_ATTR_NULL);
360 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);
361
362 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
363 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
364 LCK_GRP_ATTR_NULL);
365 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);
366
367 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
368 continue;
369 }
370
371 cp->cache_bkttype = btp;
372
373 /*
374 * Initialize the CPU layer. Each per-CPU structure is aligned
375 * on the CPU cache line boundary to prevent false sharing.
376 */
377 for (c = 0; c < ncpu; c++) {
378 mcache_cpu_t *ccp = &cp->mc_cpu[c];
379
380 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
381 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
382 ccp->cc_objs = -1;
383 ccp->cc_pobjs = -1;
384 }
385
386 if (mcache_ready) {
387 mcache_cache_bkt_enable(cp);
388 }
389
390 /* TODO: dynamically create sysctl for stats */
391
392 MCACHE_LIST_LOCK();
393 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
394 MCACHE_LIST_UNLOCK();
395
396 /*
397 * If cache buckets are enabled and this is the first cache
398 * created, start the periodic cache update.
399 */
400 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
401 mcache_updating = 1;
402 mcache_update_timeout(NULL);
403 }
404 if (cp->mc_flags & MCF_DEBUG) {
405 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
406 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
407 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
408 }
409 return cp;
410 }
411
412 /*
413 * Allocate one or more objects from a cache.
414 */
415 __private_extern__ unsigned int
mcache_alloc_ext(mcache_t * cp,mcache_obj_t ** list,unsigned int num,int wait)416 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
417 {
418 mcache_cpu_t *ccp;
419 mcache_obj_t **top = &(*list);
420 mcache_bkt_t *bkt;
421 unsigned int need = num;
422 boolean_t nwretry = FALSE;
423
424 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
425 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
426
427 ASSERT(list != NULL);
428 *list = NULL;
429
430 if (num == 0) {
431 return 0;
432 }
433
434 retry_alloc:
435 /* We may not always be running in the same CPU in case of retries */
436 ccp = MCACHE_CPU(cp);
437
438 MCACHE_LOCK(&ccp->cc_lock);
439 for (;;) {
440 /*
441 * If we have an object in the current CPU's filled bucket,
442 * chain the object to any previous objects and return if
443 * we've satisfied the number of requested objects.
444 */
445 if (ccp->cc_objs > 0) {
446 mcache_obj_t *tail;
447 int objs;
448
449 /*
450 * Objects in the bucket are already linked together
451 * with the most recently freed object at the head of
452 * the list; grab as many objects as we can.
453 */
454 objs = MIN((unsigned int)ccp->cc_objs, need);
455 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
456 ccp->cc_objs -= objs;
457 ccp->cc_alloc += objs;
458
459 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
460 list = &tail->obj_next;
461 *list = NULL;
462
463 /* If we got them all, return to caller */
464 if ((need -= objs) == 0) {
465 MCACHE_UNLOCK(&ccp->cc_lock);
466
467 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
468 cp->mc_slab_log != NULL) {
469 (*cp->mc_slab_log)(num, *top, TRUE);
470 }
471
472 if (cp->mc_flags & MCF_DEBUG) {
473 goto debug_alloc;
474 }
475
476 return num;
477 }
478 }
479
480 /*
481 * The CPU's filled bucket is empty. If the previous filled
482 * bucket was full, exchange and try again.
483 */
484 if (ccp->cc_pobjs > 0) {
485 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
486 continue;
487 }
488
489 /*
490 * If the bucket layer is disabled, allocate from slab. This
491 * can happen either because MCF_NOCPUCACHE is set, or because
492 * the bucket layer is currently being resized.
493 */
494 if (ccp->cc_bktsize == 0) {
495 break;
496 }
497
498 /*
499 * Both of the CPU's buckets are empty; try to get full
500 * bucket(s) from the bucket layer. Upon success, refill
501 * this CPU and place any empty bucket into the empty list.
502 * To prevent potential thrashing, replace both empty buckets
503 * only if the requested count exceeds a bucket's worth of
504 * objects.
505 */
506 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full,
507 &bkt, (need <= ccp->cc_bktsize) ? 1 : 2);
508 if (bkt != NULL) {
509 mcache_bkt_t *bkt_list = NULL;
510
511 if (ccp->cc_pfilled != NULL) {
512 ccp->cc_pfilled->bkt_next = bkt_list;
513 bkt_list = ccp->cc_pfilled;
514 }
515 if (bkt->bkt_next == NULL) {
516 /*
517 * Bucket layer allocation returns only 1
518 * magazine; retain current empty magazine.
519 */
520 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
521 } else {
522 /*
523 * We got 2 full buckets from the bucket
524 * layer; release the current empty bucket
525 * back to the bucket layer.
526 */
527 if (ccp->cc_filled != NULL) {
528 ccp->cc_filled->bkt_next = bkt_list;
529 bkt_list = ccp->cc_filled;
530 }
531 mcache_cpu_batch_refill(ccp, bkt,
532 ccp->cc_bktsize);
533 }
534 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt_list);
535 continue;
536 }
537
538 /*
539 * The bucket layer has no full buckets; allocate the
540 * object(s) directly from the slab layer.
541 */
542 break;
543 }
544 MCACHE_UNLOCK(&ccp->cc_lock);
545
546 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
547
548 /*
549 * If this is a blocking allocation, or if it is non-blocking and
550 * the cache's full bucket is non-empty, then retry the allocation.
551 */
552 if (need > 0) {
553 if (!(wait & MCR_NONBLOCKING)) {
554 atomic_add_32(&cp->mc_wretry_cnt, 1);
555 goto retry_alloc;
556 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
557 !mcache_bkt_isempty(cp)) {
558 if (!nwretry) {
559 nwretry = TRUE;
560 }
561 atomic_add_32(&cp->mc_nwretry_cnt, 1);
562 goto retry_alloc;
563 } else if (nwretry) {
564 atomic_add_32(&cp->mc_nwfail_cnt, 1);
565 }
566 }
567
568 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
569 (*cp->mc_slab_log)((num - need), *top, TRUE);
570 }
571
572 if (!(cp->mc_flags & MCF_DEBUG)) {
573 return num - need;
574 }
575
576 debug_alloc:
577 if (cp->mc_flags & MCF_DEBUG) {
578 mcache_obj_t **o = top;
579 unsigned int n;
580
581 n = 0;
582 /*
583 * Verify that the chain of objects have the same count as
584 * what we are about to report to the caller. Any mismatch
585 * here means that the object list is insanely broken and
586 * therefore we must panic.
587 */
588 while (*o != NULL) {
589 o = &(*o)->obj_next;
590 ++n;
591 }
592 if (n != (num - need)) {
593 panic("mcache_alloc_ext: %s cp %p corrupted list "
594 "(got %d actual %d)\n", cp->mc_name,
595 (void *)cp, num - need, n);
596 /* NOTREACHED */
597 __builtin_unreachable();
598 }
599 }
600
601 /* Invoke the slab layer audit callback if auditing is enabled */
602 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
603 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
604 }
605
606 return num - need;
607 }
608
609 /*
610 * Allocate a single object from a cache.
611 */
612 __private_extern__ void *
mcache_alloc(mcache_t * cp,int wait)613 mcache_alloc(mcache_t *cp, int wait)
614 {
615 mcache_obj_t *buf;
616
617 (void) mcache_alloc_ext(cp, &buf, 1, wait);
618 return buf;
619 }
620
621 __private_extern__ void
mcache_waiter_inc(mcache_t * cp)622 mcache_waiter_inc(mcache_t *cp)
623 {
624 atomic_add_32(&cp->mc_waiter_cnt, 1);
625 }
626
627 __private_extern__ void
mcache_waiter_dec(mcache_t * cp)628 mcache_waiter_dec(mcache_t *cp)
629 {
630 atomic_add_32(&cp->mc_waiter_cnt, -1);
631 }
632
633 __private_extern__ boolean_t
mcache_bkt_isempty(mcache_t * cp)634 mcache_bkt_isempty(mcache_t *cp)
635 {
636 /*
637 * This isn't meant to accurately tell whether there are
638 * any full buckets in the cache; it is simply a way to
639 * obtain "hints" about the state of the cache.
640 */
641 return cp->mc_full.bl_total == 0;
642 }
643
644 /*
645 * Notify the slab layer about an event.
646 */
647 static void
mcache_notify(mcache_t * cp,u_int32_t event)648 mcache_notify(mcache_t *cp, u_int32_t event)
649 {
650 if (cp->mc_slab_notify != NULL) {
651 (*cp->mc_slab_notify)(cp->mc_private, event);
652 }
653 }
654
655 /*
656 * Purge the cache and disable its buckets.
657 */
658 static void
mcache_purge(void * arg)659 mcache_purge(void *arg)
660 {
661 mcache_t *cp = arg;
662
663 mcache_bkt_purge(cp);
664 /*
665 * We cannot simply call mcache_cache_bkt_enable() from here as
666 * a bucket resize may be in flight and we would cause the CPU
667 * layers of the cache to point to different sizes. Therefore,
668 * we simply increment the enable count so that during the next
669 * periodic cache update the buckets can be reenabled.
670 */
671 lck_mtx_lock_spin(&cp->mc_sync_lock);
672 cp->mc_enable_cnt++;
673 lck_mtx_unlock(&cp->mc_sync_lock);
674 }
675
676 __private_extern__ boolean_t
mcache_purge_cache(mcache_t * cp,boolean_t async)677 mcache_purge_cache(mcache_t *cp, boolean_t async)
678 {
679 /*
680 * Purging a cache that has no per-CPU caches or is already
681 * in the process of being purged is rather pointless.
682 */
683 if (cp->mc_flags & MCF_NOCPUCACHE) {
684 return FALSE;
685 }
686
687 lck_mtx_lock_spin(&cp->mc_sync_lock);
688 if (cp->mc_purge_cnt > 0) {
689 lck_mtx_unlock(&cp->mc_sync_lock);
690 return FALSE;
691 }
692 cp->mc_purge_cnt++;
693 lck_mtx_unlock(&cp->mc_sync_lock);
694
695 if (async) {
696 mcache_dispatch(mcache_purge, cp);
697 } else {
698 mcache_purge(cp);
699 }
700
701 return TRUE;
702 }
703
704 /*
705 * Free a single object to a cache.
706 */
707 __private_extern__ void
mcache_free(mcache_t * cp,void * buf)708 mcache_free(mcache_t *cp, void *buf)
709 {
710 ((mcache_obj_t *)buf)->obj_next = NULL;
711 mcache_free_ext(cp, (mcache_obj_t *)buf);
712 }
713
714 /*
715 * Free one or more objects to a cache.
716 */
717 __private_extern__ void
mcache_free_ext(mcache_t * cp,mcache_obj_t * list)718 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
719 {
720 mcache_cpu_t *ccp = MCACHE_CPU(cp);
721 mcache_bkttype_t *btp;
722 mcache_obj_t *nlist;
723 mcache_bkt_t *bkt;
724
725 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
726 (*cp->mc_slab_log)(0, list, FALSE);
727 }
728
729 /* Invoke the slab layer audit callback if auditing is enabled */
730 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
731 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
732 }
733
734 MCACHE_LOCK(&ccp->cc_lock);
735 for (;;) {
736 /*
737 * If there is space in the current CPU's filled bucket, put
738 * the object there and return once all objects are freed.
739 * Note the cast to unsigned integer takes care of the case
740 * where the bucket layer is disabled (when cc_objs is -1).
741 */
742 if ((unsigned int)ccp->cc_objs <
743 (unsigned int)ccp->cc_bktsize) {
744 /*
745 * Reverse the list while we place the object into the
746 * bucket; this effectively causes the most recently
747 * freed object(s) to be reused during allocation.
748 */
749 nlist = list->obj_next;
750 list->obj_next = (ccp->cc_objs == 0) ? NULL :
751 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
752 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
753 ccp->cc_free++;
754
755 if ((list = nlist) != NULL) {
756 continue;
757 }
758
759 /* We are done; return to caller */
760 MCACHE_UNLOCK(&ccp->cc_lock);
761
762 /* If there is a waiter below, notify it */
763 if (cp->mc_waiter_cnt > 0) {
764 mcache_notify(cp, MCN_RETRYALLOC);
765 }
766 return;
767 }
768
769 /*
770 * The CPU's filled bucket is full. If the previous filled
771 * bucket was empty, exchange and try again.
772 */
773 if (ccp->cc_pobjs == 0) {
774 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
775 continue;
776 }
777
778 /*
779 * If the bucket layer is disabled, free to slab. This can
780 * happen either because MCF_NOCPUCACHE is set, or because
781 * the bucket layer is currently being resized.
782 */
783 if (ccp->cc_bktsize == 0) {
784 break;
785 }
786
787 /*
788 * Both of the CPU's buckets are full; try to get empty
789 * buckets from the bucket layer. Upon success, empty this
790 * CPU and place any full bucket into the full list.
791 *
792 * TODO: Because the caller currently doesn't indicate
793 * the number of objects in the list, we choose the more
794 * conservative approach of allocating only 1 empty
795 * bucket (to prevent potential thrashing). Once we
796 * have the object count, we can replace 1 with similar
797 * logic as used in mcache_alloc_ext().
798 */
799 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, 1);
800 if (bkt != NULL) {
801 mcache_bkt_t *bkt_list = NULL;
802
803 if (ccp->cc_pfilled != NULL) {
804 ccp->cc_pfilled->bkt_next = bkt_list;
805 bkt_list = ccp->cc_pfilled;
806 }
807 if (bkt->bkt_next == NULL) {
808 /*
809 * Bucket layer allocation returns only 1
810 * bucket; retain current full bucket.
811 */
812 mcache_cpu_refill(ccp, bkt, 0);
813 } else {
814 /*
815 * We got 2 empty buckets from the bucket
816 * layer; release the current full bucket
817 * back to the bucket layer.
818 */
819 if (ccp->cc_filled != NULL) {
820 ccp->cc_filled->bkt_next = bkt_list;
821 bkt_list = ccp->cc_filled;
822 }
823 mcache_cpu_batch_refill(ccp, bkt, 0);
824 }
825 mcache_bkt_batch_free(cp, &cp->mc_full, bkt_list);
826 continue;
827 }
828 btp = cp->cache_bkttype;
829
830 /*
831 * We need an empty bucket to put our freed objects into
832 * but couldn't get an empty bucket from the bucket layer;
833 * attempt to allocate one. We do not want to block for
834 * allocation here, and if the bucket allocation fails
835 * we will simply fall through to the slab layer.
836 */
837 MCACHE_UNLOCK(&ccp->cc_lock);
838 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
839 MCACHE_LOCK(&ccp->cc_lock);
840
841 if (bkt != NULL) {
842 /*
843 * We have an empty bucket, but since we drop the
844 * CPU lock above, the cache's bucket size may have
845 * changed. If so, free the bucket and try again.
846 */
847 if (ccp->cc_bktsize != btp->bt_bktsize) {
848 MCACHE_UNLOCK(&ccp->cc_lock);
849 mcache_free(btp->bt_cache, bkt);
850 MCACHE_LOCK(&ccp->cc_lock);
851 continue;
852 }
853
854 /*
855 * Store it in the bucket object since we'll
856 * need to refer to it during bucket destroy;
857 * we can't safely refer to cache_bkttype as
858 * the bucket lock may not be acquired then.
859 */
860 bkt->bkt_type = btp;
861
862 /*
863 * We have an empty bucket of the right size;
864 * add it to the bucket layer and try again.
865 */
866 ASSERT(bkt->bkt_next == NULL);
867 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt);
868 continue;
869 }
870
871 /*
872 * The bucket layer has no empty buckets; free the
873 * object(s) directly to the slab layer.
874 */
875 break;
876 }
877 MCACHE_UNLOCK(&ccp->cc_lock);
878
879 /* If there is a waiter below, notify it */
880 if (cp->mc_waiter_cnt > 0) {
881 mcache_notify(cp, MCN_RETRYALLOC);
882 }
883
884 /* Advise the slab layer to purge the object(s) */
885 (*cp->mc_slab_free)(cp->mc_private, list,
886 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
887 }
888
889 /*
890 * Cache destruction routine.
891 */
892 __private_extern__ void
mcache_destroy(mcache_t * cp)893 mcache_destroy(mcache_t *cp)
894 {
895 void **pbuf;
896
897 MCACHE_LIST_LOCK();
898 LIST_REMOVE(cp, mc_list);
899 MCACHE_LIST_UNLOCK();
900
901 mcache_bkt_purge(cp);
902
903 /*
904 * This cache is dead; there should be no further transaction.
905 * If it's still invoked, make sure that it induces a fault.
906 */
907 cp->mc_slab_alloc = NULL;
908 cp->mc_slab_free = NULL;
909 cp->mc_slab_audit = NULL;
910
911 lck_grp_free(cp->mc_bkt_lock_grp);
912 lck_grp_free(cp->mc_cpu_lock_grp);
913 lck_grp_free(cp->mc_sync_lock_grp);
914
915 /*
916 * TODO: We need to destroy the zone here, but cannot do it
917 * because there is no such way to achieve that. Until then
918 * the memory allocated for the zone structure is leaked.
919 * Once it is achievable, uncomment these lines:
920 *
921 * if (cp->mc_slab_zone != NULL) {
922 * zdestroy(cp->mc_slab_zone);
923 * cp->mc_slab_zone = NULL;
924 * }
925 */
926
927 /* Get the original address since we're about to free it */
928 pbuf = (void **)((intptr_t)cp - sizeof(void *));
929
930 zfree(mcache_zone, *pbuf);
931 }
932
933 /*
934 * Internal slab allocator used as a backend for simple caches. The current
935 * implementation uses the zone allocator for simplicity reasons.
936 */
937 static unsigned int
mcache_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)938 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
939 int wait)
940 {
941 #pragma unused(wait)
942 mcache_t *cp = arg;
943 unsigned int need = num;
944 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
945 u_int32_t flags = cp->mc_flags;
946 void *buf, *base, **pbuf;
947 mcache_obj_t **list = *plist;
948
949 *list = NULL;
950
951 for (;;) {
952 buf = zalloc_flags(cp->mc_slab_zone, Z_WAITOK | Z_NOFAIL);
953
954 /* Get the aligned base address for this object */
955 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
956 cp->mc_align);
957
958 /*
959 * Wind back a pointer size from the aligned base and
960 * save the original address so we can free it later.
961 */
962 pbuf = (void **)((intptr_t)base - sizeof(void *));
963 *pbuf = buf;
964
965 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
966 ((intptr_t)buf + cp->mc_chunksize));
967
968 /*
969 * If auditing is enabled, patternize the contents of
970 * the buffer starting from the 64-bit aligned base to
971 * the end of the buffer; the length is rounded up to
972 * the nearest 64-bit multiply; this is because we use
973 * 64-bit memory access to set/check the pattern.
974 */
975 if (flags & MCF_DEBUG) {
976 VERIFY(((intptr_t)base + rsize) <=
977 ((intptr_t)buf + cp->mc_chunksize));
978 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
979 }
980
981 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
982 *list = (mcache_obj_t *)base;
983
984 (*list)->obj_next = NULL;
985 list = *plist = &(*list)->obj_next;
986
987 /* If we got them all, return to mcache */
988 if (--need == 0) {
989 break;
990 }
991 }
992
993 return num - need;
994 }
995
996 /*
997 * Internal slab deallocator used as a backend for simple caches.
998 */
999 static void
mcache_slab_free(void * arg,mcache_obj_t * list,__unused boolean_t purged)1000 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1001 {
1002 mcache_t *cp = arg;
1003 mcache_obj_t *nlist;
1004 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1005 u_int32_t flags = cp->mc_flags;
1006 void *base;
1007 void **pbuf;
1008
1009 for (;;) {
1010 nlist = list->obj_next;
1011 list->obj_next = NULL;
1012
1013 base = list;
1014 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1015
1016 /* Get the original address since we're about to free it */
1017 pbuf = (void **)((intptr_t)base - sizeof(void *));
1018
1019 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1020 ((intptr_t)*pbuf + cp->mc_chunksize));
1021
1022 if (flags & MCF_DEBUG) {
1023 VERIFY(((intptr_t)base + rsize) <=
1024 ((intptr_t)*pbuf + cp->mc_chunksize));
1025 mcache_audit_free_verify(NULL, base, 0, rsize);
1026 }
1027
1028 /* Free it to zone */
1029 zfree(cp->mc_slab_zone, *pbuf);
1030
1031 /* No more objects to free; return to mcache */
1032 if ((list = nlist) == NULL) {
1033 break;
1034 }
1035 }
1036 }
1037
1038 /*
1039 * Internal slab auditor for simple caches.
1040 */
1041 static void
mcache_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1042 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1043 {
1044 mcache_t *cp = arg;
1045 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1046 void *base, **pbuf;
1047
1048 while (list != NULL) {
1049 mcache_obj_t *next = list->obj_next;
1050
1051 base = list;
1052 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1053
1054 /* Get the original address */
1055 pbuf = (void **)((intptr_t)base - sizeof(void *));
1056
1057 VERIFY(((intptr_t)base + rsize) <=
1058 ((intptr_t)*pbuf + cp->mc_chunksize));
1059
1060 if (!alloc) {
1061 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1062 } else {
1063 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1064 }
1065
1066 list = list->obj_next = next;
1067 }
1068 }
1069
1070 /*
1071 * Refill the CPU's buckets with bkt and its follower (if any).
1072 */
1073 static void
mcache_cpu_batch_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1074 mcache_cpu_batch_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1075 {
1076 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1077 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1078 ASSERT(ccp->cc_bktsize > 0);
1079
1080 ccp->cc_filled = bkt;
1081 ccp->cc_objs = objs;
1082 if (__probable(bkt->bkt_next != NULL)) {
1083 ccp->cc_pfilled = bkt->bkt_next;
1084 ccp->cc_pobjs = objs;
1085 bkt->bkt_next = NULL;
1086 } else {
1087 ASSERT(bkt->bkt_next == NULL);
1088 ccp->cc_pfilled = NULL;
1089 ccp->cc_pobjs = -1;
1090 }
1091 }
1092
1093 /*
1094 * Refill the CPU's filled bucket with bkt and save the previous one.
1095 */
1096 static void
mcache_cpu_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1097 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1098 {
1099 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1100 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1101 ASSERT(ccp->cc_bktsize > 0);
1102
1103 ccp->cc_pfilled = ccp->cc_filled;
1104 ccp->cc_pobjs = ccp->cc_objs;
1105 ccp->cc_filled = bkt;
1106 ccp->cc_objs = objs;
1107 }
1108
1109 /*
1110 * Get one or more buckets from the bucket layer.
1111 */
1112 static uint32_t
mcache_bkt_batch_alloc(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t ** list,uint32_t num)1113 mcache_bkt_batch_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t **list,
1114 uint32_t num)
1115 {
1116 mcache_bkt_t *bkt_list = NULL;
1117 mcache_bkt_t *bkt;
1118 uint32_t need = num;
1119
1120 ASSERT(list != NULL && need > 0);
1121
1122 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1123 /*
1124 * The bucket layer lock is held by another CPU; increase
1125 * the contention count so that we can later resize the
1126 * bucket size accordingly.
1127 */
1128 MCACHE_LOCK(&cp->mc_bkt_lock);
1129 cp->mc_bkt_contention++;
1130 }
1131
1132 while ((bkt = blp->bl_list) != NULL) {
1133 blp->bl_list = bkt->bkt_next;
1134 bkt->bkt_next = bkt_list;
1135 bkt_list = bkt;
1136 if (--blp->bl_total < blp->bl_min) {
1137 blp->bl_min = blp->bl_total;
1138 }
1139 blp->bl_alloc++;
1140 if (--need == 0) {
1141 break;
1142 }
1143 }
1144
1145 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1146
1147 *list = bkt_list;
1148
1149 return num - need;
1150 }
1151
1152 /*
1153 * Return one or more buckets to the bucket layer.
1154 */
1155 static void
mcache_bkt_batch_free(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t * bkt)1156 mcache_bkt_batch_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1157 {
1158 mcache_bkt_t *nbkt;
1159
1160 MCACHE_LOCK(&cp->mc_bkt_lock);
1161 while (bkt != NULL) {
1162 nbkt = bkt->bkt_next;
1163 bkt->bkt_next = blp->bl_list;
1164 blp->bl_list = bkt;
1165 blp->bl_total++;
1166 bkt = nbkt;
1167 }
1168 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1169 }
1170
1171 /*
1172 * Enable the bucket layer of a cache.
1173 */
1174 static void
mcache_cache_bkt_enable(mcache_t * cp)1175 mcache_cache_bkt_enable(mcache_t *cp)
1176 {
1177 mcache_cpu_t *ccp;
1178 unsigned int cpu;
1179
1180 if (cp->mc_flags & MCF_NOCPUCACHE) {
1181 return;
1182 }
1183
1184 for (cpu = 0; cpu < ncpu; cpu++) {
1185 ccp = &cp->mc_cpu[cpu];
1186 MCACHE_LOCK(&ccp->cc_lock);
1187 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1188 MCACHE_UNLOCK(&ccp->cc_lock);
1189 }
1190 }
1191
1192 /*
1193 * Purge all buckets from a cache and disable its bucket layer.
1194 */
1195 static void
mcache_bkt_purge(mcache_t * cp)1196 mcache_bkt_purge(mcache_t *cp)
1197 {
1198 mcache_cpu_t *ccp;
1199 mcache_bkt_t *bp, *pbp;
1200 int objs, pobjs;
1201 unsigned int cpu;
1202
1203 for (cpu = 0; cpu < ncpu; cpu++) {
1204 ccp = &cp->mc_cpu[cpu];
1205
1206 MCACHE_LOCK(&ccp->cc_lock);
1207
1208 bp = ccp->cc_filled;
1209 pbp = ccp->cc_pfilled;
1210 objs = ccp->cc_objs;
1211 pobjs = ccp->cc_pobjs;
1212 ccp->cc_filled = NULL;
1213 ccp->cc_pfilled = NULL;
1214 ccp->cc_objs = -1;
1215 ccp->cc_pobjs = -1;
1216 ccp->cc_bktsize = 0;
1217
1218 MCACHE_UNLOCK(&ccp->cc_lock);
1219
1220 if (bp != NULL) {
1221 mcache_bkt_destroy(cp, bp, objs);
1222 }
1223 if (pbp != NULL) {
1224 mcache_bkt_destroy(cp, pbp, pobjs);
1225 }
1226 }
1227
1228 mcache_bkt_ws_zero(cp);
1229 mcache_bkt_ws_reap(cp);
1230 }
1231
1232 /*
1233 * Free one or more objects in the bucket to the slab layer,
1234 * and also free the bucket itself.
1235 */
1236 static void
mcache_bkt_destroy(mcache_t * cp,mcache_bkt_t * bkt,int nobjs)1237 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1238 {
1239 if (nobjs > 0) {
1240 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1241
1242 if (cp->mc_flags & MCF_DEBUG) {
1243 mcache_obj_t *o = top;
1244 int cnt = 0;
1245
1246 /*
1247 * Verify that the chain of objects in the bucket is
1248 * valid. Any mismatch here means a mistake when the
1249 * object(s) were freed to the CPU layer, so we panic.
1250 */
1251 while (o != NULL) {
1252 o = o->obj_next;
1253 ++cnt;
1254 }
1255 if (cnt != nobjs) {
1256 panic("mcache_bkt_destroy: %s cp %p corrupted "
1257 "list in bkt %p (nobjs %d actual %d)\n",
1258 cp->mc_name, (void *)cp, (void *)bkt,
1259 nobjs, cnt);
1260 /* NOTREACHED */
1261 __builtin_unreachable();
1262 }
1263 }
1264
1265 /* Advise the slab layer to purge the object(s) */
1266 (*cp->mc_slab_free)(cp->mc_private, top,
1267 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1268 }
1269 mcache_free(bkt->bkt_type->bt_cache, bkt);
1270 }
1271
1272 /*
1273 * Update the bucket layer working set statistics.
1274 */
1275 static void
mcache_bkt_ws_update(mcache_t * cp)1276 mcache_bkt_ws_update(mcache_t *cp)
1277 {
1278 MCACHE_LOCK(&cp->mc_bkt_lock);
1279
1280 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1281 cp->mc_full.bl_min = cp->mc_full.bl_total;
1282 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1283 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1284
1285 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1286 }
1287
1288 /*
1289 * Mark everything as eligible for reaping (working set is zero).
1290 */
1291 static void
mcache_bkt_ws_zero(mcache_t * cp)1292 mcache_bkt_ws_zero(mcache_t *cp)
1293 {
1294 MCACHE_LOCK(&cp->mc_bkt_lock);
1295
1296 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1297 cp->mc_full.bl_min = cp->mc_full.bl_total;
1298 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1299 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1300
1301 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1302 }
1303
1304 /*
1305 * Reap all buckets that are beyond the working set.
1306 */
1307 static void
mcache_bkt_ws_reap(mcache_t * cp)1308 mcache_bkt_ws_reap(mcache_t *cp)
1309 {
1310 mcache_bkt_t *bkt, *nbkt;
1311 uint32_t reap;
1312
1313 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1314 if (reap != 0) {
1315 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full, &bkt, reap);
1316 while (bkt != NULL) {
1317 nbkt = bkt->bkt_next;
1318 bkt->bkt_next = NULL;
1319 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1320 bkt = nbkt;
1321 }
1322 }
1323
1324 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1325 if (reap != 0) {
1326 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, reap);
1327 while (bkt != NULL) {
1328 nbkt = bkt->bkt_next;
1329 bkt->bkt_next = NULL;
1330 mcache_bkt_destroy(cp, bkt, 0);
1331 bkt = nbkt;
1332 }
1333 }
1334 }
1335
1336 static void
mcache_reap_timeout(thread_call_param_t dummy __unused,thread_call_param_t arg)1337 mcache_reap_timeout(thread_call_param_t dummy __unused,
1338 thread_call_param_t arg)
1339 {
1340 volatile UInt32 *flag = arg;
1341
1342 ASSERT(flag == &mcache_reaping);
1343
1344 *flag = 0;
1345 }
1346
1347 static void
mcache_reap_done(void * flag)1348 mcache_reap_done(void *flag)
1349 {
1350 uint64_t deadline, leeway;
1351
1352 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1353 &deadline);
1354 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1355 NSEC_PER_SEC, &leeway);
1356 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1357 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1358 }
1359
1360 static void
mcache_reap_start(void * arg)1361 mcache_reap_start(void *arg)
1362 {
1363 UInt32 *flag = arg;
1364
1365 ASSERT(flag == &mcache_reaping);
1366
1367 mcache_applyall(mcache_cache_reap);
1368 mcache_dispatch(mcache_reap_done, flag);
1369 }
1370
1371 __private_extern__ void
mcache_reap(void)1372 mcache_reap(void)
1373 {
1374 UInt32 *flag = &mcache_reaping;
1375
1376 if (mcache_llock_owner == current_thread() ||
1377 !OSCompareAndSwap(0, 1, flag)) {
1378 return;
1379 }
1380
1381 mcache_dispatch(mcache_reap_start, flag);
1382 }
1383
1384 __private_extern__ void
mcache_reap_now(mcache_t * cp,boolean_t purge)1385 mcache_reap_now(mcache_t *cp, boolean_t purge)
1386 {
1387 if (purge) {
1388 mcache_bkt_purge(cp);
1389 mcache_cache_bkt_enable(cp);
1390 } else {
1391 mcache_bkt_ws_zero(cp);
1392 mcache_bkt_ws_reap(cp);
1393 }
1394 }
1395
1396 static void
mcache_cache_reap(mcache_t * cp)1397 mcache_cache_reap(mcache_t *cp)
1398 {
1399 mcache_bkt_ws_reap(cp);
1400 }
1401
1402 /*
1403 * Performs period maintenance on a cache.
1404 */
1405 static void
mcache_cache_update(mcache_t * cp)1406 mcache_cache_update(mcache_t *cp)
1407 {
1408 int need_bkt_resize = 0;
1409 int need_bkt_reenable = 0;
1410
1411 lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);
1412
1413 mcache_bkt_ws_update(cp);
1414
1415 /*
1416 * Cache resize and post-purge reenable are mutually exclusive.
1417 * If the cache was previously purged, there is no point of
1418 * increasing the bucket size as there was an indication of
1419 * memory pressure on the system.
1420 */
1421 lck_mtx_lock_spin(&cp->mc_sync_lock);
1422 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1423 need_bkt_reenable = 1;
1424 }
1425 lck_mtx_unlock(&cp->mc_sync_lock);
1426
1427 MCACHE_LOCK(&cp->mc_bkt_lock);
1428 /*
1429 * If the contention count is greater than the threshold, and if
1430 * we are not already at the maximum bucket size, increase it.
1431 * Otherwise, if this cache was previously purged by the user
1432 * then we simply reenable it.
1433 */
1434 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1435 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1436 mcache_bkt_contention && !need_bkt_reenable) {
1437 need_bkt_resize = 1;
1438 }
1439
1440 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1441 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1442
1443 if (need_bkt_resize) {
1444 mcache_dispatch(mcache_cache_bkt_resize, cp);
1445 } else if (need_bkt_reenable) {
1446 mcache_dispatch(mcache_cache_enable, cp);
1447 }
1448 }
1449
1450 /*
1451 * Recompute a cache's bucket size. This is an expensive operation
1452 * and should not be done frequently; larger buckets provide for a
1453 * higher transfer rate with the bucket while smaller buckets reduce
1454 * the memory consumption.
1455 */
1456 static void
mcache_cache_bkt_resize(void * arg)1457 mcache_cache_bkt_resize(void *arg)
1458 {
1459 mcache_t *cp = arg;
1460 mcache_bkttype_t *btp = cp->cache_bkttype;
1461
1462 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1463 mcache_bkt_purge(cp);
1464
1465 /*
1466 * Upgrade to the next bucket type with larger bucket size;
1467 * temporarily set the previous contention snapshot to a
1468 * negative number to prevent unnecessary resize request.
1469 */
1470 MCACHE_LOCK(&cp->mc_bkt_lock);
1471 cp->cache_bkttype = ++btp;
1472 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1473 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1474
1475 mcache_cache_enable(cp);
1476 }
1477 }
1478
1479 /*
1480 * Reenable a previously disabled cache due to purge.
1481 */
1482 static void
mcache_cache_enable(void * arg)1483 mcache_cache_enable(void *arg)
1484 {
1485 mcache_t *cp = arg;
1486
1487 lck_mtx_lock_spin(&cp->mc_sync_lock);
1488 cp->mc_purge_cnt = 0;
1489 cp->mc_enable_cnt = 0;
1490 lck_mtx_unlock(&cp->mc_sync_lock);
1491
1492 mcache_cache_bkt_enable(cp);
1493 }
1494
1495 static void
mcache_update_timeout(__unused void * arg)1496 mcache_update_timeout(__unused void *arg)
1497 {
1498 uint64_t deadline, leeway;
1499
1500 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1501 &deadline);
1502 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1503 NSEC_PER_SEC, &leeway);
1504 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1505 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1506 }
1507
1508 static void
mcache_update(thread_call_param_t arg __unused,thread_call_param_t dummy __unused)1509 mcache_update(thread_call_param_t arg __unused,
1510 thread_call_param_t dummy __unused)
1511 {
1512 mcache_applyall(mcache_cache_update);
1513 mcache_update_timeout(NULL);
1514 }
1515
1516 static void
mcache_applyall(void (* func)(mcache_t *))1517 mcache_applyall(void (*func)(mcache_t *))
1518 {
1519 mcache_t *cp;
1520
1521 MCACHE_LIST_LOCK();
1522 LIST_FOREACH(cp, &mcache_head, mc_list) {
1523 func(cp);
1524 }
1525 MCACHE_LIST_UNLOCK();
1526 }
1527
1528 static void
mcache_dispatch(void (* func)(void *),void * arg)1529 mcache_dispatch(void (*func)(void *), void *arg)
1530 {
1531 ASSERT(func != NULL);
1532 timeout(func, arg, hz / 1000);
1533 }
1534
1535 __private_extern__ void
mcache_buffer_log(mcache_audit_t * mca,void * addr,mcache_t * cp,struct timeval * base_ts)1536 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1537 struct timeval *base_ts)
1538 {
1539 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1540 void *stack[MCACHE_STACK_DEPTH + 1];
1541 struct mca_trn *transaction;
1542
1543 transaction = &mca->mca_trns[mca->mca_next_trn];
1544
1545 mca->mca_addr = addr;
1546 mca->mca_cache = cp;
1547
1548 transaction->mca_thread = current_thread();
1549
1550 bzero(stack, sizeof(stack));
1551 transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1552 bcopy(&stack[1], transaction->mca_stack,
1553 sizeof(transaction->mca_stack));
1554
1555 microuptime(&now);
1556 if (base_ts != NULL) {
1557 base = *base_ts;
1558 }
1559 /* tstamp is in ms relative to base_ts */
1560 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1561 if ((now.tv_sec - base.tv_sec) > 0) {
1562 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1563 }
1564
1565 mca->mca_next_trn =
1566 (mca->mca_next_trn + 1) % mca_trn_max;
1567 }
1568
1569 /*
1570 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1571 * mcache_verify_set_pattern() are marked as noinline to prevent the
1572 * compiler from aliasing pointers when they are inlined inside the callers
1573 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1574 */
1575 __private_extern__ OS_NOINLINE void
mcache_set_pattern(u_int64_t pattern,void * buf_arg,size_t size)1576 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1577 {
1578 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1579 u_int64_t *buf = (u_int64_t *)buf_arg;
1580
1581 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1582 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1583
1584 while (buf < buf_end) {
1585 *buf++ = pattern;
1586 }
1587 }
1588
1589 __private_extern__ OS_NOINLINE void *
mcache_verify_pattern(u_int64_t pattern,void * buf_arg,size_t size)1590 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1591 {
1592 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1593 u_int64_t *buf;
1594
1595 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1596 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1597
1598 for (buf = buf_arg; buf < buf_end; buf++) {
1599 if (*buf != pattern) {
1600 return buf;
1601 }
1602 }
1603 return NULL;
1604 }
1605
1606 OS_NOINLINE static void *
mcache_verify_set_pattern(u_int64_t old,u_int64_t new,void * buf_arg,size_t size)1607 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1608 size_t size)
1609 {
1610 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1611 u_int64_t *buf;
1612
1613 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1614 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1615
1616 for (buf = buf_arg; buf < buf_end; buf++) {
1617 if (*buf != old) {
1618 mcache_set_pattern(old, buf_arg,
1619 (uintptr_t)buf - (uintptr_t)buf_arg);
1620 return buf;
1621 }
1622 *buf = new;
1623 }
1624 return NULL;
1625 }
1626
1627 __private_extern__ void
mcache_audit_free_verify(mcache_audit_t * mca,void * base,size_t offset,size_t size)1628 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1629 size_t size)
1630 {
1631 void *addr;
1632 u_int64_t *oaddr64;
1633 mcache_obj_t *next;
1634
1635 addr = (void *)((uintptr_t)base + offset);
1636 next = ((mcache_obj_t *)addr)->obj_next;
1637
1638 /* For the "obj_next" pointer in the buffer */
1639 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1640 *oaddr64 = MCACHE_FREE_PATTERN;
1641
1642 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1643 (caddr_t)base, size)) != NULL) {
1644 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1645 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1646 /* NOTREACHED */
1647 }
1648 ((mcache_obj_t *)addr)->obj_next = next;
1649 }
1650
1651 __private_extern__ void
mcache_audit_free_verify_set(mcache_audit_t * mca,void * base,size_t offset,size_t size)1652 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1653 size_t size)
1654 {
1655 void *addr;
1656 u_int64_t *oaddr64;
1657 mcache_obj_t *next;
1658
1659 addr = (void *)((uintptr_t)base + offset);
1660 next = ((mcache_obj_t *)addr)->obj_next;
1661
1662 /* For the "obj_next" pointer in the buffer */
1663 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1664 *oaddr64 = MCACHE_FREE_PATTERN;
1665
1666 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1667 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1668 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1669 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1670 /* NOTREACHED */
1671 }
1672 ((mcache_obj_t *)addr)->obj_next = next;
1673 }
1674
1675 #undef panic
1676
1677 #define DUMP_TRN_FMT() \
1678 "%s transaction thread %p saved PC stack (%d deep):\n" \
1679 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1680 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1681
1682 #define DUMP_TRN_FIELDS(s, x) \
1683 s, \
1684 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1685 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1686 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1687 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1688 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1689 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1690 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1691 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1692 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1693
1694 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1695 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1696
1697 __private_extern__ char *
mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE],mcache_audit_t * mca)1698 mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
1699 {
1700 snprintf(buf, DUMP_MCA_BUF_SIZE,
1701 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1702 DUMP_TRN_FMT()
1703 DUMP_TRN_FMT(),
1704
1705 mca, mca->mca_addr, mca->mca_cache,
1706 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1707 mca->mca_next_trn,
1708
1709 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1710 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1711
1712 return buf;
1713 }
1714
1715 __private_extern__ void
mcache_audit_panic(mcache_audit_t * mca,void * addr,size_t offset,int64_t expected,int64_t got)1716 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1717 int64_t expected, int64_t got)
1718 {
1719 char buf[DUMP_MCA_BUF_SIZE];
1720
1721 if (mca == NULL) {
1722 panic("mcache_audit: buffer %p modified after free at "
1723 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1724 offset, got, expected);
1725 /* NOTREACHED */
1726 __builtin_unreachable();
1727 }
1728
1729 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1730 "(0x%llx instead of 0x%llx)\n%s\n",
1731 addr, offset, got, expected, mcache_dump_mca(buf, mca));
1732 /* NOTREACHED */
1733 __builtin_unreachable();
1734 }
1735
1736 __attribute__((noinline, cold, not_tail_called, noreturn))
1737 __private_extern__ int
assfail(const char * a,const char * f,int l)1738 assfail(const char *a, const char *f, int l)
1739 {
1740 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1741 /* NOTREACHED */
1742 __builtin_unreachable();
1743 }
1744