1 /*
2 * Copyright (c) 2006-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc. All rights
34 * reserved. Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator). Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/queue.h>
50 #include <sys/kernel.h>
51 #include <sys/systm.h>
52
53 #include <kern/debug.h>
54 #include <kern/zalloc.h>
55 #include <kern/cpu_number.h>
56 #include <kern/locks.h>
57 #include <kern/thread_call.h>
58
59 #include <libkern/libkern.h>
60 #include <libkern/OSAtomic.h>
61 #include <libkern/OSDebug.h>
62
63 #include <mach/vm_param.h>
64 #include <machine/limits.h>
65 #include <machine/machine_routines.h>
66
67 #include <string.h>
68
69 #include <sys/mcache.h>
70
71 #define MCACHE_SIZE(n) \
72 __builtin_offsetof(mcache_t, mc_cpu[n])
73
74 /* Allocate extra in case we need to manually align the pointer */
75 #define MCACHE_ALLOC_SIZE \
76 (sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_LINE_SIZE)
77
78 #define MCACHE_CPU(c) \
79 (mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
80
81 /*
82 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
83 * to serialize accesses to the global list of caches in the system.
84 * They also record the thread currently running in the critical
85 * section, so that we can avoid recursive requests to reap the
86 * caches when memory runs low.
87 */
88 #define MCACHE_LIST_LOCK() { \
89 lck_mtx_lock(&mcache_llock); \
90 mcache_llock_owner = current_thread(); \
91 }
92
93 #define MCACHE_LIST_UNLOCK() { \
94 mcache_llock_owner = NULL; \
95 lck_mtx_unlock(&mcache_llock); \
96 }
97
98 #define MCACHE_LOCK(l) lck_mtx_lock(l)
99 #define MCACHE_UNLOCK(l) lck_mtx_unlock(l)
100 #define MCACHE_LOCK_TRY(l) lck_mtx_try_lock(l)
101
102 static unsigned int ncpu;
103 static unsigned int cache_line_size;
104 static struct thread *mcache_llock_owner;
105 static LCK_GRP_DECLARE(mcache_llock_grp, "mcache.list");
106 static LCK_MTX_DECLARE(mcache_llock, &mcache_llock_grp);
107 static struct zone *mcache_zone;
108 static const uint32_t mcache_reap_interval = 15;
109 static const uint32_t mcache_reap_interval_leeway = 2;
110 static UInt32 mcache_reaping;
111 static int mcache_ready;
112 static int mcache_updating;
113
114 static int mcache_bkt_contention = 3;
115 #if DEBUG
116 static unsigned int mcache_flags = MCF_DEBUG;
117 #else
118 static unsigned int mcache_flags = 0;
119 #endif
120
121 int mca_trn_max = MCA_TRN_MAX;
122
123 static mcache_bkttype_t mcache_bkttype[] = {
124 { 1, 4096, 32768, NULL },
125 { 3, 2048, 16384, NULL },
126 { 7, 1024, 12288, NULL },
127 { 15, 256, 8192, NULL },
128 { 31, 64, 4096, NULL },
129 { 47, 0, 2048, NULL },
130 { 63, 0, 1024, NULL },
131 { 95, 0, 512, NULL },
132 { 143, 0, 256, NULL },
133 { 165, 0, 0, NULL },
134 };
135
136 static mcache_t *mcache_create_common(const char *, size_t, size_t,
137 mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
138 mcache_notifyfn_t, void *, u_int32_t, int);
139 static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
140 unsigned int, int);
141 static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
142 static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
143 static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
144 static void mcache_cpu_batch_refill(mcache_cpu_t *, mcache_bkt_t *, int);
145 static uint32_t mcache_bkt_batch_alloc(mcache_t *, mcache_bktlist_t *,
146 mcache_bkt_t **, uint32_t);
147 static void mcache_bkt_batch_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
148 static void mcache_cache_bkt_enable(mcache_t *);
149 static void mcache_bkt_purge(mcache_t *);
150 static void mcache_bkt_destroy(mcache_t *, mcache_bkt_t *, int);
151 static void mcache_bkt_ws_update(mcache_t *);
152 static void mcache_bkt_ws_zero(mcache_t *);
153 static void mcache_bkt_ws_reap(mcache_t *);
154 static void mcache_dispatch(void (*)(void *), void *);
155 static void mcache_cache_reap(mcache_t *);
156 static void mcache_cache_update(mcache_t *);
157 static void mcache_cache_bkt_resize(void *);
158 static void mcache_cache_enable(void *);
159 static void mcache_update(thread_call_param_t __unused, thread_call_param_t __unused);
160 static void mcache_update_timeout(void *);
161 static void mcache_applyall(void (*)(mcache_t *));
162 static void mcache_reap_start(void *);
163 static void mcache_reap_done(void *);
164 static void mcache_reap_timeout(thread_call_param_t __unused, thread_call_param_t);
165 static void mcache_notify(mcache_t *, u_int32_t);
166 static void mcache_purge(void *);
167
168 static LIST_HEAD(, mcache) mcache_head;
169 mcache_t *mcache_audit_cache;
170
171 static thread_call_t mcache_reap_tcall;
172 static thread_call_t mcache_update_tcall;
173
174 /*
175 * Initialize the framework; this is currently called as part of BSD init.
176 */
177 __private_extern__ void
mcache_init(void)178 mcache_init(void)
179 {
180 mcache_bkttype_t *btp;
181 unsigned int i;
182 char name[32];
183
184 VERIFY(mca_trn_max >= 2);
185
186 ncpu = ml_wait_max_cpus();
187 (void) mcache_cache_line_size(); /* prime it */
188
189 mcache_reap_tcall = thread_call_allocate(mcache_reap_timeout, NULL);
190 mcache_update_tcall = thread_call_allocate(mcache_update, NULL);
191 if (mcache_reap_tcall == NULL || mcache_update_tcall == NULL) {
192 panic("mcache_init: thread_call_allocate failed");
193 /* NOTREACHED */
194 __builtin_unreachable();
195 }
196
197 mcache_zone = zone_create("mcache", MCACHE_ALLOC_SIZE,
198 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
199
200 LIST_INIT(&mcache_head);
201
202 for (i = 0; i < sizeof(mcache_bkttype) / sizeof(*btp); i++) {
203 btp = &mcache_bkttype[i];
204 (void) snprintf(name, sizeof(name), "bkt_%d",
205 btp->bt_bktsize);
206 btp->bt_cache = mcache_create(name,
207 (btp->bt_bktsize + 1) * sizeof(void *), 0, 0, MCR_SLEEP);
208 }
209
210 PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof(mcache_flags));
211 mcache_flags &= MCF_FLAGS_MASK;
212
213 mcache_audit_cache = mcache_create("audit", sizeof(mcache_audit_t),
214 0, 0, MCR_SLEEP);
215
216 mcache_applyall(mcache_cache_bkt_enable);
217 mcache_ready = 1;
218
219 printf("mcache: %d CPU(s), %d bytes CPU cache line size\n",
220 ncpu, CPU_CACHE_LINE_SIZE);
221 }
222
223 /*
224 * Return the global mcache flags.
225 */
226 __private_extern__ unsigned int
mcache_getflags(void)227 mcache_getflags(void)
228 {
229 return mcache_flags;
230 }
231
232 /*
233 * Return the CPU cache line size.
234 */
235 __private_extern__ unsigned int
mcache_cache_line_size(void)236 mcache_cache_line_size(void)
237 {
238 if (cache_line_size == 0) {
239 ml_cpu_info_t cpu_info;
240 ml_cpu_get_info(&cpu_info);
241 cache_line_size = (unsigned int)cpu_info.cache_line_size;
242 }
243 return cache_line_size;
244 }
245
246 /*
247 * Create a cache using the zone allocator as the backend slab allocator.
248 * The caller may specify any alignment for the object; if it specifies 0
249 * the default alignment (MCACHE_ALIGN) will be used.
250 */
251 __private_extern__ mcache_t *
mcache_create(const char * name,size_t bufsize,size_t align,u_int32_t flags,int wait __unused)252 mcache_create(const char *name, size_t bufsize, size_t align,
253 u_int32_t flags, int wait __unused)
254 {
255 return mcache_create_common(name, bufsize, align, mcache_slab_alloc,
256 mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1);
257 }
258
259 /*
260 * Create a cache using a custom backend slab allocator. Since the caller
261 * is responsible for allocation, no alignment guarantee will be provided
262 * by this framework.
263 */
264 __private_extern__ mcache_t *
mcache_create_ext(const char * name,size_t bufsize,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int wait __unused)265 mcache_create_ext(const char *name, size_t bufsize,
266 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
267 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
268 u_int32_t flags, int wait __unused)
269 {
270 return mcache_create_common(name, bufsize, 0, allocfn,
271 freefn, auditfn, logfn, notifyfn, arg, flags, 0);
272 }
273
274 /*
275 * Common cache creation routine.
276 */
277 static mcache_t *
mcache_create_common(const char * name,size_t bufsize,size_t align,mcache_allocfn_t allocfn,mcache_freefn_t freefn,mcache_auditfn_t auditfn,mcache_logfn_t logfn,mcache_notifyfn_t notifyfn,void * arg,u_int32_t flags,int need_zone)278 mcache_create_common(const char *name, size_t bufsize, size_t align,
279 mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
280 mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
281 u_int32_t flags, int need_zone)
282 {
283 mcache_bkttype_t *btp;
284 mcache_t *cp = NULL;
285 size_t chunksize;
286 void *buf, **pbuf;
287 unsigned int c;
288 char lck_name[64];
289
290 buf = zalloc_flags(mcache_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
291
292 /*
293 * In case we didn't get a cache-aligned memory, round it up
294 * accordingly. This is needed in order to get the rest of
295 * structure members aligned properly. It also means that
296 * the memory span gets shifted due to the round up, but it
297 * is okay since we've allocated extra space for this.
298 */
299 cp = (mcache_t *)
300 P2ROUNDUP((intptr_t)buf + sizeof(void *), CPU_CACHE_LINE_SIZE);
301 pbuf = (void **)((intptr_t)cp - sizeof(void *));
302 *pbuf = buf;
303
304 /*
305 * Guaranteed alignment is valid only when we use the internal
306 * slab allocator (currently set to use the zone allocator).
307 */
308 if (!need_zone) {
309 align = 1;
310 } else {
311 /* Enforce 64-bit minimum alignment for zone-based buffers */
312 if (align == 0) {
313 align = MCACHE_ALIGN;
314 }
315 align = P2ROUNDUP(align, MCACHE_ALIGN);
316 }
317
318 if ((align & (align - 1)) != 0) {
319 panic("mcache_create: bad alignment %lu", align);
320 /* NOTREACHED */
321 __builtin_unreachable();
322 }
323
324 cp->mc_align = align;
325 cp->mc_slab_alloc = allocfn;
326 cp->mc_slab_free = freefn;
327 cp->mc_slab_audit = auditfn;
328 cp->mc_slab_log = logfn;
329 cp->mc_slab_notify = notifyfn;
330 cp->mc_private = need_zone ? cp : arg;
331 cp->mc_bufsize = bufsize;
332 cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
333
334 (void) snprintf(cp->mc_name, sizeof(cp->mc_name), "mcache.%s", name);
335
336 (void) snprintf(lck_name, sizeof(lck_name), "%s.cpu", cp->mc_name);
337 cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name, LCK_GRP_ATTR_NULL);
338
339 /*
340 * Allocation chunk size is the object's size plus any extra size
341 * needed to satisfy the object's alignment. It is enforced to be
342 * at least the size of an LP64 pointer to simplify auditing and to
343 * handle multiple-element allocation requests, where the elements
344 * returned are linked together in a list.
345 */
346 chunksize = MAX(bufsize, sizeof(u_int64_t));
347 if (need_zone) {
348 VERIFY(align != 0 && (align % MCACHE_ALIGN) == 0);
349 chunksize += sizeof(uint64_t) + align;
350 chunksize = P2ROUNDUP(chunksize, align);
351 cp->mc_slab_zone = zone_create(cp->mc_name, chunksize,
352 ZC_PGZ_USE_GUARDS | ZC_DESTRUCTIBLE);
353 }
354 cp->mc_chunksize = chunksize;
355
356 /*
357 * Initialize the bucket layer.
358 */
359 (void) snprintf(lck_name, sizeof(lck_name), "%s.bkt", cp->mc_name);
360 cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
361 LCK_GRP_ATTR_NULL);
362 lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp, LCK_ATTR_NULL);
363
364 (void) snprintf(lck_name, sizeof(lck_name), "%s.sync", cp->mc_name);
365 cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
366 LCK_GRP_ATTR_NULL);
367 lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp, LCK_ATTR_NULL);
368
369 for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++) {
370 continue;
371 }
372
373 cp->cache_bkttype = btp;
374
375 /*
376 * Initialize the CPU layer. Each per-CPU structure is aligned
377 * on the CPU cache line boundary to prevent false sharing.
378 */
379 for (c = 0; c < ncpu; c++) {
380 mcache_cpu_t *ccp = &cp->mc_cpu[c];
381
382 VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_LINE_SIZE));
383 lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp, LCK_ATTR_NULL);
384 ccp->cc_objs = -1;
385 ccp->cc_pobjs = -1;
386 }
387
388 if (mcache_ready) {
389 mcache_cache_bkt_enable(cp);
390 }
391
392 /* TODO: dynamically create sysctl for stats */
393
394 MCACHE_LIST_LOCK();
395 LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
396 MCACHE_LIST_UNLOCK();
397
398 /*
399 * If cache buckets are enabled and this is the first cache
400 * created, start the periodic cache update.
401 */
402 if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
403 mcache_updating = 1;
404 mcache_update_timeout(NULL);
405 }
406 if (cp->mc_flags & MCF_DEBUG) {
407 printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
408 "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
409 arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
410 }
411 return cp;
412 }
413
414 /*
415 * Allocate one or more objects from a cache.
416 */
417 __private_extern__ unsigned int
mcache_alloc_ext(mcache_t * cp,mcache_obj_t ** list,unsigned int num,int wait)418 mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
419 {
420 mcache_cpu_t *ccp;
421 mcache_obj_t **top = &(*list);
422 mcache_bkt_t *bkt;
423 unsigned int need = num;
424 boolean_t nwretry = FALSE;
425
426 /* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
427 VERIFY((wait & (MCR_NOSLEEP | MCR_FAILOK)) != (MCR_NOSLEEP | MCR_FAILOK));
428
429 ASSERT(list != NULL);
430 *list = NULL;
431
432 if (num == 0) {
433 return 0;
434 }
435
436 retry_alloc:
437 /* We may not always be running in the same CPU in case of retries */
438 ccp = MCACHE_CPU(cp);
439
440 MCACHE_LOCK(&ccp->cc_lock);
441 for (;;) {
442 /*
443 * If we have an object in the current CPU's filled bucket,
444 * chain the object to any previous objects and return if
445 * we've satisfied the number of requested objects.
446 */
447 if (ccp->cc_objs > 0) {
448 mcache_obj_t *tail;
449 int objs;
450
451 /*
452 * Objects in the bucket are already linked together
453 * with the most recently freed object at the head of
454 * the list; grab as many objects as we can.
455 */
456 objs = MIN((unsigned int)ccp->cc_objs, need);
457 *list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
458 ccp->cc_objs -= objs;
459 ccp->cc_alloc += objs;
460
461 tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
462 list = &tail->obj_next;
463 *list = NULL;
464
465 /* If we got them all, return to caller */
466 if ((need -= objs) == 0) {
467 MCACHE_UNLOCK(&ccp->cc_lock);
468
469 if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
470 cp->mc_slab_log != NULL) {
471 (*cp->mc_slab_log)(num, *top, TRUE);
472 }
473
474 if (cp->mc_flags & MCF_DEBUG) {
475 goto debug_alloc;
476 }
477
478 return num;
479 }
480 }
481
482 /*
483 * The CPU's filled bucket is empty. If the previous filled
484 * bucket was full, exchange and try again.
485 */
486 if (ccp->cc_pobjs > 0) {
487 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
488 continue;
489 }
490
491 /*
492 * If the bucket layer is disabled, allocate from slab. This
493 * can happen either because MCF_NOCPUCACHE is set, or because
494 * the bucket layer is currently being resized.
495 */
496 if (ccp->cc_bktsize == 0) {
497 break;
498 }
499
500 /*
501 * Both of the CPU's buckets are empty; try to get full
502 * bucket(s) from the bucket layer. Upon success, refill
503 * this CPU and place any empty bucket into the empty list.
504 * To prevent potential thrashing, replace both empty buckets
505 * only if the requested count exceeds a bucket's worth of
506 * objects.
507 */
508 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full,
509 &bkt, (need <= ccp->cc_bktsize) ? 1 : 2);
510 if (bkt != NULL) {
511 mcache_bkt_t *bkt_list = NULL;
512
513 if (ccp->cc_pfilled != NULL) {
514 ccp->cc_pfilled->bkt_next = bkt_list;
515 bkt_list = ccp->cc_pfilled;
516 }
517 if (bkt->bkt_next == NULL) {
518 /*
519 * Bucket layer allocation returns only 1
520 * magazine; retain current empty magazine.
521 */
522 mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
523 } else {
524 /*
525 * We got 2 full buckets from the bucket
526 * layer; release the current empty bucket
527 * back to the bucket layer.
528 */
529 if (ccp->cc_filled != NULL) {
530 ccp->cc_filled->bkt_next = bkt_list;
531 bkt_list = ccp->cc_filled;
532 }
533 mcache_cpu_batch_refill(ccp, bkt,
534 ccp->cc_bktsize);
535 }
536 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt_list);
537 continue;
538 }
539
540 /*
541 * The bucket layer has no full buckets; allocate the
542 * object(s) directly from the slab layer.
543 */
544 break;
545 }
546 MCACHE_UNLOCK(&ccp->cc_lock);
547
548 need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
549
550 /*
551 * If this is a blocking allocation, or if it is non-blocking and
552 * the cache's full bucket is non-empty, then retry the allocation.
553 */
554 if (need > 0) {
555 if (!(wait & MCR_NONBLOCKING)) {
556 atomic_add_32(&cp->mc_wretry_cnt, 1);
557 goto retry_alloc;
558 } else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
559 !mcache_bkt_isempty(cp)) {
560 if (!nwretry) {
561 nwretry = TRUE;
562 }
563 atomic_add_32(&cp->mc_nwretry_cnt, 1);
564 goto retry_alloc;
565 } else if (nwretry) {
566 atomic_add_32(&cp->mc_nwfail_cnt, 1);
567 }
568 }
569
570 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
571 (*cp->mc_slab_log)((num - need), *top, TRUE);
572 }
573
574 if (!(cp->mc_flags & MCF_DEBUG)) {
575 return num - need;
576 }
577
578 debug_alloc:
579 if (cp->mc_flags & MCF_DEBUG) {
580 mcache_obj_t **o = top;
581 unsigned int n;
582
583 n = 0;
584 /*
585 * Verify that the chain of objects have the same count as
586 * what we are about to report to the caller. Any mismatch
587 * here means that the object list is insanely broken and
588 * therefore we must panic.
589 */
590 while (*o != NULL) {
591 o = &(*o)->obj_next;
592 ++n;
593 }
594 if (n != (num - need)) {
595 panic("mcache_alloc_ext: %s cp %p corrupted list "
596 "(got %d actual %d)\n", cp->mc_name,
597 (void *)cp, num - need, n);
598 /* NOTREACHED */
599 __builtin_unreachable();
600 }
601 }
602
603 /* Invoke the slab layer audit callback if auditing is enabled */
604 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
605 (*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
606 }
607
608 return num - need;
609 }
610
611 /*
612 * Allocate a single object from a cache.
613 */
614 __private_extern__ void *
mcache_alloc(mcache_t * cp,int wait)615 mcache_alloc(mcache_t *cp, int wait)
616 {
617 mcache_obj_t *buf;
618
619 (void) mcache_alloc_ext(cp, &buf, 1, wait);
620 return buf;
621 }
622
623 __private_extern__ void
mcache_waiter_inc(mcache_t * cp)624 mcache_waiter_inc(mcache_t *cp)
625 {
626 atomic_add_32(&cp->mc_waiter_cnt, 1);
627 }
628
629 __private_extern__ void
mcache_waiter_dec(mcache_t * cp)630 mcache_waiter_dec(mcache_t *cp)
631 {
632 atomic_add_32(&cp->mc_waiter_cnt, -1);
633 }
634
635 __private_extern__ boolean_t
mcache_bkt_isempty(mcache_t * cp)636 mcache_bkt_isempty(mcache_t *cp)
637 {
638 /*
639 * This isn't meant to accurately tell whether there are
640 * any full buckets in the cache; it is simply a way to
641 * obtain "hints" about the state of the cache.
642 */
643 return cp->mc_full.bl_total == 0;
644 }
645
646 /*
647 * Notify the slab layer about an event.
648 */
649 static void
mcache_notify(mcache_t * cp,u_int32_t event)650 mcache_notify(mcache_t *cp, u_int32_t event)
651 {
652 if (cp->mc_slab_notify != NULL) {
653 (*cp->mc_slab_notify)(cp->mc_private, event);
654 }
655 }
656
657 /*
658 * Purge the cache and disable its buckets.
659 */
660 static void
mcache_purge(void * arg)661 mcache_purge(void *arg)
662 {
663 mcache_t *cp = arg;
664
665 mcache_bkt_purge(cp);
666 /*
667 * We cannot simply call mcache_cache_bkt_enable() from here as
668 * a bucket resize may be in flight and we would cause the CPU
669 * layers of the cache to point to different sizes. Therefore,
670 * we simply increment the enable count so that during the next
671 * periodic cache update the buckets can be reenabled.
672 */
673 lck_mtx_lock_spin(&cp->mc_sync_lock);
674 cp->mc_enable_cnt++;
675 lck_mtx_unlock(&cp->mc_sync_lock);
676 }
677
678 __private_extern__ boolean_t
mcache_purge_cache(mcache_t * cp,boolean_t async)679 mcache_purge_cache(mcache_t *cp, boolean_t async)
680 {
681 /*
682 * Purging a cache that has no per-CPU caches or is already
683 * in the process of being purged is rather pointless.
684 */
685 if (cp->mc_flags & MCF_NOCPUCACHE) {
686 return FALSE;
687 }
688
689 lck_mtx_lock_spin(&cp->mc_sync_lock);
690 if (cp->mc_purge_cnt > 0) {
691 lck_mtx_unlock(&cp->mc_sync_lock);
692 return FALSE;
693 }
694 cp->mc_purge_cnt++;
695 lck_mtx_unlock(&cp->mc_sync_lock);
696
697 if (async) {
698 mcache_dispatch(mcache_purge, cp);
699 } else {
700 mcache_purge(cp);
701 }
702
703 return TRUE;
704 }
705
706 /*
707 * Free a single object to a cache.
708 */
709 __private_extern__ void
mcache_free(mcache_t * cp,void * buf)710 mcache_free(mcache_t *cp, void *buf)
711 {
712 ((mcache_obj_t *)buf)->obj_next = NULL;
713 mcache_free_ext(cp, (mcache_obj_t *)buf);
714 }
715
716 /*
717 * Free one or more objects to a cache.
718 */
719 __private_extern__ void
mcache_free_ext(mcache_t * cp,mcache_obj_t * list)720 mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
721 {
722 mcache_cpu_t *ccp = MCACHE_CPU(cp);
723 mcache_bkttype_t *btp;
724 mcache_obj_t *nlist;
725 mcache_bkt_t *bkt;
726
727 if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL) {
728 (*cp->mc_slab_log)(0, list, FALSE);
729 }
730
731 /* Invoke the slab layer audit callback if auditing is enabled */
732 if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL) {
733 (*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
734 }
735
736 MCACHE_LOCK(&ccp->cc_lock);
737 for (;;) {
738 /*
739 * If there is space in the current CPU's filled bucket, put
740 * the object there and return once all objects are freed.
741 * Note the cast to unsigned integer takes care of the case
742 * where the bucket layer is disabled (when cc_objs is -1).
743 */
744 if ((unsigned int)ccp->cc_objs <
745 (unsigned int)ccp->cc_bktsize) {
746 /*
747 * Reverse the list while we place the object into the
748 * bucket; this effectively causes the most recently
749 * freed object(s) to be reused during allocation.
750 */
751 nlist = list->obj_next;
752 list->obj_next = (ccp->cc_objs == 0) ? NULL :
753 ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
754 ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
755 ccp->cc_free++;
756
757 if ((list = nlist) != NULL) {
758 continue;
759 }
760
761 /* We are done; return to caller */
762 MCACHE_UNLOCK(&ccp->cc_lock);
763
764 /* If there is a waiter below, notify it */
765 if (cp->mc_waiter_cnt > 0) {
766 mcache_notify(cp, MCN_RETRYALLOC);
767 }
768 return;
769 }
770
771 /*
772 * The CPU's filled bucket is full. If the previous filled
773 * bucket was empty, exchange and try again.
774 */
775 if (ccp->cc_pobjs == 0) {
776 mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
777 continue;
778 }
779
780 /*
781 * If the bucket layer is disabled, free to slab. This can
782 * happen either because MCF_NOCPUCACHE is set, or because
783 * the bucket layer is currently being resized.
784 */
785 if (ccp->cc_bktsize == 0) {
786 break;
787 }
788
789 /*
790 * Both of the CPU's buckets are full; try to get empty
791 * buckets from the bucket layer. Upon success, empty this
792 * CPU and place any full bucket into the full list.
793 *
794 * TODO: Because the caller currently doesn't indicate
795 * the number of objects in the list, we choose the more
796 * conservative approach of allocating only 1 empty
797 * bucket (to prevent potential thrashing). Once we
798 * have the object count, we can replace 1 with similar
799 * logic as used in mcache_alloc_ext().
800 */
801 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, 1);
802 if (bkt != NULL) {
803 mcache_bkt_t *bkt_list = NULL;
804
805 if (ccp->cc_pfilled != NULL) {
806 ccp->cc_pfilled->bkt_next = bkt_list;
807 bkt_list = ccp->cc_pfilled;
808 }
809 if (bkt->bkt_next == NULL) {
810 /*
811 * Bucket layer allocation returns only 1
812 * bucket; retain current full bucket.
813 */
814 mcache_cpu_refill(ccp, bkt, 0);
815 } else {
816 /*
817 * We got 2 empty buckets from the bucket
818 * layer; release the current full bucket
819 * back to the bucket layer.
820 */
821 if (ccp->cc_filled != NULL) {
822 ccp->cc_filled->bkt_next = bkt_list;
823 bkt_list = ccp->cc_filled;
824 }
825 mcache_cpu_batch_refill(ccp, bkt, 0);
826 }
827 mcache_bkt_batch_free(cp, &cp->mc_full, bkt_list);
828 continue;
829 }
830 btp = cp->cache_bkttype;
831
832 /*
833 * We need an empty bucket to put our freed objects into
834 * but couldn't get an empty bucket from the bucket layer;
835 * attempt to allocate one. We do not want to block for
836 * allocation here, and if the bucket allocation fails
837 * we will simply fall through to the slab layer.
838 */
839 MCACHE_UNLOCK(&ccp->cc_lock);
840 bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
841 MCACHE_LOCK(&ccp->cc_lock);
842
843 if (bkt != NULL) {
844 /*
845 * We have an empty bucket, but since we drop the
846 * CPU lock above, the cache's bucket size may have
847 * changed. If so, free the bucket and try again.
848 */
849 if (ccp->cc_bktsize != btp->bt_bktsize) {
850 MCACHE_UNLOCK(&ccp->cc_lock);
851 mcache_free(btp->bt_cache, bkt);
852 MCACHE_LOCK(&ccp->cc_lock);
853 continue;
854 }
855
856 /*
857 * Store it in the bucket object since we'll
858 * need to refer to it during bucket destroy;
859 * we can't safely refer to cache_bkttype as
860 * the bucket lock may not be acquired then.
861 */
862 bkt->bkt_type = btp;
863
864 /*
865 * We have an empty bucket of the right size;
866 * add it to the bucket layer and try again.
867 */
868 ASSERT(bkt->bkt_next == NULL);
869 mcache_bkt_batch_free(cp, &cp->mc_empty, bkt);
870 continue;
871 }
872
873 /*
874 * The bucket layer has no empty buckets; free the
875 * object(s) directly to the slab layer.
876 */
877 break;
878 }
879 MCACHE_UNLOCK(&ccp->cc_lock);
880
881 /* If there is a waiter below, notify it */
882 if (cp->mc_waiter_cnt > 0) {
883 mcache_notify(cp, MCN_RETRYALLOC);
884 }
885
886 /* Advise the slab layer to purge the object(s) */
887 (*cp->mc_slab_free)(cp->mc_private, list,
888 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
889 }
890
891 /*
892 * Cache destruction routine.
893 */
894 __private_extern__ void
mcache_destroy(mcache_t * cp)895 mcache_destroy(mcache_t *cp)
896 {
897 void **pbuf;
898
899 MCACHE_LIST_LOCK();
900 LIST_REMOVE(cp, mc_list);
901 MCACHE_LIST_UNLOCK();
902
903 mcache_bkt_purge(cp);
904
905 /*
906 * This cache is dead; there should be no further transaction.
907 * If it's still invoked, make sure that it induces a fault.
908 */
909 cp->mc_slab_alloc = NULL;
910 cp->mc_slab_free = NULL;
911 cp->mc_slab_audit = NULL;
912
913 lck_grp_free(cp->mc_bkt_lock_grp);
914 lck_grp_free(cp->mc_cpu_lock_grp);
915 lck_grp_free(cp->mc_sync_lock_grp);
916
917 /*
918 * TODO: We need to destroy the zone here, but cannot do it
919 * because there is no such way to achieve that. Until then
920 * the memory allocated for the zone structure is leaked.
921 * Once it is achievable, uncomment these lines:
922 *
923 * if (cp->mc_slab_zone != NULL) {
924 * zdestroy(cp->mc_slab_zone);
925 * cp->mc_slab_zone = NULL;
926 * }
927 */
928
929 /* Get the original address since we're about to free it */
930 pbuf = (void **)((intptr_t)cp - sizeof(void *));
931
932 zfree(mcache_zone, *pbuf);
933 }
934
935 /*
936 * Internal slab allocator used as a backend for simple caches. The current
937 * implementation uses the zone allocator for simplicity reasons.
938 */
939 static unsigned int
mcache_slab_alloc(void * arg,mcache_obj_t *** plist,unsigned int num,int wait)940 mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num,
941 int wait)
942 {
943 #pragma unused(wait)
944 mcache_t *cp = arg;
945 unsigned int need = num;
946 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
947 u_int32_t flags = cp->mc_flags;
948 void *buf, *base, **pbuf;
949 mcache_obj_t **list = *plist;
950
951 *list = NULL;
952
953 for (;;) {
954 buf = zalloc_flags(cp->mc_slab_zone, Z_WAITOK | Z_NOFAIL);
955
956 /* Get the aligned base address for this object */
957 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
958 cp->mc_align);
959
960 /*
961 * Wind back a pointer size from the aligned base and
962 * save the original address so we can free it later.
963 */
964 pbuf = (void **)((intptr_t)base - sizeof(void *));
965 *pbuf = buf;
966
967 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
968 ((intptr_t)buf + cp->mc_chunksize));
969
970 /*
971 * If auditing is enabled, patternize the contents of
972 * the buffer starting from the 64-bit aligned base to
973 * the end of the buffer; the length is rounded up to
974 * the nearest 64-bit multiply; this is because we use
975 * 64-bit memory access to set/check the pattern.
976 */
977 if (flags & MCF_DEBUG) {
978 VERIFY(((intptr_t)base + rsize) <=
979 ((intptr_t)buf + cp->mc_chunksize));
980 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
981 }
982
983 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
984 *list = (mcache_obj_t *)base;
985
986 (*list)->obj_next = NULL;
987 list = *plist = &(*list)->obj_next;
988
989 /* If we got them all, return to mcache */
990 if (--need == 0) {
991 break;
992 }
993 }
994
995 return num - need;
996 }
997
998 /*
999 * Internal slab deallocator used as a backend for simple caches.
1000 */
1001 static void
mcache_slab_free(void * arg,mcache_obj_t * list,__unused boolean_t purged)1002 mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
1003 {
1004 mcache_t *cp = arg;
1005 mcache_obj_t *nlist;
1006 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1007 u_int32_t flags = cp->mc_flags;
1008 void *base;
1009 void **pbuf;
1010
1011 for (;;) {
1012 nlist = list->obj_next;
1013 list->obj_next = NULL;
1014
1015 base = list;
1016 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1017
1018 /* Get the original address since we're about to free it */
1019 pbuf = (void **)((intptr_t)base - sizeof(void *));
1020
1021 VERIFY(((intptr_t)base + cp->mc_bufsize) <=
1022 ((intptr_t)*pbuf + cp->mc_chunksize));
1023
1024 if (flags & MCF_DEBUG) {
1025 VERIFY(((intptr_t)base + rsize) <=
1026 ((intptr_t)*pbuf + cp->mc_chunksize));
1027 mcache_audit_free_verify(NULL, base, 0, rsize);
1028 }
1029
1030 /* Free it to zone */
1031 zfree(cp->mc_slab_zone, *pbuf);
1032
1033 /* No more objects to free; return to mcache */
1034 if ((list = nlist) == NULL) {
1035 break;
1036 }
1037 }
1038 }
1039
1040 /*
1041 * Internal slab auditor for simple caches.
1042 */
1043 static void
mcache_slab_audit(void * arg,mcache_obj_t * list,boolean_t alloc)1044 mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
1045 {
1046 mcache_t *cp = arg;
1047 size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof(u_int64_t));
1048 void *base, **pbuf;
1049
1050 while (list != NULL) {
1051 mcache_obj_t *next = list->obj_next;
1052
1053 base = list;
1054 VERIFY(IS_P2ALIGNED(base, cp->mc_align));
1055
1056 /* Get the original address */
1057 pbuf = (void **)((intptr_t)base - sizeof(void *));
1058
1059 VERIFY(((intptr_t)base + rsize) <=
1060 ((intptr_t)*pbuf + cp->mc_chunksize));
1061
1062 if (!alloc) {
1063 mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1064 } else {
1065 mcache_audit_free_verify_set(NULL, base, 0, rsize);
1066 }
1067
1068 list = list->obj_next = next;
1069 }
1070 }
1071
1072 /*
1073 * Refill the CPU's buckets with bkt and its follower (if any).
1074 */
1075 static void
mcache_cpu_batch_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1076 mcache_cpu_batch_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1077 {
1078 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1079 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1080 ASSERT(ccp->cc_bktsize > 0);
1081
1082 ccp->cc_filled = bkt;
1083 ccp->cc_objs = objs;
1084 if (__probable(bkt->bkt_next != NULL)) {
1085 ccp->cc_pfilled = bkt->bkt_next;
1086 ccp->cc_pobjs = objs;
1087 bkt->bkt_next = NULL;
1088 } else {
1089 ASSERT(bkt->bkt_next == NULL);
1090 ccp->cc_pfilled = NULL;
1091 ccp->cc_pobjs = -1;
1092 }
1093 }
1094
1095 /*
1096 * Refill the CPU's filled bucket with bkt and save the previous one.
1097 */
1098 static void
mcache_cpu_refill(mcache_cpu_t * ccp,mcache_bkt_t * bkt,int objs)1099 mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1100 {
1101 ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1102 (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1103 ASSERT(ccp->cc_bktsize > 0);
1104
1105 ccp->cc_pfilled = ccp->cc_filled;
1106 ccp->cc_pobjs = ccp->cc_objs;
1107 ccp->cc_filled = bkt;
1108 ccp->cc_objs = objs;
1109 }
1110
1111 /*
1112 * Get one or more buckets from the bucket layer.
1113 */
1114 static uint32_t
mcache_bkt_batch_alloc(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t ** list,uint32_t num)1115 mcache_bkt_batch_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t **list,
1116 uint32_t num)
1117 {
1118 mcache_bkt_t *bkt_list = NULL;
1119 mcache_bkt_t *bkt;
1120 uint32_t need = num;
1121
1122 ASSERT(list != NULL && need > 0);
1123
1124 if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1125 /*
1126 * The bucket layer lock is held by another CPU; increase
1127 * the contention count so that we can later resize the
1128 * bucket size accordingly.
1129 */
1130 MCACHE_LOCK(&cp->mc_bkt_lock);
1131 cp->mc_bkt_contention++;
1132 }
1133
1134 while ((bkt = blp->bl_list) != NULL) {
1135 blp->bl_list = bkt->bkt_next;
1136 bkt->bkt_next = bkt_list;
1137 bkt_list = bkt;
1138 if (--blp->bl_total < blp->bl_min) {
1139 blp->bl_min = blp->bl_total;
1140 }
1141 blp->bl_alloc++;
1142 if (--need == 0) {
1143 break;
1144 }
1145 }
1146
1147 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1148
1149 *list = bkt_list;
1150
1151 return num - need;
1152 }
1153
1154 /*
1155 * Return one or more buckets to the bucket layer.
1156 */
1157 static void
mcache_bkt_batch_free(mcache_t * cp,mcache_bktlist_t * blp,mcache_bkt_t * bkt)1158 mcache_bkt_batch_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1159 {
1160 mcache_bkt_t *nbkt;
1161
1162 MCACHE_LOCK(&cp->mc_bkt_lock);
1163 while (bkt != NULL) {
1164 nbkt = bkt->bkt_next;
1165 bkt->bkt_next = blp->bl_list;
1166 blp->bl_list = bkt;
1167 blp->bl_total++;
1168 bkt = nbkt;
1169 }
1170 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1171 }
1172
1173 /*
1174 * Enable the bucket layer of a cache.
1175 */
1176 static void
mcache_cache_bkt_enable(mcache_t * cp)1177 mcache_cache_bkt_enable(mcache_t *cp)
1178 {
1179 mcache_cpu_t *ccp;
1180 unsigned int cpu;
1181
1182 if (cp->mc_flags & MCF_NOCPUCACHE) {
1183 return;
1184 }
1185
1186 for (cpu = 0; cpu < ncpu; cpu++) {
1187 ccp = &cp->mc_cpu[cpu];
1188 MCACHE_LOCK(&ccp->cc_lock);
1189 ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1190 MCACHE_UNLOCK(&ccp->cc_lock);
1191 }
1192 }
1193
1194 /*
1195 * Purge all buckets from a cache and disable its bucket layer.
1196 */
1197 static void
mcache_bkt_purge(mcache_t * cp)1198 mcache_bkt_purge(mcache_t *cp)
1199 {
1200 mcache_cpu_t *ccp;
1201 mcache_bkt_t *bp, *pbp;
1202 int objs, pobjs;
1203 unsigned int cpu;
1204
1205 for (cpu = 0; cpu < ncpu; cpu++) {
1206 ccp = &cp->mc_cpu[cpu];
1207
1208 MCACHE_LOCK(&ccp->cc_lock);
1209
1210 bp = ccp->cc_filled;
1211 pbp = ccp->cc_pfilled;
1212 objs = ccp->cc_objs;
1213 pobjs = ccp->cc_pobjs;
1214 ccp->cc_filled = NULL;
1215 ccp->cc_pfilled = NULL;
1216 ccp->cc_objs = -1;
1217 ccp->cc_pobjs = -1;
1218 ccp->cc_bktsize = 0;
1219
1220 MCACHE_UNLOCK(&ccp->cc_lock);
1221
1222 if (bp != NULL) {
1223 mcache_bkt_destroy(cp, bp, objs);
1224 }
1225 if (pbp != NULL) {
1226 mcache_bkt_destroy(cp, pbp, pobjs);
1227 }
1228 }
1229
1230 mcache_bkt_ws_zero(cp);
1231 mcache_bkt_ws_reap(cp);
1232 }
1233
1234 /*
1235 * Free one or more objects in the bucket to the slab layer,
1236 * and also free the bucket itself.
1237 */
1238 static void
mcache_bkt_destroy(mcache_t * cp,mcache_bkt_t * bkt,int nobjs)1239 mcache_bkt_destroy(mcache_t *cp, mcache_bkt_t *bkt, int nobjs)
1240 {
1241 if (nobjs > 0) {
1242 mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1243
1244 if (cp->mc_flags & MCF_DEBUG) {
1245 mcache_obj_t *o = top;
1246 int cnt = 0;
1247
1248 /*
1249 * Verify that the chain of objects in the bucket is
1250 * valid. Any mismatch here means a mistake when the
1251 * object(s) were freed to the CPU layer, so we panic.
1252 */
1253 while (o != NULL) {
1254 o = o->obj_next;
1255 ++cnt;
1256 }
1257 if (cnt != nobjs) {
1258 panic("mcache_bkt_destroy: %s cp %p corrupted "
1259 "list in bkt %p (nobjs %d actual %d)\n",
1260 cp->mc_name, (void *)cp, (void *)bkt,
1261 nobjs, cnt);
1262 /* NOTREACHED */
1263 __builtin_unreachable();
1264 }
1265 }
1266
1267 /* Advise the slab layer to purge the object(s) */
1268 (*cp->mc_slab_free)(cp->mc_private, top,
1269 (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1270 }
1271 mcache_free(bkt->bkt_type->bt_cache, bkt);
1272 }
1273
1274 /*
1275 * Update the bucket layer working set statistics.
1276 */
1277 static void
mcache_bkt_ws_update(mcache_t * cp)1278 mcache_bkt_ws_update(mcache_t *cp)
1279 {
1280 MCACHE_LOCK(&cp->mc_bkt_lock);
1281
1282 cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1283 cp->mc_full.bl_min = cp->mc_full.bl_total;
1284 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1285 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1286
1287 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1288 }
1289
1290 /*
1291 * Mark everything as eligible for reaping (working set is zero).
1292 */
1293 static void
mcache_bkt_ws_zero(mcache_t * cp)1294 mcache_bkt_ws_zero(mcache_t *cp)
1295 {
1296 MCACHE_LOCK(&cp->mc_bkt_lock);
1297
1298 cp->mc_full.bl_reaplimit = cp->mc_full.bl_total;
1299 cp->mc_full.bl_min = cp->mc_full.bl_total;
1300 cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_total;
1301 cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1302
1303 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1304 }
1305
1306 /*
1307 * Reap all buckets that are beyond the working set.
1308 */
1309 static void
mcache_bkt_ws_reap(mcache_t * cp)1310 mcache_bkt_ws_reap(mcache_t *cp)
1311 {
1312 mcache_bkt_t *bkt, *nbkt;
1313 uint32_t reap;
1314
1315 reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1316 if (reap != 0) {
1317 (void) mcache_bkt_batch_alloc(cp, &cp->mc_full, &bkt, reap);
1318 while (bkt != NULL) {
1319 nbkt = bkt->bkt_next;
1320 bkt->bkt_next = NULL;
1321 mcache_bkt_destroy(cp, bkt, bkt->bkt_type->bt_bktsize);
1322 bkt = nbkt;
1323 }
1324 }
1325
1326 reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1327 if (reap != 0) {
1328 (void) mcache_bkt_batch_alloc(cp, &cp->mc_empty, &bkt, reap);
1329 while (bkt != NULL) {
1330 nbkt = bkt->bkt_next;
1331 bkt->bkt_next = NULL;
1332 mcache_bkt_destroy(cp, bkt, 0);
1333 bkt = nbkt;
1334 }
1335 }
1336 }
1337
1338 static void
mcache_reap_timeout(thread_call_param_t dummy __unused,thread_call_param_t arg)1339 mcache_reap_timeout(thread_call_param_t dummy __unused,
1340 thread_call_param_t arg)
1341 {
1342 volatile UInt32 *flag = arg;
1343
1344 ASSERT(flag == &mcache_reaping);
1345
1346 *flag = 0;
1347 }
1348
1349 static void
mcache_reap_done(void * flag)1350 mcache_reap_done(void *flag)
1351 {
1352 uint64_t deadline, leeway;
1353
1354 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1355 &deadline);
1356 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1357 NSEC_PER_SEC, &leeway);
1358 thread_call_enter_delayed_with_leeway(mcache_reap_tcall, flag,
1359 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1360 }
1361
1362 static void
mcache_reap_start(void * arg)1363 mcache_reap_start(void *arg)
1364 {
1365 UInt32 *flag = arg;
1366
1367 ASSERT(flag == &mcache_reaping);
1368
1369 mcache_applyall(mcache_cache_reap);
1370 mcache_dispatch(mcache_reap_done, flag);
1371 }
1372
1373 __private_extern__ void
mcache_reap(void)1374 mcache_reap(void)
1375 {
1376 UInt32 *flag = &mcache_reaping;
1377
1378 if (mcache_llock_owner == current_thread() ||
1379 !OSCompareAndSwap(0, 1, flag)) {
1380 return;
1381 }
1382
1383 mcache_dispatch(mcache_reap_start, flag);
1384 }
1385
1386 __private_extern__ void
mcache_reap_now(mcache_t * cp,boolean_t purge)1387 mcache_reap_now(mcache_t *cp, boolean_t purge)
1388 {
1389 if (purge) {
1390 mcache_bkt_purge(cp);
1391 mcache_cache_bkt_enable(cp);
1392 } else {
1393 mcache_bkt_ws_zero(cp);
1394 mcache_bkt_ws_reap(cp);
1395 }
1396 }
1397
1398 static void
mcache_cache_reap(mcache_t * cp)1399 mcache_cache_reap(mcache_t *cp)
1400 {
1401 mcache_bkt_ws_reap(cp);
1402 }
1403
1404 /*
1405 * Performs period maintenance on a cache.
1406 */
1407 static void
mcache_cache_update(mcache_t * cp)1408 mcache_cache_update(mcache_t *cp)
1409 {
1410 int need_bkt_resize = 0;
1411 int need_bkt_reenable = 0;
1412
1413 lck_mtx_assert(&mcache_llock, LCK_MTX_ASSERT_OWNED);
1414
1415 mcache_bkt_ws_update(cp);
1416
1417 /*
1418 * Cache resize and post-purge reenable are mutually exclusive.
1419 * If the cache was previously purged, there is no point of
1420 * increasing the bucket size as there was an indication of
1421 * memory pressure on the system.
1422 */
1423 lck_mtx_lock_spin(&cp->mc_sync_lock);
1424 if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt) {
1425 need_bkt_reenable = 1;
1426 }
1427 lck_mtx_unlock(&cp->mc_sync_lock);
1428
1429 MCACHE_LOCK(&cp->mc_bkt_lock);
1430 /*
1431 * If the contention count is greater than the threshold, and if
1432 * we are not already at the maximum bucket size, increase it.
1433 * Otherwise, if this cache was previously purged by the user
1434 * then we simply reenable it.
1435 */
1436 if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1437 (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1438 mcache_bkt_contention && !need_bkt_reenable) {
1439 need_bkt_resize = 1;
1440 }
1441
1442 cp->mc_bkt_contention_prev = cp->mc_bkt_contention;
1443 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1444
1445 if (need_bkt_resize) {
1446 mcache_dispatch(mcache_cache_bkt_resize, cp);
1447 } else if (need_bkt_reenable) {
1448 mcache_dispatch(mcache_cache_enable, cp);
1449 }
1450 }
1451
1452 /*
1453 * Recompute a cache's bucket size. This is an expensive operation
1454 * and should not be done frequently; larger buckets provide for a
1455 * higher transfer rate with the bucket while smaller buckets reduce
1456 * the memory consumption.
1457 */
1458 static void
mcache_cache_bkt_resize(void * arg)1459 mcache_cache_bkt_resize(void *arg)
1460 {
1461 mcache_t *cp = arg;
1462 mcache_bkttype_t *btp = cp->cache_bkttype;
1463
1464 if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1465 mcache_bkt_purge(cp);
1466
1467 /*
1468 * Upgrade to the next bucket type with larger bucket size;
1469 * temporarily set the previous contention snapshot to a
1470 * negative number to prevent unnecessary resize request.
1471 */
1472 MCACHE_LOCK(&cp->mc_bkt_lock);
1473 cp->cache_bkttype = ++btp;
1474 cp->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1475 MCACHE_UNLOCK(&cp->mc_bkt_lock);
1476
1477 mcache_cache_enable(cp);
1478 }
1479 }
1480
1481 /*
1482 * Reenable a previously disabled cache due to purge.
1483 */
1484 static void
mcache_cache_enable(void * arg)1485 mcache_cache_enable(void *arg)
1486 {
1487 mcache_t *cp = arg;
1488
1489 lck_mtx_lock_spin(&cp->mc_sync_lock);
1490 cp->mc_purge_cnt = 0;
1491 cp->mc_enable_cnt = 0;
1492 lck_mtx_unlock(&cp->mc_sync_lock);
1493
1494 mcache_cache_bkt_enable(cp);
1495 }
1496
1497 static void
mcache_update_timeout(__unused void * arg)1498 mcache_update_timeout(__unused void *arg)
1499 {
1500 uint64_t deadline, leeway;
1501
1502 clock_interval_to_deadline(mcache_reap_interval, NSEC_PER_SEC,
1503 &deadline);
1504 clock_interval_to_absolutetime_interval(mcache_reap_interval_leeway,
1505 NSEC_PER_SEC, &leeway);
1506 thread_call_enter_delayed_with_leeway(mcache_update_tcall, NULL,
1507 deadline, leeway, THREAD_CALL_DELAY_LEEWAY);
1508 }
1509
1510 static void
mcache_update(thread_call_param_t arg __unused,thread_call_param_t dummy __unused)1511 mcache_update(thread_call_param_t arg __unused,
1512 thread_call_param_t dummy __unused)
1513 {
1514 mcache_applyall(mcache_cache_update);
1515 mcache_update_timeout(NULL);
1516 }
1517
1518 static void
mcache_applyall(void (* func)(mcache_t *))1519 mcache_applyall(void (*func)(mcache_t *))
1520 {
1521 mcache_t *cp;
1522
1523 MCACHE_LIST_LOCK();
1524 LIST_FOREACH(cp, &mcache_head, mc_list) {
1525 func(cp);
1526 }
1527 MCACHE_LIST_UNLOCK();
1528 }
1529
1530 static void
mcache_dispatch(void (* func)(void *),void * arg)1531 mcache_dispatch(void (*func)(void *), void *arg)
1532 {
1533 ASSERT(func != NULL);
1534 timeout(func, arg, hz / 1000);
1535 }
1536
1537 __private_extern__ void
mcache_buffer_log(mcache_audit_t * mca,void * addr,mcache_t * cp,struct timeval * base_ts)1538 mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp,
1539 struct timeval *base_ts)
1540 {
1541 struct timeval now, base = { .tv_sec = 0, .tv_usec = 0 };
1542 void *stack[MCACHE_STACK_DEPTH + 1];
1543 struct mca_trn *transaction;
1544
1545 transaction = &mca->mca_trns[mca->mca_next_trn];
1546
1547 mca->mca_addr = addr;
1548 mca->mca_cache = cp;
1549
1550 transaction->mca_thread = current_thread();
1551
1552 bzero(stack, sizeof(stack));
1553 transaction->mca_depth = (uint16_t)OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
1554 bcopy(&stack[1], transaction->mca_stack,
1555 sizeof(transaction->mca_stack));
1556
1557 microuptime(&now);
1558 if (base_ts != NULL) {
1559 base = *base_ts;
1560 }
1561 /* tstamp is in ms relative to base_ts */
1562 transaction->mca_tstamp = ((now.tv_usec - base.tv_usec) / 1000);
1563 if ((now.tv_sec - base.tv_sec) > 0) {
1564 transaction->mca_tstamp += ((now.tv_sec - base.tv_sec) * 1000);
1565 }
1566
1567 mca->mca_next_trn =
1568 (mca->mca_next_trn + 1) % mca_trn_max;
1569 }
1570
1571 /*
1572 * N.B.: mcache_set_pattern(), mcache_verify_pattern() and
1573 * mcache_verify_set_pattern() are marked as noinline to prevent the
1574 * compiler from aliasing pointers when they are inlined inside the callers
1575 * (e.g. mcache_audit_free_verify_set()) which would be undefined behavior.
1576 */
1577 __private_extern__ OS_NOINLINE void
mcache_set_pattern(u_int64_t pattern,void * buf_arg,size_t size)1578 mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1579 {
1580 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1581 u_int64_t *buf = (u_int64_t *)buf_arg;
1582
1583 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1584 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1585
1586 while (buf < buf_end) {
1587 *buf++ = pattern;
1588 }
1589 }
1590
1591 __private_extern__ OS_NOINLINE void *
mcache_verify_pattern(u_int64_t pattern,void * buf_arg,size_t size)1592 mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1593 {
1594 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1595 u_int64_t *buf;
1596
1597 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1598 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1599
1600 for (buf = buf_arg; buf < buf_end; buf++) {
1601 if (*buf != pattern) {
1602 return buf;
1603 }
1604 }
1605 return NULL;
1606 }
1607
1608 OS_NOINLINE static void *
mcache_verify_set_pattern(u_int64_t old,u_int64_t new,void * buf_arg,size_t size)1609 mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1610 size_t size)
1611 {
1612 u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1613 u_int64_t *buf;
1614
1615 VERIFY(IS_P2ALIGNED(buf_arg, sizeof(u_int64_t)));
1616 VERIFY(IS_P2ALIGNED(size, sizeof(u_int64_t)));
1617
1618 for (buf = buf_arg; buf < buf_end; buf++) {
1619 if (*buf != old) {
1620 mcache_set_pattern(old, buf_arg,
1621 (uintptr_t)buf - (uintptr_t)buf_arg);
1622 return buf;
1623 }
1624 *buf = new;
1625 }
1626 return NULL;
1627 }
1628
1629 __private_extern__ void
mcache_audit_free_verify(mcache_audit_t * mca,void * base,size_t offset,size_t size)1630 mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1631 size_t size)
1632 {
1633 void *addr;
1634 u_int64_t *oaddr64;
1635 mcache_obj_t *next;
1636
1637 addr = (void *)((uintptr_t)base + offset);
1638 next = ((mcache_obj_t *)addr)->obj_next;
1639
1640 /* For the "obj_next" pointer in the buffer */
1641 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1642 *oaddr64 = MCACHE_FREE_PATTERN;
1643
1644 if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1645 (caddr_t)base, size)) != NULL) {
1646 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1647 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1648 /* NOTREACHED */
1649 }
1650 ((mcache_obj_t *)addr)->obj_next = next;
1651 }
1652
1653 __private_extern__ void
mcache_audit_free_verify_set(mcache_audit_t * mca,void * base,size_t offset,size_t size)1654 mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1655 size_t size)
1656 {
1657 void *addr;
1658 u_int64_t *oaddr64;
1659 mcache_obj_t *next;
1660
1661 addr = (void *)((uintptr_t)base + offset);
1662 next = ((mcache_obj_t *)addr)->obj_next;
1663
1664 /* For the "obj_next" pointer in the buffer */
1665 oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof(u_int64_t));
1666 *oaddr64 = MCACHE_FREE_PATTERN;
1667
1668 if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1669 MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1670 mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1671 (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1672 /* NOTREACHED */
1673 }
1674 ((mcache_obj_t *)addr)->obj_next = next;
1675 }
1676
1677 #undef panic
1678
1679 #define DUMP_TRN_FMT() \
1680 "%s transaction thread %p saved PC stack (%d deep):\n" \
1681 "\t%p, %p, %p, %p, %p, %p, %p, %p\n" \
1682 "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1683
1684 #define DUMP_TRN_FIELDS(s, x) \
1685 s, \
1686 mca->mca_trns[x].mca_thread, mca->mca_trns[x].mca_depth, \
1687 mca->mca_trns[x].mca_stack[0], mca->mca_trns[x].mca_stack[1], \
1688 mca->mca_trns[x].mca_stack[2], mca->mca_trns[x].mca_stack[3], \
1689 mca->mca_trns[x].mca_stack[4], mca->mca_trns[x].mca_stack[5], \
1690 mca->mca_trns[x].mca_stack[6], mca->mca_trns[x].mca_stack[7], \
1691 mca->mca_trns[x].mca_stack[8], mca->mca_trns[x].mca_stack[9], \
1692 mca->mca_trns[x].mca_stack[10], mca->mca_trns[x].mca_stack[11], \
1693 mca->mca_trns[x].mca_stack[12], mca->mca_trns[x].mca_stack[13], \
1694 mca->mca_trns[x].mca_stack[14], mca->mca_trns[x].mca_stack[15]
1695
1696 #define MCA_TRN_LAST ((mca->mca_next_trn + mca_trn_max) % mca_trn_max)
1697 #define MCA_TRN_PREV ((mca->mca_next_trn + mca_trn_max - 1) % mca_trn_max)
1698
1699 __private_extern__ char *
mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE],mcache_audit_t * mca)1700 mcache_dump_mca(char buf[static DUMP_MCA_BUF_SIZE], mcache_audit_t *mca)
1701 {
1702 snprintf(buf, DUMP_MCA_BUF_SIZE,
1703 "mca %p: addr %p, cache %p (%s) nxttrn %d\n"
1704 DUMP_TRN_FMT()
1705 DUMP_TRN_FMT(),
1706
1707 mca, mca->mca_addr, mca->mca_cache,
1708 mca->mca_cache ? mca->mca_cache->mc_name : "?",
1709 mca->mca_next_trn,
1710
1711 DUMP_TRN_FIELDS("last", MCA_TRN_LAST),
1712 DUMP_TRN_FIELDS("previous", MCA_TRN_PREV));
1713
1714 return buf;
1715 }
1716
1717 __private_extern__ void
mcache_audit_panic(mcache_audit_t * mca,void * addr,size_t offset,int64_t expected,int64_t got)1718 mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1719 int64_t expected, int64_t got)
1720 {
1721 char buf[DUMP_MCA_BUF_SIZE];
1722
1723 if (mca == NULL) {
1724 panic("mcache_audit: buffer %p modified after free at "
1725 "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1726 offset, got, expected);
1727 /* NOTREACHED */
1728 __builtin_unreachable();
1729 }
1730
1731 panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1732 "(0x%llx instead of 0x%llx)\n%s\n",
1733 addr, offset, got, expected, mcache_dump_mca(buf, mca));
1734 /* NOTREACHED */
1735 __builtin_unreachable();
1736 }
1737
1738 __attribute__((noinline, cold, not_tail_called, noreturn))
1739 __private_extern__ int
assfail(const char * a,const char * f,int l)1740 assfail(const char *a, const char *f, int l)
1741 {
1742 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1743 /* NOTREACHED */
1744 __builtin_unreachable();
1745 }
1746